From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- arch/x86_64/Kconfig | 477 ++++++++ arch/x86_64/Kconfig.debug | 57 + arch/x86_64/Makefile | 119 ++ arch/x86_64/boot/Makefile | 102 ++ arch/x86_64/boot/bootsect.S | 98 ++ arch/x86_64/boot/compressed/Makefile | 32 + arch/x86_64/boot/compressed/head.S | 142 +++ arch/x86_64/boot/compressed/misc.c | 354 ++++++ arch/x86_64/boot/compressed/miscsetup.h | 39 + arch/x86_64/boot/compressed/vmlinux.scr | 9 + arch/x86_64/boot/install.sh | 40 + arch/x86_64/boot/mtools.conf.in | 17 + arch/x86_64/boot/setup.S | 867 +++++++++++++ arch/x86_64/boot/tools/build.c | 186 +++ arch/x86_64/boot/video.S | 2007 +++++++++++++++++++++++++++++++ arch/x86_64/defconfig | 1129 +++++++++++++++++ arch/x86_64/ia32/Makefile | 32 + arch/x86_64/ia32/fpu32.c | 184 +++ arch/x86_64/ia32/ia32_aout.c | 529 ++++++++ arch/x86_64/ia32/ia32_binfmt.c | 434 +++++++ arch/x86_64/ia32/ia32_ioctl.c | 201 ++++ arch/x86_64/ia32/ia32_signal.c | 621 ++++++++++ arch/x86_64/ia32/ia32entry.S | 602 +++++++++ arch/x86_64/ia32/ipc32.c | 57 + arch/x86_64/ia32/ptrace32.c | 379 ++++++ arch/x86_64/ia32/sys_ia32.c | 1050 ++++++++++++++++ arch/x86_64/ia32/syscall32.c | 111 ++ arch/x86_64/ia32/tls32.c | 163 +++ arch/x86_64/ia32/vsyscall-sigreturn.S | 120 ++ arch/x86_64/ia32/vsyscall-syscall.S | 68 ++ arch/x86_64/ia32/vsyscall-sysenter.S | 94 ++ arch/x86_64/ia32/vsyscall.lds | 77 ++ arch/x86_64/kernel/Makefile | 45 + arch/x86_64/kernel/acpi/Makefile | 3 + arch/x86_64/kernel/acpi/sleep.c | 132 ++ arch/x86_64/kernel/acpi/wakeup.S | 527 ++++++++ arch/x86_64/kernel/aperture.c | 286 +++++ arch/x86_64/kernel/apic.c | 1088 +++++++++++++++++ arch/x86_64/kernel/asm-offsets.c | 69 ++ arch/x86_64/kernel/cpufreq/Kconfig | 96 ++ arch/x86_64/kernel/cpufreq/Makefile | 17 + arch/x86_64/kernel/e820.c | 513 ++++++++ arch/x86_64/kernel/early_printk.c | 220 ++++ arch/x86_64/kernel/entry.S | 920 ++++++++++++++ arch/x86_64/kernel/genapic.c | 89 ++ arch/x86_64/kernel/genapic_cluster.c | 130 ++ arch/x86_64/kernel/genapic_flat.c | 127 ++ arch/x86_64/kernel/head.S | 396 ++++++ arch/x86_64/kernel/head64.c | 117 ++ arch/x86_64/kernel/i387.c | 155 +++ arch/x86_64/kernel/i8259.c | 579 +++++++++ arch/x86_64/kernel/init_task.c | 49 + arch/x86_64/kernel/io_apic.c | 1982 ++++++++++++++++++++++++++++++ arch/x86_64/kernel/ioport.c | 117 ++ arch/x86_64/kernel/irq.c | 108 ++ arch/x86_64/kernel/kprobes.c | 631 ++++++++++ arch/x86_64/kernel/ldt.c | 253 ++++ arch/x86_64/kernel/mce.c | 548 +++++++++ arch/x86_64/kernel/mce_intel.c | 99 ++ arch/x86_64/kernel/module.c | 166 +++ arch/x86_64/kernel/mpparse.c | 949 +++++++++++++++ arch/x86_64/kernel/msr.c | 279 +++++ arch/x86_64/kernel/nmi.c | 488 ++++++++ arch/x86_64/kernel/pci-dma.c | 60 + arch/x86_64/kernel/pci-gart.c | 980 +++++++++++++++ arch/x86_64/kernel/pci-nommu.c | 94 ++ arch/x86_64/kernel/process.c | 770 ++++++++++++ arch/x86_64/kernel/ptrace.c | 547 +++++++++ arch/x86_64/kernel/reboot.c | 163 +++ arch/x86_64/kernel/semaphore.c | 180 +++ arch/x86_64/kernel/setup.c | 1189 ++++++++++++++++++ arch/x86_64/kernel/setup64.c | 292 +++++ arch/x86_64/kernel/signal.c | 486 ++++++++ arch/x86_64/kernel/smp.c | 415 +++++++ arch/x86_64/kernel/smpboot.c | 938 +++++++++++++++ arch/x86_64/kernel/suspend.c | 157 +++ arch/x86_64/kernel/suspend_asm.S | 104 ++ arch/x86_64/kernel/sys_x86_64.c | 173 +++ arch/x86_64/kernel/syscall.c | 26 + arch/x86_64/kernel/time.c | 1262 +++++++++++++++++++ arch/x86_64/kernel/trampoline.S | 64 + arch/x86_64/kernel/traps.c | 948 +++++++++++++++ arch/x86_64/kernel/vmlinux.lds.S | 164 +++ arch/x86_64/kernel/vsyscall.c | 225 ++++ arch/x86_64/kernel/x8664_ksyms.c | 221 ++++ arch/x86_64/lib/Makefile | 14 + arch/x86_64/lib/bitops.c | 141 +++ arch/x86_64/lib/bitstr.c | 28 + arch/x86_64/lib/clear_page.S | 50 + arch/x86_64/lib/copy_page.S | 101 ++ arch/x86_64/lib/copy_user.S | 294 +++++ arch/x86_64/lib/csum-copy.S | 233 ++++ arch/x86_64/lib/csum-partial.c | 150 +++ arch/x86_64/lib/csum-wrappers.c | 129 ++ arch/x86_64/lib/dec_and_lock.c | 40 + arch/x86_64/lib/delay.c | 48 + arch/x86_64/lib/getuser.S | 101 ++ arch/x86_64/lib/io.c | 23 + arch/x86_64/lib/memcpy.S | 121 ++ arch/x86_64/lib/memmove.c | 19 + arch/x86_64/lib/memset.S | 125 ++ arch/x86_64/lib/putuser.S | 89 ++ arch/x86_64/lib/thunk.S | 95 ++ arch/x86_64/lib/usercopy.c | 153 +++ arch/x86_64/mm/Makefile | 11 + arch/x86_64/mm/extable.c | 35 + arch/x86_64/mm/fault.c | 579 +++++++++ arch/x86_64/mm/init.c | 630 ++++++++++ arch/x86_64/mm/ioremap.c | 283 +++++ arch/x86_64/mm/k8topology.c | 168 +++ arch/x86_64/mm/numa.c | 294 +++++ arch/x86_64/mm/pageattr.c | 235 ++++ arch/x86_64/mm/srat.c | 217 ++++ arch/x86_64/oprofile/Kconfig | 23 + arch/x86_64/oprofile/Makefile | 19 + arch/x86_64/pci/Makefile | 24 + arch/x86_64/pci/Makefile-BUS | 22 + arch/x86_64/pci/k8-bus.c | 78 ++ arch/x86_64/pci/mmconfig.c | 104 ++ 119 files changed, 35709 insertions(+) create mode 100644 arch/x86_64/Kconfig create mode 100644 arch/x86_64/Kconfig.debug create mode 100644 arch/x86_64/Makefile create mode 100644 arch/x86_64/boot/Makefile create mode 100644 arch/x86_64/boot/bootsect.S create mode 100644 arch/x86_64/boot/compressed/Makefile create mode 100644 arch/x86_64/boot/compressed/head.S create mode 100644 arch/x86_64/boot/compressed/misc.c create mode 100644 arch/x86_64/boot/compressed/miscsetup.h create mode 100644 arch/x86_64/boot/compressed/vmlinux.scr create mode 100644 arch/x86_64/boot/install.sh create mode 100644 arch/x86_64/boot/mtools.conf.in create mode 100644 arch/x86_64/boot/setup.S create mode 100644 arch/x86_64/boot/tools/build.c create mode 100644 arch/x86_64/boot/video.S create mode 100644 arch/x86_64/defconfig create mode 100644 arch/x86_64/ia32/Makefile create mode 100644 arch/x86_64/ia32/fpu32.c create mode 100644 arch/x86_64/ia32/ia32_aout.c create mode 100644 arch/x86_64/ia32/ia32_binfmt.c create mode 100644 arch/x86_64/ia32/ia32_ioctl.c create mode 100644 arch/x86_64/ia32/ia32_signal.c create mode 100644 arch/x86_64/ia32/ia32entry.S create mode 100644 arch/x86_64/ia32/ipc32.c create mode 100644 arch/x86_64/ia32/ptrace32.c create mode 100644 arch/x86_64/ia32/sys_ia32.c create mode 100644 arch/x86_64/ia32/syscall32.c create mode 100644 arch/x86_64/ia32/tls32.c create mode 100644 arch/x86_64/ia32/vsyscall-sigreturn.S create mode 100644 arch/x86_64/ia32/vsyscall-syscall.S create mode 100644 arch/x86_64/ia32/vsyscall-sysenter.S create mode 100644 arch/x86_64/ia32/vsyscall.lds create mode 100644 arch/x86_64/kernel/Makefile create mode 100644 arch/x86_64/kernel/acpi/Makefile create mode 100644 arch/x86_64/kernel/acpi/sleep.c create mode 100644 arch/x86_64/kernel/acpi/wakeup.S create mode 100644 arch/x86_64/kernel/aperture.c create mode 100644 arch/x86_64/kernel/apic.c create mode 100644 arch/x86_64/kernel/asm-offsets.c create mode 100644 arch/x86_64/kernel/cpufreq/Kconfig create mode 100644 arch/x86_64/kernel/cpufreq/Makefile create mode 100644 arch/x86_64/kernel/e820.c create mode 100644 arch/x86_64/kernel/early_printk.c create mode 100644 arch/x86_64/kernel/entry.S create mode 100644 arch/x86_64/kernel/genapic.c create mode 100644 arch/x86_64/kernel/genapic_cluster.c create mode 100644 arch/x86_64/kernel/genapic_flat.c create mode 100644 arch/x86_64/kernel/head.S create mode 100644 arch/x86_64/kernel/head64.c create mode 100644 arch/x86_64/kernel/i387.c create mode 100644 arch/x86_64/kernel/i8259.c create mode 100644 arch/x86_64/kernel/init_task.c create mode 100644 arch/x86_64/kernel/io_apic.c create mode 100644 arch/x86_64/kernel/ioport.c create mode 100644 arch/x86_64/kernel/irq.c create mode 100644 arch/x86_64/kernel/kprobes.c create mode 100644 arch/x86_64/kernel/ldt.c create mode 100644 arch/x86_64/kernel/mce.c create mode 100644 arch/x86_64/kernel/mce_intel.c create mode 100644 arch/x86_64/kernel/module.c create mode 100644 arch/x86_64/kernel/mpparse.c create mode 100644 arch/x86_64/kernel/msr.c create mode 100644 arch/x86_64/kernel/nmi.c create mode 100644 arch/x86_64/kernel/pci-dma.c create mode 100644 arch/x86_64/kernel/pci-gart.c create mode 100644 arch/x86_64/kernel/pci-nommu.c create mode 100644 arch/x86_64/kernel/process.c create mode 100644 arch/x86_64/kernel/ptrace.c create mode 100644 arch/x86_64/kernel/reboot.c create mode 100644 arch/x86_64/kernel/semaphore.c create mode 100644 arch/x86_64/kernel/setup.c create mode 100644 arch/x86_64/kernel/setup64.c create mode 100644 arch/x86_64/kernel/signal.c create mode 100644 arch/x86_64/kernel/smp.c create mode 100644 arch/x86_64/kernel/smpboot.c create mode 100644 arch/x86_64/kernel/suspend.c create mode 100644 arch/x86_64/kernel/suspend_asm.S create mode 100644 arch/x86_64/kernel/sys_x86_64.c create mode 100644 arch/x86_64/kernel/syscall.c create mode 100644 arch/x86_64/kernel/time.c create mode 100644 arch/x86_64/kernel/trampoline.S create mode 100644 arch/x86_64/kernel/traps.c create mode 100644 arch/x86_64/kernel/vmlinux.lds.S create mode 100644 arch/x86_64/kernel/vsyscall.c create mode 100644 arch/x86_64/kernel/x8664_ksyms.c create mode 100644 arch/x86_64/lib/Makefile create mode 100644 arch/x86_64/lib/bitops.c create mode 100644 arch/x86_64/lib/bitstr.c create mode 100644 arch/x86_64/lib/clear_page.S create mode 100644 arch/x86_64/lib/copy_page.S create mode 100644 arch/x86_64/lib/copy_user.S create mode 100644 arch/x86_64/lib/csum-copy.S create mode 100644 arch/x86_64/lib/csum-partial.c create mode 100644 arch/x86_64/lib/csum-wrappers.c create mode 100644 arch/x86_64/lib/dec_and_lock.c create mode 100644 arch/x86_64/lib/delay.c create mode 100644 arch/x86_64/lib/getuser.S create mode 100644 arch/x86_64/lib/io.c create mode 100644 arch/x86_64/lib/memcpy.S create mode 100644 arch/x86_64/lib/memmove.c create mode 100644 arch/x86_64/lib/memset.S create mode 100644 arch/x86_64/lib/putuser.S create mode 100644 arch/x86_64/lib/thunk.S create mode 100644 arch/x86_64/lib/usercopy.c create mode 100644 arch/x86_64/mm/Makefile create mode 100644 arch/x86_64/mm/extable.c create mode 100644 arch/x86_64/mm/fault.c create mode 100644 arch/x86_64/mm/init.c create mode 100644 arch/x86_64/mm/ioremap.c create mode 100644 arch/x86_64/mm/k8topology.c create mode 100644 arch/x86_64/mm/numa.c create mode 100644 arch/x86_64/mm/pageattr.c create mode 100644 arch/x86_64/mm/srat.c create mode 100644 arch/x86_64/oprofile/Kconfig create mode 100644 arch/x86_64/oprofile/Makefile create mode 100644 arch/x86_64/pci/Makefile create mode 100644 arch/x86_64/pci/Makefile-BUS create mode 100644 arch/x86_64/pci/k8-bus.c create mode 100644 arch/x86_64/pci/mmconfig.c (limited to 'arch/x86_64') diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig new file mode 100644 index 000000000000..80c38c5d71fe --- /dev/null +++ b/arch/x86_64/Kconfig @@ -0,0 +1,477 @@ +# +# For a description of the syntax of this configuration file, +# see Documentation/kbuild/kconfig-language.txt. +# +# Note: ISA is disabled and will hopefully never be enabled. +# If you managed to buy an ISA x86-64 box you'll have to fix all the +# ISA drivers you need yourself. +# + +mainmenu "Linux Kernel Configuration" + +config X86_64 + bool + default y + help + Port to the x86-64 architecture. x86-64 is a 64-bit extension to the + classical 32-bit x86 architecture. For details see + . + +config 64BIT + def_bool y + +config X86 + bool + default y + +config MMU + bool + default y + +config ISA + bool + +config SBUS + bool + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + +config GENERIC_CALIBRATE_DELAY + bool + default y + +config X86_CMPXCHG + bool + default y + +config EARLY_PRINTK + bool + default y + +config GENERIC_ISA_DMA + bool + default y + +config GENERIC_IOMAP + bool + default y + +source "init/Kconfig" + + +menu "Processor type and features" + +choice + prompt "Processor family" + default MK8 + +config MK8 + bool "AMD-Opteron/Athlon64" + help + Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. + +config MPSC + bool "Intel EM64T" + help + Optimize for Intel Pentium 4 and Xeon CPUs with Intel + Extended Memory 64 Technology(EM64T). For details see + . + +config GENERIC_CPU + bool "Generic-x86-64" + help + Generic x86-64 CPU. + +endchoice + +# +# Define implied options from the CPU selection here +# +config X86_L1_CACHE_BYTES + int + default "128" if GENERIC_CPU || MPSC + default "64" if MK8 + +config X86_L1_CACHE_SHIFT + int + default "7" if GENERIC_CPU || MPSC + default "6" if MK8 + +config X86_TSC + bool + default y + +config X86_GOOD_APIC + bool + default y + +config MICROCODE + tristate "/dev/cpu/microcode - Intel CPU microcode support" + ---help--- + If you say Y here the 'File systems' section, you will be + able to update the microcode on Intel processors. You will + obviously need the actual microcode binary data itself which is + not shipped with the Linux kernel. + + For latest news and information on obtaining all the required + ingredients for this driver, check: + . + + To compile this driver as a module, choose M here: the + module will be called microcode. + If you use modprobe or kmod you may also want to add the line + 'alias char-major-10-184 microcode' to your /etc/modules.conf file. + +config X86_MSR + tristate "/dev/cpu/*/msr - Model-specific register support" + help + This device gives privileged processes access to the x86 + Model-Specific Registers (MSRs). It is a character device with + major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. + MSR accesses are directed to a specific CPU on multi-processor + systems. + +config X86_CPUID + tristate "/dev/cpu/*/cpuid - CPU information support" + help + This device gives processes access to the x86 CPUID instruction to + be executed on a specific processor. It is a character device + with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to + /dev/cpu/31/cpuid. + +# disable it for opteron optimized builds because it pulls in ACPI_BOOT +config X86_HT + bool + depends on SMP && !MK8 + default y + +config MATH_EMULATION + bool + +config MCA + bool + +config EISA + bool + +config X86_IO_APIC + bool + default y + +config X86_LOCAL_APIC + bool + default y + +config MTRR + bool "MTRR (Memory Type Range Register) support" + ---help--- + On Intel P6 family processors (Pentium Pro, Pentium II and later) + the Memory Type Range Registers (MTRRs) may be used to control + processor access to memory ranges. This is most useful if you have + a video (VGA) card on a PCI or AGP bus. Enabling write-combining + allows bus write transfers to be combined into a larger transfer + before bursting over the PCI/AGP bus. This can increase performance + of image write operations 2.5 times or more. Saying Y here creates a + /proc/mtrr file which may be used to manipulate your processor's + MTRRs. Typically the X server should use this. + + This code has a reasonably generic interface so that similar + control registers on other processors can be easily supported + as well. + + Saying Y here also fixes a problem with buggy SMP BIOSes which only + set the MTRRs for the boot CPU and not for the secondary CPUs. This + can lead to all sorts of problems, so it's good to say Y here. + + Just say Y here, all x86-64 machines support MTRRs. + + See for more information. + +config SMP + bool "Symmetric multi-processing support" + ---help--- + This enables support for systems with more than one CPU. If you have + a system with only one CPU, like most personal computers, say N. If + you have a system with more than one CPU, say Y. + + If you say N here, the kernel will run on single and multiprocessor + machines, but will use only one CPU of a multiprocessor machine. If + you say Y here, the kernel will run on many, but not all, + singleprocessor machines. On a singleprocessor machine, the kernel + will run faster if you say N here. + + If you don't know what to do here, say N. + +config PREEMPT + bool "Preemptible Kernel" + ---help--- + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load. On contrary it may also break your drivers and add + priority inheritance problems to your system. Don't select it if + you rely on a stable system or have slightly obscure hardware. + It's also not very well tested on x86-64 currently. + You have been warned. + + Say Y here if you are feeling brave and building a kernel for a + desktop, embedded or real-time system. Say N if you are unsure. + +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on PREEMPT + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + +config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP + default n + help + SMT scheduler support improves the CPU scheduler's decision making + when dealing with Intel Pentium 4 chips with HyperThreading at a + cost of slightly increased overhead in some places. If unsure say + N here. + +config K8_NUMA + bool "K8 NUMA support" + select NUMA + depends on SMP + help + Enable NUMA (Non Unified Memory Architecture) support for + AMD Opteron Multiprocessor systems. The kernel will try to allocate + memory used by a CPU on the local memory controller of the CPU + and add some more NUMA awareness to the kernel. + This code is recommended on all multiprocessor Opteron systems + and normally doesn't hurt on others. + +config NUMA_EMU + bool "NUMA emulation support" + select NUMA + depends on SMP + help + Enable NUMA emulation. A flat machine will be split + into virtual nodes when booted with "numa=fake=N", where N is the + number of nodes. This is only useful for debugging. + +config DISCONTIGMEM + bool + depends on NUMA + default y + +config NUMA + bool + default n + +config HAVE_DEC_LOCK + bool + depends on SMP + default y + +config NR_CPUS + int "Maximum number of CPUs (2-256)" + range 2 256 + depends on SMP + default "8" + help + This allows you to specify the maximum number of CPUs which this + kernel will support. Current maximum is 256 CPUs due to + APIC addressing limits. Less depending on the hardware. + + This is purely to save memory - each supported CPU requires + memory in the static kernel configuration. + +config HPET_TIMER + bool + default y + help + Use the IA-PC HPET (High Precision Event Timer) to manage + time in preference to the PIT and RTC, if a HPET is + present. The HPET provides a stable time base on SMP + systems, unlike the TSC, but it is more expensive to access, + as it is off-chip. You can find the HPET spec at + . + +config HPET_EMULATE_RTC + bool "Provide RTC interrupt" + depends on HPET_TIMER && RTC=y + +config GART_IOMMU + bool "IOMMU support" + depends on PCI + help + Support the K8 IOMMU. Needed to run systems with more than 4GB of memory + properly with 32-bit PCI devices that do not support DAC (Double Address + Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter. + Normally the kernel will take the right choice by itself. + If unsure, say Y. + +# need this always enabled with GART_IOMMU for the VIA workaround +config SWIOTLB + bool + depends on GART_IOMMU + default y + +config DUMMY_IOMMU + bool + depends on !GART_IOMMU && !SWIOTLB + default y + help + Don't use IOMMU code. This will cause problems when you have more than 4GB + of memory and any 32-bit devices. Don't turn on unless you know what you + are doing. + +config X86_MCE + bool "Machine check support" if EMBEDDED + default y + help + Include a machine check error handler to report hardware errors. + This version will require the mcelog utility to decode some + machine check error logs. See + ftp://ftp.x86-64.org/pub/linux/tools/mcelog + +config X86_MCE_INTEL + bool "Intel MCE features" + depends on X86_MCE && X86_LOCAL_APIC + default y + help + Additional support for intel specific MCE features such as + the thermal monitor. + +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc//seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + +endmenu + +# +# Use the generic interrupt handling code in kernel/irq/: +# +config GENERIC_HARDIRQS + bool + default y + +config GENERIC_IRQ_PROBE + bool + default y + +menu "Power management options" + +source kernel/power/Kconfig + +source "drivers/acpi/Kconfig" + +source "arch/x86_64/kernel/cpufreq/Kconfig" + +endmenu + +menu "Bus options (PCI etc.)" + +config PCI + bool "PCI support" + +# x86-64 doesn't support PCI BIOS access from long mode so always go direct. +config PCI_DIRECT + bool + depends on PCI + default y + +config PCI_MMCONFIG + bool "Support mmconfig PCI config space access" + depends on PCI + select ACPI_BOOT + +config UNORDERED_IO + bool "Unordered IO mapping access" + depends on EXPERIMENTAL + help + Use unordered stores to access IO memory mappings in device drivers. + Still very experimental. When a driver works on IA64/ppc64/pa-risc it should + work with this option, but it makes the drivers behave differently + from i386. Requires that the driver writer used memory barriers + properly. + +source "drivers/pci/pcie/Kconfig" + +source "drivers/pci/Kconfig" + +source "drivers/pcmcia/Kconfig" + +source "drivers/pci/hotplug/Kconfig" + +endmenu + + +menu "Executable file formats / Emulations" + +source "fs/Kconfig.binfmt" + +config IA32_EMULATION + bool "IA32 Emulation" + help + Include code to run 32-bit programs under a 64-bit kernel. You should likely + turn this on, unless you're 100% sure that you don't have any 32-bit programs + left. + +config IA32_AOUT + bool "IA32 a.out support" + depends on IA32_EMULATION + help + Support old a.out binaries in the 32bit emulation. + +config COMPAT + bool + depends on IA32_EMULATION + default y + +config SYSVIPC_COMPAT + bool + depends on COMPAT && SYSVIPC + default y + +config UID16 + bool + depends on IA32_EMULATION + default y + +endmenu + +source drivers/Kconfig + +source "drivers/firmware/Kconfig" + +source fs/Kconfig + +source "arch/x86_64/oprofile/Kconfig" + +source "arch/x86_64/Kconfig.debug" + +source "security/Kconfig" + +source "crypto/Kconfig" + +source "lib/Kconfig" diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug new file mode 100644 index 000000000000..9cf1410d2f5a --- /dev/null +++ b/arch/x86_64/Kconfig.debug @@ -0,0 +1,57 @@ +menu "Kernel hacking" + +source "lib/Kconfig.debug" + +# !SMP for now because the context switch early causes GPF in segment reloading +# and the GS base checking does the wrong thing then, causing a hang. +config CHECKING + bool "Additional run-time checks" + depends on DEBUG_KERNEL && !SMP + help + Enables some internal consistency checks for kernel debugging. + You should normally say N. + +config INIT_DEBUG + bool "Debug __init statements" + depends on DEBUG_KERNEL + help + Fill __init and __initdata at the end of boot. This helps debugging + illegal uses of __init and __initdata after initialization. + +config IOMMU_DEBUG + depends on GART_IOMMU && DEBUG_KERNEL + bool "Enable IOMMU debugging" + help + Force the IOMMU to on even when you have less than 4GB of + memory and add debugging code. On overflow always panic. And + allow to enable IOMMU leak tracing. Can be disabled at boot + time with iommu=noforce. This will also enable scatter gather + list merging. Currently not recommended for production + code. When you use it make sure you have a big enough + IOMMU/AGP aperture. Most of the options enabled by this can + be set more finegrained using the iommu= command line + options. See Documentation/x86_64/boot-options.txt for more + details. + +config KPROBES + bool "Kprobes" + depends on DEBUG_KERNEL + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". + +config IOMMU_LEAK + bool "IOMMU leak tracing" + depends on DEBUG_KERNEL + depends on IOMMU_DEBUG + help + Add a simple leak tracer to the IOMMU code. This is useful when you + are debugging a buggy device driver that leaks IOMMU mappings. + +#config X86_REMOTE_DEBUG +# bool "kgdb debugging stub" + +endmenu diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile new file mode 100644 index 000000000000..6f90c246c418 --- /dev/null +++ b/arch/x86_64/Makefile @@ -0,0 +1,119 @@ +# +# x86_64/Makefile +# +# This file is included by the global makefile so that you can add your own +# architecture-specific flags and dependencies. Remember to do have actions +# for "archclean" and "archdep" for cleaning up and making dependencies for +# this architecture +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1994 by Linus Torvalds +# +# 19990713 Artur Skawina +# Added '-march' and '-mpreferred-stack-boundary' support +# 20000913 Pavel Machek +# Converted for x86_64 architecture +# 20010105 Andi Kleen, add IA32 compiler. +# ....and later removed it again.... +# +# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ + +# +# early bootup linking needs 32bit. You can either use real 32bit tools +# here or 64bit tools in 32bit mode. +# +IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer +IA32_LD := $(LD) -m elf_i386 +IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c +IA32_OBJCOPY := $(CROSS_COMPILE)objcopy +IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E +export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP + + +LDFLAGS := -m elf_x86_64 +OBJCOPYFLAGS := -O binary -R .note -R .comment -S +LDFLAGS_vmlinux := -e stext + +CHECKFLAGS += -D__x86_64__ -m64 + +cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) +cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) +CFLAGS += $(cflags-y) + +CFLAGS += -mno-red-zone +CFLAGS += -mcmodel=kernel +CFLAGS += -pipe +# this makes reading assembly source easier, but produces worse code +# actually it makes the kernel smaller too. +CFLAGS += -fno-reorder-blocks +CFLAGS += -Wno-sign-compare +ifneq ($(CONFIG_DEBUG_INFO),y) +CFLAGS += -fno-asynchronous-unwind-tables +# -fweb shrinks the kernel a bit, but the difference is very small +# it also messes up debugging, so don't use it for now. +#CFLAGS += $(call cc-option,-fweb) +endif +# -funit-at-a-time shrinks the kernel .text considerably +# unfortunately it makes reading oopses harder. +CFLAGS += $(call cc-option,-funit-at-a-time) +# prevent gcc from generating any FP code by mistake +CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) + +head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o + +libs-y += arch/x86_64/lib/ +core-y += arch/x86_64/kernel/ arch/x86_64/mm/ +core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ +drivers-$(CONFIG_PCI) += arch/x86_64/pci/ +drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ + +boot := arch/x86_64/boot + +.PHONY: bzImage bzlilo install archmrproper \ + fdimage fdimage144 fdimage288 archclean + +#Default target when executing "make" +all: bzImage + +BOOTIMAGE := arch/x86_64/boot/bzImage +KBUILD_IMAGE := $(BOOTIMAGE) + +bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) + +bzlilo: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo + +bzdisk: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk + +install fdimage fdimage144 fdimage288: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ + +archclean: + $(Q)$(MAKE) $(clean)=$(boot) + +prepare: include/asm-$(ARCH)/offset.h + +arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ + include/config/MARKER + +include/asm-$(ARCH)/offset.h: arch/$(ARCH)/kernel/asm-offsets.s + $(call filechk,gen-asm-offsets) + +CLEAN_FILES += include/asm-$(ARCH)/offset.h + +define archhelp + echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)' + echo ' install - Install kernel using' + echo ' (your) ~/bin/installkernel or' + echo ' (distribution) /sbin/installkernel or' + echo ' install to $$(INSTALL_PATH) and run lilo' +endef + +CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf + + diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile new file mode 100644 index 000000000000..f4399c701b77 --- /dev/null +++ b/arch/x86_64/boot/Makefile @@ -0,0 +1,102 @@ +# +# arch/x86_64/boot/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1994 by Linus Torvalds +# + +# ROOT_DEV specifies the default root-device when making the image. +# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case +# the default of FLOPPY is used by 'build'. + +ROOT_DEV := CURRENT + +# If you want to preset the SVGA mode, uncomment the next line and +# set SVGA_MODE to whatever number you want. +# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. +# The number is the same as you would ordinarily press at bootup. + +SVGA_MODE := -DSVGA_MODE=NORMAL_VGA + +# If you want the RAM disk device, define this to be the size in blocks. + +#RAMDISK := -DRAMDISK=512 + +targets := vmlinux.bin bootsect bootsect.o \ + setup setup.o bzImage mtools.conf + +EXTRA_CFLAGS := -m32 + +hostprogs-y := tools/build +HOST_EXTRACFLAGS += $(LINUXINCLUDE) +subdir- := compressed/ #Let make clean descend in compressed/ +# --------------------------------------------------------------------------- + +$(obj)/bzImage: IMAGE_OFFSET := 0x100000 +$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: BUILDFLAGS := -b + +quiet_cmd_image = BUILD $@ +cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \ + $(obj)/vmlinux.bin $(ROOT_DEV) > $@ + +$(obj)/bzImage: $(obj)/bootsect $(obj)/setup \ + $(obj)/vmlinux.bin $(obj)/tools/build FORCE + $(call if_changed,image) + @echo 'Kernel: $@ is ready' + +$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE + $(call if_changed,objcopy) + +LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary +LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext + +$(obj)/setup $(obj)/bootsect: %: %.o FORCE + $(call if_changed,ld) + +$(obj)/compressed/vmlinux: FORCE + $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ + +# Set this if you want to pass append arguments to the zdisk/fdimage kernel +FDARGS = + +$(obj)/mtools.conf: $(src)/mtools.conf.in + sed -e 's|@OBJ@|$(obj)|g' < $< > $@ + +# This requires write access to /dev/fd0 +zdisk: $(BOOTIMAGE) $(obj)/mtools.conf + MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync + syslinux /dev/fd0 ; sync + echo 'default linux $(FDARGS)' | \ + MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg + MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync + +# These require being root or having syslinux 2.02 or higher installed +fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf + dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440 + MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync + syslinux $(obj)/fdimage ; sync + echo 'default linux $(FDARGS)' | \ + MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg + MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync + +fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf + dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880 + MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync + syslinux $(obj)/fdimage ; sync + echo 'default linux $(FDARGS)' | \ + MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg + MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync + +zlilo: $(BOOTIMAGE) + if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi + if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi + cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz + cp System.map $(INSTALL_PATH)/ + if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi + +install: $(BOOTIMAGE) + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)" diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S new file mode 100644 index 000000000000..bb15d406ee95 --- /dev/null +++ b/arch/x86_64/boot/bootsect.S @@ -0,0 +1,98 @@ +/* + * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds + * + * modified by Drew Eckhardt + * modified by Bruce Evans (bde) + * modified by Chris Noe (May 1999) (as86 -> gas) + * gutted by H. Peter Anvin (Jan 2003) + * + * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment + * addresses must be multiplied by 16 to obtain their respective linear + * addresses. To avoid confusion, linear addresses are written using leading + * hex while segment addresses are written as segment:offset. + * + */ + +#include + +SETUPSECTS = 4 /* default nr of setup-sectors */ +BOOTSEG = 0x07C0 /* original address of boot-sector */ +INITSEG = DEF_INITSEG /* we move boot here - out of the way */ +SETUPSEG = DEF_SETUPSEG /* setup starts here */ +SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ +SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ + /* to be loaded */ +ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ +SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ + +#ifndef SVGA_MODE +#define SVGA_MODE ASK_VGA +#endif + +#ifndef RAMDISK +#define RAMDISK 0 +#endif + +#ifndef ROOT_RDONLY +#define ROOT_RDONLY 1 +#endif + +.code16 +.text + +.global _start +_start: + + # Normalize the start address + jmpl $BOOTSEG, $start2 + +start2: + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + movw $0x7c00, %sp + sti + cld + + movw $bugger_off_msg, %si + +msg_loop: + lodsb + andb %al, %al + jz die + movb $0xe, %ah + movw $7, %bx + int $0x10 + jmp msg_loop + +die: + # Allow the user to press a key, then reboot + xorw %ax, %ax + int $0x16 + int $0x19 + + # int 0x19 should never return. In case it does anyway, + # invoke the BIOS reset code... + ljmp $0xf000,$0xfff0 + + +bugger_off_msg: + .ascii "Direct booting from floppy is no longer supported.\r\n" + .ascii "Please use a boot loader program instead.\r\n" + .ascii "\n" + .ascii "Remove disk and press any key to reboot . . .\r\n" + .byte 0 + + + # Kernel attributes; used by setup + + .org 497 +setup_sects: .byte SETUPSECTS +root_flags: .word ROOT_RDONLY +syssize: .word SYSSIZE +swap_dev: .word SWAP_DEV +ram_size: .word RAMDISK +vid_mode: .word SVGA_MODE +root_dev: .word ROOT_DEV +boot_flag: .word 0xAA55 diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile new file mode 100644 index 000000000000..f89d96f11a9f --- /dev/null +++ b/arch/x86_64/boot/compressed/Makefile @@ -0,0 +1,32 @@ +# +# linux/arch/x86_64/boot/compressed/Makefile +# +# create a compressed vmlinux image from the original vmlinux +# +# Note all the files here are compiled/linked as 32bit executables. +# + +targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o +EXTRA_AFLAGS := -traditional -m32 + +# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with +# -m32 +CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing +LDFLAGS := -m elf_i386 + +LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 + +$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE + $(call if_changed,ld) + @: + +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) + +LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T + +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE + $(call if_changed,ld) diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S new file mode 100644 index 000000000000..27264dbd575c --- /dev/null +++ b/arch/x86_64/boot/compressed/head.S @@ -0,0 +1,142 @@ +/* + * linux/boot/head.S + * + * Copyright (C) 1991, 1992, 1993 Linus Torvalds + * + * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $ + */ + +/* + * head.S contains the 32-bit startup code. + * + * NOTE!!! Startup happens at absolute address 0x00001000, which is also where + * the page directory will exist. The startup code will be overwritten by + * the page directory. [According to comments etc elsewhere on a compressed + * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] + * + * Page 0 is deliberately kept safe, since System Management Mode code in + * laptops may need to access the BIOS data stored there. This is also + * useful for future device drivers that either access the BIOS via VM86 + * mode. + */ + +/* + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + */ +.code32 +.text + +#include +#include + + .code32 + .globl startup_32 + +startup_32: + cld + cli + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + + lss stack_start,%esp + xorl %eax,%eax +1: incl %eax # check that A20 really IS enabled + movl %eax,0x000000 # loop forever if it isn't + cmpl %eax,0x100000 + je 1b + +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ + pushl $0 + popfl +/* + * Clear BSS + */ + xorl %eax,%eax + movl $_edata,%edi + movl $_end,%ecx + subl %edi,%ecx + cld + rep + stosb +/* + * Do the decompression, and jump to the new kernel.. + */ + subl $16,%esp # place for structure on the stack + movl %esp,%eax + pushl %esi # real mode pointer as second arg + pushl %eax # address of structure as first arg + call decompress_kernel + orl %eax,%eax + jnz 3f + addl $8,%esp + xorl %ebx,%ebx + ljmp $(__KERNEL_CS), $0x100000 + +/* + * We come here, if we were loaded high. + * We need to move the move-in-place routine down to 0x1000 + * and then start it with the buffer addresses in registers, + * which we got from the stack. + */ +3: + movl %esi,%ebx + movl $move_routine_start,%esi + movl $0x1000,%edi + movl $move_routine_end,%ecx + subl %esi,%ecx + addl $3,%ecx + shrl $2,%ecx + cld + rep + movsl + + popl %esi # discard the address + addl $4,%esp # real mode pointer + popl %esi # low_buffer_start + popl %ecx # lcount + popl %edx # high_buffer_start + popl %eax # hcount + movl $0x100000,%edi + cli # make sure we don't get interrupted + ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine + +/* + * Routine (template) for moving the decompressed kernel in place, + * if we were high loaded. This _must_ PIC-code ! + */ +move_routine_start: + movl %ecx,%ebp + shrl $2,%ecx + rep + movsl + movl %ebp,%ecx + andl $3,%ecx + rep + movsb + movl %edx,%esi + movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 + addl $3,%ecx + shrl $2,%ecx + rep + movsl + movl %ebx,%esi # Restore setup pointer + xorl %ebx,%ebx + ljmp $(__KERNEL_CS), $0x100000 +move_routine_end: + + +/* Stack for uncompression */ + .align 32 +user_stack: + .fill 4096,4,0 +stack_start: + .long user_stack+4096 + .word __KERNEL_DS + diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c new file mode 100644 index 000000000000..c8b9216f9e63 --- /dev/null +++ b/arch/x86_64/boot/compressed/misc.c @@ -0,0 +1,354 @@ +/* + * misc.c + * + * This is a collection of several routines from gzip-1.0.3 + * adapted for Linux. + * + * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 + * puts by Nick Holloway 1993, better puts by Martin Mares 1995 + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + */ + +#include "miscsetup.h" +#include + +/* + * gzip declarations + */ + +#define OF(args) args +#define STATIC static + +#undef memset +#undef memcpy +#define memzero(s, n) memset ((s), 0, (n)) + +typedef unsigned char uch; +typedef unsigned short ush; +typedef unsigned long ulg; + +#define WSIZE 0x8000 /* Window size must be at least 32k, */ + /* and a power of two */ + +static uch *inbuf; /* input buffer */ +static uch window[WSIZE]; /* Sliding window buffer */ + +static unsigned insize = 0; /* valid bytes in inbuf */ +static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ +static unsigned outcnt = 0; /* bytes in output buffer */ + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ +#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ +#define RESERVED 0xC0 /* bit 6,7: reserved */ + +#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) + +/* Diagnostic functions */ +#ifdef DEBUG +# define Assert(cond,msg) {if(!(cond)) error(msg);} +# define Trace(x) fprintf x +# define Tracev(x) {if (verbose) fprintf x ;} +# define Tracevv(x) {if (verbose>1) fprintf x ;} +# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} +# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} +#else +# define Assert(cond,msg) +# define Trace(x) +# define Tracev(x) +# define Tracevv(x) +# define Tracec(c,x) +# define Tracecv(c,x) +#endif + +static int fill_inbuf(void); +static void flush_window(void); +static void error(char *m); +static void gzip_mark(void **); +static void gzip_release(void **); + +/* + * This is set up by the setup-routine at boot-time + */ +static unsigned char *real_mode; /* Pointer to real-mode data */ + +#define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) +#ifndef STANDARD_MEMORY_BIOS_CALL +#define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) +#endif +#define SCREEN_INFO (*(struct screen_info *)(real_mode+0)) + +extern char input_data[]; +extern int input_len; + +static long bytes_out = 0; +static uch *output_data; +static unsigned long output_ptr = 0; + +static void *malloc(int size); +static void free(void *where); + +static void putstr(const char *); + +extern int end; +static long free_mem_ptr = (long)&end; +static long free_mem_end_ptr; + +#define INPLACE_MOVE_ROUTINE 0x1000 +#define LOW_BUFFER_START 0x2000 +#define LOW_BUFFER_MAX 0x90000 +#define HEAP_SIZE 0x3000 +static unsigned int low_buffer_end, low_buffer_size; +static int high_loaded =0; +static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; + +static char *vidmem = (char *)0xb8000; +static int vidport; +static int lines, cols; + +#include "../../../../lib/inflate.c" + +static void *malloc(int size) +{ + void *p; + + if (size <0) error("Malloc error"); + if (free_mem_ptr <= 0) error("Memory error"); + + free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ + + p = (void *)free_mem_ptr; + free_mem_ptr += size; + + if (free_mem_ptr >= free_mem_end_ptr) + error("Out of memory"); + + return p; +} + +static void free(void *where) +{ /* Don't care */ +} + +static void gzip_mark(void **ptr) +{ + *ptr = (void *) free_mem_ptr; +} + +static void gzip_release(void **ptr) +{ + free_mem_ptr = (long) *ptr; +} + +static void scroll(void) +{ + int i; + + memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); + for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) + vidmem[i] = ' '; +} + +static void putstr(const char *s) +{ + int x,y,pos; + char c; + + x = SCREEN_INFO.orig_x; + y = SCREEN_INFO.orig_y; + + while ( ( c = *s++ ) != '\0' ) { + if ( c == '\n' ) { + x = 0; + if ( ++y >= lines ) { + scroll(); + y--; + } + } else { + vidmem [ ( x + cols * y ) * 2 ] = c; + if ( ++x >= cols ) { + x = 0; + if ( ++y >= lines ) { + scroll(); + y--; + } + } + } + } + + SCREEN_INFO.orig_x = x; + SCREEN_INFO.orig_y = y; + + pos = (x + cols * y) * 2; /* Update cursor position */ + outb_p(14, vidport); + outb_p(0xff & (pos >> 9), vidport+1); + outb_p(15, vidport); + outb_p(0xff & (pos >> 1), vidport+1); +} + +void* memset(void* s, int c, unsigned n) +{ + int i; + char *ss = (char*)s; + + for (i=0;i> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + output_ptr += (ulg)outcnt; + outcnt = 0; +} + +static void flush_window_high(void) +{ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, ch; + in = window; + for (n = 0; n < outcnt; n++) { + ch = *output_data++ = *in++; + if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + outcnt = 0; +} + +static void flush_window(void) +{ + if (high_loaded) flush_window_high(); + else flush_window_low(); +} + +static void error(char *x) +{ + putstr("\n\n"); + putstr(x); + putstr("\n\n -- System halted"); + + while(1); +} + +void setup_normal_output_buffer(void) +{ +#ifdef STANDARD_MEMORY_BIOS_CALL + if (EXT_MEM_K < 1024) error("Less than 2MB of memory"); +#else + if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); +#endif + output_data = (char *)0x100000; /* Points to 1M */ + free_mem_end_ptr = (long)real_mode; +} + +struct moveparams { + uch *low_buffer_start; int lcount; + uch *high_buffer_start; int hcount; +}; + +void setup_output_buffer_if_we_run_high(struct moveparams *mv) +{ + high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); +#ifdef STANDARD_MEMORY_BIOS_CALL + if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); +#else + if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); +#endif + mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START; + low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX + ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; + low_buffer_size = low_buffer_end - LOW_BUFFER_START; + high_loaded = 1; + free_mem_end_ptr = (long)high_buffer_start; + if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(0x100000 + low_buffer_size); + mv->hcount = 0; /* say: we need not to move high_buffer */ + } + else mv->hcount = -1; + mv->high_buffer_start = high_buffer_start; +} + +void close_output_buffer_if_we_run_high(struct moveparams *mv) +{ + if (bytes_out > low_buffer_size) { + mv->lcount = low_buffer_size; + if (mv->hcount) + mv->hcount = bytes_out - low_buffer_size; + } else { + mv->lcount = bytes_out; + mv->hcount = 0; + } +} + +int decompress_kernel(struct moveparams *mv, void *rmode) +{ + real_mode = rmode; + + if (SCREEN_INFO.orig_video_mode == 7) { + vidmem = (char *) 0xb0000; + vidport = 0x3b4; + } else { + vidmem = (char *) 0xb8000; + vidport = 0x3d4; + } + + lines = SCREEN_INFO.orig_video_lines; + cols = SCREEN_INFO.orig_video_cols; + + if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); + else setup_output_buffer_if_we_run_high(mv); + + makecrc(); + putstr(".\nDecompressing Linux..."); + gunzip(); + putstr("done.\nBooting the kernel.\n"); + if (high_loaded) close_output_buffer_if_we_run_high(mv); + return high_loaded; +} diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h new file mode 100644 index 000000000000..bb1620531703 --- /dev/null +++ b/arch/x86_64/boot/compressed/miscsetup.h @@ -0,0 +1,39 @@ +#define NULL 0 +//typedef unsigned int size_t; + + +struct screen_info { + unsigned char orig_x; /* 0x00 */ + unsigned char orig_y; /* 0x01 */ + unsigned short dontuse1; /* 0x02 -- EXT_MEM_K sits here */ + unsigned short orig_video_page; /* 0x04 */ + unsigned char orig_video_mode; /* 0x06 */ + unsigned char orig_video_cols; /* 0x07 */ + unsigned short unused2; /* 0x08 */ + unsigned short orig_video_ega_bx; /* 0x0a */ + unsigned short unused3; /* 0x0c */ + unsigned char orig_video_lines; /* 0x0e */ + unsigned char orig_video_isVGA; /* 0x0f */ + unsigned short orig_video_points; /* 0x10 */ + + /* VESA graphic mode -- linear frame buffer */ + unsigned short lfb_width; /* 0x12 */ + unsigned short lfb_height; /* 0x14 */ + unsigned short lfb_depth; /* 0x16 */ + unsigned long lfb_base; /* 0x18 */ + unsigned long lfb_size; /* 0x1c */ + unsigned short dontuse2, dontuse3; /* 0x20 -- CL_MAGIC and CL_OFFSET here */ + unsigned short lfb_linelength; /* 0x24 */ + unsigned char red_size; /* 0x26 */ + unsigned char red_pos; /* 0x27 */ + unsigned char green_size; /* 0x28 */ + unsigned char green_pos; /* 0x29 */ + unsigned char blue_size; /* 0x2a */ + unsigned char blue_pos; /* 0x2b */ + unsigned char rsvd_size; /* 0x2c */ + unsigned char rsvd_pos; /* 0x2d */ + unsigned short vesapm_seg; /* 0x2e */ + unsigned short vesapm_off; /* 0x30 */ + unsigned short pages; /* 0x32 */ + /* 0x34 -- 0x3f reserved for future expansion */ +}; diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr new file mode 100644 index 000000000000..1ed9d791f863 --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.scr @@ -0,0 +1,9 @@ +SECTIONS +{ + .data : { + input_len = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + input_data_end = .; + } +} diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh new file mode 100644 index 000000000000..90f2452b3b9e --- /dev/null +++ b/arch/x86_64/boot/install.sh @@ -0,0 +1,40 @@ +#!/bin/sh +# +# arch/i386/boot/install.sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# +# "make install" script for i386 architecture +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) +# + +# User may have a custom install script + +if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi +if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi + +# Default install - same as make zlilo + +if [ -f $4/vmlinuz ]; then + mv $4/vmlinuz $4/vmlinuz.old +fi + +if [ -f $4/System.map ]; then + mv $4/System.map $4/System.old +fi + +cat $2 > $4/vmlinuz +cp $3 $4/System.map + +if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in new file mode 100644 index 000000000000..efd6d2490c1d --- /dev/null +++ b/arch/x86_64/boot/mtools.conf.in @@ -0,0 +1,17 @@ +# +# mtools configuration file for "make (b)zdisk" +# + +# Actual floppy drive +drive a: + file="/dev/fd0" + +# 1.44 MB floppy disk image +drive v: + file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter + +# 2.88 MB floppy disk image (mostly for virtual uses) +drive w: + file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter + + diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S new file mode 100644 index 000000000000..3e838be9dbe7 --- /dev/null +++ b/arch/x86_64/boot/setup.S @@ -0,0 +1,867 @@ +/* + * setup.S Copyright (C) 1991, 1992 Linus Torvalds + * + * setup.s is responsible for getting the system data from the BIOS, + * and putting them into the appropriate places in system memory. + * both setup.s and system has been loaded by the bootblock. + * + * This code asks the bios for memory/disk/other parameters, and + * puts them in a "safe" place: 0x90000-0x901FF, ie where the + * boot-block used to be. It is then up to the protected mode + * system to read them from there before the area is overwritten + * for buffer-blocks. + * + * Move PS/2 aux init code to psaux.c + * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92 + * + * some changes and additional features by Christoph Niemann, + * March 1993/June 1994 (Christoph.Niemann@linux.org) + * + * add APM BIOS checking by Stephen Rothwell, May 1994 + * (sfr@canb.auug.org.au) + * + * High load stuff, initrd support and position independency + * by Hans Lermen & Werner Almesberger, February 1996 + * , + * + * Video handling moved to video.S by Martin Mares, March 1996 + * + * + * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david + * parsons) to avoid loadlin confusion, July 1997 + * + * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999. + * + * + * Fix to work around buggy BIOSes which dont use carry bit correctly + * and/or report extended memory in CX/DX for e801h memory size detection + * call. As a result the kernel got wrong figures. The int15/e801h docs + * from Ralf Brown interrupt list seem to indicate AX/BX should be used + * anyway. So to avoid breaking many machines (presumably there was a reason + * to orginally use CX/DX instead of AX/BX), we do a kludge to see + * if CX/DX have been changed in the e801 call and if so use AX/BX . + * Michael Miller, April 2001 + * + * Added long mode checking and SSE force. March 2003, Andi Kleen. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Signature words to ensure LILO loaded us right */ +#define SIG1 0xAA55 +#define SIG2 0x5A5A + +INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way +SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536). +SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment + # ... and the former contents of CS + +DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020 + +.code16 +.globl begtext, begdata, begbss, endtext, enddata, endbss + +.text +begtext: +.data +begdata: +.bss +begbss: +.text + +start: + jmp trampoline + +# This is the setup header, and it must start at %cs:2 (old 0x9020:2) + + .ascii "HdrS" # header signature + .word 0x0203 # header version number (>= 0x0105) + # or else old loadlin-1.5 will fail) +realmode_swtch: .word 0, 0 # default_switch, SETUPSEG +start_sys_seg: .word SYSSEG + .word kernel_version # pointing to kernel version string + # above section of header is compatible + # with loadlin-1.5 (header v1.5). Don't + # change it. + +type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin, + # Bootlin, SYSLX, bootsect...) + # See Documentation/i386/boot.txt for + # assigned ids + +# flags, unused bits must be zero (RFU) bit within loadflags +loadflags: +LOADED_HIGH = 1 # If set, the kernel is loaded high +CAN_USE_HEAP = 0x80 # If set, the loader also has set + # heap_end_ptr to tell how much + # space behind setup.S can be used for + # heap purposes. + # Only the loader knows what is free +#ifndef __BIG_KERNEL__ + .byte 0 +#else + .byte LOADED_HIGH +#endif + +setup_move_size: .word 0x8000 # size to move, when setup is not + # loaded at 0x90000. We will move setup + # to 0x90000 then just before jumping + # into the kernel. However, only the + # loader knows how much data behind + # us also needs to be loaded. + +code32_start: # here loaders can put a different + # start address for 32-bit code. +#ifndef __BIG_KERNEL__ + .long 0x1000 # 0x1000 = default for zImage +#else + .long 0x100000 # 0x100000 = default for big kernel +#endif + +ramdisk_image: .long 0 # address of loaded ramdisk image + # Here the loader puts the 32-bit + # address where it loaded the image. + # This only will be read by the kernel. + +ramdisk_size: .long 0 # its size in bytes + +bootsect_kludge: + .long 0 # obsolete + +heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later) + # space from here (exclusive) down to + # end of setup code can be used by setup + # for local heap purposes. + +pad1: .word 0 +cmd_line_ptr: .long 0 # (Header version 0x0202 or later) + # If nonzero, a 32-bit pointer + # to the kernel command line. + # The command line should be + # located between the start of + # setup and the end of low + # memory (0xa0000), or it may + # get overwritten before it + # gets read. If this field is + # used, there is no longer + # anything magical about the + # 0x90000 segment; the setup + # can be located anywhere in + # low memory 0x10000 or higher. + +ramdisk_max: .long 0xffffffff + +trampoline: call start_of_setup + .align 16 + # The offset at this point is 0x240 + .space (0x7ff-0x240+1) # E820 & EDD space (ending at 0x7ff) +# End of setup header ##################################################### + +start_of_setup: +# Bootlin depends on this being done early + movw $0x01500, %ax + movb $0x81, %dl + int $0x13 + +#ifdef SAFE_RESET_DISK_CONTROLLER +# Reset the disk controller. + movw $0x0000, %ax + movb $0x80, %dl + int $0x13 +#endif + +# Set %ds = %cs, we know that SETUPSEG = %cs at this point + movw %cs, %ax # aka SETUPSEG + movw %ax, %ds +# Check signature at end of setup + cmpw $SIG1, setup_sig1 + jne bad_sig + + cmpw $SIG2, setup_sig2 + jne bad_sig + + jmp good_sig1 + +# Routine to print asciiz string at ds:si +prtstr: + lodsb + andb %al, %al + jz fin + + call prtchr + jmp prtstr + +fin: ret + +# Space printing +prtsp2: call prtspc # Print double space +prtspc: movb $0x20, %al # Print single space (note: fall-thru) + +prtchr: + pushw %ax + pushw %cx + movw $0007,%bx + movw $0x01, %cx + movb $0x0e, %ah + int $0x10 + popw %cx + popw %ax + ret + +beep: movb $0x07, %al + jmp prtchr + +no_sig_mess: .string "No setup signature found ..." + +good_sig1: + jmp good_sig + +# We now have to find the rest of the setup code/data +bad_sig: + movw %cs, %ax # SETUPSEG + subw $DELTA_INITSEG, %ax # INITSEG + movw %ax, %ds + xorb %bh, %bh + movb (497), %bl # get setup sect from bootsect + subw $4, %bx # LILO loads 4 sectors of setup + shlw $8, %bx # convert to words (1sect=2^8 words) + movw %bx, %cx + shrw $3, %bx # convert to segment + addw $SYSSEG, %bx + movw %bx, %cs:start_sys_seg +# Move rest of setup code/data to here + movw $2048, %di # four sectors loaded by LILO + subw %si, %si + movw %cs, %ax # aka SETUPSEG + movw %ax, %es + movw $SYSSEG, %ax + movw %ax, %ds + rep + movsw + movw %cs, %ax # aka SETUPSEG + movw %ax, %ds + cmpw $SIG1, setup_sig1 + jne no_sig + + cmpw $SIG2, setup_sig2 + jne no_sig + + jmp good_sig + +no_sig: + lea no_sig_mess, %si + call prtstr + +no_sig_loop: + jmp no_sig_loop + +good_sig: + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %ds +# Check if an old loader tries to load a big-kernel + testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel? + jz loader_ok # No, no danger for old loaders. + + cmpb $0, %cs:type_of_loader # Do we have a loader that + # can deal with us? + jnz loader_ok # Yes, continue. + + pushw %cs # No, we have an old loader, + popw %ds # die. + lea loader_panic_mess, %si + call prtstr + + jmp no_sig_loop + +loader_panic_mess: .string "Wrong loader, giving up..." + +loader_ok: + /* check for long mode. */ + /* we have to do this before the VESA setup, otherwise the user + can't see the error message. */ + + pushw %ds + movw %cs,%ax + movw %ax,%ds + + /* minimum CPUID flags for x86-64 */ + /* see http://www.x86-64.org/lists/discuss/msg02971.html */ +#define SSE_MASK ((1<<25)|(1<<26)) +#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ + (1<<13)|(1<<15)|(1<<24)) +#define REQUIRED_MASK2 (1<<29) + + pushfl /* standard way to check for cpuid */ + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz no_longmode /* cpu has no cpuid */ + movl $0x0,%eax + cpuid + cmpl $0x1,%eax + jb no_longmode /* no cpuid 1 */ + xor %di,%di + cmpl $0x68747541,%ebx /* AuthenticAMD */ + jnz noamd + cmpl $0x69746e65,%edx + jnz noamd + cmpl $0x444d4163,%ecx + jnz noamd + mov $1,%di /* cpu is from AMD */ +noamd: + movl $0x1,%eax + cpuid + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz no_longmode + movl $0x80000000,%eax + cpuid + cmpl $0x80000001,%eax + jb no_longmode /* no extended cpuid */ + movl $0x80000001,%eax + cpuid + andl $REQUIRED_MASK2,%edx + xorl $REQUIRED_MASK2,%edx + jnz no_longmode +sse_test: + movl $1,%eax + cpuid + andl $SSE_MASK,%edx + cmpl $SSE_MASK,%edx + je sse_ok + test %di,%di + jz no_longmode /* only try to force SSE on AMD */ + movl $0xc0010015,%ecx /* HWCR */ + rdmsr + btr $15,%eax /* enable SSE */ + wrmsr + xor %di,%di /* don't loop */ + jmp sse_test /* try again */ +no_longmode: + call beep + lea long_mode_panic,%si + call prtstr +no_longmode_loop: + jmp no_longmode_loop +long_mode_panic: + .string "Your CPU does not support long mode. Use a 32bit distribution." + .byte 0 + +sse_ok: + popw %ds + +# tell BIOS we want to go to long mode + movl $0xec00,%eax # declare target operating mode + movl $2,%ebx # long mode + int $0x15 + +# Get memory size (extended mem, kB) + + xorl %eax, %eax + movl %eax, (0x1e0) +#ifndef STANDARD_MEMORY_BIOS_CALL + movb %al, (E820NR) +# Try three different memory detection schemes. First, try +# e820h, which lets us assemble a memory map, then try e801h, +# which returns a 32-bit memory size, and finally 88h, which +# returns 0-64m + +# method E820H: +# the memory map from hell. e820h returns memory classified into +# a whole bunch of different types, and allows memory holes and +# everything. We scan through this memory map and build a list +# of the first 32 memory areas, which we return at [E820MAP]. +# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm + +#define SMAP 0x534d4150 + +meme820: + xorl %ebx, %ebx # continuation counter + movw $E820MAP, %di # point into the whitelist + # so we can have the bios + # directly write into it. + +jmpe820: + movl $0x0000e820, %eax # e820, upper word zeroed + movl $SMAP, %edx # ascii 'SMAP' + movl $20, %ecx # size of the e820rec + pushw %ds # data record. + popw %es + int $0x15 # make the call + jc bail820 # fall to e801 if it fails + + cmpl $SMAP, %eax # check the return is `SMAP' + jne bail820 # fall to e801 if it fails + +# cmpl $1, 16(%di) # is this usable memory? +# jne again820 + + # If this is usable memory, we save it by simply advancing %di by + # sizeof(e820rec). + # +good820: + movb (E820NR), %al # up to 32 entries + cmpb $E820MAX, %al + jnl bail820 + + incb (E820NR) + movw %di, %ax + addw $20, %ax + movw %ax, %di +again820: + cmpl $0, %ebx # check to see if + jne jmpe820 # %ebx is set to EOF +bail820: + + +# method E801H: +# memory size is in 1k chunksizes, to avoid confusing loadlin. +# we store the 0xe801 memory size in a completely different place, +# because it will most likely be longer than 16 bits. +# (use 1e0 because that's what Larry Augustine uses in his +# alternative new memory detection scheme, and it's sensible +# to write everything into the same place.) + +meme801: + stc # fix to work around buggy + xorw %cx,%cx # BIOSes which dont clear/set + xorw %dx,%dx # carry on pass/error of + # e801h memory size call + # or merely pass cx,dx though + # without changing them. + movw $0xe801, %ax + int $0x15 + jc mem88 + + cmpw $0x0, %cx # Kludge to handle BIOSes + jne e801usecxdx # which report their extended + cmpw $0x0, %dx # memory in AX/BX rather than + jne e801usecxdx # CX/DX. The spec I have read + movw %ax, %cx # seems to indicate AX/BX + movw %bx, %dx # are more reasonable anyway... + +e801usecxdx: + andl $0xffff, %edx # clear sign extend + shll $6, %edx # and go from 64k to 1k chunks + movl %edx, (0x1e0) # store extended memory size + andl $0xffff, %ecx # clear sign extend + addl %ecx, (0x1e0) # and add lower memory into + # total size. + +# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or +# 64mb, depending on the bios) in ax. +mem88: + +#endif + movb $0x88, %ah + int $0x15 + movw %ax, (2) + +# Set the keyboard repeat rate to the max + movw $0x0305, %ax + xorw %bx, %bx + int $0x16 + +# Check for video adapter and its parameters and allow the +# user to browse video modes. + call video # NOTE: we need %ds pointing + # to bootsector + +# Get hd0 data... + xorw %ax, %ax + movw %ax, %ds + ldsw (4 * 0x41), %si + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + pushw %ax + movw %ax, %es + movw $0x0080, %di + movw $0x10, %cx + pushw %cx + cld + rep + movsb +# Get hd1 data... + xorw %ax, %ax + movw %ax, %ds + ldsw (4 * 0x46), %si + popw %cx + popw %es + movw $0x0090, %di + rep + movsb +# Check that there IS a hd1 :-) + movw $0x01500, %ax + movb $0x81, %dl + int $0x13 + jc no_disk1 + + cmpb $3, %ah + je is_disk1 + +no_disk1: + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %es + movw $0x0090, %di + movw $0x10, %cx + xorw %ax, %ax + cld + rep + stosb +is_disk1: + +# Check for PS/2 pointing device + movw %cs, %ax # aka SETUPSEG + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ax, %ds + movw $0, (0x1ff) # default is no pointing device + int $0x11 # int 0x11: equipment list + testb $0x04, %al # check if mouse installed + jz no_psmouse + + movw $0xAA, (0x1ff) # device present +no_psmouse: + +#include "../../i386/boot/edd.S" + +# Now we want to move to protected mode ... + cmpw $0, %cs:realmode_swtch + jz rmodeswtch_normal + + lcall *%cs:realmode_swtch + + jmp rmodeswtch_end + +rmodeswtch_normal: + pushw %cs + call default_switch + +rmodeswtch_end: +# we get the code32 start address and modify the below 'jmpi' +# (loader may have changed it) + movl %cs:code32_start, %eax + movl %eax, %cs:code32 + +# Now we move the system to its rightful place ... but we check if we have a +# big-kernel. In that case we *must* not move it ... + testb $LOADED_HIGH, %cs:loadflags + jz do_move0 # .. then we have a normal low + # loaded zImage + # .. or else we have a high + # loaded bzImage + jmp end_move # ... and we skip moving + +do_move0: + movw $0x100, %ax # start of destination segment + movw %cs, %bp # aka SETUPSEG + subw $DELTA_INITSEG, %bp # aka INITSEG + movw %cs:start_sys_seg, %bx # start of source segment + cld +do_move: + movw %ax, %es # destination segment + incb %ah # instead of add ax,#0x100 + movw %bx, %ds # source segment + addw $0x100, %bx + subw %di, %di + subw %si, %si + movw $0x800, %cx + rep + movsw + cmpw %bp, %bx # assume start_sys_seg > 0x200, + # so we will perhaps read one + # page more than needed, but + # never overwrite INITSEG + # because destination is a + # minimum one page below source + jb do_move + +end_move: +# then we load the segment descriptors + movw %cs, %ax # aka SETUPSEG + movw %ax, %ds + +# Check whether we need to be downward compatible with version <=201 + cmpl $0, cmd_line_ptr + jne end_move_self # loader uses version >=202 features + cmpb $0x20, type_of_loader + je end_move_self # bootsect loader, we know of it + +# Boot loader doesnt support boot protocol version 2.02. +# If we have our code not at 0x90000, we need to move it there now. +# We also then need to move the params behind it (commandline) +# Because we would overwrite the code on the current IP, we move +# it in two steps, jumping high after the first one. + movw %cs, %ax + cmpw $SETUPSEG, %ax + je end_move_self + + cli # make sure we really have + # interrupts disabled ! + # because after this the stack + # should not be used + subw $DELTA_INITSEG, %ax # aka INITSEG + movw %ss, %dx + cmpw %ax, %dx + jb move_self_1 + + addw $INITSEG, %dx + subw %ax, %dx # this will go into %ss after + # the move +move_self_1: + movw %ax, %ds + movw $INITSEG, %ax # real INITSEG + movw %ax, %es + movw %cs:setup_move_size, %cx + std # we have to move up, so we use + # direction down because the + # areas may overlap + movw %cx, %di + decw %di + movw %di, %si + subw $move_self_here+0x200, %cx + rep + movsb + ljmp $SETUPSEG, $move_self_here + +move_self_here: + movw $move_self_here+0x200, %cx + rep + movsb + movw $SETUPSEG, %ax + movw %ax, %ds + movw %dx, %ss +end_move_self: # now we are at the right place + lidt idt_48 # load idt with 0,0 + xorl %eax, %eax # Compute gdt_base + movw %ds, %ax # (Convert %ds:gdt to a linear ptr) + shll $4, %eax + addl $gdt, %eax + movl %eax, (gdt_48+2) + lgdt gdt_48 # load gdt with whatever is + # appropriate + +# that was painless, now we enable a20 + call empty_8042 + + movb $0xD1, %al # command write + outb %al, $0x64 + call empty_8042 + + movb $0xDF, %al # A20 on + outb %al, $0x60 + call empty_8042 + +# +# You must preserve the other bits here. Otherwise embarrasing things +# like laptops powering off on boot happen. Corrected version by Kira +# Brown from Linux 2.2 +# + inb $0x92, %al # + orb $02, %al # "fast A20" version + outb %al, $0x92 # some chips have only this + +# wait until a20 really *is* enabled; it can take a fair amount of +# time on certain systems; Toshiba Tecras are known to have this +# problem. The memory location used here (0x200) is the int 0x80 +# vector, which should be safe to use. + + xorw %ax, %ax # segment 0x0000 + movw %ax, %fs + decw %ax # segment 0xffff (HMA) + movw %ax, %gs +a20_wait: + incw %ax # unused memory location <0xfff0 + movw %ax, %fs:(0x200) # we use the "int 0x80" vector + cmpw %gs:(0x210), %ax # and its corresponding HMA addr + je a20_wait # loop until no longer aliased + +# make sure any possible coprocessor is properly reset.. + xorw %ax, %ax + outb %al, $0xf0 + call delay + + outb %al, $0xf1 + call delay + +# well, that went ok, I hope. Now we mask all interrupts - the rest +# is done in init_IRQ(). + movb $0xFF, %al # mask all interrupts for now + outb %al, $0xA1 + call delay + + movb $0xFB, %al # mask all irq's but irq2 which + outb %al, $0x21 # is cascaded + +# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't +# need no steenking BIOS anyway (except for the initial loading :-). +# The BIOS-routine wants lots of unnecessary data, and it's less +# "interesting" anyway. This is how REAL programmers do it. +# +# Well, now's the time to actually move into protected mode. To make +# things as simple as possible, we do no register set-up or anything, +# we let the gnu-compiled 32-bit programs do that. We just jump to +# absolute address 0x1000 (or the loader supplied one), +# in 32-bit protected mode. +# +# Note that the short jump isn't strictly needed, although there are +# reasons why it might be a good idea. It won't hurt in any case. + movw $1, %ax # protected mode (PE) bit + lmsw %ax # This is it! + jmp flush_instr + +flush_instr: + xorw %bx, %bx # Flag to indicate a boot + xorl %esi, %esi # Pointer to real-mode code + movw %cs, %si + subw $DELTA_INITSEG, %si + shll $4, %esi # Convert to 32-bit pointer +# NOTE: For high loaded big kernels we need a +# jmpi 0x100000,__KERNEL_CS +# +# but we yet haven't reloaded the CS register, so the default size +# of the target offset still is 16 bit. +# However, using an operant prefix (0x66), the CPU will properly +# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference +# Manual, Mixing 16-bit and 32-bit code, page 16-6) + + .byte 0x66, 0xea # prefix + jmpi-opcode +code32: .long 0x1000 # will be set to 0x100000 + # for big kernels + .word __KERNEL_CS + +# Here's a bunch of information about your current kernel.. +kernel_version: .ascii UTS_RELEASE + .ascii " (" + .ascii LINUX_COMPILE_BY + .ascii "@" + .ascii LINUX_COMPILE_HOST + .ascii ") " + .ascii UTS_VERSION + .byte 0 + +# This is the default real mode switch routine. +# to be called just before protected mode transition +default_switch: + cli # no interrupts allowed ! + movb $0x80, %al # disable NMI for bootup + # sequence + outb %al, $0x70 + lret + + +# This routine checks that the keyboard command queue is empty +# (after emptying the output buffers) +# +# Some machines have delusions that the keyboard buffer is always full +# with no keyboard attached... +# +# If there is no keyboard controller, we will usually get 0xff +# to all the reads. With each IO taking a microsecond and +# a timeout of 100,000 iterations, this can take about half a +# second ("delay" == outb to port 0x80). That should be ok, +# and should also be plenty of time for a real keyboard controller +# to empty. +# + +empty_8042: + pushl %ecx + movl $100000, %ecx + +empty_8042_loop: + decl %ecx + jz empty_8042_end_loop + + call delay + + inb $0x64, %al # 8042 status port + testb $1, %al # output buffer? + jz no_output + + call delay + inb $0x60, %al # read it + jmp empty_8042_loop + +no_output: + testb $2, %al # is input buffer full? + jnz empty_8042_loop # yes - loop +empty_8042_end_loop: + popl %ecx + ret + +# Read the cmos clock. Return the seconds in al +gettime: + pushw %cx + movb $0x02, %ah + int $0x1a + movb %dh, %al # %dh contains the seconds + andb $0x0f, %al + movb %dh, %ah + movb $0x04, %cl + shrb %cl, %ah + aad + popw %cx + ret + +# Delay is needed after doing I/O +delay: + outb %al,$0x80 + ret + +# Descriptor tables +gdt: + .word 0, 0, 0, 0 # dummy + + .word 0, 0, 0, 0 # unused + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9A00 # code read/exec + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9200 # data read/write + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) +idt_48: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L +gdt_48: + .word 0x8000 # gdt limit=2048, + # 256 GDT entries + + .word 0, 0 # gdt base (filled in later) + +# Include video setup & detection code + +#include "video.S" + +# Setup signature -- must be last +setup_sig1: .word SIG1 +setup_sig2: .word SIG2 + +# After this point, there is some free space which is used by the video mode +# handling code to store the temporary mode table (not used by the kernel). + +modelist: + +.text +endtext: +.data +enddata: +.bss +endbss: diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c new file mode 100644 index 000000000000..c2fa66313170 --- /dev/null +++ b/arch/x86_64/boot/tools/build.c @@ -0,0 +1,186 @@ +/* + * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $ + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1997 Martin Mares + */ + +/* + * This file builds a disk-image from three different files: + * + * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest + * - setup: 8086 machine code, sets up system parm + * - system: 80386 code for actual system + * + * It does some checking that all files are of the correct type, and + * just writes the result to stdout, removing headers and padding to + * the right amount. It also writes some system data to stderr. + */ + +/* + * Changes by tytso to allow root device specification + * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 + * Cross compiling fixes by Gertjan van Wingerde, July 1996 + * Rewritten by Martin Mares, April 1997 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef unsigned char byte; +typedef unsigned short word; +typedef unsigned long u32; + +#define DEFAULT_MAJOR_ROOT 0 +#define DEFAULT_MINOR_ROOT 0 + +/* Minimal number of setup sectors (see also bootsect.S) */ +#define SETUP_SECTS 4 + +byte buf[1024]; +int fd; +int is_big_kernel; + +void die(const char * str, ...) +{ + va_list args; + va_start(args, str); + vfprintf(stderr, str, args); + fputc('\n', stderr); + exit(1); +} + +void file_open(const char *name) +{ + if ((fd = open(name, O_RDONLY, 0)) < 0) + die("Unable to open `%s': %m", name); +} + +void usage(void) +{ + die("Usage: build [-b] bootsect setup system [rootdev] [> image]"); +} + +int main(int argc, char ** argv) +{ + unsigned int i, c, sz, setup_sectors; + u32 sys_size; + byte major_root, minor_root; + struct stat sb; + + if (argc > 2 && !strcmp(argv[1], "-b")) + { + is_big_kernel = 1; + argc--, argv++; + } + if ((argc < 4) || (argc > 5)) + usage(); + if (argc > 4) { + if (!strcmp(argv[4], "CURRENT")) { + if (stat("/", &sb)) { + perror("/"); + die("Couldn't stat /"); + } + major_root = major(sb.st_dev); + minor_root = minor(sb.st_dev); + } else if (strcmp(argv[4], "FLOPPY")) { + if (stat(argv[4], &sb)) { + perror(argv[4]); + die("Couldn't stat root device."); + } + major_root = major(sb.st_rdev); + minor_root = minor(sb.st_rdev); + } else { + major_root = 0; + minor_root = 0; + } + } else { + major_root = DEFAULT_MAJOR_ROOT; + minor_root = DEFAULT_MINOR_ROOT; + } + fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root); + + file_open(argv[1]); + i = read(fd, buf, sizeof(buf)); + fprintf(stderr,"Boot sector %d bytes.\n",i); + if (i != 512) + die("Boot block must be exactly 512 bytes"); + if (buf[510] != 0x55 || buf[511] != 0xaa) + die("Boot block hasn't got boot flag (0xAA55)"); + buf[508] = minor_root; + buf[509] = major_root; + if (write(1, buf, 512) != 512) + die("Write call failed"); + close (fd); + + file_open(argv[2]); /* Copy the setup code */ + for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c ) + if (write(1, buf, c) != c) + die("Write call failed"); + if (c != 0) + die("read-error on `setup'"); + close (fd); + + setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */ + /* for compatibility with ancient versions of LILO. */ + if (setup_sectors < SETUP_SECTS) + setup_sectors = SETUP_SECTS; + fprintf(stderr, "Setup is %d bytes.\n", i); + memset(buf, 0, sizeof(buf)); + while (i < setup_sectors * 512) { + c = setup_sectors * 512 - i; + if (c > sizeof(buf)) + c = sizeof(buf); + if (write(1, buf, c) != c) + die("Write call failed"); + i += c; + } + + file_open(argv[3]); + if (fstat (fd, &sb)) + die("Unable to stat `%s': %m", argv[3]); + sz = sb.st_size; + fprintf (stderr, "System is %d kB\n", sz/1024); + sys_size = (sz + 15) / 16; + /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ + if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) + die("System is too big. Try using %smodules.", + is_big_kernel ? "" : "bzImage or "); + while (sz > 0) { + int l, n; + + l = (sz > sizeof(buf)) ? sizeof(buf) : sz; + if ((n=read(fd, buf, l)) != l) { + if (n < 0) + die("Error reading %s: %m", argv[3]); + else + die("%s: Unexpected EOF", argv[3]); + } + if (write(1, buf, l) != l) + die("Write failed"); + sz -= l; + } + close(fd); + + if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */ + die("Output: seek failed"); + buf[0] = setup_sectors; + if (write(1, buf, 1) != 1) + die("Write of setup sector count failed"); + if (lseek(1, 500, SEEK_SET) != 500) + die("Output: seek failed"); + buf[0] = (sys_size & 0xff); + buf[1] = ((sys_size >> 8) & 0xff); + if (write(1, buf, 2) != 2) + die("Write of image length failed"); + + return 0; /* Everything is OK */ +} diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S new file mode 100644 index 000000000000..0587477c99f2 --- /dev/null +++ b/arch/x86_64/boot/video.S @@ -0,0 +1,2007 @@ +/* video.S + * + * Display adapter & video mode setup, version 2.13 (14-May-99) + * + * Copyright (C) 1995 -- 1998 Martin Mares + * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson + * + * Rewritten to use GNU 'as' by Chris Noe May 1999 + * + * For further information, look at Documentation/svga.txt. + * + */ + +#include /* for CONFIG_VIDEO_* */ + +/* Enable autodetection of SVGA adapters and modes. */ +#undef CONFIG_VIDEO_SVGA + +/* Enable autodetection of VESA modes */ +#define CONFIG_VIDEO_VESA + +/* Enable compacting of mode table */ +#define CONFIG_VIDEO_COMPACT + +/* Retain screen contents when switching modes */ +#define CONFIG_VIDEO_RETAIN + +/* Enable local mode list */ +#undef CONFIG_VIDEO_LOCAL + +/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ +#undef CONFIG_VIDEO_400_HACK + +/* Hack that lets you force specific BIOS mode ID and specific dimensions */ +#undef CONFIG_VIDEO_GFX_HACK +#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */ +#define VIDEO_GFX_BIOS_BX 0x0102 +#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */ + +/* This code uses an extended set of video mode numbers. These include: + * Aliases for standard modes + * NORMAL_VGA (-1) + * EXTENDED_VGA (-2) + * ASK_VGA (-3) + * Video modes numbered by menu position -- NOT RECOMMENDED because of lack + * of compatibility when extending the table. These are between 0x00 and 0xff. + */ +#define VIDEO_FIRST_MENU 0x0000 + +/* Standard BIOS video modes (BIOS number + 0x0100) */ +#define VIDEO_FIRST_BIOS 0x0100 + +/* VESA BIOS video modes (VESA number + 0x0200) */ +#define VIDEO_FIRST_VESA 0x0200 + +/* Video7 special modes (BIOS number + 0x0900) */ +#define VIDEO_FIRST_V7 0x0900 + +/* Special video modes */ +#define VIDEO_FIRST_SPECIAL 0x0f00 +#define VIDEO_80x25 0x0f00 +#define VIDEO_8POINT 0x0f01 +#define VIDEO_80x43 0x0f02 +#define VIDEO_80x28 0x0f03 +#define VIDEO_CURRENT_MODE 0x0f04 +#define VIDEO_80x30 0x0f05 +#define VIDEO_80x34 0x0f06 +#define VIDEO_80x60 0x0f07 +#define VIDEO_GFX_HACK 0x0f08 +#define VIDEO_LAST_SPECIAL 0x0f09 + +/* Video modes given by resolution */ +#define VIDEO_FIRST_RESOLUTION 0x1000 + +/* The "recalculate timings" flag */ +#define VIDEO_RECALC 0x8000 + +/* Positions of various video parameters passed to the kernel */ +/* (see also include/linux/tty.h) */ +#define PARAM_CURSOR_POS 0x00 +#define PARAM_VIDEO_PAGE 0x04 +#define PARAM_VIDEO_MODE 0x06 +#define PARAM_VIDEO_COLS 0x07 +#define PARAM_VIDEO_EGA_BX 0x0a +#define PARAM_VIDEO_LINES 0x0e +#define PARAM_HAVE_VGA 0x0f +#define PARAM_FONT_POINTS 0x10 + +#define PARAM_LFB_WIDTH 0x12 +#define PARAM_LFB_HEIGHT 0x14 +#define PARAM_LFB_DEPTH 0x16 +#define PARAM_LFB_BASE 0x18 +#define PARAM_LFB_SIZE 0x1c +#define PARAM_LFB_LINELENGTH 0x24 +#define PARAM_LFB_COLORS 0x26 +#define PARAM_VESAPM_SEG 0x2e +#define PARAM_VESAPM_OFF 0x30 +#define PARAM_LFB_PAGES 0x32 +#define PARAM_VESA_ATTRIB 0x34 + +/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ +#ifdef CONFIG_VIDEO_RETAIN +#define DO_STORE call store_screen +#else +#define DO_STORE +#endif /* CONFIG_VIDEO_RETAIN */ + +# This is the main entry point called by setup.S +# %ds *must* be pointing to the bootsector +video: pushw %ds # We use different segments + pushw %ds # FS contains original DS + popw %fs + pushw %cs # DS is equal to CS + popw %ds + pushw %cs # ES is equal to CS + popw %es + xorw %ax, %ax + movw %ax, %gs # GS is zero + cld + call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA) +#ifdef CONFIG_VIDEO_SELECT + movw %fs:(0x01fa), %ax # User selected video mode + cmpw $ASK_VGA, %ax # Bring up the menu + jz vid2 + + call mode_set # Set the mode + jc vid1 + + leaw badmdt, %si # Invalid mode ID + call prtstr +vid2: call mode_menu +vid1: +#ifdef CONFIG_VIDEO_RETAIN + call restore_screen # Restore screen contents +#endif /* CONFIG_VIDEO_RETAIN */ + call store_edid +#endif /* CONFIG_VIDEO_SELECT */ + call mode_params # Store mode parameters + popw %ds # Restore original DS + ret + +# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel. +basic_detect: + movb $0, %fs:(PARAM_HAVE_VGA) + movb $0x12, %ah # Check EGA/VGA + movb $0x10, %bl + int $0x10 + movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel + cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card. + je basret + + incb adapter + movw $0x1a00, %ax # Check EGA or VGA? + int $0x10 + cmpb $0x1a, %al # 1a means VGA... + jne basret # anything else is EGA. + + incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA + incb adapter +basret: ret + +# Store the video mode parameters for later usage by the kernel. +# This is done by asking the BIOS except for the rows/columns +# parameters in the default 80x25 mode -- these are set directly, +# because some very obscure BIOSes supply insane values. +mode_params: +#ifdef CONFIG_VIDEO_SELECT + cmpb $0, graphic_mode + jnz mopar_gr +#endif + movb $0x03, %ah # Read cursor position + xorb %bh, %bh + int $0x10 + movw %dx, %fs:(PARAM_CURSOR_POS) + movb $0x0f, %ah # Read page/mode/width + int $0x10 + movw %bx, %fs:(PARAM_VIDEO_PAGE) + movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width + cmpb $0x7, %al # MDA/HGA => segment differs + jnz mopar0 + + movw $0xb000, video_segment +mopar0: movw %gs:(0x485), %ax # Font size + movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA) + movw force_size, %ax # Forced size? + orw %ax, %ax + jz mopar1 + + movb %ah, %fs:(PARAM_VIDEO_COLS) + movb %al, %fs:(PARAM_VIDEO_LINES) + ret + +mopar1: movb $25, %al + cmpb $0, adapter # If we are on CGA/MDA/HGA, the + jz mopar2 # screen must have 25 lines. + + movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS + incb %al # location of max lines. +mopar2: movb %al, %fs:(PARAM_VIDEO_LINES) + ret + +#ifdef CONFIG_VIDEO_SELECT +# Fetching of VESA frame buffer parameters +mopar_gr: + leaw modelist+1024, %di + movb $0x23, %fs:(PARAM_HAVE_VGA) + movw 16(%di), %ax + movw %ax, %fs:(PARAM_LFB_LINELENGTH) + movw 18(%di), %ax + movw %ax, %fs:(PARAM_LFB_WIDTH) + movw 20(%di), %ax + movw %ax, %fs:(PARAM_LFB_HEIGHT) + movb 25(%di), %al + movb $0, %ah + movw %ax, %fs:(PARAM_LFB_DEPTH) + movb 29(%di), %al + movb $0, %ah + movw %ax, %fs:(PARAM_LFB_PAGES) + movl 40(%di), %eax + movl %eax, %fs:(PARAM_LFB_BASE) + movl 31(%di), %eax + movl %eax, %fs:(PARAM_LFB_COLORS) + movl 35(%di), %eax + movl %eax, %fs:(PARAM_LFB_COLORS+4) + movw 0(%di), %ax + movw %ax, %fs:(PARAM_VESA_ATTRIB) + +# get video mem size + leaw modelist+1024, %di + movw $0x4f00, %ax + int $0x10 + xorl %eax, %eax + movw 18(%di), %ax + movl %eax, %fs:(PARAM_LFB_SIZE) + +# switching the DAC to 8-bit is for <= 8 bpp only + movw %fs:(PARAM_LFB_DEPTH), %ax + cmpw $8, %ax + jg dac_done + +# get DAC switching capability + xorl %eax, %eax + movb 10(%di), %al + testb $1, %al + jz dac_set + +# attempt to switch DAC to 8-bit + movw $0x4f08, %ax + movw $0x0800, %bx + int $0x10 + cmpw $0x004f, %ax + jne dac_set + movb %bh, dac_size # store actual DAC size + +dac_set: +# set color size to DAC size + movb dac_size, %al + movb %al, %fs:(PARAM_LFB_COLORS+0) + movb %al, %fs:(PARAM_LFB_COLORS+2) + movb %al, %fs:(PARAM_LFB_COLORS+4) + movb %al, %fs:(PARAM_LFB_COLORS+6) + +# set color offsets to 0 + movb $0, %fs:(PARAM_LFB_COLORS+1) + movb $0, %fs:(PARAM_LFB_COLORS+3) + movb $0, %fs:(PARAM_LFB_COLORS+5) + movb $0, %fs:(PARAM_LFB_COLORS+7) + +dac_done: +# get protected mode interface informations + movw $0x4f0a, %ax + xorw %bx, %bx + xorw %di, %di + int $0x10 + cmp $0x004f, %ax + jnz no_pm + + movw %es, %fs:(PARAM_VESAPM_SEG) + movw %di, %fs:(PARAM_VESAPM_OFF) +no_pm: ret + +# The video mode menu +mode_menu: + leaw keymsg, %si # "Return/Space/Timeout" message + call prtstr + call flush +nokey: call getkt + + cmpb $0x0d, %al # ENTER ? + je listm # yes - manual mode selection + + cmpb $0x20, %al # SPACE ? + je defmd1 # no - repeat + + call beep + jmp nokey + +defmd1: ret # No mode chosen? Default 80x25 + +listm: call mode_table # List mode table +listm0: leaw name_bann, %si # Print adapter name + call prtstr + movw card_name, %si + orw %si, %si + jnz an2 + + movb adapter, %al + leaw old_name, %si + orb %al, %al + jz an1 + + leaw ega_name, %si + decb %al + jz an1 + + leaw vga_name, %si + jmp an1 + +an2: call prtstr + leaw svga_name, %si +an1: call prtstr + leaw listhdr, %si # Table header + call prtstr + movb $0x30, %dl # DL holds mode number + leaw modelist, %si +lm1: cmpw $ASK_VGA, (%si) # End? + jz lm2 + + movb %dl, %al # Menu selection number + call prtchr + call prtsp2 + lodsw + call prthw # Mode ID + call prtsp2 + movb 0x1(%si), %al + call prtdec # Rows + movb $0x78, %al # the letter 'x' + call prtchr + lodsw + call prtdec # Columns + movb $0x0d, %al # New line + call prtchr + movb $0x0a, %al + call prtchr + incb %dl # Next character + cmpb $0x3a, %dl + jnz lm1 + + movb $0x61, %dl + jmp lm1 + +lm2: leaw prompt, %si # Mode prompt + call prtstr + leaw edit_buf, %di # Editor buffer +lm3: call getkey + cmpb $0x0d, %al # Enter? + jz lment + + cmpb $0x08, %al # Backspace? + jz lmbs + + cmpb $0x20, %al # Printable? + jc lm3 + + cmpw $edit_buf+4, %di # Enough space? + jz lm3 + + stosb + call prtchr + jmp lm3 + +lmbs: cmpw $edit_buf, %di # Backspace + jz lm3 + + decw %di + movb $0x08, %al + call prtchr + call prtspc + movb $0x08, %al + call prtchr + jmp lm3 + +lment: movb $0, (%di) + leaw crlft, %si + call prtstr + leaw edit_buf, %si + cmpb $0, (%si) # Empty string = default mode + jz lmdef + + cmpb $0, 1(%si) # One character = menu selection + jz mnusel + + cmpw $0x6373, (%si) # "scan" => mode scanning + jnz lmhx + + cmpw $0x6e61, 2(%si) + jz lmscan + +lmhx: xorw %bx, %bx # Else => mode ID in hex +lmhex: lodsb + orb %al, %al + jz lmuse1 + + subb $0x30, %al + jc lmbad + + cmpb $10, %al + jc lmhx1 + + subb $7, %al + andb $0xdf, %al + cmpb $10, %al + jc lmbad + + cmpb $16, %al + jnc lmbad + +lmhx1: shlw $4, %bx + orb %al, %bl + jmp lmhex + +lmuse1: movw %bx, %ax + jmp lmuse + +mnusel: lodsb # Menu selection + xorb %ah, %ah + subb $0x30, %al + jc lmbad + + cmpb $10, %al + jc lmuse + + cmpb $0x61-0x30, %al + jc lmbad + + subb $0x61-0x30-10, %al + cmpb $36, %al + jnc lmbad + +lmuse: call mode_set + jc lmdef + +lmbad: leaw unknt, %si + call prtstr + jmp lm2 +lmscan: cmpb $0, adapter # Scanning only on EGA/VGA + jz lmbad + + movw $0, mt_end # Scanning of modes is + movb $1, scanning # done as new autodetection. + call mode_table + jmp listm0 +lmdef: ret + +# Additional parts of mode_set... (relative jumps, you know) +setv7: # Video7 extended modes + DO_STORE + subb $VIDEO_FIRST_V7>>8, %bh + movw $0x6f05, %ax + int $0x10 + stc + ret + +_setrec: jmp setrec # Ugly... +_set_80x25: jmp set_80x25 + +# Aliases for backward compatibility. +setalias: + movw $VIDEO_80x25, %ax + incw %bx + jz mode_set + + movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al + incw %bx + jnz setbad # Fall-through! + +# Setting of user mode (AX=mode ID) => CF=success +mode_set: + movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S + movw %ax, %bx + cmpb $0xff, %ah + jz setalias + + testb $VIDEO_RECALC>>8, %ah + jnz _setrec + + cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah + jnc setres + + cmpb $VIDEO_FIRST_SPECIAL>>8, %ah + jz setspc + + cmpb $VIDEO_FIRST_V7>>8, %ah + jz setv7 + + cmpb $VIDEO_FIRST_VESA>>8, %ah + jnc check_vesa + + orb %ah, %ah + jz setmenu + + decb %ah + jz setbios + +setbad: clc + movb $0, do_restore # The screen needn't be restored + ret + +setvesa: + DO_STORE + subb $VIDEO_FIRST_VESA>>8, %bh + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz setbad # AH=0 if OK + + stc + ret + +setbios: + DO_STORE + int $0x10 # Standard BIOS mode set call + pushw %bx + movb $0x0f, %ah # Check if really set + int $0x10 + popw %bx + cmpb %bl, %al + jnz setbad + + stc + ret + +setspc: xorb %bh, %bh # Set special mode + cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl + jnc setbad + + addw %bx, %bx + jmp *spec_inits(%bx) + +setmenu: + orb %al, %al # 80x25 is an exception + jz _set_80x25 + + pushw %bx # Set mode chosen from menu + call mode_table # Build the mode table + popw %ax + shlw $2, %ax + addw %ax, %si + cmpw %di, %si + jnc setbad + + movw (%si), %ax # Fetch mode ID +_m_s: jmp mode_set + +setres: pushw %bx # Set mode chosen by resolution + call mode_table + popw %bx + xchgb %bl, %bh +setr1: lodsw + cmpw $ASK_VGA, %ax # End of the list? + jz setbad + + lodsw + cmpw %bx, %ax + jnz setr1 + + movw -4(%si), %ax # Fetch mode ID + jmp _m_s + +check_vesa: + leaw modelist+1024, %di + subb $VIDEO_FIRST_VESA>>8, %bh + movw %bx, %cx # Get mode information structure + movw $0x4f01, %ax + int $0x10 + addb $VIDEO_FIRST_VESA>>8, %bh + cmpw $0x004f, %ax + jnz setbad + + movb (%di), %al # Check capabilities. + andb $0x19, %al + cmpb $0x09, %al + jz setvesa # This is a text mode + + movb (%di), %al # Check capabilities. + andb $0x99, %al + cmpb $0x99, %al + jnz _setbad # Doh! No linear frame buffer. + + subb $VIDEO_FIRST_VESA>>8, %bh + orw $0x4000, %bx # Use linear frame buffer + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz _setbad # AH=0 if OK + + movb $1, graphic_mode # flag graphic mode + movb $0, do_restore # no screen restore + stc + ret + +_setbad: jmp setbad # Ugly... + +# Recalculate vertical display end registers -- this fixes various +# inconsistencies of extended modes on many adapters. Called when +# the VIDEO_RECALC flag is set in the mode ID. + +setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode + call mode_set + jnc rct3 + + movw %gs:(0x485), %ax # Font size in pixels + movb %gs:(0x484), %bl # Number of rows + incb %bl + mulb %bl # Number of visible + decw %ax # scan lines - 1 + movw $0x3d4, %dx + movw %ax, %bx + movb $0x12, %al # Lower 8 bits + movb %bl, %ah + outw %ax, %dx + movb $0x07, %al # Bits 8 and 9 in the overflow register + call inidx + xchgb %al, %ah + andb $0xbd, %ah + shrb %bh + jnc rct1 + orb $0x02, %ah +rct1: shrb %bh + jnc rct2 + orb $0x40, %ah +rct2: movb $0x07, %al + outw %ax, %dx + stc +rct3: ret + +# Table of routines for setting of the special modes. +spec_inits: + .word set_80x25 + .word set_8pixel + .word set_80x43 + .word set_80x28 + .word set_current + .word set_80x30 + .word set_80x34 + .word set_80x60 + .word set_gfx + +# Set the 80x25 mode. If already set, do nothing. +set_80x25: + movw $0x5019, force_size # Override possibly broken BIOS +use_80x25: +#ifdef CONFIG_VIDEO_400_HACK + movw $0x1202, %ax # Force 400 scan lines + movb $0x30, %bl + int $0x10 +#else + movb $0x0f, %ah # Get current mode ID + int $0x10 + cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available + jz st80 # on CGA/MDA/HGA and is also available on EGAM + + cmpw $0x5003, %ax # Unknown mode, force 80x25 color + jnz force3 + +st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25 + jz set80 + + movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc. + orb %al, %al # Some buggy BIOS'es set 0 rows + jz set80 + + cmpb $24, %al # It's hopefully correct + jz set80 +#endif /* CONFIG_VIDEO_400_HACK */ +force3: DO_STORE + movw $0x0003, %ax # Forced set + int $0x10 +set80: stc + ret + +# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls. +set_8pixel: + DO_STORE + call use_80x25 # The base is 80x25 +set_8pt: + movw $0x1112, %ax # Use 8x8 font + xorb %bl, %bl + int $0x10 + movw $0x1200, %ax # Use alternate print screen + movb $0x20, %bl + int $0x10 + movw $0x1201, %ax # Turn off cursor emulation + movb $0x34, %bl + int $0x10 + movb $0x01, %ah # Define cursor scan lines 6-7 + movw $0x0607, %cx + int $0x10 +set_current: + stc + ret + +# Set the 80x28 mode. This mode works on all VGA's, because it's a standard +# 80x25 mode with 14-point fonts instead of 16-point. +set_80x28: + DO_STORE + call use_80x25 # The base is 80x25 +set14: movw $0x1111, %ax # Use 9x14 font + xorb %bl, %bl + int $0x10 + movb $0x01, %ah # Define cursor scan lines 11-12 + movw $0x0b0c, %cx + int $0x10 + stc + ret + +# Set the 80x43 mode. This mode is works on all VGA's. +# It's a 350-scanline mode with 8-pixel font. +set_80x43: + DO_STORE + movw $0x1201, %ax # Set 350 scans + movb $0x30, %bl + int $0x10 + movw $0x0003, %ax # Reset video mode + int $0x10 + jmp set_8pt # Use 8-pixel font + +# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font. +set_80x30: + call use_80x25 # Start with real 80x25 + DO_STORE + movw $0x3cc, %dx # Get CRTC port + inb %dx, %al + movb $0xd4, %dl + rorb %al # Mono or color? + jc set48a + + movb $0xb4, %dl +set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7) + call outidx + movw $0x0b06, %ax # Vertical total + call outidx + movw $0x3e07, %ax # (Vertical) overflow + call outidx + movw $0xea10, %ax # Vertical sync start + call outidx + movw $0xdf12, %ax # Vertical display end + call outidx + movw $0xe715, %ax # Vertical blank start + call outidx + movw $0x0416, %ax # Vertical blank end + call outidx + pushw %dx + movb $0xcc, %dl # Misc output register (read) + inb %dx, %al + movb $0xc2, %dl # (write) + andb $0x0d, %al # Preserve clock select bits and color bit + orb $0xe2, %al # Set correct sync polarity + outb %al, %dx + popw %dx + movw $0x501e, force_size + stc # That's all. + ret + +# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font. +set_80x34: + call set_80x30 # Set 480 scans + call set14 # And 14-pt font + movw $0xdb12, %ax # VGA vertical display end + movw $0x5022, force_size +setvde: call outidx + stc + ret + +# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font. +set_80x60: + call set_80x30 # Set 480 scans + call set_8pt # And 8-pt font + movw $0xdf12, %ax # VGA vertical display end + movw $0x503c, force_size + jmp setvde + +# Special hack for ThinkPad graphics +set_gfx: +#ifdef CONFIG_VIDEO_GFX_HACK + movw $VIDEO_GFX_BIOS_AX, %ax + movw $VIDEO_GFX_BIOS_BX, %bx + int $0x10 + movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size + stc +#endif + ret + +#ifdef CONFIG_VIDEO_RETAIN + +# Store screen contents to temporary buffer. +store_screen: + cmpb $0, do_restore # Already stored? + jnz stsr + + testb $CAN_USE_HEAP, loadflags # Have we space for storing? + jz stsr + + pushw %ax + pushw %bx + pushw force_size # Don't force specific size + movw $0, force_size + call mode_params # Obtain params of current mode + popw force_size + movb %fs:(PARAM_VIDEO_LINES), %ah + movb %fs:(PARAM_VIDEO_COLS), %al + movw %ax, %bx # BX=dimensions + mulb %ah + movw %ax, %cx # CX=number of characters + addw %ax, %ax # Calculate image size + addw $modelist+1024+4, %ax + cmpw heap_end_ptr, %ax + jnc sts1 # Unfortunately, out of memory + + movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params + leaw modelist+1024, %di + stosw + movw %bx, %ax + stosw + pushw %ds # Store the screen + movw video_segment, %ds + xorw %si, %si + rep + movsw + popw %ds + incb do_restore # Screen will be restored later +sts1: popw %bx + popw %ax +stsr: ret + +# Restore screen contents from temporary buffer. +restore_screen: + cmpb $0, do_restore # Has the screen been stored? + jz res1 + + call mode_params # Get parameters of current mode + movb %fs:(PARAM_VIDEO_LINES), %cl + movb %fs:(PARAM_VIDEO_COLS), %ch + leaw modelist+1024, %si # Screen buffer + lodsw # Set cursor position + movw %ax, %dx + cmpb %cl, %dh + jc res2 + + movb %cl, %dh + decb %dh +res2: cmpb %ch, %dl + jc res3 + + movb %ch, %dl + decb %dl +res3: movb $0x02, %ah + movb $0x00, %bh + int $0x10 + lodsw # Display size + movb %ah, %dl # DL=number of lines + movb $0, %ah # BX=phys. length of orig. line + movw %ax, %bx + cmpb %cl, %dl # Too many? + jc res4 + + pushw %ax + movb %dl, %al + subb %cl, %al + mulb %bl + addw %ax, %si + addw %ax, %si + popw %ax + movb %cl, %dl +res4: cmpb %ch, %al # Too wide? + jc res5 + + movb %ch, %al # AX=width of src. line +res5: movb $0, %cl + xchgb %ch, %cl + movw %cx, %bp # BP=width of dest. line + pushw %es + movw video_segment, %es + xorw %di, %di # Move the data + addw %bx, %bx # Convert BX and BP to _bytes_ + addw %bp, %bp +res6: pushw %si + pushw %di + movw %ax, %cx + rep + movsw + popw %di + popw %si + addw %bp, %di + addw %bx, %si + decb %dl + jnz res6 + + popw %es # Done +res1: ret +#endif /* CONFIG_VIDEO_RETAIN */ + +# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port) +outidx: outb %al, %dx + pushw %ax + movb %ah, %al + incw %dx + outb %al, %dx + decw %dx + popw %ax + ret + +# Build the table of video modes (stored after the setup.S code at the +# `modelist' label. Each video mode record looks like: +# .word MODE-ID (our special mode ID (see above)) +# .byte rows (number of rows) +# .byte columns (number of columns) +# Returns address of the end of the table in DI, the end is marked +# with a ASK_VGA ID. +mode_table: + movw mt_end, %di # Already filled? + orw %di, %di + jnz mtab1x + + leaw modelist, %di # Store standard modes: + movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL) + stosl + movb adapter, %al # CGA/MDA/HGA -- no more modes + orb %al, %al + jz mtabe + + decb %al + jnz mtabv + + movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode + stosl + jmp mtabe + +mtab1x: jmp mtab1 + +mtabv: leaw vga_modes, %si # All modes for std VGA + movw $vga_modes_end-vga_modes, %cx + rep # I'm unable to use movsw as I don't know how to store a half + movsb # of the expression above to cx without using explicit shr. + + cmpb $0, scanning # Mode scan requested? + jz mscan1 + + call mode_scan +mscan1: + +#ifdef CONFIG_VIDEO_LOCAL + call local_modes +#endif /* CONFIG_VIDEO_LOCAL */ + +#ifdef CONFIG_VIDEO_VESA + call vesa_modes # Detect VESA VGA modes +#endif /* CONFIG_VIDEO_VESA */ + +#ifdef CONFIG_VIDEO_SVGA + cmpb $0, scanning # Bypass when scanning + jnz mscan2 + + call svga_modes # Detect SVGA cards & modes +mscan2: +#endif /* CONFIG_VIDEO_SVGA */ + +mtabe: + +#ifdef CONFIG_VIDEO_COMPACT + leaw modelist, %si + movw %di, %dx + movw %si, %di +cmt1: cmpw %dx, %si # Scan all modes + jz cmt2 + + leaw modelist, %bx # Find in previous entries + movw 2(%si), %cx +cmt3: cmpw %bx, %si + jz cmt4 + + cmpw 2(%bx), %cx # Found => don't copy this entry + jz cmt5 + + addw $4, %bx + jmp cmt3 + +cmt4: movsl # Copy entry + jmp cmt1 + +cmt5: addw $4, %si # Skip entry + jmp cmt1 + +cmt2: +#endif /* CONFIG_VIDEO_COMPACT */ + + movw $ASK_VGA, (%di) # End marker + movw %di, mt_end +mtab1: leaw modelist, %si # SI=mode list, DI=list end +ret0: ret + +# Modes usable on all standard VGAs +vga_modes: + .word VIDEO_8POINT + .word 0x5032 # 80x50 + .word VIDEO_80x43 + .word 0x502b # 80x43 + .word VIDEO_80x28 + .word 0x501c # 80x28 + .word VIDEO_80x30 + .word 0x501e # 80x30 + .word VIDEO_80x34 + .word 0x5022 # 80x34 + .word VIDEO_80x60 + .word 0x503c # 80x60 +#ifdef CONFIG_VIDEO_GFX_HACK + .word VIDEO_GFX_HACK + .word VIDEO_GFX_DUMMY_RESOLUTION +#endif + +vga_modes_end: +# Detect VESA modes. + +#ifdef CONFIG_VIDEO_VESA +vesa_modes: + cmpb $2, adapter # VGA only + jnz ret0 + + movw %di, %bp # BP=original mode table end + addw $0x200, %di # Buffer space + movw $0x4f00, %ax # VESA Get card info call + int $0x10 + movw %bp, %di + cmpw $0x004f, %ax # Successful? + jnz ret0 + + cmpw $0x4556, 0x200(%di) + jnz ret0 + + cmpw $0x4153, 0x202(%di) + jnz ret0 + + movw $vesa_name, card_name # Set name to "VESA VGA" + pushw %gs + lgsw 0x20e(%di), %si # GS:SI=mode list + movw $128, %cx # Iteration limit +vesa1: +# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst. +# XXX: lodsw %gs:(%si), %ax # Get next mode in the list + gs; lodsw + cmpw $0xffff, %ax # End of the table? + jz vesar + + cmpw $0x0080, %ax # Check validity of mode ID + jc vesa2 + + orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff + jz vesan # Certain BIOSes report 0x80-0xff! + + cmpw $0x0800, %ax + jnc vesae + +vesa2: pushw %cx + movw %ax, %cx # Get mode information structure + movw $0x4f01, %ax + int $0x10 + movw %cx, %bx # BX=mode number + addb $VIDEO_FIRST_VESA>>8, %bh + popw %cx + cmpw $0x004f, %ax + jnz vesan # Don't report errors (buggy BIOSES) + + movb (%di), %al # Check capabilities. We require + andb $0x19, %al # a color text mode. + cmpb $0x09, %al + jnz vesan + + cmpw $0xb800, 8(%di) # Standard video memory address required + jnz vesan + + testb $2, (%di) # Mode characteristics supplied? + movw %bx, (%di) # Store mode number + jz vesa3 + + xorw %dx, %dx + movw 0x12(%di), %bx # Width + orb %bh, %bh + jnz vesan + + movb %bl, 0x3(%di) + movw 0x14(%di), %ax # Height + orb %ah, %ah + jnz vesan + + movb %al, 2(%di) + mulb %bl + cmpw $8193, %ax # Small enough for Linux console driver? + jnc vesan + + jmp vesaok + +vesa3: subw $0x8108, %bx # This mode has no detailed info specified, + jc vesan # so it must be a standard VESA mode. + + cmpw $5, %bx + jnc vesan + + movw vesa_text_mode_table(%bx), %ax + movw %ax, 2(%di) +vesaok: addw $4, %di # The mode is valid. Store it. +vesan: loop vesa1 # Next mode. Limit exceeded => error +vesae: leaw vesaer, %si + call prtstr + movw %bp, %di # Discard already found modes. +vesar: popw %gs + ret + +# Dimensions of standard VESA text modes +vesa_text_mode_table: + .byte 60, 80 # 0108 + .byte 25, 132 # 0109 + .byte 43, 132 # 010A + .byte 50, 132 # 010B + .byte 60, 132 # 010C +#endif /* CONFIG_VIDEO_VESA */ + +# Scan for video modes. A bit dirty, but should work. +mode_scan: + movw $0x0100, %cx # Start with mode 0 +scm1: movb $0, %ah # Test the mode + movb %cl, %al + int $0x10 + movb $0x0f, %ah + int $0x10 + cmpb %cl, %al + jnz scm2 # Mode not set + + movw $0x3c0, %dx # Test if it's a text mode + movb $0x10, %al # Mode bits + call inidx + andb $0x03, %al + jnz scm2 + + movb $0xce, %dl # Another set of mode bits + movb $0x06, %al + call inidx + shrb %al + jc scm2 + + movb $0xd4, %dl # Cursor location + movb $0x0f, %al + call inidx + orb %al, %al + jnz scm2 + + movw %cx, %ax # Ok, store the mode + stosw + movb %gs:(0x484), %al # Number of rows + incb %al + stosb + movw %gs:(0x44a), %ax # Number of columns + stosb +scm2: incb %cl + jns scm1 + + movw $0x0003, %ax # Return back to mode 3 + int $0x10 + ret + +tstidx: outw %ax, %dx # OUT DX,AX and inidx +inidx: outb %al, %dx # Read from indexed VGA register + incw %dx # AL=index, DX=index reg port -> AL=data + inb %dx, %al + decw %dx + ret + +# Try to detect type of SVGA card and supply (usually approximate) video +# mode table for it. + +#ifdef CONFIG_VIDEO_SVGA +svga_modes: + leaw svga_table, %si # Test all known SVGA adapters +dosvga: lodsw + movw %ax, %bp # Default mode table + orw %ax, %ax + jz didsv1 + + lodsw # Pointer to test routine + pushw %si + pushw %di + pushw %es + movw $0xc000, %bx + movw %bx, %es + call *%ax # Call test routine + popw %es + popw %di + popw %si + orw %bp, %bp + jz dosvga + + movw %bp, %si # Found, copy the modes + movb svga_prefix, %ah +cpsvga: lodsb + orb %al, %al + jz didsv + + stosw + movsw + jmp cpsvga + +didsv: movw %si, card_name # Store pointer to card name +didsv1: ret + +# Table of all known SVGA cards. For each card, we store a pointer to +# a table of video modes supported by the card and a pointer to a routine +# used for testing of presence of the card. The video mode table is always +# followed by the name of the card or the chipset. +svga_table: + .word ati_md, ati_test + .word oak_md, oak_test + .word paradise_md, paradise_test + .word realtek_md, realtek_test + .word s3_md, s3_test + .word chips_md, chips_test + .word video7_md, video7_test + .word cirrus5_md, cirrus5_test + .word cirrus6_md, cirrus6_test + .word cirrus1_md, cirrus1_test + .word ahead_md, ahead_test + .word everex_md, everex_test + .word genoa_md, genoa_test + .word trident_md, trident_test + .word tseng_md, tseng_test + .word 0 + +# Test routines and mode tables: + +# S3 - The test algorithm was taken from the SuperProbe package +# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org +s3_test: + movw $0x0f35, %cx # we store some constants in cl/ch + movw $0x03d4, %dx + movb $0x38, %al + call inidx + movb %al, %bh # store current CRT-register 0x38 + movw $0x0038, %ax + call outidx # disable writing to special regs + movb %cl, %al # check whether we can write special reg 0x35 + call inidx + movb %al, %bl # save the current value of CRT reg 0x35 + andb $0xf0, %al # clear bits 0-3 + movb %al, %ah + movb %cl, %al # and write it to CRT reg 0x35 + call outidx + call inidx # now read it back + andb %ch, %al # clear the upper 4 bits + jz s3_2 # the first test failed. But we have a + + movb %bl, %ah # second chance + movb %cl, %al + call outidx + jmp s3_1 # do the other tests + +s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35 + orb %bl, %ah # set the upper 4 bits of ah with the orig value + call outidx # write ... + call inidx # ... and reread + andb %cl, %al # turn off the upper 4 bits + pushw %ax + movb %bl, %ah # restore old value in register 0x35 + movb %cl, %al + call outidx + popw %ax + cmpb %ch, %al # setting lower 4 bits was successful => bad + je no_s3 # writing is allowed => this is not an S3 + +s3_1: movw $0x4838, %ax # allow writing to special regs by putting + call outidx # magic number into CRT-register 0x38 + movb %cl, %al # check whether we can write special reg 0x35 + call inidx + movb %al, %bl + andb $0xf0, %al + movb %al, %ah + movb %cl, %al + call outidx + call inidx + andb %ch, %al + jnz no_s3 # no, we can't write => no S3 + + movw %cx, %ax + orb %bl, %ah + call outidx + call inidx + andb %ch, %al + pushw %ax + movb %bl, %ah # restore old value in register 0x35 + movb %cl, %al + call outidx + popw %ax + cmpb %ch, %al + jne no_s31 # writing not possible => no S3 + movb $0x30, %al + call inidx # now get the S3 id ... + leaw idS3, %di + movw $0x10, %cx + repne + scasb + je no_s31 + + movb %bh, %ah + movb $0x38, %al + jmp s3rest + +no_s3: movb $0x35, %al # restore CRT register 0x35 + movb %bl, %ah + call outidx +no_s31: xorw %bp, %bp # Detection failed +s3rest: movb %bh, %ah + movb $0x38, %al # restore old value of CRT register 0x38 + jmp outidx + +idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95 + .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0 + +s3_md: .byte 0x54, 0x2b, 0x84 + .byte 0x55, 0x19, 0x84 + .byte 0 + .ascii "S3" + .byte 0 + +# ATI cards. +ati_test: + leaw idati, %si + movw $0x31, %di + movw $0x09, %cx + repe + cmpsb + je atiok + + xorw %bp, %bp +atiok: ret + +idati: .ascii "761295520" + +ati_md: .byte 0x23, 0x19, 0x84 + .byte 0x33, 0x2c, 0x84 + .byte 0x22, 0x1e, 0x64 + .byte 0x21, 0x19, 0x64 + .byte 0x58, 0x21, 0x50 + .byte 0x5b, 0x1e, 0x50 + .byte 0 + .ascii "ATI" + .byte 0 + +# AHEAD +ahead_test: + movw $0x200f, %ax + movw $0x3ce, %dx + outw %ax, %dx + incw %dx + inb %dx, %al + cmpb $0x20, %al + je isahed + + cmpb $0x21, %al + je isahed + + xorw %bp, %bp +isahed: ret + +ahead_md: + .byte 0x22, 0x2c, 0x84 + .byte 0x23, 0x19, 0x84 + .byte 0x24, 0x1c, 0x84 + .byte 0x2f, 0x32, 0xa0 + .byte 0x32, 0x22, 0x50 + .byte 0x34, 0x42, 0x50 + .byte 0 + .ascii "Ahead" + .byte 0 + +# Chips & Tech. +chips_test: + movw $0x3c3, %dx + inb %dx, %al + orb $0x10, %al + outb %al, %dx + movw $0x104, %dx + inb %dx, %al + movb %al, %bl + movw $0x3c3, %dx + inb %dx, %al + andb $0xef, %al + outb %al, %dx + cmpb $0xa5, %bl + je cantok + + xorw %bp, %bp +cantok: ret + +chips_md: + .byte 0x60, 0x19, 0x84 + .byte 0x61, 0x32, 0x84 + .byte 0 + .ascii "Chips & Technologies" + .byte 0 + +# Cirrus Logic 5X0 +cirrus1_test: + movw $0x3d4, %dx + movb $0x0c, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bl + xorb %al, %al + outb %al, %dx + decw %dx + movb $0x1f, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bh + xorb %ah, %ah + shlb $4, %al + movw %ax, %cx + movb %bh, %al + shrb $4, %al + addw %ax, %cx + shlw $8, %cx + addw $6, %cx + movw %cx, %ax + movw $0x3c4, %dx + outw %ax, %dx + incw %dx + inb %dx, %al + andb %al, %al + jnz nocirr + + movb %bh, %al + outb %al, %dx + inb %dx, %al + cmpb $0x01, %al + je iscirr + +nocirr: xorw %bp, %bp +iscirr: movw $0x3d4, %dx + movb %bl, %al + xorb %ah, %ah + shlw $8, %ax + addw $0x0c, %ax + outw %ax, %dx + ret + +cirrus1_md: + .byte 0x1f, 0x19, 0x84 + .byte 0x20, 0x2c, 0x84 + .byte 0x22, 0x1e, 0x84 + .byte 0x31, 0x25, 0x64 + .byte 0 + .ascii "Cirrus Logic 5X0" + .byte 0 + +# Cirrus Logic 54XX +cirrus5_test: + movw $0x3c4, %dx + movb $6, %al + call inidx + movb %al, %bl # BL=backup + movw $6, %ax + call tstidx + cmpb $0x0f, %al + jne c5fail + + movw $0x1206, %ax + call tstidx + cmpb $0x12, %al + jne c5fail + + movb $0x1e, %al + call inidx + movb %al, %bh + movb %bh, %ah + andb $0xc0, %ah + movb $0x1e, %al + call tstidx + andb $0x3f, %al + jne c5xx + + movb $0x1e, %al + movb %bh, %ah + orb $0x3f, %ah + call tstidx + xorb $0x3f, %al + andb $0x3f, %al +c5xx: pushf + movb $0x1e, %al + movb %bh, %ah + outw %ax, %dx + popf + je c5done + +c5fail: xorw %bp, %bp +c5done: movb $6, %al + movb %bl, %ah + outw %ax, %dx + ret + +cirrus5_md: + .byte 0x14, 0x19, 0x84 + .byte 0x54, 0x2b, 0x84 + .byte 0 + .ascii "Cirrus Logic 54XX" + .byte 0 + +# Cirrus Logic 64XX -- no known extra modes, but must be identified, because +# it's misidentified by the Ahead test. +cirrus6_test: + movw $0x3ce, %dx + movb $0x0a, %al + call inidx + movb %al, %bl # BL=backup + movw $0xce0a, %ax + call tstidx + orb %al, %al + jne c2fail + + movw $0xec0a, %ax + call tstidx + cmpb $0x01, %al + jne c2fail + + movb $0xaa, %al + call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's. + shrb $4, %al + subb $4, %al + jz c6done + + decb %al + jz c6done + + subb $2, %al + jz c6done + + decb %al + jz c6done + +c2fail: xorw %bp, %bp +c6done: movb $0x0a, %al + movb %bl, %ah + outw %ax, %dx + ret + +cirrus6_md: + .byte 0 + .ascii "Cirrus Logic 64XX" + .byte 0 + +# Everex / Trident +everex_test: + movw $0x7000, %ax + xorw %bx, %bx + int $0x10 + cmpb $0x70, %al + jne noevrx + + shrw $4, %dx + cmpw $0x678, %dx + je evtrid + + cmpw $0x236, %dx + jne evrxok + +evtrid: leaw trident_md, %bp +evrxok: ret + +noevrx: xorw %bp, %bp + ret + +everex_md: + .byte 0x03, 0x22, 0x50 + .byte 0x04, 0x3c, 0x50 + .byte 0x07, 0x2b, 0x64 + .byte 0x08, 0x4b, 0x64 + .byte 0x0a, 0x19, 0x84 + .byte 0x0b, 0x2c, 0x84 + .byte 0x16, 0x1e, 0x50 + .byte 0x18, 0x1b, 0x64 + .byte 0x21, 0x40, 0xa0 + .byte 0x40, 0x1e, 0x84 + .byte 0 + .ascii "Everex/Trident" + .byte 0 + +# Genoa. +genoa_test: + leaw idgenoa, %si # Check Genoa 'clues' + xorw %ax, %ax + movb %es:(0x37), %al + movw %ax, %di + movw $0x04, %cx + decw %si + decw %di +l1: incw %si + incw %di + movb (%si), %al + testb %al, %al + jz l2 + + cmpb %es:(%di), %al +l2: loope l1 + orw %cx, %cx + je isgen + + xorw %bp, %bp +isgen: ret + +idgenoa: .byte 0x77, 0x00, 0x99, 0x66 + +genoa_md: + .byte 0x58, 0x20, 0x50 + .byte 0x5a, 0x2a, 0x64 + .byte 0x60, 0x19, 0x84 + .byte 0x61, 0x1d, 0x84 + .byte 0x62, 0x20, 0x84 + .byte 0x63, 0x2c, 0x84 + .byte 0x64, 0x3c, 0x84 + .byte 0x6b, 0x4f, 0x64 + .byte 0x72, 0x3c, 0x50 + .byte 0x74, 0x42, 0x50 + .byte 0x78, 0x4b, 0x64 + .byte 0 + .ascii "Genoa" + .byte 0 + +# OAK +oak_test: + leaw idoakvga, %si + movw $0x08, %di + movw $0x08, %cx + repe + cmpsb + je isoak + + xorw %bp, %bp +isoak: ret + +idoakvga: .ascii "OAK VGA " + +oak_md: .byte 0x4e, 0x3c, 0x50 + .byte 0x4f, 0x3c, 0x84 + .byte 0x50, 0x19, 0x84 + .byte 0x51, 0x2b, 0x84 + .byte 0 + .ascii "OAK" + .byte 0 + +# WD Paradise. +paradise_test: + leaw idparadise, %si + movw $0x7d, %di + movw $0x04, %cx + repe + cmpsb + je ispara + + xorw %bp, %bp +ispara: ret + +idparadise: .ascii "VGA=" + +paradise_md: + .byte 0x41, 0x22, 0x50 + .byte 0x47, 0x1c, 0x84 + .byte 0x55, 0x19, 0x84 + .byte 0x54, 0x2c, 0x84 + .byte 0 + .ascii "Paradise" + .byte 0 + +# Trident. +trident_test: + movw $0x3c4, %dx + movb $0x0e, %al + outb %al, %dx + incw %dx + inb %dx, %al + xchgb %al, %ah + xorb %al, %al + outb %al, %dx + inb %dx, %al + xchgb %ah, %al + movb %al, %bl # Strange thing ... in the book this wasn't + andb $0x02, %bl # necessary but it worked on my card which + jz setb2 # is a trident. Without it the screen goes + # blurred ... + andb $0xfd, %al + jmp clrb2 + +setb2: orb $0x02, %al +clrb2: outb %al, %dx + andb $0x0f, %ah + cmpb $0x02, %ah + je istrid + + xorw %bp, %bp +istrid: ret + +trident_md: + .byte 0x50, 0x1e, 0x50 + .byte 0x51, 0x2b, 0x50 + .byte 0x52, 0x3c, 0x50 + .byte 0x57, 0x19, 0x84 + .byte 0x58, 0x1e, 0x84 + .byte 0x59, 0x2b, 0x84 + .byte 0x5a, 0x3c, 0x84 + .byte 0 + .ascii "Trident" + .byte 0 + +# Tseng. +tseng_test: + movw $0x3cd, %dx + inb %dx, %al # Could things be this simple ! :-) + movb %al, %bl + movb $0x55, %al + outb %al, %dx + inb %dx, %al + movb %al, %ah + movb %bl, %al + outb %al, %dx + cmpb $0x55, %ah + je istsen + +isnot: xorw %bp, %bp +istsen: ret + +tseng_md: + .byte 0x26, 0x3c, 0x50 + .byte 0x2a, 0x28, 0x64 + .byte 0x23, 0x19, 0x84 + .byte 0x24, 0x1c, 0x84 + .byte 0x22, 0x2c, 0x84 + .byte 0x21, 0x3c, 0x84 + .byte 0 + .ascii "Tseng" + .byte 0 + +# Video7. +video7_test: + movw $0x3cc, %dx + inb %dx, %al + movw $0x3b4, %dx + andb $0x01, %al + jz even7 + + movw $0x3d4, %dx +even7: movb $0x0c, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bl + movb $0x55, %al + outb %al, %dx + inb %dx, %al + decw %dx + movb $0x1f, %al + outb %al, %dx + incw %dx + inb %dx, %al + movb %al, %bh + decw %dx + movb $0x0c, %al + outb %al, %dx + incw %dx + movb %bl, %al + outb %al, %dx + movb $0x55, %al + xorb $0xea, %al + cmpb %bh, %al + jne isnot + + movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching + ret + +video7_md: + .byte 0x40, 0x2b, 0x50 + .byte 0x43, 0x3c, 0x50 + .byte 0x44, 0x3c, 0x64 + .byte 0x41, 0x19, 0x84 + .byte 0x42, 0x2c, 0x84 + .byte 0x45, 0x1c, 0x84 + .byte 0 + .ascii "Video 7" + .byte 0 + +# Realtek VGA +realtek_test: + leaw idrtvga, %si + movw $0x45, %di + movw $0x0b, %cx + repe + cmpsb + je isrt + + xorw %bp, %bp +isrt: ret + +idrtvga: .ascii "REALTEK VGA" + +realtek_md: + .byte 0x1a, 0x3c, 0x50 + .byte 0x1b, 0x19, 0x84 + .byte 0x1c, 0x1e, 0x84 + .byte 0x1d, 0x2b, 0x84 + .byte 0x1e, 0x3c, 0x84 + .byte 0 + .ascii "REALTEK" + .byte 0 + +#endif /* CONFIG_VIDEO_SVGA */ + +# User-defined local mode table (VGA only) +#ifdef CONFIG_VIDEO_LOCAL +local_modes: + leaw local_mode_table, %si +locm1: lodsw + orw %ax, %ax + jz locm2 + + stosw + movsw + jmp locm1 + +locm2: ret + +# This is the table of local video modes which can be supplied manually +# by the user. Each entry consists of mode ID (word) and dimensions +# (byte for column count and another byte for row count). These modes +# are placed before all SVGA and VESA modes and override them if table +# compacting is enabled. The table must end with a zero word followed +# by NUL-terminated video adapter name. +local_mode_table: + .word 0x0100 # Example: 40x25 + .byte 25,40 + .word 0 + .ascii "Local" + .byte 0 +#endif /* CONFIG_VIDEO_LOCAL */ + +# Read a key and return the ASCII code in al, scan code in ah +getkey: xorb %ah, %ah + int $0x16 + ret + +# Read a key with a timeout of 30 seconds. +# The hardware clock is used to get the time. +getkt: call gettime + addb $30, %al # Wait 30 seconds + cmpb $60, %al + jl lminute + + subb $60, %al +lminute: + movb %al, %cl +again: movb $0x01, %ah + int $0x16 + jnz getkey # key pressed, so get it + + call gettime + cmpb %cl, %al + jne again + + movb $0x20, %al # timeout, return `space' + ret + +# Flush the keyboard buffer +flush: movb $0x01, %ah + int $0x16 + jz empty + + xorb %ah, %ah + int $0x16 + jmp flush + +empty: ret + +# Print hexadecimal number. +prthw: pushw %ax + movb %ah, %al + call prthb + popw %ax +prthb: pushw %ax + shrb $4, %al + call prthn + popw %ax + andb $0x0f, %al +prthn: cmpb $0x0a, %al + jc prth1 + + addb $0x07, %al +prth1: addb $0x30, %al + jmp prtchr + +# Print decimal number in al +prtdec: pushw %ax + pushw %cx + xorb %ah, %ah + movb $0x0a, %cl + idivb %cl + cmpb $0x09, %al + jbe lt100 + + call prtdec + jmp skip10 + +lt100: addb $0x30, %al + call prtchr +skip10: movb %ah, %al + addb $0x30, %al + call prtchr + popw %cx + popw %ax + ret + +store_edid: + pushw %es # just save all registers + pushw %ax + pushw %bx + pushw %cx + pushw %dx + pushw %di + + pushw %fs + popw %es + + movl $0x13131313, %eax # memset block with 0x13 + movw $32, %cx + movw $0x140, %di + cld + rep + stosl + + movw $0x4f15, %ax # do VBE/DDC + movw $0x01, %bx + movw $0x00, %cx + movw $0x01, %dx + movw $0x140, %di + int $0x10 + + popw %di # restore all registers + popw %dx + popw %cx + popw %bx + popw %ax + popw %es + ret + +# VIDEO_SELECT-only variables +mt_end: .word 0 # End of video mode table if built +edit_buf: .space 6 # Line editor buffer +card_name: .word 0 # Pointer to adapter name +scanning: .byte 0 # Performing mode scan +do_restore: .byte 0 # Screen contents altered during mode change +svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes +graphic_mode: .byte 0 # Graphic mode with a linear frame buffer +dac_size: .byte 6 # DAC bit depth + +# Status messages +keymsg: .ascii "Press to see video modes available, " + .ascii " to continue or wait 30 secs" + .byte 0x0d, 0x0a, 0 + +listhdr: .byte 0x0d, 0x0a + .ascii "Mode: COLSxROWS:" + +crlft: .byte 0x0d, 0x0a, 0 + +prompt: .byte 0x0d, 0x0a + .asciz "Enter mode number or `scan': " + +unknt: .asciz "Unknown mode ID. Try again." + +badmdt: .ascii "You passed an undefined mode number." + .byte 0x0d, 0x0a, 0 + +vesaer: .ascii "Error: Scanning of VESA modes failed. Please " + .ascii "report to ." + .byte 0x0d, 0x0a, 0 + +old_name: .asciz "CGA/MDA/HGA" + +ega_name: .asciz "EGA" + +svga_name: .ascii " " + +vga_name: .asciz "VGA" + +vesa_name: .asciz "VESA" + +name_bann: .asciz "Video adapter: " +#endif /* CONFIG_VIDEO_SELECT */ + +# Other variables: +adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA +video_segment: .word 0xb800 # Video memory segment +force_size: .word 0 # Use this size instead of the one in BIOS vars diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig new file mode 100644 index 000000000000..9ce51dee30b3 --- /dev/null +++ b/arch/x86_64/defconfig @@ -0,0 +1,1129 @@ +# +# Automatically generated make config: don't edit +# Linux kernel version: 2.6.11-bk7 +# Sat Mar 12 23:43:44 2005 +# +CONFIG_X86_64=y +CONFIG_64BIT=y +CONFIG_X86=y +CONFIG_MMU=y +CONFIG_RWSEM_GENERIC_SPINLOCK=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_X86_CMPXCHG=y +CONFIG_EARLY_PRINTK=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_IOMAP=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +CONFIG_LOCK_KERNEL=y + +# +# General setup +# +CONFIG_LOCALVERSION="" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +# CONFIG_BSD_PROCESS_ACCT is not set +CONFIG_SYSCTL=y +# CONFIG_AUDIT is not set +CONFIG_LOG_BUF_SHIFT=18 +# CONFIG_HOTPLUG is not set +CONFIG_KOBJECT_UEVENT=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_CPUSETS is not set +# CONFIG_EMBEDDED is not set +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +# CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SHMEM=y +CONFIG_CC_ALIGN_FUNCTIONS=0 +CONFIG_CC_ALIGN_LABELS=0 +CONFIG_CC_ALIGN_LOOPS=0 +CONFIG_CC_ALIGN_JUMPS=0 +# CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_OBSOLETE_MODPARM=y +# CONFIG_MODVERSIONS is not set +# CONFIG_MODULE_SRCVERSION_ALL is not set +# CONFIG_KMOD is not set +CONFIG_STOP_MACHINE=y + +# +# Processor type and features +# +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_L1_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_TSC=y +CONFIG_X86_GOOD_APIC=y +# CONFIG_MICROCODE is not set +CONFIG_X86_MSR=y +CONFIG_X86_CPUID=y +CONFIG_X86_HT=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_MTRR=y +CONFIG_SMP=y +# CONFIG_PREEMPT is not set +CONFIG_SCHED_SMT=y +CONFIG_K8_NUMA=y +# CONFIG_NUMA_EMU is not set +CONFIG_DISCONTIGMEM=y +CONFIG_NUMA=y +CONFIG_HAVE_DEC_LOCK=y +CONFIG_NR_CPUS=8 +CONFIG_GART_IOMMU=y +CONFIG_SWIOTLB=y +CONFIG_X86_MCE=y +CONFIG_X86_MCE_INTEL=y +CONFIG_SECCOMP=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_IRQ_PROBE=y + +# +# Power management options +# +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_SOFTWARE_SUSPEND=y +CONFIG_PM_STD_PARTITION="" + +# +# ACPI (Advanced Configuration and Power Interface) Support +# +CONFIG_ACPI=y +CONFIG_ACPI_BOOT=y +CONFIG_ACPI_INTERPRETER=y +CONFIG_ACPI_SLEEP=y +CONFIG_ACPI_SLEEP_PROC_FS=y +CONFIG_ACPI_AC=y +CONFIG_ACPI_BATTERY=y +CONFIG_ACPI_BUTTON=y +# CONFIG_ACPI_VIDEO is not set +CONFIG_ACPI_FAN=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ACPI_NUMA=y +# CONFIG_ACPI_ASUS is not set +# CONFIG_ACPI_IBM is not set +CONFIG_ACPI_TOSHIBA=y +CONFIG_ACPI_BLACKLIST_YEAR=2001 +CONFIG_ACPI_DEBUG=y +CONFIG_ACPI_BUS=y +CONFIG_ACPI_EC=y +CONFIG_ACPI_POWER=y +CONFIG_ACPI_PCI=y +CONFIG_ACPI_SYSTEM=y +# CONFIG_ACPI_CONTAINER is not set + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_DEBUG is not set +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_STAT_DETAILS is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_TABLE=y + +# +# CPUFreq processor drivers +# +CONFIG_X86_POWERNOW_K8=y +CONFIG_X86_POWERNOW_K8_ACPI=y +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_ACPI_CPUFREQ=y + +# +# shared options +# +CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y + +# +# Bus options (PCI etc.) +# +CONFIG_PCI=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_UNORDERED_IO=y +CONFIG_PCI_MSI=y +# CONFIG_PCI_LEGACY_PROC is not set +# CONFIG_PCI_NAMES is not set + +# +# PCCARD (PCMCIA/CardBus) support +# +# CONFIG_PCCARD is not set + +# +# PC-card bridges +# + +# +# PCI Hotplug Support +# +# CONFIG_HOTPLUG_PCI is not set + +# +# Executable file formats / Emulations +# +CONFIG_BINFMT_ELF=y +# CONFIG_BINFMT_MISC is not set +CONFIG_IA32_EMULATION=y +CONFIG_IA32_AOUT=y +CONFIG_COMPAT=y +CONFIG_SYSVIPC_COMPAT=y +CONFIG_UID16=y + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y +# CONFIG_FW_LOADER is not set +# CONFIG_DEBUG_DRIVER is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Parallel port support +# +# CONFIG_PARPORT is not set + +# +# Plug and Play support +# +# CONFIG_PNP is not set + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +# CONFIG_BLK_DEV_COW_COMMON is not set +CONFIG_BLK_DEV_LOOP=y +# CONFIG_BLK_DEV_CRYPTOLOOP is not set +# CONFIG_BLK_DEV_NBD is not set +# CONFIG_BLK_DEV_SX8 is not set +# CONFIG_BLK_DEV_UB is not set +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_LBD=y +# CONFIG_CDROM_PKTCDVD is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +# CONFIG_ATA_OVER_ETH is not set + +# +# ATA/ATAPI/MFM/RLL support +# +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y + +# +# Please see Documentation/ide.txt for help/info on IDE drives +# +# CONFIG_BLK_DEV_IDE_SATA is not set +# CONFIG_BLK_DEV_HD_IDE is not set +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +CONFIG_BLK_DEV_IDECD=y +# CONFIG_BLK_DEV_IDETAPE is not set +# CONFIG_BLK_DEV_IDEFLOPPY is not set +# CONFIG_BLK_DEV_IDESCSI is not set +# CONFIG_IDE_TASK_IOCTL is not set + +# +# IDE chipset support/bugfixes +# +CONFIG_IDE_GENERIC=y +# CONFIG_BLK_DEV_CMD640 is not set +CONFIG_BLK_DEV_IDEPCI=y +# CONFIG_IDEPCI_SHARE_IRQ is not set +# CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_GENERIC is not set +# CONFIG_BLK_DEV_OPTI621 is not set +# CONFIG_BLK_DEV_RZ1000 is not set +CONFIG_BLK_DEV_IDEDMA_PCI=y +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +# CONFIG_BLK_DEV_AEC62XX is not set +# CONFIG_BLK_DEV_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +# CONFIG_BLK_DEV_ATIIXP is not set +# CONFIG_BLK_DEV_CMD64X is not set +# CONFIG_BLK_DEV_TRIFLEX is not set +# CONFIG_BLK_DEV_CY82C693 is not set +# CONFIG_BLK_DEV_CS5520 is not set +# CONFIG_BLK_DEV_CS5530 is not set +# CONFIG_BLK_DEV_HPT34X is not set +# CONFIG_BLK_DEV_HPT366 is not set +# CONFIG_BLK_DEV_SC1200 is not set +CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_NS87415 is not set +# CONFIG_BLK_DEV_PDC202XX_OLD is not set +# CONFIG_BLK_DEV_PDC202XX_NEW is not set +# CONFIG_BLK_DEV_SVWKS is not set +# CONFIG_BLK_DEV_SIIMAGE is not set +# CONFIG_BLK_DEV_SIS5513 is not set +# CONFIG_BLK_DEV_SLC90E66 is not set +# CONFIG_BLK_DEV_TRM290 is not set +# CONFIG_BLK_DEV_VIA82CXXX is not set +# CONFIG_IDE_ARM is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_IVB is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_BLK_DEV_HD is not set + +# +# SCSI device support +# +CONFIG_SCSI=y +# CONFIG_SCSI_PROC_FS is not set + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +# CONFIG_CHR_DEV_ST is not set +# CONFIG_CHR_DEV_OSST is not set +# CONFIG_BLK_DEV_SR is not set +# CONFIG_CHR_DEV_SG is not set + +# +# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +# +# CONFIG_SCSI_MULTI_LUN is not set +# CONFIG_SCSI_CONSTANTS is not set +# CONFIG_SCSI_LOGGING is not set + +# +# SCSI Transport Attributes +# +# CONFIG_SCSI_SPI_ATTRS is not set +# CONFIG_SCSI_FC_ATTRS is not set +# CONFIG_SCSI_ISCSI_ATTRS is not set + +# +# SCSI low-level drivers +# +CONFIG_BLK_DEV_3W_XXXX_RAID=y +# CONFIG_SCSI_3W_9XXX is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AACRAID is not set +# CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_AIC7XXX_OLD is not set +CONFIG_SCSI_AIC79XX=y +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=4000 +# CONFIG_AIC79XX_ENABLE_RD_STRM is not set +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +# CONFIG_MEGARAID_NEWGEN is not set +# CONFIG_MEGARAID_LEGACY is not set +CONFIG_SCSI_SATA=y +# CONFIG_SCSI_SATA_AHCI is not set +# CONFIG_SCSI_SATA_SVW is not set +CONFIG_SCSI_ATA_PIIX=y +# CONFIG_SCSI_SATA_NV is not set +# CONFIG_SCSI_SATA_PROMISE is not set +# CONFIG_SCSI_SATA_QSTOR is not set +# CONFIG_SCSI_SATA_SX4 is not set +# CONFIG_SCSI_SATA_SIL is not set +# CONFIG_SCSI_SATA_SIS is not set +# CONFIG_SCSI_SATA_ULI is not set +CONFIG_SCSI_SATA_VIA=y +# CONFIG_SCSI_SATA_VITESSE is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_EATA_PIO is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set +# CONFIG_SCSI_GDTH is not set +# CONFIG_SCSI_IPS is not set +# CONFIG_SCSI_INITIO is not set +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_IPR is not set +# CONFIG_SCSI_QLOGIC_ISP is not set +# CONFIG_SCSI_QLOGIC_FC is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +CONFIG_SCSI_QLA2XXX=y +# CONFIG_SCSI_QLA21XX is not set +# CONFIG_SCSI_QLA22XX is not set +# CONFIG_SCSI_QLA2300 is not set +# CONFIG_SCSI_QLA2322 is not set +# CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_DC395x is not set +# CONFIG_SCSI_DC390T is not set +# CONFIG_SCSI_DEBUG is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set + +# +# Fusion MPT device support +# +CONFIG_FUSION=y +CONFIG_FUSION_MAX_SGE=40 +# CONFIG_FUSION_CTL is not set + +# +# IEEE 1394 (FireWire) support +# +# CONFIG_IEEE1394 is not set + +# +# I2O device support +# +# CONFIG_I2O is not set + +# +# Networking support +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +# CONFIG_PACKET_MMAP is not set +# CONFIG_NETLINK_DEV is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_IP_MROUTE is not set +# CONFIG_ARPD is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_INET_TUNNEL is not set +CONFIG_IP_TCPDIAG=y +CONFIG_IP_TCPDIAG_IPV6=y +CONFIG_IPV6=y +# CONFIG_IPV6_PRIVACY is not set +# CONFIG_INET6_AH is not set +# CONFIG_INET6_ESP is not set +# CONFIG_INET6_IPCOMP is not set +# CONFIG_INET6_TUNNEL is not set +# CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETFILTER is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_BRIDGE is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_DECNET is not set +# CONFIG_LLC2 is not set +# CONFIG_IPX is not set +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set +# CONFIG_NET_CLS_ROUTE is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +CONFIG_NETPOLL=y +# CONFIG_NETPOLL_RX is not set +# CONFIG_NETPOLL_TRAP is not set +CONFIG_NET_POLL_CONTROLLER=y +# CONFIG_HAMRADIO is not set +# CONFIG_IRDA is not set +# CONFIG_BT is not set +CONFIG_NETDEVICES=y +# CONFIG_DUMMY is not set +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +# CONFIG_TUN is not set + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +CONFIG_MII=y +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_NET_VENDOR_3COM is not set + +# +# Tulip family network device support +# +# CONFIG_NET_TULIP is not set +# CONFIG_HP100 is not set +CONFIG_NET_PCI=y +# CONFIG_PCNET32 is not set +CONFIG_AMD8111_ETH=y +# CONFIG_AMD8111E_NAPI is not set +# CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_B44 is not set +CONFIG_FORCEDETH=y +# CONFIG_DGRS is not set +# CONFIG_EEPRO100 is not set +# CONFIG_E100 is not set +# CONFIG_FEALNX is not set +# CONFIG_NATSEMI is not set +# CONFIG_NE2K_PCI is not set +CONFIG_8139CP=m +CONFIG_8139TOO=y +# CONFIG_8139TOO_PIO is not set +# CONFIG_8139TOO_TUNE_TWISTER is not set +# CONFIG_8139TOO_8129 is not set +# CONFIG_8139_OLD_RX_RESET is not set +# CONFIG_SIS900 is not set +# CONFIG_EPIC100 is not set +# CONFIG_SUNDANCE is not set +# CONFIG_VIA_RHINE is not set + +# +# Ethernet (1000 Mbit) +# +# CONFIG_ACENIC is not set +# CONFIG_DL2K is not set +CONFIG_E1000=y +# CONFIG_E1000_NAPI is not set +# CONFIG_NS83820 is not set +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_R8169 is not set +# CONFIG_SK98LIN is not set +# CONFIG_VIA_VELOCITY is not set +CONFIG_TIGON3=y + +# +# Ethernet (10000 Mbit) +# +# CONFIG_IXGB is not set +CONFIG_S2IO=m +# CONFIG_S2IO_NAPI is not set +# CONFIG_2BUFF_MODE is not set + +# +# Token Ring devices +# +# CONFIG_TR is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +# CONFIG_PPP is not set +# CONFIG_SLIP is not set +# CONFIG_NET_FC is not set +# CONFIG_SHAPER is not set +CONFIG_NETCONSOLE=y + +# +# ISDN subsystem +# +# CONFIG_ISDN is not set + +# +# Telephony Support +# +# CONFIG_PHONE is not set + +# +# Input device support +# +CONFIG_INPUT=y + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +# CONFIG_INPUT_JOYDEV is not set +# CONFIG_INPUT_TSDEV is not set +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_LKKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +# CONFIG_MOUSE_SERIAL is not set +# CONFIG_MOUSE_VSXXXAA is not set +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TOUCHSCREEN is not set +# CONFIG_INPUT_MISC is not set + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PCIPS2 is not set +CONFIG_SERIO_LIBPS2=y +# CONFIG_SERIO_RAW is not set +# CONFIG_GAMEPORT is not set +CONFIG_SOUND_GAMEPORT=y + +# +# Character devices +# +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +# CONFIG_SERIAL_8250_ACPI is not set +CONFIG_SERIAL_8250_NR_UARTS=4 +# CONFIG_SERIAL_8250_EXTENDED is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 + +# +# IPMI +# +# CONFIG_IPMI_HANDLER is not set + +# +# Watchdog Cards +# +# CONFIG_WATCHDOG is not set +CONFIG_HW_RANDOM=y +# CONFIG_NVRAM is not set +CONFIG_RTC=y +# CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set + +# +# Ftape, the floppy tape device driver +# +CONFIG_AGP=y +CONFIG_AGP_AMD64=y +# CONFIG_DRM is not set +# CONFIG_MWAVE is not set +CONFIG_RAW_DRIVER=y +CONFIG_HPET=y +# CONFIG_HPET_RTC_IRQ is not set +CONFIG_HPET_MMAP=y +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HANGCHECK_TIMER=y + +# +# TPM devices +# +# CONFIG_TCG_TPM is not set + +# +# I2C support +# +# CONFIG_I2C is not set + +# +# Dallas's 1-wire bus +# +# CONFIG_W1 is not set + +# +# Misc devices +# +# CONFIG_IBM_ASM is not set + +# +# Multimedia devices +# +# CONFIG_VIDEO_DEV is not set + +# +# Digital Video Broadcasting Devices +# +# CONFIG_DVB is not set + +# +# Graphics support +# +# CONFIG_FB is not set +CONFIG_VIDEO_SELECT=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_DUMMY_CONSOLE=y + +# +# Sound +# +CONFIG_SOUND=y + +# +# Advanced Linux Sound Architecture +# +# CONFIG_SND is not set + +# +# Open Sound System +# +CONFIG_SOUND_PRIME=y +# CONFIG_SOUND_BT878 is not set +# CONFIG_SOUND_CMPCI is not set +# CONFIG_SOUND_EMU10K1 is not set +# CONFIG_SOUND_FUSION is not set +# CONFIG_SOUND_CS4281 is not set +# CONFIG_SOUND_ES1370 is not set +# CONFIG_SOUND_ES1371 is not set +# CONFIG_SOUND_ESSSOLO1 is not set +# CONFIG_SOUND_MAESTRO is not set +# CONFIG_SOUND_MAESTRO3 is not set +CONFIG_SOUND_ICH=y +# CONFIG_SOUND_SONICVIBES is not set +# CONFIG_SOUND_TRIDENT is not set +# CONFIG_SOUND_MSNDCLAS is not set +# CONFIG_SOUND_MSNDPIN is not set +# CONFIG_SOUND_VIA82CXXX is not set +# CONFIG_SOUND_OSS is not set +# CONFIG_SOUND_ALI5455 is not set +# CONFIG_SOUND_FORTE is not set +# CONFIG_SOUND_RME96XX is not set +# CONFIG_SOUND_AD1980 is not set + +# +# USB support +# +CONFIG_USB=y +# CONFIG_USB_DEBUG is not set + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +# CONFIG_USB_DYNAMIC_MINORS is not set +# CONFIG_USB_SUSPEND is not set +# CONFIG_USB_OTG is not set +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y + +# +# USB Host Controller Drivers +# +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_SPLIT_ISO is not set +# CONFIG_USB_EHCI_ROOT_HUB_TT is not set +CONFIG_USB_OHCI_HCD=y +# CONFIG_USB_OHCI_BIG_ENDIAN is not set +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_UHCI_HCD=y +# CONFIG_USB_SL811_HCD is not set + +# +# USB Device Class drivers +# +# CONFIG_USB_AUDIO is not set +# CONFIG_USB_BLUETOOTH_TTY is not set +# CONFIG_USB_MIDI is not set +# CONFIG_USB_ACM is not set +CONFIG_USB_PRINTER=y + +# +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# +CONFIG_USB_STORAGE=y +# CONFIG_USB_STORAGE_DEBUG is not set +# CONFIG_USB_STORAGE_RW_DETECT is not set +# CONFIG_USB_STORAGE_DATAFAB is not set +# CONFIG_USB_STORAGE_FREECOM is not set +# CONFIG_USB_STORAGE_ISD200 is not set +# CONFIG_USB_STORAGE_DPCM is not set +# CONFIG_USB_STORAGE_USBAT is not set +# CONFIG_USB_STORAGE_SDDR09 is not set +# CONFIG_USB_STORAGE_SDDR55 is not set +# CONFIG_USB_STORAGE_JUMPSHOT is not set + +# +# USB Input Devices +# +CONFIG_USB_HID=y +CONFIG_USB_HIDINPUT=y +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set +# CONFIG_USB_AIPTEK is not set +# CONFIG_USB_WACOM is not set +# CONFIG_USB_KBTAB is not set +# CONFIG_USB_POWERMATE is not set +# CONFIG_USB_MTOUCH is not set +# CONFIG_USB_EGALAX is not set +# CONFIG_USB_XPAD is not set +# CONFIG_USB_ATI_REMOTE is not set + +# +# USB Imaging devices +# +# CONFIG_USB_MDC800 is not set +# CONFIG_USB_MICROTEK is not set + +# +# USB Multimedia devices +# +# CONFIG_USB_DABUSB is not set + +# +# Video4Linux support is needed for USB Multimedia device support +# + +# +# USB Network Adapters +# +# CONFIG_USB_CATC is not set +# CONFIG_USB_KAWETH is not set +# CONFIG_USB_PEGASUS is not set +# CONFIG_USB_RTL8150 is not set +# CONFIG_USB_USBNET is not set +CONFIG_USB_MON=y + +# +# USB port drivers +# + +# +# USB Serial Converter support +# +# CONFIG_USB_SERIAL is not set + +# +# USB Miscellaneous drivers +# +# CONFIG_USB_EMI62 is not set +# CONFIG_USB_EMI26 is not set +# CONFIG_USB_AUERSWALD is not set +# CONFIG_USB_RIO500 is not set +# CONFIG_USB_LEGOTOWER is not set +# CONFIG_USB_LCD is not set +# CONFIG_USB_LED is not set +# CONFIG_USB_CYTHERM is not set +# CONFIG_USB_PHIDGETKIT is not set +# CONFIG_USB_PHIDGETSERVO is not set +# CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_SISUSBVGA is not set +# CONFIG_USB_TEST is not set + +# +# USB ATM/DSL drivers +# + +# +# USB Gadget Support +# +# CONFIG_USB_GADGET is not set + +# +# MMC/SD Card support +# +# CONFIG_MMC is not set + +# +# InfiniBand support +# +# CONFIG_INFINIBAND is not set + +# +# Firmware Drivers +# +# CONFIG_EDD is not set + +# +# File systems +# +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +# CONFIG_EXT2_FS_SECURITY is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +# CONFIG_REISERFS_FS_SECURITY is not set +# CONFIG_JFS_FS is not set +CONFIG_FS_POSIX_ACL=y + +# +# XFS support +# +# CONFIG_XFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_ROMFS_FS is not set +# CONFIG_QUOTA is not set +CONFIG_DNOTIFY=y +CONFIG_AUTOFS_FS=y +# CONFIG_AUTOFS4_FS is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_SYSFS=y +# CONFIG_DEVFS_FS is not set +# CONFIG_DEVPTS_FS_XATTR is not set +CONFIG_TMPFS=y +# CONFIG_TMPFS_XATTR is not set +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_HFSPLUS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_CRAMFS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set + +# +# Network File Systems +# +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +# CONFIG_NFS_V4 is not set +# CONFIG_NFS_DIRECTIO is not set +CONFIG_NFSD=y +CONFIG_NFSD_V3=y +# CONFIG_NFSD_V4 is not set +CONFIG_NFSD_TCP=y +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=y +CONFIG_SUNRPC=y +# CONFIG_RPCSEC_GSS_KRB5 is not set +# CONFIG_RPCSEC_GSS_SPKM3 is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y + +# +# Native Language Support +# +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=y +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +CONFIG_NLS_ISO8859_15=y +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +CONFIG_NLS_UTF8=y + +# +# Profiling support +# +CONFIG_PROFILING=y +CONFIG_OPROFILE=y + +# +# Kernel hacking +# +CONFIG_DEBUG_KERNEL=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_PRINTK_TIME is not set +# CONFIG_SCHEDSTATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_FS=y +CONFIG_INIT_DEBUG=y +# CONFIG_IOMMU_DEBUG is not set +CONFIG_KPROBES=y + +# +# Security options +# +# CONFIG_KEYS is not set +# CONFIG_SECURITY is not set + +# +# Cryptographic options +# +# CONFIG_CRYPTO is not set + +# +# Hardware crypto devices +# + +# +# Library routines +# +# CONFIG_CRC_CCITT is not set +CONFIG_CRC32=y +# CONFIG_LIBCRC32C is not set diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile new file mode 100644 index 000000000000..a12b19da4b59 --- /dev/null +++ b/arch/x86_64/ia32/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for the ia32 kernel emulation subsystem. +# + +obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \ + ia32_signal.o tls32.o \ + ia32_binfmt.o fpu32.o ptrace32.o syscall32.o + +sysv-$(CONFIG_SYSVIPC) := ipc32.o +obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) + +obj-$(CONFIG_IA32_AOUT) += ia32_aout.o + +$(obj)/syscall32.o: $(src)/syscall32.c \ + $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) + +# Teach kbuild about targets +targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) + +# The DSO images are built using a special linker script +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ + -Wl,-soname=linux-gate.so.1 -o $@ \ + -Wl,-T,$(filter-out FORCE,$^) + +$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ +$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE + $(call if_changed,syscall) + +AFLAGS_vsyscall-sysenter.o = -m32 +AFLAGS_vsyscall-syscall.o = -m32 +CFLAGS_ia32_ioctl.o += -Ifs/ diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c new file mode 100644 index 000000000000..1c23095f1813 --- /dev/null +++ b/arch/x86_64/ia32/fpu32.c @@ -0,0 +1,184 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. + * This is used for ptrace, signals and coredumps in 32bit emulation. + * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $ + */ + +#include +#include +#include +#include +#include + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st = NULL; + unsigned long tos = (fxsave->swd >> 11) & 7; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000; + int i; + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); + + for (i = 0 ; i < 8 ; i++) { + if (twd & 0x1) { + st = FPREG_ADDR( fxsave, (i - tos) & 7 ); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if ( !st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if (st->significand[3] & 0x8000) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + + +static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave, + struct _fpstate_ia32 __user *buf) +{ + struct _fpxreg *to; + struct _fpreg __user *from; + int i; + u32 v; + int err = 0; + +#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf) + G(0, fxsave->cwd); + G(1, fxsave->swd); + G(2, fxsave->twd); + fxsave->twd = twd_i387_to_fxsr(fxsave->twd); + G(3, fxsave->rip); + G(4, v); + fxsave->fop = v>>16; /* cs ignored */ + G(5, fxsave->rdp); + /* 6: ds ignored */ +#undef G + if (err) + return -1; + + to = (struct _fpxreg *)&fxsave->st_space[0]; + from = &buf->_st[0]; + for (i = 0 ; i < 8 ; i++, to++, from++) { + if (__copy_from_user(to, from, sizeof(*from))) + return -1; + } + return 0; +} + + +static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf, + struct i387_fxsave_struct *fxsave, + struct pt_regs *regs, + struct task_struct *tsk) +{ + struct _fpreg __user *to; + struct _fpxreg *from; + int i; + u16 cs,ds; + int err = 0; + + if (tsk == current) { + /* should be actually ds/cs at fpu exception time, + but that information is not available in 64bit mode. */ + asm("movw %%ds,%0 " : "=r" (ds)); + asm("movw %%cs,%0 " : "=r" (cs)); + } else { /* ptrace. task has stopped. */ + ds = tsk->thread.ds; + cs = regs->cs; + } + +#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf) + P(0, (u32)fxsave->cwd | 0xffff0000); + P(1, (u32)fxsave->swd | 0xffff0000); + P(2, twd_fxsr_to_i387(fxsave)); + P(3, (u32)fxsave->rip); + P(4, cs | ((u32)fxsave->fop) << 16); + P(5, fxsave->rdp); + P(6, 0xffff0000 | ds); +#undef P + + if (err) + return -1; + + to = &buf->_st[0]; + from = (struct _fpxreg *) &fxsave->st_space[0]; + for ( i = 0 ; i < 8 ; i++, to++, from++ ) { + if (__copy_to_user(to, from, sizeof(*to))) + return -1; + } + return 0; +} + +int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) +{ + clear_fpu(tsk); + if (!fsave) { + if (__copy_from_user(&tsk->thread.i387.fxsave, + &buf->_fxsr_env[0], + sizeof(struct i387_fxsave_struct))) + return -1; + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + set_stopped_child_used_math(tsk); + } + return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf); +} + +int save_i387_ia32(struct task_struct *tsk, + struct _fpstate_ia32 __user *buf, + struct pt_regs *regs, + int fsave) +{ + int err = 0; + + init_fpu(tsk); + if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk)) + return -1; + if (fsave) + return 0; + err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); + if (fsave) + return err ? -1 : 1; + err |= __put_user(X86_FXSR_MAGIC, &buf->magic); + err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct)); + return err ? -1 : 1; +} diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c new file mode 100644 index 000000000000..1965efc974dc --- /dev/null +++ b/arch/x86_64/ia32/ia32_aout.c @@ -0,0 +1,529 @@ +/* + * a.out loader for x86-64 + * + * Copyright (C) 1991, 1992, 1996 Linus Torvalds + * Hacked together by Andi Kleen + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#undef WARN_OLD +#undef CORE_DUMP /* probably broken */ + +extern int ia32_setup_arg_pages(struct linux_binprm *bprm, + unsigned long stack_top, int exec_stack); + +static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); +static int load_aout_library(struct file*); + +#if CORE_DUMP +static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); + +/* + * fill in the user structure for a core dump.. + */ +static void dump_thread32(struct pt_regs * regs, struct user32 * dump) +{ + u32 fs,gs; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; + + if (dump->start_stack < 0xc0000000) + dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; + + dump->regs.ebx = regs->rbx; + dump->regs.ecx = regs->rcx; + dump->regs.edx = regs->rdx; + dump->regs.esi = regs->rsi; + dump->regs.edi = regs->rdi; + dump->regs.ebp = regs->rbp; + dump->regs.eax = regs->rax; + dump->regs.ds = current->thread.ds; + dump->regs.es = current->thread.es; + asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; + asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; + dump->regs.orig_eax = regs->orig_rax; + dump->regs.eip = regs->rip; + dump->regs.cs = regs->cs; + dump->regs.eflags = regs->eflags; + dump->regs.esp = regs->rsp; + dump->regs.ss = regs->ss; + +#if 1 /* FIXME */ + dump->u_fpvalid = 0; +#else + dump->u_fpvalid = dump_fpu (regs, &dump->i387); +#endif +} + +#endif + +static struct linux_binfmt aout_format = { + .module = THIS_MODULE, + .load_binary = load_aout_binary, + .load_shlib = load_aout_library, +#if CORE_DUMP + .core_dump = aout_core_dump, +#endif + .min_coredump = PAGE_SIZE +}; + +static void set_brk(unsigned long start, unsigned long end) +{ + start = PAGE_ALIGN(start); + end = PAGE_ALIGN(end); + if (end <= start) + return; + down_write(¤t->mm->mmap_sem); + do_brk(start, end - start); + up_write(¤t->mm->mmap_sem); +} + +#if CORE_DUMP +/* + * These are the only things you should do on a core-file: use only these + * macros to write out all the necessary info. + */ + +static int dump_write(struct file *file, const void *addr, int nr) +{ + return file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} + +#define DUMP_WRITE(addr, nr) \ + if (!dump_write(file, (void *)(addr), (nr))) \ + goto end_coredump; + +#define DUMP_SEEK(offset) \ +if (file->f_op->llseek) { \ + if (file->f_op->llseek(file,(offset),0) != (offset)) \ + goto end_coredump; \ +} else file->f_pos = (offset) + +/* + * Routine writes a core dump image in the current directory. + * Currently only a stub-function. + * + * Note that setuid/setgid files won't make a core-dump if the uid/gid + * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable" + * field, which also makes sure the core-dumps won't be recursive if the + * dumping of the process results in another error.. + */ + +static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file) +{ + mm_segment_t fs; + int has_dumped = 0; + unsigned long dump_start, dump_size; + struct user32 dump; +# define START_DATA(u) (u.u_tsize << PAGE_SHIFT) +# define START_STACK(u) (u.start_stack) + + fs = get_fs(); + set_fs(KERNEL_DS); + has_dumped = 1; + current->flags |= PF_DUMPCORE; + strncpy(dump.u_comm, current->comm, sizeof(current->comm)); + dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); + dump.signal = signr; + dump_thread32(regs, &dump); + +/* If the size of the dump file exceeds the rlimit, then see what would happen + if we wrote the stack, but not the data area. */ + if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE > + current->signal->rlim[RLIMIT_CORE].rlim_cur) + dump.u_dsize = 0; + +/* Make sure we have enough room to write the stack and data areas. */ + if ((dump.u_ssize+1) * PAGE_SIZE > + current->signal->rlim[RLIMIT_CORE].rlim_cur) + dump.u_ssize = 0; + +/* make sure we actually have a data and stack area to dump */ + set_fs(USER_DS); + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) + dump.u_dsize = 0; + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) + dump.u_ssize = 0; + + set_fs(KERNEL_DS); +/* struct user */ + DUMP_WRITE(&dump,sizeof(dump)); +/* Now dump all of the user data. Include malloced stuff as well */ + DUMP_SEEK(PAGE_SIZE); +/* now we start writing out the user space info */ + set_fs(USER_DS); +/* Dump the data area */ + if (dump.u_dsize != 0) { + dump_start = START_DATA(dump); + dump_size = dump.u_dsize << PAGE_SHIFT; + DUMP_WRITE(dump_start,dump_size); + } +/* Now prepare to dump the stack area */ + if (dump.u_ssize != 0) { + dump_start = START_STACK(dump); + dump_size = dump.u_ssize << PAGE_SHIFT; + DUMP_WRITE(dump_start,dump_size); + } +/* Finally dump the task struct. Not be used by gdb, but could be useful */ + set_fs(KERNEL_DS); + DUMP_WRITE(current,sizeof(*current)); +end_coredump: + set_fs(fs); + return has_dumped; +} +#endif + +/* + * create_aout_tables() parses the env- and arg-strings in new user + * memory and creates the pointer tables from them, and puts their + * addresses on the "stack", returning the new stack pointer value. + */ +static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) +{ + u32 __user *argv; + u32 __user *envp; + u32 __user *sp; + int argc = bprm->argc; + int envc = bprm->envc; + + sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); + sp -= envc+1; + envp = sp; + sp -= argc+1; + argv = sp; + put_user((unsigned long) envp,--sp); + put_user((unsigned long) argv,--sp); + put_user(argc,--sp); + current->mm->arg_start = (unsigned long) p; + while (argc-->0) { + char c; + put_user((u32)(unsigned long)p,argv++); + do { + get_user(c,p++); + } while (c); + } + put_user(NULL,argv); + current->mm->arg_end = current->mm->env_start = (unsigned long) p; + while (envc-->0) { + char c; + put_user((u32)(unsigned long)p,envp++); + do { + get_user(c,p++); + } while (c); + } + put_user(NULL,envp); + current->mm->env_end = (unsigned long) p; + return sp; +} + +/* + * These are the functions used to load a.out style executables and shared + * libraries. There is no binary dependent code anywhere else. + */ + +static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +{ + struct exec ex; + unsigned long error; + unsigned long fd_offset; + unsigned long rlim; + int retval; + + ex = *((struct exec *) bprm->buf); /* exec-header */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && + N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || + N_TRSIZE(ex) || N_DRSIZE(ex) || + i_size_read(bprm->file->f_dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + return -ENOEXEC; + } + + fd_offset = N_TXTOFF(ex); + + /* Check initial limits. This avoids letting people circumvent + * size limits imposed on them by creating programs with large + * arrays in the data or bss. + */ + rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + if (rlim >= RLIM_INFINITY) + rlim = ~0; + if (ex.a_data + ex.a_bss > rlim) + return -ENOMEM; + + /* Flush all traces of the currently running executable */ + retval = flush_old_exec(bprm); + if (retval) + return retval; + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; + + /* OK, This is the point of no return */ + set_personality(PER_LINUX); + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_ABI_PENDING); + + current->mm->end_code = ex.a_text + + (current->mm->start_code = N_TXTADDR(ex)); + current->mm->end_data = ex.a_data + + (current->mm->start_data = N_DATADDR(ex)); + current->mm->brk = ex.a_bss + + (current->mm->start_brk = N_BSSADDR(ex)); + current->mm->free_area_cache = TASK_UNMAPPED_BASE; + + set_mm_counter(current->mm, rss, 0); + current->mm->mmap = NULL; + compute_creds(bprm); + current->flags &= ~PF_FORKNOEXEC; + + if (N_MAGIC(ex) == OMAGIC) { + unsigned long text_addr, map_size; + loff_t pos; + + text_addr = N_TXTADDR(ex); + + pos = 32; + map_size = ex.a_text+ex.a_data; + + down_write(¤t->mm->mmap_sem); + error = do_brk(text_addr & PAGE_MASK, map_size); + up_write(¤t->mm->mmap_sem); + + if (error != (text_addr & PAGE_MASK)) { + send_sig(SIGKILL, current, 0); + return error; + } + + error = bprm->file->f_op->read(bprm->file, (char *)text_addr, + ex.a_text+ex.a_data, &pos); + if ((signed long)error < 0) { + send_sig(SIGKILL, current, 0); + return error; + } + + flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); + } else { +#ifdef WARN_OLD + static unsigned long error_time, error_time2; + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) + { + printk(KERN_NOTICE "executable not page aligned\n"); + error_time2 = jiffies; + } + + if ((fd_offset & ~PAGE_MASK) != 0 && + (jiffies-error_time) > 5*HZ) + { + printk(KERN_WARNING + "fd_offset is not page aligned. Please convert program: %s\n", + bprm->file->f_dentry->d_name.name); + error_time = jiffies; + } +#endif + + if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { + loff_t pos = fd_offset; + down_write(¤t->mm->mmap_sem); + do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + up_write(¤t->mm->mmap_sem); + bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex), + ex.a_text+ex.a_data, &pos); + flush_icache_range((unsigned long) N_TXTADDR(ex), + (unsigned long) N_TXTADDR(ex) + + ex.a_text+ex.a_data); + goto beyond_if; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, + fd_offset); + up_write(¤t->mm->mmap_sem); + + if (error != N_TXTADDR(ex)) { + send_sig(SIGKILL, current, 0); + return error; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, + fd_offset + ex.a_text); + up_write(¤t->mm->mmap_sem); + if (error != N_DATADDR(ex)) { + send_sig(SIGKILL, current, 0); + return error; + } + } +beyond_if: + set_binfmt(&aout_format); + + set_brk(current->mm->start_brk, current->mm->brk); + + retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + + current->mm->start_stack = + (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); + /* start thread */ + asm volatile("movl %0,%%fs" :: "r" (0)); \ + asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); + load_gs_index(0); + (regs)->rip = ex.a_entry; + (regs)->rsp = current->mm->start_stack; + (regs)->eflags = 0x200; + (regs)->cs = __USER32_CS; + (regs)->ss = __USER32_DS; + set_fs(USER_DS); + if (unlikely(current->ptrace & PT_PTRACED)) { + if (current->ptrace & PT_TRACE_EXEC) + ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); + else + send_sig(SIGTRAP, current, 0); + } + return 0; +} + +static int load_aout_library(struct file *file) +{ + struct inode * inode; + unsigned long bss, start_addr, len; + unsigned long error; + int retval; + struct exec ex; + + inode = file->f_dentry->d_inode; + + retval = -ENOEXEC; + error = kernel_read(file, 0, (char *) &ex, sizeof(ex)); + if (error != sizeof(ex)) + goto out; + + /* We come in here for the regular a.out style of shared libraries */ + if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || + N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || + i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + goto out; + } + + if (N_FLAGS(ex)) + goto out; + + /* For QMAGIC, the starting address is 0x20 into the page. We mask + this off to get the starting address for the page */ + + start_addr = ex.a_entry & 0xfffff000; + + if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { + loff_t pos = N_TXTOFF(ex); + +#ifdef WARN_OLD + static unsigned long error_time; + if ((jiffies-error_time) > 5*HZ) + { + printk(KERN_WARNING + "N_TXTOFF is not page aligned. Please convert library: %s\n", + file->f_dentry->d_name.name); + error_time = jiffies; + } +#endif + down_write(¤t->mm->mmap_sem); + do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + up_write(¤t->mm->mmap_sem); + + file->f_op->read(file, (char *)start_addr, + ex.a_text + ex.a_data, &pos); + flush_icache_range((unsigned long) start_addr, + (unsigned long) start_addr + ex.a_text + ex.a_data); + + retval = 0; + goto out; + } + /* Now use mmap to map the library into memory. */ + down_write(¤t->mm->mmap_sem); + error = do_mmap(file, start_addr, ex.a_text + ex.a_data, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT, + N_TXTOFF(ex)); + up_write(¤t->mm->mmap_sem); + retval = error; + if (error != start_addr) + goto out; + + len = PAGE_ALIGN(ex.a_text + ex.a_data); + bss = ex.a_text + ex.a_data + ex.a_bss; + if (bss > len) { + down_write(¤t->mm->mmap_sem); + error = do_brk(start_addr + len, bss - len); + up_write(¤t->mm->mmap_sem); + retval = error; + if (error != start_addr + len) + goto out; + } + retval = 0; +out: + return retval; +} + +static int __init init_aout_binfmt(void) +{ + return register_binfmt(&aout_format); +} + +static void __exit exit_aout_binfmt(void) +{ + unregister_binfmt(&aout_format); +} + +module_init(init_aout_binfmt); +module_exit(exit_aout_binfmt); +MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c new file mode 100644 index 000000000000..93d568dfa762 --- /dev/null +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -0,0 +1,434 @@ +/* + * Written 2000,2002 by Andi Kleen. + * + * Loosely based on the sparc64 and IA64 32bit emulation loaders. + * This tricks binfmt_elf.c into loading 32bit binaries using lots + * of ugly preprocessor tricks. Talk about very very poor man's inheritance. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ELF_NAME "elf/i386" + +#define AT_SYSINFO 32 +#define AT_SYSINFO_EHDR 33 + +int sysctl_vsyscall32 = 1; + +#define ARCH_DLINFO do { \ + if (sysctl_vsyscall32) { \ + NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \ + } \ +} while(0) + +struct file; +struct elf_phdr; + +#define IA32_EMULATOR 1 + +#define ELF_ET_DYN_BASE (TASK_UNMAPPED_32 + 0x1000000) + +#undef ELF_ARCH +#define ELF_ARCH EM_386 + +#undef ELF_CLASS +#define ELF_CLASS ELFCLASS32 + +#define ELF_DATA ELFDATA2LSB + +#define USE_ELF_CORE_DUMP 1 + +/* Overwrite elfcore.h */ +#define _LINUX_ELFCORE_H 1 +typedef unsigned int elf_greg_t; + +#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t)) +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; + +/* + * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out + * extra segments containing the vsyscall DSO contents. Dumping its + * contents makes post-mortem fully interpretable later without matching up + * the same kernel and hardware config to see what PC values meant. + * Dumping its extra ELF program headers includes all the other information + * a debugger needs to easily find how the vsyscall DSO was being used. + */ +#define ELF_CORE_EXTRA_PHDRS (VSYSCALL32_EHDR->e_phnum) +#define ELF_CORE_WRITE_EXTRA_PHDRS \ +do { \ + const struct elf32_phdr *const vsyscall_phdrs = \ + (const struct elf32_phdr *) (VSYSCALL32_BASE \ + + VSYSCALL32_EHDR->e_phoff); \ + int i; \ + Elf32_Off ofs = 0; \ + for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ + struct elf32_phdr phdr = vsyscall_phdrs[i]; \ + if (phdr.p_type == PT_LOAD) { \ + BUG_ON(ofs != 0); \ + ofs = phdr.p_offset = offset; \ + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ + phdr.p_filesz = phdr.p_memsz; \ + offset += phdr.p_filesz; \ + } \ + else \ + phdr.p_offset += ofs; \ + phdr.p_paddr = 0; /* match other core phdrs */ \ + DUMP_WRITE(&phdr, sizeof(phdr)); \ + } \ +} while (0) +#define ELF_CORE_WRITE_EXTRA_DATA \ +do { \ + const struct elf32_phdr *const vsyscall_phdrs = \ + (const struct elf32_phdr *) (VSYSCALL32_BASE \ + + VSYSCALL32_EHDR->e_phoff); \ + int i; \ + for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ + if (vsyscall_phdrs[i].p_type == PT_LOAD) \ + DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr, \ + PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ + } \ +} while (0) + +struct elf_siginfo +{ + int si_signo; /* signal number */ + int si_code; /* extra code */ + int si_errno; /* errno */ +}; + +#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) + +struct elf_prstatus +{ + struct elf_siginfo pr_info; /* Info associated with signal */ + short pr_cursig; /* Current signal */ + unsigned int pr_sigpend; /* Set of pending signals */ + unsigned int pr_sighold; /* Set of held signals */ + pid_t pr_pid; + pid_t pr_ppid; + pid_t pr_pgrp; + pid_t pr_sid; + struct compat_timeval pr_utime; /* User time */ + struct compat_timeval pr_stime; /* System time */ + struct compat_timeval pr_cutime; /* Cumulative user time */ + struct compat_timeval pr_cstime; /* Cumulative system time */ + elf_gregset_t pr_reg; /* GP registers */ + int pr_fpvalid; /* True if math co-processor being used. */ +}; + +#define ELF_PRARGSZ (80) /* Number of chars for args */ + +struct elf_prpsinfo +{ + char pr_state; /* numeric process state */ + char pr_sname; /* char for pr_state */ + char pr_zomb; /* zombie */ + char pr_nice; /* nice val */ + unsigned int pr_flag; /* flags */ + __u16 pr_uid; + __u16 pr_gid; + pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; + /* Lots missing */ + char pr_fname[16]; /* filename of executable */ + char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ +}; + +#define __STR(x) #x +#define STR(x) __STR(x) + +#define _GET_SEG(x) \ + ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; }) + +/* Assumes current==process to be dumped */ +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ + pr_reg[0] = regs->rbx; \ + pr_reg[1] = regs->rcx; \ + pr_reg[2] = regs->rdx; \ + pr_reg[3] = regs->rsi; \ + pr_reg[4] = regs->rdi; \ + pr_reg[5] = regs->rbp; \ + pr_reg[6] = regs->rax; \ + pr_reg[7] = _GET_SEG(ds); \ + pr_reg[8] = _GET_SEG(es); \ + pr_reg[9] = _GET_SEG(fs); \ + pr_reg[10] = _GET_SEG(gs); \ + pr_reg[11] = regs->orig_rax; \ + pr_reg[12] = regs->rip; \ + pr_reg[13] = regs->cs; \ + pr_reg[14] = regs->eflags; \ + pr_reg[15] = regs->rsp; \ + pr_reg[16] = regs->ss; + +#define user user32 + +#define __ASM_X86_64_ELF_H 1 +#define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack)) +//#include +#include + +typedef struct user_i387_ia32_struct elf_fpregset_t; +typedef struct user32_fxsr_struct elf_fpxregset_t; + + +static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) +{ + ELF_CORE_COPY_REGS((*elfregs), regs) +} + +static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) +{ + struct pt_regs *pp = (struct pt_regs *)(t->thread.rsp0); + --pp; + ELF_CORE_COPY_REGS((*elfregs), pp); + /* fix wrong segments */ + (*elfregs)[7] = t->thread.ds; + (*elfregs)[9] = t->thread.fsindex; + (*elfregs)[10] = t->thread.gsindex; + (*elfregs)[8] = t->thread.es; + return 1; +} + +static inline int +elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu) +{ + struct _fpstate_ia32 *fpstate = (void*)fpu; + mm_segment_t oldfs = get_fs(); + + if (!tsk_used_math(tsk)) + return 0; + if (!regs) + regs = (struct pt_regs *)tsk->thread.rsp0; + --regs; + if (tsk == current) + unlazy_fpu(tsk); + set_fs(KERNEL_DS); + save_i387_ia32(tsk, fpstate, regs, 1); + /* Correct for i386 bug. It puts the fop into the upper 16bits of + the tag word (like FXSAVE), not into the fcs*/ + fpstate->cssel |= fpstate->tag & 0xffff0000; + set_fs(oldfs); + return 1; +} + +#define ELF_CORE_COPY_XFPREGS 1 +static inline int +elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) +{ + struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1; + if (!tsk_used_math(t)) + return 0; + if (t == current) + unlazy_fpu(t); + memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t)); + xfpu->fcs = regs->cs; + xfpu->fos = t->thread.ds; /* right? */ + return 1; +} + +#undef elf_check_arch +#define elf_check_arch(x) \ + ((x)->e_machine == EM_386) + +extern int force_personality32; + +#define ELF_EXEC_PAGESIZE PAGE_SIZE +#define ELF_HWCAP (boot_cpu_data.x86_capability[0]) +#define ELF_PLATFORM ("i686") +#define SET_PERSONALITY(ex, ibcs2) \ +do { \ + unsigned long new_flags = 0; \ + if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ + new_flags = _TIF_IA32; \ + if ((current_thread_info()->flags & _TIF_IA32) \ + != new_flags) \ + set_thread_flag(TIF_ABI_PENDING); \ + else \ + clear_thread_flag(TIF_ABI_PENDING); \ + /* XXX This overwrites the user set personality */ \ + current->personality |= force_personality32; \ +} while (0) + +/* Override some function names */ +#define elf_format elf32_format + +#define init_elf_binfmt init_elf32_binfmt +#define exit_elf_binfmt exit_elf32_binfmt + +#define load_elf_binary load_elf32_binary + +#define ELF_PLAT_INIT(r, load_addr) elf32_init(r) +#define setup_arg_pages(bprm, stack_top, exec_stack) \ + ia32_setup_arg_pages(bprm, stack_top, exec_stack) +int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack); + +#undef start_thread +#define start_thread(regs,new_rip,new_rsp) do { \ + asm volatile("movl %0,%%fs" :: "r" (0)); \ + asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \ + load_gs_index(0); \ + (regs)->rip = (new_rip); \ + (regs)->rsp = (new_rsp); \ + (regs)->eflags = 0x200; \ + (regs)->cs = __USER32_CS; \ + (regs)->ss = __USER32_DS; \ + set_fs(USER_DS); \ +} while(0) + + +#define elf_map elf32_map + +#include + +MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); +MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); + +#undef MODULE_DESCRIPTION +#undef MODULE_AUTHOR + +#define elf_addr_t __u32 + +#undef TASK_SIZE +#define TASK_SIZE 0xffffffff + +static void elf32_init(struct pt_regs *); + +#include "../../../fs/binfmt_elf.c" + +static void elf32_init(struct pt_regs *regs) +{ + struct task_struct *me = current; + regs->rdi = 0; + regs->rsi = 0; + regs->rdx = 0; + regs->rcx = 0; + regs->rax = 0; + regs->rbx = 0; + regs->rbp = 0; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; + me->thread.fs = 0; + me->thread.gs = 0; + me->thread.fsindex = 0; + me->thread.gsindex = 0; + me->thread.ds = __USER_DS; + me->thread.es = __USER_DS; +} + +int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) +{ + unsigned long stack_base; + struct vm_area_struct *mpnt; + struct mm_struct *mm = current->mm; + int i, ret; + + stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE; + mm->arg_start = bprm->p + stack_base; + + bprm->p += stack_base; + if (bprm->loader) + bprm->loader += stack_base; + bprm->exec += stack_base; + + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!mpnt) + return -ENOMEM; + + if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) { + kmem_cache_free(vm_area_cachep, mpnt); + return -ENOMEM; + } + + memset(mpnt, 0, sizeof(*mpnt)); + + down_write(&mm->mmap_sem); + { + mpnt->vm_mm = mm; + mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; + mpnt->vm_end = IA32_STACK_TOP; + if (executable_stack == EXSTACK_ENABLE_X) + mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; + else if (executable_stack == EXSTACK_DISABLE_X) + mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC; + else + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? + PAGE_COPY_EXEC : PAGE_COPY; + if ((ret = insert_vm_struct(mm, mpnt))) { + up_write(&mm->mmap_sem); + kmem_cache_free(vm_area_cachep, mpnt); + return ret; + } + mm->stack_vm = mm->total_vm = vma_pages(mpnt); + } + + for (i = 0 ; i < MAX_ARG_PAGES ; i++) { + struct page *page = bprm->page[i]; + if (page) { + bprm->page[i] = NULL; + install_arg_page(mpnt, page, stack_base); + } + stack_base += PAGE_SIZE; + } + up_write(&mm->mmap_sem); + + return 0; +} + +static unsigned long +elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) +{ + unsigned long map_addr; + struct task_struct *me = current; + + down_write(&me->mm->mmap_sem); + map_addr = do_mmap(filep, ELF_PAGESTART(addr), + eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, + type, + eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr)); + up_write(&me->mm->mmap_sem); + return(map_addr); +} + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include + +static ctl_table abi_table2[] = { + { 99, "vsyscall32", &sysctl_vsyscall32, sizeof(int), 0644, NULL, + proc_dointvec }, + { 0, } +}; + +static ctl_table abi_root_table2[] = { + { .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555, + .child = abi_table2 }, + { 0 }, +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2, 1); + return 0; +} +__initcall(ia32_binfmt_init); +#endif diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c new file mode 100644 index 000000000000..d259f8a6f811 --- /dev/null +++ b/arch/x86_64/ia32/ia32_ioctl.c @@ -0,0 +1,201 @@ +/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $ + * ioctl32.c: Conversion between 32bit and 64bit native ioctls. + * + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs + * + * These routines maintain argument size conversion between 32bit and 64bit + * ioctls. + */ + +#define INCLUDES +#include +#include "compat_ioctl.c" +#include +#include + +#define CODE +#include "compat_ioctl.c" + +#ifndef TIOCGDEV +#define TIOCGDEV _IOR('T',0x32, unsigned int) +#endif +static int tiocgdev(unsigned fd, unsigned cmd, unsigned int __user *ptr) +{ + + struct file *file = fget(fd); + struct tty_struct *real_tty; + + if (!file) + return -EBADF; + if (file->f_op->ioctl != tty_ioctl) + return -EINVAL; + real_tty = (struct tty_struct *)file->private_data; + if (!real_tty) + return -EINVAL; + return put_user(new_encode_dev(tty_devnum(real_tty)), ptr); +} + +#define RTC_IRQP_READ32 _IOR('p', 0x0b, unsigned int) /* Read IRQ rate */ +#define RTC_IRQP_SET32 _IOW('p', 0x0c, unsigned int) /* Set IRQ rate */ +#define RTC_EPOCH_READ32 _IOR('p', 0x0d, unsigned) /* Read epoch */ +#define RTC_EPOCH_SET32 _IOW('p', 0x0e, unsigned) /* Set epoch */ + +static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +{ + unsigned long val; + mm_segment_t oldfs = get_fs(); + int ret; + + switch (cmd) { + case RTC_IRQP_READ32: + set_fs(KERNEL_DS); + ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val); + set_fs(oldfs); + if (!ret) + ret = put_user(val, (unsigned int __user *) arg); + return ret; + + case RTC_IRQP_SET32: + cmd = RTC_IRQP_SET; + break; + + case RTC_EPOCH_READ32: + set_fs(KERNEL_DS); + ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val); + set_fs(oldfs); + if (!ret) + ret = put_user(val, (unsigned int __user *) arg); + return ret; + + case RTC_EPOCH_SET32: + cmd = RTC_EPOCH_SET; + break; + } + return sys_ioctl(fd,cmd,arg); +} + +/* /proc/mtrr ioctls */ + + +struct mtrr_sentry32 +{ + compat_ulong_t base; /* Base address */ + compat_uint_t size; /* Size of region */ + compat_uint_t type; /* Type of region */ +}; + +struct mtrr_gentry32 +{ + compat_ulong_t regnum; /* Register number */ + compat_uint_t base; /* Base address */ + compat_uint_t size; /* Size of region */ + compat_uint_t type; /* Type of region */ +}; + +#define MTRR_IOCTL_BASE 'M' + +#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32) +#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32) +#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32) +#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32) +#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32) +#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32) +#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32) +#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32) +#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32) +#define MTRRIOC32_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32) + + +static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + struct mtrr_gentry g; + struct mtrr_sentry s; + int get = 0, err = 0; + struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg; + mm_segment_t oldfs = get_fs(); + + switch (cmd) { +#define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break +#define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break + SET(ADD); + SET(SET); + SET(DEL); + GET(GET); + SET(KILL); + SET(ADD_PAGE); + SET(SET_PAGE); + SET(DEL_PAGE); + GET(GET_PAGE); + SET(KILL_PAGE); + } + + if (get) { + err = get_user(g.regnum, &g32->regnum); + err |= get_user(g.base, &g32->base); + err |= get_user(g.size, &g32->size); + err |= get_user(g.type, &g32->type); + + arg = (unsigned long)&g; + } else { + struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg; + err = get_user(s.base, &s32->base); + err |= get_user(s.size, &s32->size); + err |= get_user(s.type, &s32->type); + + arg = (unsigned long)&s; + } + if (err) return err; + + set_fs(KERNEL_DS); + err = sys_ioctl(fd, cmd, arg); + set_fs(oldfs); + + if (!err && get) { + err = put_user(g.base, &g32->base); + err |= put_user(g.size, &g32->size); + err |= put_user(g.regnum, &g32->regnum); + err |= put_user(g.type, &g32->type); + } + return err; +} + +#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) }, +#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl) + +struct ioctl_trans ioctl_start[] = { +#include +#define DECLARES +#include "compat_ioctl.c" +COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS) +COMPATIBLE_IOCTL(HDIO_SCAN_HWIF) +COMPATIBLE_IOCTL(BLKRASET) +COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */ +COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */ +COMPATIBLE_IOCTL(FIOQSIZE) + +/* And these ioctls need translation */ +HANDLE_IOCTL(TIOCGDEV, tiocgdev) +/* realtime device */ +HANDLE_IOCTL(RTC_IRQP_READ, rtc32_ioctl) +HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl) +HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl) +HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl) +HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl) +/* take care of sizeof(sizeof()) breakage */ +/* mtrr */ +HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32) +HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32) +}; + +int ioctl_table_size = ARRAY_SIZE(ioctl_start); + diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c new file mode 100644 index 000000000000..fbd09b5126ce --- /dev/null +++ b/arch/x86_64/ia32/ia32_signal.c @@ -0,0 +1,621 @@ +/* + * linux/arch/x86_64/ia32/ia32_signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen + * + * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SIG 0 + +#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); +void signal_fault(struct pt_regs *regs, void __user *frame, char *where); + +int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) +{ + int err; + if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + return -EFAULT; + + /* If you change siginfo_t structure, please make sure that + this code is fixed accordingly. + It should never copy any pad contained in the structure + to avoid security leaks, but must copy the generic + 3 ints plus the relevant union member. */ + err = __put_user(from->si_signo, &to->si_signo); + err |= __put_user(from->si_errno, &to->si_errno); + err |= __put_user((short)from->si_code, &to->si_code); + + if (from->si_code < 0) { + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); + } else { + /* First 32bits of unions are always present: + * si_pid === si_band === si_tid === si_addr(LS half) */ + err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); + switch (from->si_code >> 16) { + case __SI_FAULT >> 16: + break; + case __SI_CHLD >> 16: + err |= __put_user(from->si_utime, &to->si_utime); + err |= __put_user(from->si_stime, &to->si_stime); + err |= __put_user(from->si_status, &to->si_status); + /* FALL THROUGH */ + default: + case __SI_KILL >> 16: + err |= __put_user(from->si_uid, &to->si_uid); + break; + case __SI_POLL >> 16: + err |= __put_user(from->si_fd, &to->si_fd); + break; + case __SI_TIMER >> 16: + err |= __put_user(from->si_overrun, &to->si_overrun); + err |= __put_user(ptr_to_compat(from->si_ptr), + &to->si_ptr); + break; + case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ >> 16: + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; + } + } + return err; +} + +int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) +{ + int err; + u32 ptr32; + if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) + return -EFAULT; + + err = __get_user(to->si_signo, &from->si_signo); + err |= __get_user(to->si_errno, &from->si_errno); + err |= __get_user(to->si_code, &from->si_code); + + err |= __get_user(to->si_pid, &from->si_pid); + err |= __get_user(to->si_uid, &from->si_uid); + err |= __get_user(ptr32, &from->si_ptr); + to->si_ptr = compat_ptr(ptr32); + + return err; +} + +asmlinkage long +sys32_sigsuspend(int history0, int history1, old_sigset_t mask, + struct pt_regs *regs) +{ + sigset_t saveset; + + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + saveset = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + regs->rax = -EINTR; + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (do_signal(regs, &saveset)) + return -EINTR; + } +} + +asmlinkage long +sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, + stack_ia32_t __user *uoss_ptr, + struct pt_regs *regs) +{ + stack_t uss,uoss; + int ret; + mm_segment_t seg; + if (uss_ptr) { + u32 ptr; + memset(&uss,0,sizeof(stack_t)); + if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || + __get_user(ptr, &uss_ptr->ss_sp) || + __get_user(uss.ss_flags, &uss_ptr->ss_flags) || + __get_user(uss.ss_size, &uss_ptr->ss_size)) + return -EFAULT; + uss.ss_sp = compat_ptr(ptr); + } + seg = get_fs(); + set_fs(KERNEL_DS); + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); + set_fs(seg); + if (ret >= 0 && uoss_ptr) { + if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || + __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || + __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || + __put_user(uoss.ss_size, &uoss_ptr->ss_size)) + ret = -EFAULT; + } + return ret; +} + +/* + * Do a signal return; undo the signal stack. + */ + +struct sigframe +{ + u32 pretcode; + int sig; + struct sigcontext_ia32 sc; + struct _fpstate_ia32 fpstate; + unsigned int extramask[_COMPAT_NSIG_WORDS-1]; + char retcode[8]; +}; + +struct rt_sigframe +{ + u32 pretcode; + int sig; + u32 pinfo; + u32 puc; + compat_siginfo_t info; + struct ucontext_ia32 uc; + struct _fpstate_ia32 fpstate; + char retcode[8]; +}; + +static int +ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) +{ + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + +#if DEBUG_SIG + printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", + sc, sc->err, sc->eip, sc->cs, sc->eflags); +#endif +#define COPY(x) { \ + unsigned int reg; \ + err |= __get_user(reg, &sc->e ##x); \ + regs->r ## x = reg; \ +} + +#define RELOAD_SEG(seg,mask) \ + { unsigned int cur; \ + unsigned short pre; \ + err |= __get_user(pre, &sc->seg); \ + asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ + pre |= mask; \ + if (pre != cur) loadsegment(seg,pre); } + + /* Reload fs and gs if they have changed in the signal handler. + This does not handle long fs/gs base changes in the handler, but + does not clobber them at least in the normal case. */ + + { + unsigned gs, oldgs; + err |= __get_user(gs, &sc->gs); + gs |= 3; + asm("movl %%gs,%0" : "=r" (oldgs)); + if (gs != oldgs) + load_gs_index(gs); + } + RELOAD_SEG(fs,3); + RELOAD_SEG(ds,3); + RELOAD_SEG(es,3); + + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); + /* Don't touch extended registers */ + + err |= __get_user(regs->cs, &sc->cs); + regs->cs |= 3; + err |= __get_user(regs->ss, &sc->ss); + regs->ss |= 3; + + { + unsigned int tmpflags; + err |= __get_user(tmpflags, &sc->eflags); + regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->orig_rax = -1; /* disable syscall checks */ + } + + { + u32 tmp; + struct _fpstate_ia32 __user * buf; + err |= __get_user(tmp, &sc->fpstate); + buf = compat_ptr(tmp); + if (buf) { + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) + goto badframe; + err |= restore_i387_ia32(current, buf, 0); + } else { + struct task_struct *me = current; + if (used_math()) { + clear_fpu(me); + clear_used_math(); + } + } + } + + { + u32 tmp; + err |= __get_user(tmp, &sc->eax); + *peax = tmp; + } + return err; + +badframe: + return 1; +} + +asmlinkage long sys32_sigreturn(struct pt_regs *regs) +{ + struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); + sigset_t set; + unsigned int eax; + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) + || (_COMPAT_NSIG_WORDS > 1 + && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) + goto badframe; + return eax; + +badframe: + signal_fault(regs, frame, "32bit sigreturn"); + return 0; +} + +asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + sigset_t set; + unsigned int eax; + struct pt_regs tregs; + + frame = (struct rt_sigframe __user *)(regs->rsp - 4); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + sigdelsetmask(&set, ~_BLOCKABLE); + spin_lock_irq(¤t->sighand->siglock); + current->blocked = set; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + goto badframe; + + tregs = *regs; + if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) + goto badframe; + + return eax; + +badframe: + signal_fault(regs,frame,"32bit rt sigreturn"); + return 0; +} + +/* + * Set up a signal frame. + */ + +static int +ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, + struct pt_regs *regs, unsigned int mask) +{ + int tmp, err = 0; + u32 eflags; + + tmp = 0; + __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->fs); + __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->ds); + __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); + err |= __put_user(tmp, (unsigned int __user *)&sc->es); + + err |= __put_user((u32)regs->rdi, &sc->edi); + err |= __put_user((u32)regs->rsi, &sc->esi); + err |= __put_user((u32)regs->rbp, &sc->ebp); + err |= __put_user((u32)regs->rsp, &sc->esp); + err |= __put_user((u32)regs->rbx, &sc->ebx); + err |= __put_user((u32)regs->rdx, &sc->edx); + err |= __put_user((u32)regs->rcx, &sc->ecx); + err |= __put_user((u32)regs->rax, &sc->eax); + err |= __put_user((u32)regs->cs, &sc->cs); + err |= __put_user((u32)regs->ss, &sc->ss); + err |= __put_user(current->thread.trap_no, &sc->trapno); + err |= __put_user(current->thread.error_code, &sc->err); + err |= __put_user((u32)regs->rip, &sc->eip); + eflags = regs->eflags; + if (current->ptrace & PT_PTRACED) + eflags &= ~TF_MASK; + err |= __put_user((u32)eflags, &sc->eflags); + err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); + + tmp = save_i387_ia32(current, fpstate, regs, 0); + if (tmp < 0) + err = -EFAULT; + else { + clear_used_math(); + stts(); + err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), + &sc->fpstate); + } + + /* non-iBCS2 extensions.. */ + err |= __put_user(mask, &sc->oldmask); + err |= __put_user(current->thread.cr2, &sc->cr2); + + return err; +} + +/* + * Determine which stack to use.. + */ +static void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +{ + unsigned long rsp; + + /* Default to using normal stack */ + rsp = regs->rsp; + + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (sas_ss_flags(rsp) == 0) + rsp = current->sas_ss_sp + current->sas_ss_size; + } + + /* This is the legacy signal stack switching. */ + else if ((regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + rsp = (unsigned long) ka->sa.sa_restorer; + } + + return (void __user *)((rsp - frame_size) & -8UL); +} + +void ia32_setup_frame(int sig, struct k_sigaction *ka, + compat_sigset_t *set, struct pt_regs * regs) +{ + struct sigframe __user *frame; + int err = 0; + + frame = get_sigframe(ka, regs, sizeof(*frame)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + { + struct exec_domain *ed = current_thread_info()->exec_domain; + err |= __put_user((ed + && ed->signal_invmap + && sig < 32 + ? ed->signal_invmap[sig] + : sig), + &frame->sig); + } + if (err) + goto give_sigsegv; + + err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, + set->sig[0]); + if (err) + goto give_sigsegv; + + if (_COMPAT_NSIG_WORDS > 1) { + err |= __copy_to_user(frame->extramask, &set->sig[1], + sizeof(frame->extramask)); + } + if (err) + goto give_sigsegv; + + /* Return stub is in 32bit vsyscall page */ + { + void __user *restorer = VSYSCALL32_SIGRETURN; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + } + /* These are actually not used anymore, but left because some + gdb versions depend on them as a marker. */ + { + /* copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u16 poplmovl; + u32 val; + u16 int80; + u16 pad; + } __attribute__((packed)) code = { + 0xb858, /* popl %eax ; movl $...,%eax */ + __NR_ia32_sigreturn, + 0x80cd, /* int $0x80 */ + 0, + }; + err |= __copy_to_user(frame->retcode, &code, 8); + } + if (err) + goto give_sigsegv; + + /* Set up registers for signal handler */ + regs->rsp = (unsigned long) frame; + regs->rip = (unsigned long) ka->sa.sa_handler; + + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + + set_fs(USER_DS); + if (regs->eflags & TF_MASK) { + if (current->ptrace & PT_PTRACED) { + ptrace_notify(SIGTRAP); + } else { + regs->eflags &= ~TF_MASK; + } + } + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->rip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + +void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + compat_sigset_t *set, struct pt_regs * regs) +{ + struct rt_sigframe __user *frame; + int err = 0; + + frame = get_sigframe(ka, regs, sizeof(*frame)); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto give_sigsegv; + + { + struct exec_domain *ed = current_thread_info()->exec_domain; + err |= __put_user((ed + && ed->signal_invmap + && sig < 32 + ? ed->signal_invmap[sig] + : sig), + &frame->sig); + } + err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); + err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); + err |= copy_siginfo_to_user32(&frame->info, info); + if (err) + goto give_sigsegv; + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(regs->rsp), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; + + + { + void __user *restorer = VSYSCALL32_RTSIGRETURN; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + } + + /* This is movl $,%eax ; int $0x80 */ + /* Not actually used anymore, but left because some gdb versions + need it. */ + { + /* __copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u8 movl; + u32 val; + u16 int80; + u16 pad; + u8 pad2; + } __attribute__((packed)) code = { + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, + 0, + }; + err |= __copy_to_user(frame->retcode, &code, 8); + } + if (err) + goto give_sigsegv; + + /* Set up registers for signal handler */ + regs->rsp = (unsigned long) frame; + regs->rip = (unsigned long) ka->sa.sa_handler; + + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + + set_fs(USER_DS); + if (regs->eflags & TF_MASK) { + if (current->ptrace & PT_PTRACED) { + ptrace_notify(SIGTRAP); + } else { + regs->eflags &= ~TF_MASK; + } + } + +#if DEBUG_SIG + printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + current->comm, current->pid, frame, regs->rip, frame->pretcode); +#endif + + return; + +give_sigsegv: + force_sigsegv(sig, current); +} + diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S new file mode 100644 index 000000000000..f3ca0db85b5b --- /dev/null +++ b/arch/x86_64/ia32/ia32entry.S @@ -0,0 +1,602 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .macro IA32_ARG_FIXUP noebp=0 + movl %edi,%r8d + .if \noebp + .else + movl %ebp,%r9d + .endif + xchg %ecx,%esi + movl %ebx,%edi + movl %edx,%edx /* zero extension */ + .endm + + /* clobbers %eax */ + .macro CLEAR_RREGS + xorl %eax,%eax + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %rax,R9(%rsp) + movq %rax,R8(%rsp) + .endm + +/* + * 32bit SYSENTER instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp user stack + * 0(%ebp) Arg6 + * + * Interrupts off. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + CFI_STARTPROC + swapgs + movq %gs:pda_kernelstack, %rsp + addq $(PDA_STACKOFFSET),%rsp + sti + movl %ebp,%ebp /* zero extension */ + pushq $__USER32_DS + pushq %rbp + pushfq + movl $VSYSCALL32_SYSEXIT, %r10d + pushq $__USER32_CS + movl %eax, %eax + pushq %r10 + pushq %rax + cld + SAVE_ARGS 0,0,1 + /* no need to do an access_ok check here because rbp has been + 32bit zero extended */ +1: movl (%rbp),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + GET_THREAD_INFO(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + jnz sysenter_tracesys +sysenter_do_call: + cmpl $(IA32_NR_syscalls),%eax + jae ia32_badsys + IA32_ARG_FIXUP 1 + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%r10) + cli + testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) + jnz int_ret_from_sys_call + /* clear IF, that popfq doesn't enable interrupts early */ + andl $~0x200,EFLAGS-R11(%rsp) + RESTORE_ARGS 1,24,1,1,1,1 + popfq + popq %rcx /* User %esp */ + movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ + swapgs + sti /* sti only takes effect after the next instruction */ + /* sysexit */ + .byte 0xf, 0x35 + +sysenter_tracesys: + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + movl %ebp, %ebp + /* no need to do an access_ok check here because rbp has been + 32bit zero extended */ +1: movl (%rbp),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + jmp sysenter_do_call + CFI_ENDPROC + +/* + * 32bit SYSCALL instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx return EIP + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched] + * %esp user stack + * 0(%esp) Arg6 + * + * Interrupts off. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + CFI_STARTPROC + swapgs + movl %esp,%r8d + movq %gs:pda_kernelstack,%rsp + sti + SAVE_ARGS 8,1,1 + movl %eax,%eax /* zero extension */ + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + movl %ebp,%ecx + movq $__USER32_CS,CS-ARGOFFSET(%rsp) + movq $__USER32_DS,SS-ARGOFFSET(%rsp) + movq %r11,EFLAGS-ARGOFFSET(%rsp) + movq %r8,RSP-ARGOFFSET(%rsp) + /* no need to do an access_ok check here because r8 has been + 32bit zero extended */ + /* hardware stack frame is complete now */ +1: movl (%r8),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + GET_THREAD_INFO(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + jnz cstar_tracesys +cstar_do_call: + cmpl $IA32_NR_syscalls,%eax + jae ia32_badsys + IA32_ARG_FIXUP 1 + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + GET_THREAD_INFO(%r10) + cli + testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) + jnz int_ret_from_sys_call + RESTORE_ARGS 1,-ARG_SKIP,1,1,1 + movl RIP-ARGOFFSET(%rsp),%ecx + movl EFLAGS-ARGOFFSET(%rsp),%r11d + movl RSP-ARGOFFSET(%rsp),%esp + swapgs + sysretl + +cstar_tracesys: + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + movl RSP-ARGOFFSET(%rsp), %r8d + /* no need to do an access_ok check here because r8 has been + 32bit zero extended */ +1: movl (%r8),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + jmp cstar_do_call + +ia32_badarg: + movq $-EFAULT,%rax + jmp ia32_sysret + CFI_ENDPROC + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except %eax must be saved (but ptrace may violate that) + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + CFI_STARTPROC + swapgs + sti + movl %eax,%eax + pushq %rax + cld + /* note the registers are not zero extended to the sf. + this could be a problem. */ + SAVE_ARGS 0,0,1 + GET_THREAD_INFO(%r10) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + jnz ia32_tracesys +ia32_do_syscall: + cmpl $(IA32_NR_syscalls),%eax + jae ia32_badsys + IA32_ARG_FIXUP + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX-ARGOFFSET(%rsp) + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_REST + movq $-ENOSYS,RAX(%rsp) /* really needed? */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + jmp ia32_do_syscall + +ia32_badsys: + movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp int_ret_from_sys_call + +ni_syscall: + movq %rax,%rdi + jmp sys32_ni_syscall + +quiet_ni_syscall: + movq $-ENOSYS,%rax + ret + CFI_ENDPROC + + .macro PTREGSCALL label, func, arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx + PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx + PTREGSCALL stub32_execve, sys32_execve, %rcx + PTREGSCALL stub32_fork, sys_fork, %rdi + PTREGSCALL stub32_clone, sys32_clone, %rdx + PTREGSCALL stub32_vfork, sys_vfork, %rdi + PTREGSCALL stub32_iopl, sys_iopl, %rsi + PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx + +ENTRY(ia32_ptregs_common) + CFI_STARTPROC + popq %r11 + SAVE_REST + call *%rax + RESTORE_REST + jmp ia32_sysret /* misbalances the return cache */ + CFI_ENDPROC + + .data + .align 8 + .globl ia32_sys_call_table +ia32_sys_call_table: + .quad sys_restart_syscall + .quad sys_exit + .quad stub32_fork + .quad sys_read + .quad sys_write + .quad sys32_open /* 5 */ + .quad sys_close + .quad sys32_waitpid + .quad sys_creat + .quad sys_link + .quad sys_unlink /* 10 */ + .quad stub32_execve + .quad sys_chdir + .quad compat_sys_time + .quad sys_mknod + .quad sys_chmod /* 15 */ + .quad sys_lchown16 + .quad quiet_ni_syscall /* old break syscall holder */ + .quad sys_stat + .quad sys32_lseek + .quad sys_getpid /* 20 */ + .quad compat_sys_mount /* mount */ + .quad sys_oldumount /* old_umount */ + .quad sys_setuid16 + .quad sys_getuid16 + .quad compat_sys_stime /* stime */ /* 25 */ + .quad sys32_ptrace /* ptrace */ + .quad sys_alarm + .quad sys_fstat /* (old)fstat */ + .quad sys_pause + .quad compat_sys_utime /* 30 */ + .quad quiet_ni_syscall /* old stty syscall holder */ + .quad quiet_ni_syscall /* old gtty syscall holder */ + .quad sys_access + .quad sys_nice + .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ + .quad sys_sync + .quad sys32_kill + .quad sys_rename + .quad sys_mkdir + .quad sys_rmdir /* 40 */ + .quad sys_dup + .quad sys32_pipe + .quad compat_sys_times + .quad quiet_ni_syscall /* old prof syscall holder */ + .quad sys_brk /* 45 */ + .quad sys_setgid16 + .quad sys_getgid16 + .quad sys_signal + .quad sys_geteuid16 + .quad sys_getegid16 /* 50 */ + .quad sys_acct + .quad sys_umount /* new_umount */ + .quad quiet_ni_syscall /* old lock syscall holder */ + .quad compat_sys_ioctl + .quad compat_sys_fcntl64 /* 55 */ + .quad quiet_ni_syscall /* old mpx syscall holder */ + .quad sys_setpgid + .quad quiet_ni_syscall /* old ulimit syscall holder */ + .quad sys32_olduname + .quad sys_umask /* 60 */ + .quad sys_chroot + .quad sys32_ustat + .quad sys_dup2 + .quad sys_getppid + .quad sys_getpgrp /* 65 */ + .quad sys_setsid + .quad sys32_sigaction + .quad sys_sgetmask + .quad sys_ssetmask + .quad sys_setreuid16 /* 70 */ + .quad sys_setregid16 + .quad stub32_sigsuspend + .quad compat_sys_sigpending + .quad sys_sethostname + .quad compat_sys_setrlimit /* 75 */ + .quad compat_sys_old_getrlimit /* old_getrlimit */ + .quad compat_sys_getrusage + .quad sys32_gettimeofday + .quad sys32_settimeofday + .quad sys_getgroups16 /* 80 */ + .quad sys_setgroups16 + .quad sys32_old_select + .quad sys_symlink + .quad sys_lstat + .quad sys_readlink /* 85 */ +#ifdef CONFIG_IA32_AOUT + .quad sys_uselib +#else + .quad quiet_ni_syscall +#endif + .quad sys_swapon + .quad sys_reboot + .quad compat_sys_old_readdir + .quad sys32_mmap /* 90 */ + .quad sys_munmap + .quad sys_truncate + .quad sys_ftruncate + .quad sys_fchmod + .quad sys_fchown16 /* 95 */ + .quad sys_getpriority + .quad sys_setpriority + .quad quiet_ni_syscall /* old profil syscall holder */ + .quad compat_sys_statfs + .quad compat_sys_fstatfs /* 100 */ + .quad sys_ioperm + .quad compat_sys_socketcall + .quad sys_syslog + .quad compat_sys_setitimer + .quad compat_sys_getitimer /* 105 */ + .quad compat_sys_newstat + .quad compat_sys_newlstat + .quad compat_sys_newfstat + .quad sys32_uname + .quad stub32_iopl /* 110 */ + .quad sys_vhangup + .quad quiet_ni_syscall /* old "idle" system call */ + .quad sys32_vm86_warning /* vm86old */ + .quad compat_sys_wait4 + .quad sys_swapoff /* 115 */ + .quad sys32_sysinfo + .quad sys32_ipc + .quad sys_fsync + .quad stub32_sigreturn + .quad stub32_clone /* 120 */ + .quad sys_setdomainname + .quad sys_uname + .quad sys_modify_ldt + .quad sys32_adjtimex + .quad sys32_mprotect /* 125 */ + .quad compat_sys_sigprocmask + .quad quiet_ni_syscall /* create_module */ + .quad sys_init_module + .quad sys_delete_module + .quad quiet_ni_syscall /* 130 get_kernel_syms */ + .quad sys_quotactl + .quad sys_getpgid + .quad sys_fchdir + .quad quiet_ni_syscall /* bdflush */ + .quad sys_sysfs /* 135 */ + .quad sys_personality + .quad quiet_ni_syscall /* for afs_syscall */ + .quad sys_setfsuid16 + .quad sys_setfsgid16 + .quad sys_llseek /* 140 */ + .quad compat_sys_getdents + .quad compat_sys_select + .quad sys_flock + .quad sys_msync + .quad compat_sys_readv /* 145 */ + .quad compat_sys_writev + .quad sys_getsid + .quad sys_fdatasync + .quad sys32_sysctl /* sysctl */ + .quad sys_mlock /* 150 */ + .quad sys_munlock + .quad sys_mlockall + .quad sys_munlockall + .quad sys_sched_setparam + .quad sys_sched_getparam /* 155 */ + .quad sys_sched_setscheduler + .quad sys_sched_getscheduler + .quad sys_sched_yield + .quad sys_sched_get_priority_max + .quad sys_sched_get_priority_min /* 160 */ + .quad sys_sched_rr_get_interval + .quad compat_sys_nanosleep + .quad sys_mremap + .quad sys_setresuid16 + .quad sys_getresuid16 /* 165 */ + .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* query_module */ + .quad sys_poll + .quad compat_sys_nfsservctl + .quad sys_setresgid16 /* 170 */ + .quad sys_getresgid16 + .quad sys_prctl + .quad stub32_rt_sigreturn + .quad sys32_rt_sigaction + .quad sys32_rt_sigprocmask /* 175 */ + .quad sys32_rt_sigpending + .quad compat_sys_rt_sigtimedwait + .quad sys32_rt_sigqueueinfo + .quad stub32_rt_sigsuspend + .quad sys32_pread /* 180 */ + .quad sys32_pwrite + .quad sys_chown16 + .quad sys_getcwd + .quad sys_capget + .quad sys_capset + .quad stub32_sigaltstack + .quad sys32_sendfile + .quad quiet_ni_syscall /* streams1 */ + .quad quiet_ni_syscall /* streams2 */ + .quad stub32_vfork /* 190 */ + .quad compat_sys_getrlimit + .quad sys32_mmap2 + .quad sys32_truncate64 + .quad sys32_ftruncate64 + .quad sys32_stat64 /* 195 */ + .quad sys32_lstat64 + .quad sys32_fstat64 + .quad sys_lchown + .quad sys_getuid + .quad sys_getgid /* 200 */ + .quad sys_geteuid + .quad sys_getegid + .quad sys_setreuid + .quad sys_setregid + .quad sys_getgroups /* 205 */ + .quad sys_setgroups + .quad sys_fchown + .quad sys_setresuid + .quad sys_getresuid + .quad sys_setresgid /* 210 */ + .quad sys_getresgid + .quad sys_chown + .quad sys_setuid + .quad sys_setgid + .quad sys_setfsuid /* 215 */ + .quad sys_setfsgid + .quad sys_pivot_root + .quad sys_mincore + .quad sys_madvise + .quad compat_sys_getdents64 /* 220 getdents64 */ + .quad compat_sys_fcntl64 + .quad quiet_ni_syscall /* tux */ + .quad quiet_ni_syscall /* security */ + .quad sys_gettid + .quad sys_readahead /* 225 */ + .quad sys_setxattr + .quad sys_lsetxattr + .quad sys_fsetxattr + .quad sys_getxattr + .quad sys_lgetxattr /* 230 */ + .quad sys_fgetxattr + .quad sys_listxattr + .quad sys_llistxattr + .quad sys_flistxattr + .quad sys_removexattr /* 235 */ + .quad sys_lremovexattr + .quad sys_fremovexattr + .quad sys_tkill + .quad sys_sendfile64 + .quad compat_sys_futex /* 240 */ + .quad compat_sys_sched_setaffinity + .quad compat_sys_sched_getaffinity + .quad sys32_set_thread_area + .quad sys32_get_thread_area + .quad compat_sys_io_setup /* 245 */ + .quad sys_io_destroy + .quad compat_sys_io_getevents + .quad compat_sys_io_submit + .quad sys_io_cancel + .quad sys_fadvise64 /* 250 */ + .quad quiet_ni_syscall /* free_huge_pages */ + .quad sys_exit_group + .quad sys32_lookup_dcookie + .quad sys_epoll_create + .quad sys_epoll_ctl /* 255 */ + .quad sys_epoll_wait + .quad sys_remap_file_pages + .quad sys_set_tid_address + .quad sys32_timer_create + .quad compat_sys_timer_settime /* 260 */ + .quad compat_sys_timer_gettime + .quad sys_timer_getoverrun + .quad sys_timer_delete + .quad compat_sys_clock_settime + .quad compat_sys_clock_gettime /* 265 */ + .quad compat_sys_clock_getres + .quad compat_sys_clock_nanosleep + .quad compat_sys_statfs64 + .quad compat_sys_fstatfs64 + .quad sys_tgkill /* 270 */ + .quad compat_sys_utimes + .quad sys32_fadvise64_64 + .quad quiet_ni_syscall /* sys_vserver */ + .quad sys_mbind + .quad compat_sys_get_mempolicy /* 275 */ + .quad sys_set_mempolicy + .quad compat_sys_mq_open + .quad sys_mq_unlink + .quad compat_sys_mq_timedsend + .quad compat_sys_mq_timedreceive /* 280 */ + .quad compat_sys_mq_notify + .quad compat_sys_mq_getsetattr + .quad quiet_ni_syscall /* reserved for kexec */ + .quad compat_sys_waitid + .quad quiet_ni_syscall /* sys_altroot */ + .quad sys_add_key + .quad sys_request_key + .quad sys_keyctl + /* don't forget to change IA32_NR_syscalls */ +ia32_syscall_end: + .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 + .quad ni_syscall + .endr diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c new file mode 100644 index 000000000000..369151dc3213 --- /dev/null +++ b/arch/x86_64/ia32/ipc32.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +asmlinkage long +sys32_ipc(u32 call, int first, int second, int third, + compat_uptr_t ptr, u32 fifth) +{ + int version; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + /* struct sembuf is the same on 32 and 64bit :)) */ + return sys_semtimedop(first, compat_ptr(ptr), second, NULL); + case SEMTIMEDOP: + return compat_sys_semtimedop(first, compat_ptr(ptr), second, + compat_ptr(fifth)); + case SEMGET: + return sys_semget(first, second, third); + case SEMCTL: + return compat_sys_semctl(first, second, third, compat_ptr(ptr)); + + case MSGSND: + return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); + case MSGRCV: + return compat_sys_msgrcv(first, second, fifth, third, + version, compat_ptr(ptr)); + case MSGGET: + return sys_msgget((key_t) first, second); + case MSGCTL: + return compat_sys_msgctl(first, second, compat_ptr(ptr)); + + case SHMAT: + return compat_sys_shmat(first, second, third, version, + compat_ptr(ptr)); + break; + case SHMDT: + return sys_shmdt(compat_ptr(ptr)); + case SHMGET: + return sys_shmget(first, (unsigned)second, third); + case SHMCTL: + return compat_sys_shmctl(first, second, compat_ptr(ptr)); + } + return -ENOSYS; +} diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c new file mode 100644 index 000000000000..b98b6d2462f6 --- /dev/null +++ b/arch/x86_64/ia32/ptrace32.c @@ -0,0 +1,379 @@ +/* + * 32bit ptrace for x86-64. + * + * Copyright 2001,2002 Andi Kleen, SuSE Labs. + * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier + * copyright. + * + * This allows to access 64bit processes too; but there is no way to see the extended + * register contents. + * + * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x44dd5UL + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break + +static int putreg32(struct task_struct *child, unsigned regno, u32 val) +{ + int i; + __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); + + switch (regno) { + case offsetof(struct user32, regs.fs): + if (val && (val & 3) != 3) return -EIO; + child->thread.fs = val & 0xffff; + break; + case offsetof(struct user32, regs.gs): + if (val && (val & 3) != 3) return -EIO; + child->thread.gs = val & 0xffff; + break; + case offsetof(struct user32, regs.ds): + if (val && (val & 3) != 3) return -EIO; + child->thread.ds = val & 0xffff; + break; + case offsetof(struct user32, regs.es): + child->thread.es = val & 0xffff; + break; + case offsetof(struct user32, regs.ss): + if ((val & 3) != 3) return -EIO; + stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff; + break; + case offsetof(struct user32, regs.cs): + if ((val & 3) != 3) return -EIO; + stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff; + break; + + R32(ebx, rbx); + R32(ecx, rcx); + R32(edx, rdx); + R32(edi, rdi); + R32(esi, rsi); + R32(ebp, rbp); + R32(eax, rax); + R32(orig_eax, orig_rax); + R32(eip, rip); + R32(esp, rsp); + + case offsetof(struct user32, regs.eflags): { + __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; + val &= FLAG_MASK; + *flags = val | (*flags & ~FLAG_MASK); + break; + } + + case offsetof(struct user32, u_debugreg[4]): + case offsetof(struct user32, u_debugreg[5]): + return -EIO; + + case offsetof(struct user32, u_debugreg[0]): + child->thread.debugreg0 = val; + break; + + case offsetof(struct user32, u_debugreg[1]): + child->thread.debugreg1 = val; + break; + + case offsetof(struct user32, u_debugreg[2]): + child->thread.debugreg2 = val; + break; + + case offsetof(struct user32, u_debugreg[3]): + child->thread.debugreg3 = val; + break; + + case offsetof(struct user32, u_debugreg[6]): + child->thread.debugreg6 = val; + break; + + case offsetof(struct user32, u_debugreg[7]): + val &= ~DR_CONTROL_RESERVED; + /* See arch/i386/kernel/ptrace.c for an explanation of + * this awkward check.*/ + for(i=0; i<4; i++) + if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) + return -EIO; + child->thread.debugreg7 = val; + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* Other dummy fields in the virtual user structure are ignored */ + break; + } + return 0; +} + +#undef R32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs)); + + switch (regno) { + case offsetof(struct user32, regs.fs): + *val = child->thread.fs; + break; + case offsetof(struct user32, regs.gs): + *val = child->thread.gs; + break; + case offsetof(struct user32, regs.ds): + *val = child->thread.ds; + break; + case offsetof(struct user32, regs.es): + *val = child->thread.es; + break; + + R32(cs, cs); + R32(ss, ss); + R32(ebx, rbx); + R32(ecx, rcx); + R32(edx, rdx); + R32(edi, rdi); + R32(esi, rsi); + R32(ebp, rbp); + R32(eax, rax); + R32(orig_eax, orig_rax); + R32(eip, rip); + R32(eflags, eflags); + R32(esp, rsp); + + case offsetof(struct user32, u_debugreg[0]): + *val = child->thread.debugreg0; + break; + case offsetof(struct user32, u_debugreg[1]): + *val = child->thread.debugreg1; + break; + case offsetof(struct user32, u_debugreg[2]): + *val = child->thread.debugreg2; + break; + case offsetof(struct user32, u_debugreg[3]): + *val = child->thread.debugreg3; + break; + case offsetof(struct user32, u_debugreg[6]): + *val = child->thread.debugreg6; + break; + case offsetof(struct user32, u_debugreg[7]): + *val = child->thread.debugreg7; + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* Other dummy fields in the virtual user structure are ignored */ + *val = 0; + break; + } + return 0; +} + +#undef R32 + +static struct task_struct *find_target(int request, int pid, int *err) +{ + struct task_struct *child; + + *err = -EPERM; + if (pid == 1) + return NULL; + + *err = -ESRCH; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); + if (child) { + *err = -EPERM; + if (child->pid == 1) + goto out; + *err = ptrace_check_attach(child, request == PTRACE_KILL); + if (*err < 0) + goto out; + return child; + } + out: + if (child) + put_task_struct(child); + return NULL; + +} + +asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) +{ + struct task_struct *child; + struct pt_regs *childregs; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + default: + return sys_ptrace(request, pid, addr, data); + + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + case PTRACE_POKEDATA: + case PTRACE_POKETEXT: + case PTRACE_POKEUSR: + case PTRACE_PEEKUSR: + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_SETFPREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPXREGS: + case PTRACE_GETFPXREGS: + case PTRACE_GETEVENTMSG: + break; + } + + child = find_target(request, pid, &ret); + if (!child) + return ret; + + childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs)); + + switch (request) { + case PTRACE_PEEKDATA: + case PTRACE_PEEKTEXT: + ret = 0; + if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32)) + ret = -EIO; + else + ret = put_user(val, (unsigned int __user *)datap); + break; + + case PTRACE_POKEDATA: + case PTRACE_POKETEXT: + ret = 0; + if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32)) + ret = -EIO; + break; + + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + int i; + if (!access_ok(VERIFY_WRITE, datap, 16*4)) { + ret = -EIO; + break; + } + ret = 0; + for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) { + getreg32(child, i, &val); + ret |= __put_user(val,(u32 __user *)datap); + datap += sizeof(u32); + } + break; + } + + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp; + int i; + if (!access_ok(VERIFY_READ, datap, 16*4)) { + ret = -EIO; + break; + } + ret = 0; + for ( i = 0; i <= 16*4; i += sizeof(u32) ) { + ret |= __get_user(tmp, (u32 __user *)datap); + putreg32(child, i, tmp); + datap += sizeof(u32); + } + break; + } + + case PTRACE_GETFPREGS: + ret = -EIO; + if (!access_ok(VERIFY_READ, compat_ptr(data), + sizeof(struct user_i387_struct))) + break; + save_i387_ia32(child, datap, childregs, 1); + ret = 0; + break; + + case PTRACE_SETFPREGS: + ret = -EIO; + if (!access_ok(VERIFY_WRITE, datap, + sizeof(struct user_i387_struct))) + break; + ret = 0; + /* don't check EFAULT to be bug-to-bug compatible to i386 */ + restore_i387_ia32(child, datap, 1); + break; + + case PTRACE_GETFPXREGS: { + struct user32_fxsr_struct __user *u = datap; + init_fpu(child); + ret = -EIO; + if (!access_ok(VERIFY_WRITE, u, sizeof(*u))) + break; + ret = -EFAULT; + if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u))) + break; + ret = __put_user(childregs->cs, &u->fcs); + ret |= __put_user(child->thread.ds, &u->fos); + break; + } + case PTRACE_SETFPXREGS: { + struct user32_fxsr_struct __user *u = datap; + unlazy_fpu(child); + ret = -EIO; + if (!access_ok(VERIFY_READ, u, sizeof(*u))) + break; + /* no checking to be bug-to-bug compatible with i386 */ + __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)); + set_stopped_child_used_math(child); + child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + ret = 0; + break; + } + + case PTRACE_GETEVENTMSG: + ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data)); + break; + + default: + ret = -EINVAL; + break; + } + + put_task_struct(child); + return ret; +} + diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c new file mode 100644 index 000000000000..68a9ab06ee7c --- /dev/null +++ b/arch/x86_64/ia32/sys_ia32.c @@ -0,0 +1,1050 @@ +/* + * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on + * sys_sparc32 + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger + * Copyright (C) 1999 Arun Sharma + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * + * These routines maintain argument size conversion between 32bit and 64bit + * environment. In 2.5 most of this should be moved to a generic directory. + * + * This file assumes that there is a hole at the end of user address space. + * + * Some of the functions are LE specific currently. These are hopefully all marked. + * This should be fixed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define AA(__x) ((unsigned long)(__x)) + +int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, kbuf->uid); + SET_GID(gid, kbuf->gid); + if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) + return -EOVERFLOW; + if (kbuf->size >= 0x7fffffff) + return -EOVERFLOW; + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || + __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || + __put_user (kbuf->ino, &ubuf->st_ino) || + __put_user (kbuf->mode, &ubuf->st_mode) || + __put_user (kbuf->nlink, &ubuf->st_nlink) || + __put_user (uid, &ubuf->st_uid) || + __put_user (gid, &ubuf->st_gid) || + __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || + __put_user (kbuf->size, &ubuf->st_size) || + __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || + __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || + __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || + __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user (kbuf->blksize, &ubuf->st_blksize) || + __put_user (kbuf->blocks, &ubuf->st_blocks)) + return -EFAULT; + return 0; +} + +asmlinkage long +sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) +{ + return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); +} + +asmlinkage long +sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) +{ + return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); +} + +/* Another set for IA32/LFS -- x86_64 struct stat is different due to + support for 64bit inode numbers. */ + +static int +cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +{ + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + SET_UID(uid, stat->uid); + SET_GID(gid, stat->gid); + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || + __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || + __put_user (stat->ino, &ubuf->__st_ino) || + __put_user (stat->ino, &ubuf->st_ino) || + __put_user (stat->mode, &ubuf->st_mode) || + __put_user (stat->nlink, &ubuf->st_nlink) || + __put_user (uid, &ubuf->st_uid) || + __put_user (gid, &ubuf->st_gid) || + __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || + __put_user (stat->size, &ubuf->st_size) || + __put_user (stat->atime.tv_sec, &ubuf->st_atime) || + __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || + __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || + __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user (stat->blksize, &ubuf->st_blksize) || + __put_user (stat->blocks, &ubuf->st_blocks)) + return -EFAULT; + return 0; +} + +asmlinkage long +sys32_stat64(char __user * filename, struct stat64 __user *statbuf) +{ + struct kstat stat; + int ret = vfs_stat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +asmlinkage long +sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) +{ + struct kstat stat; + int ret = vfs_lstat(filename, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +asmlinkage long +sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) +{ + struct kstat stat; + int ret = vfs_fstat(fd, &stat); + if (!ret) + ret = cp_stat64(statbuf, &stat); + return ret; +} + +/* + * Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct { + unsigned int addr; + unsigned int len; + unsigned int prot; + unsigned int flags; + unsigned int fd; + unsigned int offset; +}; + +asmlinkage long +sys32_mmap(struct mmap_arg_struct __user *arg) +{ + struct mmap_arg_struct a; + struct file *file = NULL; + unsigned long retval; + struct mm_struct *mm ; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + if (!(a.flags & MAP_ANONYMOUS)) { + file = fget(a.fd); + if (!file) + return -EBADF; + } + + mm = current->mm; + down_write(&mm->mmap_sem); + retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); + if (file) + fput(file); + + up_write(&mm->mmap_sem); + + return retval; +} + +asmlinkage long +sys32_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + return sys_mprotect(start,len,prot); +} + +asmlinkage long +sys32_pipe(int __user *fd) +{ + int retval; + int fds[2]; + + retval = do_pipe(fds); + if (retval) + goto out; + if (copy_to_user(fd, fds, sizeof(fds))) + retval = -EFAULT; + out: + return retval; +} + +asmlinkage long +sys32_rt_sigaction(int sig, struct sigaction32 __user *act, + struct sigaction32 __user *oact, unsigned int sigsetsize) +{ + struct k_sigaction new_ka, old_ka; + int ret; + compat_sigset_t set32; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + + if (act) { + compat_uptr_t handler, restorer; + + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(restorer, &act->sa_restorer)|| + __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) + return -EFAULT; + new_ka.sa.sa_handler = compat_ptr(handler); + new_ka.sa.sa_restorer = compat_ptr(restorer); + /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ + switch (_NSIG_WORDS) { + case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] + | (((long)set32.sig[7]) << 32); + case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4] + | (((long)set32.sig[5]) << 32); + case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2] + | (((long)set32.sig[3]) << 32); + case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0] + | (((long)set32.sig[1]) << 32); + } + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ + switch (_NSIG_WORDS) { + case 4: + set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); + set32.sig[6] = old_ka.sa.sa_mask.sig[3]; + case 3: + set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32); + set32.sig[4] = old_ka.sa.sa_mask.sig[2]; + case 2: + set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32); + set32.sig[2] = old_ka.sa.sa_mask.sig[1]; + case 1: + set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32); + set32.sig[0] = old_ka.sa.sa_mask.sig[0]; + } + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) + return -EFAULT; + } + + return ret; +} + +asmlinkage long +sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + compat_old_sigset_t mask; + compat_uptr_t handler, restorer; + + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(restorer, &act->sa_restorer) || + __get_user(mask, &act->sa_mask)) + return -EFAULT; + + new_ka.sa.sa_handler = compat_ptr(handler); + new_ka.sa.sa_restorer = compat_ptr(restorer); + + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) + return -EFAULT; + } + + return ret; +} + +asmlinkage long +sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, + compat_sigset_t __user *oset, unsigned int sigsetsize) +{ + sigset_t s; + compat_sigset_t s32; + int ret; + mm_segment_t old_fs = get_fs(); + + if (set) { + if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) + return -EFAULT; + switch (_NSIG_WORDS) { + case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); + case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); + case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); + case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); + } + } + set_fs (KERNEL_DS); + ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL, + sigsetsize); + set_fs (old_fs); + if (ret) return ret; + if (oset) { + switch (_NSIG_WORDS) { + case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; + case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; + case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; + case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; + } + if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) + return -EFAULT; + } + return 0; +} + +static inline long +get_tv32(struct timeval *o, struct compat_timeval __user *i) +{ + int err = -EFAULT; + if (access_ok(VERIFY_READ, i, sizeof(*i))) { + err = __get_user(o->tv_sec, &i->tv_sec); + err |= __get_user(o->tv_usec, &i->tv_usec); + } + return err; +} + +static inline long +put_tv32(struct compat_timeval __user *o, struct timeval *i) +{ + int err = -EFAULT; + if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { + err = __put_user(i->tv_sec, &o->tv_sec); + err |= __put_user(i->tv_usec, &o->tv_usec); + } + return err; +} + +extern int do_setitimer(int which, struct itimerval *, struct itimerval *); + +asmlinkage long +sys32_alarm(unsigned int seconds) +{ + struct itimerval it_new, it_old; + unsigned int oldalarm; + + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + do_setitimer(ITIMER_REAL, &it_new, &it_old); + oldalarm = it_old.it_value.tv_sec; + /* ehhh.. We can't return 0 if we have an alarm pending.. */ + /* And we'd better return too much than too little anyway */ + if (it_old.it_value.tv_usec) + oldalarm++; + return oldalarm; +} + +/* Translations due to time_t size differences. Which affects all + sorts of things, like timeval and itimerval. */ + +extern struct timezone sys_tz; + +asmlinkage long +sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (put_tv32(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +asmlinkage long +sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) +{ + struct timeval ktv; + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (get_tv32(&ktv, tv)) + return -EFAULT; + kts.tv_sec = ktv.tv_sec; + kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + +struct sel_arg_struct { + unsigned int n; + unsigned int inp; + unsigned int outp; + unsigned int exp; + unsigned int tvp; +}; + +asmlinkage long +sys32_old_select(struct sel_arg_struct __user *arg) +{ + struct sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +extern asmlinkage long +compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, + struct compat_rusage *ru); + +asmlinkage long +sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) +{ + return compat_sys_wait4(pid, stat_addr, options, NULL); +} + +int sys32_ni_syscall(int call) +{ + struct task_struct *me = current; + static char lastcomm[sizeof(me->comm)]; + + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { + printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", + call, me->comm); + strncpy(lastcomm, me->comm, sizeof(lastcomm)); + } + return -ENOSYS; +} + +/* 32-bit timeval and related flotsam. */ + +asmlinkage long +sys32_sysfs(int option, u32 arg1, u32 arg2) +{ + return sys_sysfs(option, arg1, arg2); +} + +struct sysinfo32 { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + unsigned short procs; + unsigned short pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +asmlinkage long +sys32_sysinfo(struct sysinfo32 __user *info) +{ + struct sysinfo s; + int ret; + mm_segment_t old_fs = get_fs (); + int bitcount = 0; + + set_fs (KERNEL_DS); + ret = sys_sysinfo(&s); + set_fs (old_fs); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; + } + + if (!access_ok(VERIFY_WRITE, info, sizeof(struct sysinfo32)) || + __put_user (s.uptime, &info->uptime) || + __put_user (s.loads[0], &info->loads[0]) || + __put_user (s.loads[1], &info->loads[1]) || + __put_user (s.loads[2], &info->loads[2]) || + __put_user (s.totalram, &info->totalram) || + __put_user (s.freeram, &info->freeram) || + __put_user (s.sharedram, &info->sharedram) || + __put_user (s.bufferram, &info->bufferram) || + __put_user (s.totalswap, &info->totalswap) || + __put_user (s.freeswap, &info->freeswap) || + __put_user (s.procs, &info->procs) || + __put_user (s.totalhigh, &info->totalhigh) || + __put_user (s.freehigh, &info->freehigh) || + __put_user (s.mem_unit, &info->mem_unit)) + return -EFAULT; + return 0; +} + +asmlinkage long +sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) +{ + struct timespec t; + int ret; + mm_segment_t old_fs = get_fs (); + + set_fs (KERNEL_DS); + ret = sys_sched_rr_get_interval(pid, &t); + set_fs (old_fs); + if (put_compat_timespec(&t, interval)) + return -EFAULT; + return ret; +} + +asmlinkage long +sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) +{ + sigset_t s; + compat_sigset_t s32; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs (KERNEL_DS); + ret = sys_rt_sigpending(&s, sigsetsize); + set_fs (old_fs); + if (!ret) { + switch (_NSIG_WORDS) { + case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; + case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2]; + case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; + case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; + } + if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) + return -EFAULT; + } + return ret; +} + +asmlinkage long +sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) +{ + siginfo_t info; + int ret; + mm_segment_t old_fs = get_fs(); + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + set_fs (KERNEL_DS); + ret = sys_rt_sigqueueinfo(pid, sig, &info); + set_fs (old_fs); + return ret; +} + +/* These are here just in case some old ia32 binary calls it. */ +asmlinkage long +sys32_pause(void) +{ + current->state = TASK_INTERRUPTIBLE; + schedule(); + return -ERESTARTNOHAND; +} + + +#ifdef CONFIG_SYSCTL +struct sysctl_ia32 { + unsigned int name; + int nlen; + unsigned int oldval; + unsigned int oldlenp; + unsigned int newval; + unsigned int newlen; + unsigned int __unused[4]; +}; + + +asmlinkage long +sys32_sysctl(struct sysctl_ia32 __user *args32) +{ + struct sysctl_ia32 a32; + mm_segment_t old_fs = get_fs (); + void __user *oldvalp, *newvalp; + size_t oldlen; + int __user *namep; + long ret; + extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp, + void *newval, size_t newlen); + + + if (copy_from_user(&a32, args32, sizeof (a32))) + return -EFAULT; + + /* + * We need to pre-validate these because we have to disable address checking + * before calling do_sysctl() because of OLDLEN but we can't run the risk of the + * user specifying bad addresses here. Well, since we're dealing with 32 bit + * addresses, we KNOW that access_ok() will always succeed, so this is an + * expensive NOP, but so what... + */ + namep = compat_ptr(a32.name); + oldvalp = compat_ptr(a32.oldval); + newvalp = compat_ptr(a32.newval); + + if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) + || !access_ok(VERIFY_WRITE, namep, 0) + || !access_ok(VERIFY_WRITE, oldvalp, 0) + || !access_ok(VERIFY_WRITE, newvalp, 0)) + return -EFAULT; + + set_fs(KERNEL_DS); + lock_kernel(); + ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen); + unlock_kernel(); + set_fs(old_fs); + + if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) + return -EFAULT; + + return ret; +} +#endif + +/* warning: next two assume little endian */ +asmlinkage long +sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) +{ + return sys_pread64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + +asmlinkage long +sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) +{ + return sys_pwrite64(fd, ubuf, count, + ((loff_t)AA(poshi) << 32) | AA(poslo)); +} + + +asmlinkage long +sys32_personality(unsigned long personality) +{ + int ret; + if (personality(current->personality) == PER_LINUX32 && + personality == PER_LINUX) + personality = PER_LINUX32; + ret = sys_personality(personality); + if (ret == PER_LINUX32) + ret = PER_LINUX; + return ret; +} + +asmlinkage long +sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) +{ + mm_segment_t old_fs = get_fs(); + int ret; + off_t of; + + if (offset && get_user(of, offset)) + return -EFAULT; + + set_fs(KERNEL_DS); + ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); + set_fs(old_fs); + + if (!ret && offset && put_user(of, offset)) + return -EFAULT; + + return ret; +} + +/* Handle adjtimex compatibility. */ + +struct timex32 { + u32 modes; + s32 offset, freq, maxerror, esterror; + s32 status, constant, precision, tolerance; + struct compat_timeval time; + s32 tick; + s32 ppsfreq, jitter, shift, stabil; + s32 jitcnt, calcnt, errcnt, stbcnt; + s32 :32; s32 :32; s32 :32; s32 :32; + s32 :32; s32 :32; s32 :32; s32 :32; + s32 :32; s32 :32; s32 :32; s32 :32; +}; + +extern int do_adjtimex(struct timex *); + +asmlinkage long +sys32_adjtimex(struct timex32 __user *utp) +{ + struct timex txc; + int ret; + + memset(&txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct timex32)) || + __get_user(txc.modes, &utp->modes) || + __get_user(txc.offset, &utp->offset) || + __get_user(txc.freq, &utp->freq) || + __get_user(txc.maxerror, &utp->maxerror) || + __get_user(txc.esterror, &utp->esterror) || + __get_user(txc.status, &utp->status) || + __get_user(txc.constant, &utp->constant) || + __get_user(txc.precision, &utp->precision) || + __get_user(txc.tolerance, &utp->tolerance) || + __get_user(txc.time.tv_sec, &utp->time.tv_sec) || + __get_user(txc.time.tv_usec, &utp->time.tv_usec) || + __get_user(txc.tick, &utp->tick) || + __get_user(txc.ppsfreq, &utp->ppsfreq) || + __get_user(txc.jitter, &utp->jitter) || + __get_user(txc.shift, &utp->shift) || + __get_user(txc.stabil, &utp->stabil) || + __get_user(txc.jitcnt, &utp->jitcnt) || + __get_user(txc.calcnt, &utp->calcnt) || + __get_user(txc.errcnt, &utp->errcnt) || + __get_user(txc.stbcnt, &utp->stbcnt)) + return -EFAULT; + + ret = do_adjtimex(&txc); + + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct timex32)) || + __put_user(txc.modes, &utp->modes) || + __put_user(txc.offset, &utp->offset) || + __put_user(txc.freq, &utp->freq) || + __put_user(txc.maxerror, &utp->maxerror) || + __put_user(txc.esterror, &utp->esterror) || + __put_user(txc.status, &utp->status) || + __put_user(txc.constant, &utp->constant) || + __put_user(txc.precision, &utp->precision) || + __put_user(txc.tolerance, &utp->tolerance) || + __put_user(txc.time.tv_sec, &utp->time.tv_sec) || + __put_user(txc.time.tv_usec, &utp->time.tv_usec) || + __put_user(txc.tick, &utp->tick) || + __put_user(txc.ppsfreq, &utp->ppsfreq) || + __put_user(txc.jitter, &utp->jitter) || + __put_user(txc.shift, &utp->shift) || + __put_user(txc.stabil, &utp->stabil) || + __put_user(txc.jitcnt, &utp->jitcnt) || + __put_user(txc.calcnt, &utp->calcnt) || + __put_user(txc.errcnt, &utp->errcnt) || + __put_user(txc.stbcnt, &utp->stbcnt)) + ret = -EFAULT; + + return ret; +} + +asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + struct mm_struct *mm = current->mm; + unsigned long error; + struct file * file = NULL; + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + return -EBADF; + } + + down_write(&mm->mmap_sem); + error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(&mm->mmap_sem); + + if (file) + fput(file); + return error; +} + +asmlinkage long sys32_olduname(struct oldold_utsname __user * name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + + error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); + __put_user(0,name->sysname+__OLD_UTS_LEN); + __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); + __put_user(0,name->nodename+__OLD_UTS_LEN); + __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); + __put_user(0,name->release+__OLD_UTS_LEN); + __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); + __put_user(0,name->version+__OLD_UTS_LEN); + { + char *arch = "x86_64"; + if (personality(current->personality) == PER_LINUX32) + arch = "i686"; + + __copy_to_user(&name->machine,arch,strlen(arch)+1); + } + + up_read(&uts_sem); + + error = error ? -EFAULT : 0; + + return error; +} + +long sys32_uname(struct old_utsname __user * name) +{ + int err; + if (!name) + return -EFAULT; + down_read(&uts_sem); + err=copy_to_user(name, &system_utsname, sizeof (*name)); + up_read(&uts_sem); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); + return err?-EFAULT:0; +} + +long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) +{ + struct ustat u; + mm_segment_t seg; + int ret; + + seg = get_fs(); + set_fs(KERNEL_DS); + ret = sys_ustat(dev,&u); + set_fs(seg); + if (ret >= 0) { + if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || + __put_user((__u32) u.f_tfree, &u32p->f_tfree) || + __put_user((__u32) u.f_tinode, &u32p->f_tfree) || + __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || + __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) + ret = -EFAULT; + } + return ret; +} + +asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, + compat_uptr_t __user *envp, struct pt_regs *regs) +{ + long error; + char * filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = compat_do_execve(filename, argv, envp, regs); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + } + putname(filename); + return error; +} + +asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, + struct pt_regs *regs) +{ + void __user *parent_tid = (void __user *)regs->rdx; + void __user *child_tid = (void __user *)regs->rdi; + if (!newsp) + newsp = regs->rsp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * Some system calls that need sign extended arguments. This could be done by a generic wrapper. + */ + +long sys32_lseek (unsigned int fd, int offset, unsigned int whence) +{ + return sys_lseek(fd, offset, whence); +} + +long sys32_kill(int pid, int sig) +{ + return sys_kill(pid, sig); +} + +asmlinkage long sys32_open(const char __user * filename, int flags, int mode) +{ + char * tmp; + int fd, error; + + /* don't force O_LARGEFILE */ + tmp = getname(filename); + fd = PTR_ERR(tmp); + if (!IS_ERR(tmp)) { + fd = get_unused_fd(); + if (fd >= 0) { + struct file *f = filp_open(tmp, flags, mode); + error = PTR_ERR(f); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = error; + } else + fd_install(fd, f); + } + putname(tmp); + } + return fd; +} + +extern asmlinkage long +sys_timer_create(clockid_t which_clock, + struct sigevent __user *timer_event_spec, + timer_t __user * created_timer_id); + +long +sys32_timer_create(u32 clock, struct compat_sigevent __user *se32, timer_t __user *timer_id) +{ + struct sigevent __user *p = NULL; + if (se32) { + struct sigevent se; + p = compat_alloc_user_space(sizeof(struct sigevent)); + if (get_compat_sigevent(&se, se32) || + copy_to_user(p, &se, sizeof(se))) + return -EFAULT; + } + return sys_timer_create(clock, p, timer_id); +} + +long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, + __u32 len_low, __u32 len_high, int advice) +{ + return sys_fadvise64_64(fd, + (((u64)offset_high)<<32) | offset_low, + (((u64)len_high)<<32) | len_low, + advice); +} + +long sys32_vm86_warning(void) +{ + struct task_struct *me = current; + static char lastcomm[sizeof(me->comm)]; + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { + printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", + me->comm); + strncpy(lastcomm, me->comm, sizeof(lastcomm)); + } + return -ENOSYS; +} + +long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, + char __user * buf, size_t len) +{ + return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); +} + +static int __init ia32_init (void) +{ + printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n"); + return 0; +} + +__initcall(ia32_init); + +extern unsigned long ia32_sys_call_table[]; +EXPORT_SYMBOL(ia32_sys_call_table); diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c new file mode 100644 index 000000000000..399ff4985099 --- /dev/null +++ b/arch/x86_64/ia32/syscall32.c @@ -0,0 +1,111 @@ +/* Copyright 2002,2003 Andi Kleen, SuSE Labs */ + +/* vsyscall handling for 32bit processes. Map a stub page into it + on demand because 32bit cannot reach the kernel's fixmaps */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* 32bit VDSOs mapped into user space. */ +asm(".section \".init.data\",\"aw\"\n" + "syscall32_syscall:\n" + ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n" + "syscall32_syscall_end:\n" + "syscall32_sysenter:\n" + ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n" + "syscall32_sysenter_end:\n" + ".previous"); + +extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; +extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; +extern int sysctl_vsyscall32; + +char *syscall32_page; +static int use_sysenter = -1; + +/* + * Map the 32bit vsyscall page on demand. + * + * RED-PEN: This knows too much about high level VM. + * + * Alternative would be to generate a vma with appropriate backing options + * and let it be handled by generic VM. + */ +int __map_syscall32(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte; + pmd_t *pmd; + int err = -ENOMEM; + + spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (pud) { + pmd = pmd_alloc(mm, pud, address); + if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { + if (pte_none(*pte)) { + set_pte(pte, + mk_pte(virt_to_page(syscall32_page), + PAGE_KERNEL_VSYSCALL32)); + } + /* Flush only the local CPU. Other CPUs taking a fault + will just end up here again + This probably not needed and just paranoia. */ + __flush_tlb_one(address); + err = 0; + } + } + spin_unlock(&mm->page_table_lock); + return err; +} + +int map_syscall32(struct mm_struct *mm, unsigned long address) +{ + int err; + down_read(&mm->mmap_sem); + err = __map_syscall32(mm, address); + up_read(&mm->mmap_sem); + return err; +} + +static int __init init_syscall32(void) +{ + syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!syscall32_page) + panic("Cannot allocate syscall32 page"); + SetPageReserved(virt_to_page(syscall32_page)); + if (use_sysenter > 0) { + memcpy(syscall32_page, syscall32_sysenter, + syscall32_sysenter_end - syscall32_sysenter); + } else { + memcpy(syscall32_page, syscall32_syscall, + syscall32_syscall_end - syscall32_syscall); + } + return 0; +} + +__initcall(init_syscall32); + +/* May not be __init: called during resume */ +void syscall32_cpu_init(void) +{ + if (use_sysenter < 0) + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); + + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); + checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c new file mode 100644 index 000000000000..1cc4340de3ca --- /dev/null +++ b/arch/x86_64/ia32/tls32.c @@ -0,0 +1,163 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx)) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +/* + * Set a given TLS descriptor: + * When you want addresses > 32bit use arch_prctl() + */ +int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info) +{ + struct user_desc info; + struct n_desc_struct *desc; + int cpu, idx; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); + return 0; +} + +asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info) +{ + return do_set_thread_area(¤t->thread, u_info); +} + + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) +#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1) + +int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info) +{ + struct user_desc info; + struct n_desc_struct *desc; + int idx; + + if (get_user(idx, &u_info->entry_number)) + return -EFAULT; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; + + memset(&info, 0, sizeof(struct user_desc)); + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + info.lm = GET_LONGMODE(desc); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info) +{ + return do_get_thread_area(¤t->thread, u_info); +} + + +int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs) +{ + struct n_desc_struct *desc; + struct user_desc info; + struct user_desc __user *cp; + int idx; + + cp = (void __user *)childregs->rsi; + if (copy_from_user(&info, cp, sizeof(info))) + return -EFAULT; + if (LDT_empty(&info)) + return -EINVAL; + + idx = info.entry_number; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN; + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + + return 0; +} diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S new file mode 100644 index 000000000000..ba4067d350e4 --- /dev/null +++ b/arch/x86_64/ia32/vsyscall-sigreturn.S @@ -0,0 +1,120 @@ +/* + * Common code for the sigreturn entry points on the vsyscall page. + * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80) + * to enter the kernel. + * This file is #include'd by vsyscall-*.S to define them after the + * vsyscall entry point. The addresses we get for these entry points + * by doing ".balign 32" must match in both versions of the page. + */ + + .section .text.sigreturn,"ax" + .balign 32 + .globl __kernel_sigreturn + .type __kernel_sigreturn,@function +__kernel_sigreturn: +.LSTART_sigreturn: + popl %eax + movl $__NR_ia32_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_sigreturn: + .size __kernel_sigreturn,.-.LSTART_sigreturn + + .section .text.rtsigreturn,"ax" + .balign 32 + .globl __kernel_rt_sigreturn + .type __kernel_rt_sigreturn,@function +__kernel_rt_sigreturn: +.LSTART_rt_sigreturn: + movl $__NR_ia32_rt_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_rt_sigreturn: + .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + + .section .eh_frame,"a",@progbits + .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ +.LSTARTFDE2: + .long .LSTARTFDE2-.LSTARTFRAME /* CIE pointer */ + /* HACK: The dwarf2 unwind routines will subtract 1 from the + return address to get an address in the middle of the + presumed call instruction. Since we didn't get here via + a call, we need to include the nop before the real start + to make up for it. */ + .long .LSTART_sigreturn-1-. /* PC-relative start address */ + .long .LEND_sigreturn-.LSTART_sigreturn+1 + .uleb128 0 /* Augmentation length */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + complicated by the fact that the "CFA" is always assumed to + be the value of the stack pointer in the caller. This means + that we must define the CFA of this body of code to be the + saved value of the stack pointer in the sigcontext. Which + also means that there is no fixed relation to the other + saved registers, which means that we must use DW_CFA_expression + to compute their addresses. It also means that when we + adjust the stack with the popl, we have to do it all over again. */ + +#define do_cfa_expr(offset) \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ + .byte 0x06; /* DW_OP_deref */ \ +1: + +#define do_expr(regno, offset) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ +1: + + do_cfa_expr(IA32_SIGCONTEXT_esp+4) + do_expr(0, IA32_SIGCONTEXT_eax+4) + do_expr(1, IA32_SIGCONTEXT_ecx+4) + do_expr(2, IA32_SIGCONTEXT_edx+4) + do_expr(3, IA32_SIGCONTEXT_ebx+4) + do_expr(5, IA32_SIGCONTEXT_ebp+4) + do_expr(6, IA32_SIGCONTEXT_esi+4) + do_expr(7, IA32_SIGCONTEXT_edi+4) + do_expr(8, IA32_SIGCONTEXT_eip+4) + + .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ + + do_cfa_expr(IA32_SIGCONTEXT_esp) + do_expr(0, IA32_SIGCONTEXT_eax) + do_expr(1, IA32_SIGCONTEXT_ecx) + do_expr(2, IA32_SIGCONTEXT_edx) + do_expr(3, IA32_SIGCONTEXT_ebx) + do_expr(5, IA32_SIGCONTEXT_ebp) + do_expr(6, IA32_SIGCONTEXT_esi) + do_expr(7, IA32_SIGCONTEXT_edi) + do_expr(8, IA32_SIGCONTEXT_eip) + + .align 4 +.LENDFDE2: + + .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ +.LSTARTFDE3: + .long .LSTARTFDE3-.LSTARTFRAME /* CIE pointer */ + /* HACK: See above wrt unwind library assumptions. */ + .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ + .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + slightly less complicated than the above, since we don't + modify the stack pointer in the process. */ + + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip) + + .align 4 +.LENDFDE3: diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S new file mode 100644 index 000000000000..e2aaf3de8a42 --- /dev/null +++ b/arch/x86_64/ia32/vsyscall-syscall.S @@ -0,0 +1,68 @@ +/* + * Code for the vsyscall page. This version uses the syscall instruction. + */ + +#include +#include +#include + + .text + .section .text.vsyscall,"ax" + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function +__kernel_vsyscall: +.LSTART_vsyscall: + push %ebp +.Lpush_ebp: + movl %ecx, %ebp + syscall + movl $__USER32_DS, %ecx + movl %ecx, %ss + movl %ebp, %ecx + popl %ebp +.Lpop_ebp: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + + .section .eh_frame,"a",@progbits +.LSTARTFRAME: + .long .LENDCIE-.LSTARTCIE +.LSTARTCIE: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIE: + + .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ +.LSTARTFDE1: + .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 /* Augmentation length */ + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 8 + .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ + .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 4 + .align 4 +.LENDFDE1: + +#define SYSCALL_ENTER_KERNEL syscall +#include "vsyscall-sigreturn.S" diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S new file mode 100644 index 000000000000..8fb8e0ff3afa --- /dev/null +++ b/arch/x86_64/ia32/vsyscall-sysenter.S @@ -0,0 +1,94 @@ +/* + * Code for the vsyscall page. This version uses the sysenter instruction. + */ + +#include +#include + + .text + .section .text.vsyscall,"ax" + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function +__kernel_vsyscall: +.LSTART_vsyscall: + push %ecx +.Lpush_ecx: + push %edx +.Lpush_edx: + push %ebp +.Lenter_kernel: + movl %esp,%ebp + sysenter + .space 7,0x90 + jmp .Lenter_kernel + /* 16: System call normal return point is here! */ + pop %ebp +.Lpop_ebp: + pop %edx +.Lpop_edx: + pop %ecx +.Lpop_ecx: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + + .section .eh_frame,"a",@progbits +.LSTARTFRAME: + .long .LENDCIE-.LSTARTCIE +.LSTARTCIE: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIE: + + .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ +.LSTARTFDE1: + .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 /* Augmentation length */ + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpush_ecx-.LSTART_vsyscall + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpush_edx-.Lpush_ecx + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lenter_kernel-.Lpush_edx + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x10 /* RA at offset 16 now */ + .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ + /* Finally the epilogue. */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpop_ebp-.Lenter_kernel + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x12 /* RA at offset 12 now */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpop_edx-.Lpop_ebp + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x04 /* DW_CFA_advance_loc4 */ + .long .Lpop_ecx-.Lpop_edx + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x04 /* RA at offset 4 now */ + .align 4 +.LENDFDE1: + +#define SYSCALL_ENTER_KERNEL int $0x80 +#include "vsyscall-sigreturn.S" diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds new file mode 100644 index 000000000000..fa4b4dd4a9ff --- /dev/null +++ b/arch/x86_64/ia32/vsyscall.lds @@ -0,0 +1,77 @@ +/* + * Linker script for vsyscall DSO. The vsyscall page is an ELF shared + * object prelinked to its virtual address. This script controls its layout. + */ + +/* This must match . */ +VSYSCALL_BASE = 0xffffe000; + +SECTIONS +{ + . = VSYSCALL_BASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + /* This linker script is used both with -r and with -shared. + For the layouts to match, we need to skip more than enough + space for the dynamic symbol table et al. If this amount + is insufficient, ld -shared will barf. Just increase it here. */ + . = VSYSCALL_BASE + 0x400; + + .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 + + /* This is an 32bit object and we cannot easily get the offsets + into the 64bit kernel. Just hardcode them here. This assumes + that all the stubs don't need more than 0x100 bytes. */ + . = VSYSCALL_BASE + 0x500; + + .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 + + . = VSYSCALL_BASE + 0x600; + + .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .dynamic : { *(.dynamic) } :text :dynamic + .useless : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + } :text +} + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile new file mode 100644 index 000000000000..0a3318e08ab6 --- /dev/null +++ b/arch/x86_64/kernel/Makefile @@ -0,0 +1,45 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head.o head64.o init_task.o vmlinux.lds +EXTRA_AFLAGS := -traditional +obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ + ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ + x8664_ksyms.o i387.o syscall.o vsyscall.o \ + setup64.o bootflag.o e820.o reboot.o quirks.o + +obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ +obj-$(CONFIG_ACPI_BOOT) += acpi/ +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ + genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_PM) += suspend.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o +obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o +obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_KPROBES) += kprobes.o + +obj-$(CONFIG_MODULES) += module.o + +obj-y += topology.o +obj-y += intel_cacheinfo.o + +CFLAGS_vsyscall.o := $(PROFILING) -g0 + +bootflag-y += ../../i386/kernel/bootflag.o +cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o +topology-y += ../../i386/mach-default/topology.o +swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o +microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o +intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o +quirks-y += ../../i386/kernel/quirks.o diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile new file mode 100644 index 000000000000..d2c2ee5f9a88 --- /dev/null +++ b/arch/x86_64/kernel/acpi/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_ACPI_BOOT) := boot.o +boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c new file mode 100644 index 000000000000..7a275de6df22 --- /dev/null +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -0,0 +1,132 @@ +/* + * acpi.c - Architecture-Specific Low-Level ACPI Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh + * Copyright (C) 2001 Jun Nakajima + * Copyright (C) 2001 Patrick Mochel + * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) + * Copyright (C) 2003 Pavel Machek, SuSE Labs + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* -------------------------------------------------------------------------- + Low-Level Sleep Support + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_SLEEP + +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_video_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); + +static pgd_t low_ptr; + +static void init_low_mapping(void) +{ + pgd_t *slot0 = pgd_offset(current->mm, 0UL); + low_ptr = *slot0; + set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); + flush_tlb_all(); +} + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem (void) +{ + init_low_mapping(); + + memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); + + return 0; +} + +/* + * acpi_restore_state + */ +void acpi_restore_state_mem (void) +{ + set_pgd(pgd_offset(current->mm, 0UL), low_ptr); + flush_tlb_all(); +} + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page in low memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16M pages, but not + * <1M pages. + */ +void __init acpi_reserve_bootmem(void) +{ + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); +} + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_video_flags = 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_video_flags |= 2; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); + +#endif /*CONFIG_ACPI_SLEEP*/ + +void acpi_pci_link_exit(void) {} diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S new file mode 100644 index 000000000000..a4c630034cd4 --- /dev/null +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -0,0 +1,527 @@ +.text +#include +#include +#include +#include + +# Copyright 2003 Pavel Machek , distribute under GPLv2 +# +# wakeup_code runs in real mode, and at unknown address (determined at run-time). +# Therefore it must only use relative jumps/calls. +# +# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled +# +# If physical address of wakeup_code is 0x12345, BIOS should call us with +# cs = 0x1234, eip = 0x05 +# + + +ALIGN + .align 16 +ENTRY(wakeup_start) +wakeup_code: + wakeup_code_start = . + .code16 + +# Running in *copy* of this code, somewhere in low 1MB. + + movb $0xa1, %al ; outb %al, $0x80 + cli + cld + # setup data segment + movw %cs, %ax + movw %ax, %ds # Make ds:0 point to wakeup_start + movw %ax, %ss + mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board + + pushl $0 # Kill any dangerous flags + popfl + + movl real_magic - wakeup_code, %eax + cmpl $0x12345678, %eax + jne bogus_real_magic + + testl $1, video_flags - wakeup_code + jz 1f + lcall $0xc000,$3 + movw %cs, %ax + movw %ax, %ds # Bios might have played with that + movw %ax, %ss +1: + + testl $2, video_flags - wakeup_code + jz 1f + mov video_mode - wakeup_code, %ax + call mode_seta +1: + + movw $0xb800, %ax + movw %ax,%fs + movw $0x0e00 + 'L', %fs:(0x10) + + movb $0xa2, %al ; outb %al, $0x80 + + lidt %ds:idt_48a - wakeup_code + xorl %eax, %eax + movw %ds, %ax # (Convert %ds:gdt to a linear ptr) + shll $4, %eax + addl $(gdta - wakeup_code), %eax + movl %eax, gdt_48a +2 - wakeup_code + lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is + # appropriate + + movl $1, %eax # protected mode (PE) bit + lmsw %ax # This is it! + jmp 1f +1: + + .byte 0x66, 0xea # prefix + jmpi-opcode + .long wakeup_32 - __START_KERNEL_map + .word __KERNEL_CS + + .code32 +wakeup_32: +# Running in this code, but at low address; paging is not yet turned on. + movb $0xa5, %al ; outb %al, $0x80 + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + cmpl $0x80000000, %eax + jbe bogus_cpu + wbinvd + mov $0x80000001, %eax + cpuid + btl $29, %edx + jnc bogus_cpu + movl %edx,%edi + + movw $__KERNEL_DS, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + + movw $__KERNEL_DS, %ax + movw %ax, %ss + + mov $(wakeup_stack - __START_KERNEL_map), %esp + movl saved_magic - __START_KERNEL_map, %eax + cmpl $0x9abcdef0, %eax + jne bogus_32_magic + + /* + * Prepare for entering 64bits mode + */ + + /* Enable PAE mode and PGE */ + xorl %eax, %eax + btsl $5, %eax + btsl $7, %eax + movl %eax, %cr4 + + /* Setup early boot stage 4 level pagetables */ + movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax + movl %eax, %cr3 + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + /* Fool rdmsr and reset %eax to avoid dependences */ + xorl %eax, %eax + /* Enable Long Mode */ + btsl $_EFER_LME, %eax + /* Enable System Call */ + btsl $_EFER_SCE, %eax + + /* No Execute supported? */ + btl $20,%edi + jnc 1f + btsl $_EFER_NX, %eax +1: + + /* Make changes effective */ + wrmsr + wbinvd + + xorl %eax, %eax + btsl $31, %eax /* Enable paging and in turn activate Long Mode */ + btsl $0, %eax /* Enable protected mode */ + btsl $1, %eax /* Enable MP */ + btsl $4, %eax /* Enable ET */ + btsl $5, %eax /* Enable NE */ + btsl $16, %eax /* Enable WP */ + btsl $18, %eax /* Enable AM */ + + /* Make changes effective */ + movl %eax, %cr0 + /* At this point: + CR4.PAE must be 1 + CS.L must be 0 + CR3 must point to PML4 + Next instruction must be a branch + This must be on identity-mapped page + */ + jmp reach_compatibility_mode +reach_compatibility_mode: + movw $0x0e00 + 'i', %ds:(0xb8012) + movb $0xa8, %al ; outb %al, $0x80; + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + + movw $0x0e00 + 'n', %ds:(0xb8014) + movb $0xa9, %al ; outb %al, $0x80 + + /* Load new GDT with the 64bit segment using 32bit descriptor */ + movl $(pGDT32 - __START_KERNEL_map), %eax + lgdt (%eax) + + movl $(wakeup_jumpvector - __START_KERNEL_map), %eax + /* Finally jump in 64bit mode */ + ljmp *(%eax) + +wakeup_jumpvector: + .long wakeup_long64 - __START_KERNEL_map + .word __KERNEL_CS + +.code64 + + /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ +wakeup_long64: + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt cpu_gdt_descr - __START_KERNEL_map + + movw $0x0e00 + 'u', %ds:(0xb8016) + + nop + nop + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + movq saved_esp, %rsp + + movw $0x0e00 + 'x', %ds:(0xb8018) + movq saved_ebx, %rbx + movq saved_edi, %rdi + movq saved_esi, %rsi + movq saved_ebp, %rbp + + movw $0x0e00 + '!', %ds:(0xb801a) + movq saved_eip, %rax + jmp *%rax + +.code32 + + .align 64 +gdta: + .word 0, 0, 0, 0 # dummy + + .word 0, 0, 0, 0 # unused + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) + + .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) + .word 0 # base address = 0 + .word 0x9200 # data read/write + .word 0x00CF # granularity = 4096, 386 + # (+5th nibble of limit) +# this is 64bit descriptor for code + .word 0xFFFF + .word 0 + .word 0x9A00 # code read/exec + .word 0x00AF # as above, but it is long mode and with D=0 + +idt_48a: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + +gdt_48a: + .word 0x8000 # gdt limit=2048, + # 256 GDT entries + .word 0, 0 # gdt base (filled in later) + + +real_save_gdt: .word 0 + .quad 0 +real_magic: .quad 0 +video_mode: .quad 0 +video_flags: .quad 0 + +bogus_real_magic: + movb $0xba,%al ; outb %al,$0x80 + jmp bogus_real_magic + +bogus_32_magic: + movb $0xb3,%al ; outb %al,$0x80 + jmp bogus_32_magic + +bogus_31_magic: + movb $0xb1,%al ; outb %al,$0x80 + jmp bogus_31_magic + +bogus_cpu: + movb $0xbc,%al ; outb %al,$0x80 + jmp bogus_cpu + + +/* This code uses an extended set of video mode numbers. These include: + * Aliases for standard modes + * NORMAL_VGA (-1) + * EXTENDED_VGA (-2) + * ASK_VGA (-3) + * Video modes numbered by menu position -- NOT RECOMMENDED because of lack + * of compatibility when extending the table. These are between 0x00 and 0xff. + */ +#define VIDEO_FIRST_MENU 0x0000 + +/* Standard BIOS video modes (BIOS number + 0x0100) */ +#define VIDEO_FIRST_BIOS 0x0100 + +/* VESA BIOS video modes (VESA number + 0x0200) */ +#define VIDEO_FIRST_VESA 0x0200 + +/* Video7 special modes (BIOS number + 0x0900) */ +#define VIDEO_FIRST_V7 0x0900 + +# Setting of user mode (AX=mode ID) => CF=success +mode_seta: + movw %ax, %bx +#if 0 + cmpb $0xff, %ah + jz setalias + + testb $VIDEO_RECALC>>8, %ah + jnz _setrec + + cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah + jnc setres + + cmpb $VIDEO_FIRST_SPECIAL>>8, %ah + jz setspc + + cmpb $VIDEO_FIRST_V7>>8, %ah + jz setv7 +#endif + + cmpb $VIDEO_FIRST_VESA>>8, %ah + jnc check_vesaa +#if 0 + orb %ah, %ah + jz setmenu +#endif + + decb %ah +# jz setbios Add bios modes later + +setbada: clc + ret + +check_vesaa: + subb $VIDEO_FIRST_VESA>>8, %bh + orw $0x4000, %bx # Use linear frame buffer + movw $0x4f02, %ax # VESA BIOS mode set call + int $0x10 + cmpw $0x004f, %ax # AL=4f if implemented + jnz _setbada # AH=0 if OK + + stc + ret + +_setbada: jmp setbada + + .code64 +bogus_magic: + movw $0x0e00 + 'B', %ds:(0xb8018) + jmp bogus_magic + +bogus_magic2: + movw $0x0e00 + '2', %ds:(0xb8018) + jmp bogus_magic2 + + +wakeup_stack_begin: # Stack grows down + +.org 0xff0 +wakeup_stack: # Just below end of page + +ENTRY(wakeup_end) + +## +# acpi_copy_wakeup_routine +# +# Copy the above routine to low memory. +# +# Parameters: +# %rdi: place to copy wakeup routine to +# +# Returned address is location of code in low memory (past data and stack) +# +ENTRY(acpi_copy_wakeup_routine) + pushq %rax + pushq %rcx + pushq %rdx + + sgdt saved_gdt + sidt saved_idt + sldt saved_ldt + str saved_tss + + movq %cr3, %rdx + movq %rdx, saved_cr3 + movq %cr4, %rdx + movq %rdx, saved_cr4 + movq %cr0, %rdx + movq %rdx, saved_cr0 + sgdt real_save_gdt - wakeup_start (,%rdi) + movl $MSR_EFER, %ecx + rdmsr + movl %eax, saved_efer + movl %edx, saved_efer2 + + movl saved_video_mode, %edx + movl %edx, video_mode - wakeup_start (,%rdi) + movl acpi_video_flags, %edx + movl %edx, video_flags - wakeup_start (,%rdi) + movq $0x12345678, real_magic - wakeup_start (,%rdi) + movq $0x123456789abcdef0, %rdx + movq %rdx, saved_magic + + movl saved_magic - __START_KERNEL_map, %eax + cmpl $0x9abcdef0, %eax + jne bogus_32_magic + + # make sure %cr4 is set correctly (features, etc) + movl saved_cr4 - __START_KERNEL_map, %eax + movq %rax, %cr4 + + movl saved_cr0 - __START_KERNEL_map, %eax + movq %rax, %cr0 + jmp 1f # Flush pipelines +1: + # restore the regs we used + popq %rdx + popq %rcx + popq %rax +ENTRY(do_suspend_lowlevel_s4bios) + ret + + .align 2 + .p2align 4,,15 +.globl do_suspend_lowlevel + .type do_suspend_lowlevel,@function +do_suspend_lowlevel: +.LFB5: + subq $8, %rsp + xorl %eax, %eax + call save_processor_state + + movq %rsp, saved_context_esp(%rip) + movq %rax, saved_context_eax(%rip) + movq %rbx, saved_context_ebx(%rip) + movq %rcx, saved_context_ecx(%rip) + movq %rdx, saved_context_edx(%rip) + movq %rbp, saved_context_ebp(%rip) + movq %rsi, saved_context_esi(%rip) + movq %rdi, saved_context_edi(%rip) + movq %r8, saved_context_r08(%rip) + movq %r9, saved_context_r09(%rip) + movq %r10, saved_context_r10(%rip) + movq %r11, saved_context_r11(%rip) + movq %r12, saved_context_r12(%rip) + movq %r13, saved_context_r13(%rip) + movq %r14, saved_context_r14(%rip) + movq %r15, saved_context_r15(%rip) + pushfq ; popq saved_context_eflags(%rip) + + movq $.L97, saved_eip(%rip) + + movq %rsp,saved_esp + movq %rbp,saved_ebp + movq %rbx,saved_ebx + movq %rdi,saved_edi + movq %rsi,saved_esi + + addq $8, %rsp + movl $3, %edi + xorl %eax, %eax + jmp acpi_enter_sleep_state +.L97: + .p2align 4,,7 +.L99: + .align 4 + movl $24, %eax + movw %ax, %ds + movq saved_context+58(%rip), %rax + movq %rax, %cr4 + movq saved_context+50(%rip), %rax + movq %rax, %cr3 + movq saved_context+42(%rip), %rax + movq %rax, %cr2 + movq saved_context+34(%rip), %rax + movq %rax, %cr0 + pushq saved_context_eflags(%rip) ; popfq + movq saved_context_esp(%rip), %rsp + movq saved_context_ebp(%rip), %rbp + movq saved_context_eax(%rip), %rax + movq saved_context_ebx(%rip), %rbx + movq saved_context_ecx(%rip), %rcx + movq saved_context_edx(%rip), %rdx + movq saved_context_esi(%rip), %rsi + movq saved_context_edi(%rip), %rdi + movq saved_context_r08(%rip), %r8 + movq saved_context_r09(%rip), %r9 + movq saved_context_r10(%rip), %r10 + movq saved_context_r11(%rip), %r11 + movq saved_context_r12(%rip), %r12 + movq saved_context_r13(%rip), %r13 + movq saved_context_r14(%rip), %r14 + movq saved_context_r15(%rip), %r15 + + xorl %eax, %eax + addq $8, %rsp + jmp restore_processor_state +.LFE5: +.Lfe5: + .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel + +.data +ALIGN +ENTRY(saved_ebp) .quad 0 +ENTRY(saved_esi) .quad 0 +ENTRY(saved_edi) .quad 0 +ENTRY(saved_ebx) .quad 0 + +ENTRY(saved_eip) .quad 0 +ENTRY(saved_esp) .quad 0 + +ENTRY(saved_magic) .quad 0 + +ALIGN +# saved registers +saved_gdt: .quad 0,0 +saved_idt: .quad 0,0 +saved_ldt: .quad 0 +saved_tss: .quad 0 + +saved_cr0: .quad 0 +saved_cr3: .quad 0 +saved_cr4: .quad 0 +saved_efer: .quad 0 +saved_efer2: .quad 0 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c new file mode 100644 index 000000000000..4baa99fe1e5c --- /dev/null +++ b/arch/x86_64/kernel/aperture.c @@ -0,0 +1,286 @@ +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture or only set the + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int iommu_aperture; +int iommu_aperture_disabled __initdata = 0; +int iommu_aperture_allowed __initdata = 0; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata = 0; + +int fix_aperture __initdata = 1; + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) + +static u32 __init allocate_aperture(void) +{ +#ifdef CONFIG_DISCONTIGMEM + pg_data_t *nd0 = NODE_DATA(0); +#else + pg_data_t *nd0 = &contig_page_data; +#endif + u32 aper_size; + void *p; + + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means an 2GB aperture won't + * have much chances to find a place in the lower 4GB of memory. + * Unfortunately we cannot move it up because that would make the + * IOMMU useless. + */ + p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); + if (!p || __pa(p)+aper_size > 0xffffffff) { + printk("Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); + if (p) + free_bootmem_node(nd0, (unsigned long)p, aper_size); + return 0; + } + printk("Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, __pa(p)); + return (u32)__pa(p); +} + +static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) +{ + if (!aper_base) + return 0; + if (aper_size < 64*1024*1024) { + printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); + return 0; + } + if (aper_base + aper_size >= 0xffffffff) { + printk("Aperture from %s beyond 4GB. Ignoring.\n",name); + return 0; + } + if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); + return 0; + } + return 1; +} + +/* Find a PCI capability */ +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ + u8 pos; + int bytes; + if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + return 0; + pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + pos &= ~3; + id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + + printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + if (apsizereg == 0xffffffff) { + printk("APSIZE in AGP bridge unreadable\n"); + return 0; + } + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(num,slot,func, 0x10); + aper_hi = read_pci_config(num,slot,func,0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); + + if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) + return 0; + return (u32)aper; +} + +/* Look for an AGP bridge. Windows only expects the aperture in the + AGP bridge and some BIOS forget to initialize the Northbridge too. + Work around this here. + + Do an PCI bus scan by hand because we're running before the PCI + subsystem. + + All K8 AGP bridges are AGPv3 compliant, so we can do this scan + generically. It's probably overkill to always scan all slots because + the AGP bridges should be always an own bus on the HT hierarchy, + but do it here for future safety. */ +static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int num, slot, func; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(num,slot,func,cap,order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + printk("No AGP bridge found\n"); + return 0; +} + +void __init iommu_hole_init(void) +{ + int fix, num; + u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int valid_agp = 0; + + if (iommu_aperture_disabled || !fix_aperture) + return; + + printk("Checking aperture...\n"); + + fix = 0; + for (num = 24; num < 32; num++) { + char name[30]; + if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) + continue; + + iommu_aperture = 1; + + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, + aper_base, aper_size>>20); + + sprintf(name, "northbridge cpu %d", num-24); + + if (!aperture_valid(name, aper_base, aper_size)) { + fix = 1; + break; + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + + if (!fix && !fallback_aper_force) + return; + + if (!fallback_aper_force) + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || + force_iommu || + valid_agp || + fallback_aper_force) { + printk("Your BIOS doesn't leave a aperture memory hole\n"); + printk("Please enable the IOMMU option in the BIOS setup\n"); + printk("This costs you %d MB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* Could disable AGP and IOMMU here, but it's probably + not worth it. But the later users cannot deal with + bad apertures and turning on the aperture over memory + causes very strange problems, so it's better to + panic early. */ + panic("Not enough memory for aperture"); + } + } else { + return; + } + + /* Fix up the north bridges */ + for (num = 24; num < 32; num++) { + if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) + continue; + + /* Don't enable translation yet. That is done later. + Assume this BIOS didn't initialise the GART so + just overwrite all previous bits */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c new file mode 100644 index 000000000000..9826f03dc9a3 --- /dev/null +++ b/arch/x86_64/kernel/apic.c @@ -0,0 +1,1088 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +int apic_verbosity; + +int disable_apic_timer __initdata; + +/* Using APIC to generate smp_local_timer_interrupt? */ +int using_apic_timer = 0; + +static DEFINE_PER_CPU(int, prof_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; +static DEFINE_PER_CPU(int, prof_counter) = 1; + +static void apic_pm_activate(void); + +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v, ver; + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + v = APIC_DM_NMI; /* unmask and set to NMI */ + apic_write_around(APIC_LVT0, v); +} + +int get_maxlvt(void) +{ + unsigned int v, ver, maxlvt; + + v = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(v); + maxlvt = GET_APIC_MAXLVT(v); + return maxlvt; +} + +void clear_local_APIC(void) +{ + int maxlvt; + unsigned int v; + + maxlvt = get_maxlvt(); + + /* + * Masking an LVT entry on a P6 can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* + * Clean APIC state for other OSs: + */ + apic_write_around(APIC_LVTT, APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + v = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (APIC_INTEGRATED(v)) { /* !82489DX */ + if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. + * connect BSP's local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } +} + +void disconnect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect + * only on certain older boards). Note that APIC + * interrupts, including IPIs, won't work beyond + * this point! The only exception are INIT IPIs. + */ + apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } +} + +void disable_local_APIC(void) +{ + unsigned int value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write_around(APIC_SPIV, value); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +void __init sync_Arb_IDs(void) +{ + /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ + unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + if (ver >= 0x14) /* P4 or higher */ + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG + | APIC_DM_INIT); +} + +extern void __error_in_apic_c (void); + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value, ver; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); +} + +void __init setup_local_APIC (void) +{ + unsigned int value, ver, maxlvt; + + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } + + value = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(value); + + if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) + __error_in_apic_c(); + + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write_around(APIC_TASKPRI, value); + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ +#if 1 + /* Enable focus processor (bit==0) */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#else + /* Disable focus processor (bit==1) */ + value |= APIC_SPIV_FOCUS_DISABLED; +#endif + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the BP's LINT0. This is not + * strictly necessary in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!smp_processor_id() && (pic_mode || !value)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); + } + apic_write_around(APIC_LVT0, value); + + /* + * only the BP should see the LINT1 NMI signal, obviously. + */ + if (!smp_processor_id()) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!APIC_INTEGRATED(ver)) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); + + if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ + unsigned oldvalue; + maxlvt = get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + value = ERROR_APIC_VECTOR; // enables sending errors + apic_write_around(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, + "ESR value after enabling vector: %08x, after %08x\n", + oldvalue, value); + } else { + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); + else + apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); + } + + nmi_watchdog_default(); + if (nmi_watchdog == NMI_LOCAL_APIC) + setup_apic_nmi_watchdog(); + apic_pm_activate(); +} + +#ifdef CONFIG_PM + +static struct { + /* 'active' is true if the local APIC was enabled by us and + not the BIOS; this signifies that we are also responsible + for disabling it before entering apm/acpi suspend */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, u32 state) +{ + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); + local_save_flags(flags); + local_irq_disable(); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + /* XXX: Pavel needs this for S3 resume, but can't explain why */ + set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); + + local_irq_save(flags); + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; +} + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __init apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug", str); + + return 0; +} + +__setup("apic=", apic_set_verbosity); + +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ + +static int __init detect_init_APIC (void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_id = 0; + return 0; +} + +void __init init_apic_mappings(void) +{ + unsigned long apic_phys; + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + if (boot_cpu_id == -1U) + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + +#ifdef CONFIG_X86_IO_APIC + { + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + } + } +#endif +} + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +#define APIC_DIVISOR 16 + +static void __setup_APIC_LVTT(unsigned int clocks) +{ + unsigned int lvtt_value, tmp_value, ver; + + ver = GET_APIC_VERSION(apic_read(APIC_LVR)); + lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + if (!APIC_INTEGRATED(ver)) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + apic_write_around(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void setup_APIC_timer(unsigned int clocks) +{ + unsigned long flags; + + local_irq_save(flags); + + /* For some reasons this doesn't work on Simics, so fake it for now */ + if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { + __setup_APIC_LVTT(clocks); + return; + } + + /* wait for irq slice */ + if (vxtime.hpet_address) { + int trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_COUNTER) >= trigger) + /* do nothing */ ; + while (hpet_readl(HPET_COUNTER) < trigger) + /* do nothing */ ; + } else { + int c1, c2; + outb_p(0x00, 0x43); + c2 = inb_p(0x40); + c2 |= inb_p(0x40) << 8; + do { + c1 = c2; + outb_p(0x00, 0x43); + c2 = inb_p(0x40); + c2 |= inb_p(0x40) << 8; + } while (c2 - c1 < 300); + } + + __setup_APIC_LVTT(clocks); + + local_irq_restore(flags); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + int apic, apic_start, tsc, tsc_start; + int result; + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + */ + __setup_APIC_LVTT(1000000000); + + apic_start = apic_read(APIC_TMCCT); + rdtscl(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscl(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + return result * APIC_DIVISOR / HZ; +} + +static unsigned int calibration_result; + +void __init setup_boot_APIC_clock (void) +{ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } + + printk(KERN_INFO "Using local APIC timer interrupts.\n"); + using_apic_timer = 1; + + local_irq_disable(); + + calibration_result = calibrate_APIC_clock(); + /* + * Now set up the timer for real. + */ + setup_APIC_timer(calibration_result); + + local_irq_enable(); +} + +void __init setup_secondary_APIC_clock(void) +{ + local_irq_disable(); /* FIXME: Do we need this? --RR */ + setup_APIC_timer(calibration_result); + local_irq_enable(); +} + +void __init disable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + } +} + +void enable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); + } +} + +/* + * the frequency of the profiling timer can be changed + * by writing a multiplier value into /proc/profile. + */ +int setup_profiling_timer(unsigned int multiplier) +{ + int i; + + /* + * Sanity check. [at least 500 APIC cycles should be + * between APIC interrupts as a rule of thumb, to avoid + * irqs flooding us] + */ + if ( (!multiplier) || (calibration_result/multiplier < 500)) + return -EINVAL; + + /* + * Set the new multiplier for each CPU. CPUs don't start using the + * new values until the next timer interrupt in which they do process + * accounting. At that time they also adjust their APIC timers + * accordingly. + */ + for (i = 0; i < NR_CPUS; ++i) + per_cpu(prof_multiplier, i) = multiplier; + + return 0; +} + +#undef APIC_DIVISOR + +/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * + * We do profiling in every local tick, statistics/rescheduling + * happen only every 'profiling multiplier' ticks. The default + * multiplier is 1 and it can be changed by writing the new multiplier + * value into /proc/profile. + */ + +void smp_local_timer_interrupt(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + profile_tick(CPU_PROFILING, regs); + if (--per_cpu(prof_counter, cpu) <= 0) { + /* + * The multiplier may have changed since the last time we got + * to this point as a result of the user writing to + * /proc/profile. In this case we need to adjust the APIC + * timer accordingly. + * + * Interrupts are already masked off at this point. + */ + per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); + if (per_cpu(prof_counter, cpu) != + per_cpu(prof_old_multiplier, cpu)) { + __setup_APIC_LVTT(calibration_result/ + per_cpu(prof_counter, cpu)); + per_cpu(prof_old_multiplier, cpu) = + per_cpu(prof_counter, cpu); + } + +#ifdef CONFIG_SMP + update_process_times(user_mode(regs)); +#endif + } + + /* + * We take the 'long' return path, and there every subsystem + * grabs the appropriate locks (kernel lock/ irq lock). + * + * we might want to decouple profiling from the 'long path', + * and do the profiling totally in assembly. + * + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); +} + +/* + * oem_force_hpet_timer -- force HPET mode for some boxes. + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__init int oem_force_hpet_timer(void) +{ + int i, clusters, zeros; + unsigned id; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + bitmap_empty(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since + * clusters are allocated sequentially, count zeros only if they are + * bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* + * If clusters > 2, then should be multi-chassis. Return 1 for HPET. + * Else return 0 to use TSC. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + +#if 0 + static unsigned long last_warning; + static unsigned long skipped; + + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + if (time_before(last_warning+30*HZ,jiffies)) { + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", + smp_processor_id(), skipped); + last_warning = jiffies; + skipped = 0; + } else { + skipped++; + } +#endif + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ + +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +int disable_apic; + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } + + verify_local_APIC(); + + connect_bsp_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(0); + apic_write_around(APIC_ID, boot_cpu_id); + + setup_local_APIC(); + +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; +#endif + setup_boot_APIC_clock(); + + return 0; +} + +static __init int setup_disableapic(char *str) +{ + disable_apic = 1; + return 0; +} + +static __init int setup_nolapic(char *str) +{ + disable_apic = 1; + return 0; +} + +static __init int setup_noapictimer(char *str) +{ + disable_apic_timer = 1; + return 0; +} + +/* dummy parsing: see setup.c */ + +__setup("disableapic", setup_disableapic); +__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ + +__setup("noapictimer", setup_noapictimer); + +/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c new file mode 100644 index 000000000000..35b4c3fcbb37 --- /dev/null +++ b/arch/x86_64/kernel/asm-offsets.c @@ -0,0 +1,69 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +int main(void) +{ +#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) + ENTRY(state); + ENTRY(flags); + ENTRY(thread); + ENTRY(pid); + BLANK(); +#undef ENTRY +#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) + ENTRY(flags); + ENTRY(addr_limit); + ENTRY(preempt_count); + BLANK(); +#undef ENTRY +#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) + ENTRY(kernelstack); + ENTRY(oldrsp); + ENTRY(pcurrent); + ENTRY(irqrsp); + ENTRY(irqcount); + ENTRY(cpunumber); + ENTRY(irqstackptr); + BLANK(); +#undef ENTRY +#ifdef CONFIG_IA32_EMULATION +#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + ENTRY(eax); + ENTRY(ebx); + ENTRY(ecx); + ENTRY(edx); + ENTRY(esi); + ENTRY(edi); + ENTRY(ebp); + ENTRY(esp); + ENTRY(eip); + BLANK(); +#undef ENTRY + DEFINE(IA32_RT_SIGFRAME_sigcontext, + offsetof (struct rt_sigframe32, uc.uc_mcontext)); + BLANK(); +#endif + DEFINE(pbe_address, offsetof(struct pbe, address)); + DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); + DEFINE(pbe_next, offsetof(struct pbe, next)); + return 0; +} diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig new file mode 100644 index 000000000000..81f1562e5393 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Kconfig @@ -0,0 +1,96 @@ +# +# CPU Frequency scaling +# + +menu "CPU Frequency scaling" + +source "drivers/cpufreq/Kconfig" + +if CPU_FREQ + +comment "CPUFreq processor drivers" + +config X86_POWERNOW_K8 + tristate "AMD Opteron/Athlon64 PowerNow!" + select CPU_FREQ_TABLE + help + This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors. + + For details, take a look at . + + If in doubt, say N. + +config X86_POWERNOW_K8_ACPI + bool + depends on X86_POWERNOW_K8 && ACPI_PROCESSOR + depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m) + default y + +config X86_SPEEDSTEP_CENTRINO + tristate "Intel Enhanced SpeedStep" + select CPU_FREQ_TABLE + depends on ACPI_PROCESSOR + help + This adds the CPUFreq driver for Enhanced SpeedStep enabled + mobile CPUs. This means Intel Pentium M (Centrino) CPUs + or 64bit enabled Intel Xeons. + + For details, take a look at . + + If in doubt, say N. + +config X86_SPEEDSTEP_CENTRINO_ACPI + bool + depends on X86_SPEEDSTEP_CENTRINO + default y + +config X86_ACPI_CPUFREQ + tristate "ACPI Processor P-States driver" + depends on ACPI_PROCESSOR + help + This driver adds a CPUFreq driver which utilizes the ACPI + Processor Performance States. + + For details, take a look at . + + If in doubt, say N. + +comment "shared options" + +config X86_ACPI_CPUFREQ_PROC_INTF + bool "/proc/acpi/processor/../performance interface (deprecated)" + depends on PROC_FS + depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI + help + This enables the deprecated /proc/acpi/processor/../performance + interface. While it is helpful for debugging, the generic, + cross-architecture cpufreq interfaces should be used. + + If in doubt, say N. + +config X86_P4_CLOCKMOD + tristate "Intel Pentium 4 clock modulation" + depends on EMBEDDED + help + This adds the clock modulation driver for Intel Pentium 4 / XEON + processors. When enabled it will lower CPU temperature by skipping + clocks. + + This driver should be only used in exceptional + circumstances when very low power is needed because it causes severe + slowdowns and noticeable latencies. Normally Speedstep should be used + instead. + + For details, take a look at . + + Unless you are absolutely sure say N. + + +config X86_SPEEDSTEP_LIB + tristate + default X86_P4_CLOCKMOD + +endif + +endmenu + diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile new file mode 100644 index 000000000000..d8b593879224 --- /dev/null +++ b/arch/x86_64/kernel/cpufreq/Makefile @@ -0,0 +1,17 @@ +# +# Reuse the i386 cpufreq drivers +# + +SRCDIR := ../../../i386/kernel/cpu/cpufreq + +obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o +obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o +obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o + +powernow-k8-objs := ${SRCDIR}/powernow-k8.o +speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o +acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o +p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o +speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c new file mode 100644 index 000000000000..657003e461e6 --- /dev/null +++ b/arch/x86_64/kernel/e820.c @@ -0,0 +1,513 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char _end[]; + +/* + * PFN of last memory page. + */ +unsigned long end_pfn; + +/* + * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. + * The direct mapping extends to end_pfn_map, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long end_pfn_map; + +/* + * Last pfn which the user wants to use. + */ +unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; + +extern struct resource code_resource, data_resource; + +/* Check for some hardcoded bad areas that early boot is not allowed to touch */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + unsigned long addr = *addrp, last = addr + size; + + /* various gunk below that needed for SMP startup */ + if (addr < 0x8000) { + *addrp = 0x8000; + return 1; + } + + /* direct mapping tables of the kernel */ + if (last >= table_start<= INITRD_START && + addr < INITRD_START+INITRD_SIZE) { + *addrp = INITRD_START + INITRD_SIZE; + return 1; + } +#endif + /* kernel code + 640k memory hole (later should not be needed, but + be paranoid for now) */ + if (last >= 640*1024 && addr < __pa_symbol(&_end)) { + *addrp = __pa_symbol(&_end); + return 1; + } + /* XXX ramdisk image here? */ + return 0; +} + +int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size < start) + continue; + return 1; + } + return 0; +} + +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + if (ei->type != E820_RAM) + continue; + if (addr < start) + addr = start; + if (addr > ei->addr + ei->size) + continue; + while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) + ; + last = addr + size; + if (last > ei->addr + ei->size) + continue; + if (last > end) + continue; + return addr; + } + return -1UL; +} + +/* + * Free bootmem based on the e820 table for a node. + */ +void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long last, addr; + + if (ei->type != E820_RAM || + ei->addr+ei->size <= start || + ei->addr > end) + continue; + + addr = round_up(ei->addr, PAGE_SIZE); + if (addr < start) + addr = start; + + last = round_down(ei->addr + ei->size, PAGE_SIZE); + if (last >= end) + last = end; + + if (last > addr && last-addr >= PAGE_SIZE) + free_bootmem_node(pgdat, addr, last-addr); + } +} + +/* + * Find the highest page frame number we have available + */ +unsigned long __init e820_end_of_ram(void) +{ + int i; + unsigned long end_pfn = 0; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + + start = round_up(ei->addr, PAGE_SIZE); + end = round_down(ei->addr + ei->size, PAGE_SIZE); + if (start >= end) + continue; + if (ei->type == E820_RAM) { + if (end > end_pfn<>PAGE_SHIFT; + } else { + if (end > end_pfn_map<>PAGE_SHIFT; + } + } + + if (end_pfn > end_pfn_map) + end_pfn_map = end_pfn; + if (end_pfn_map > MAXMEM>>PAGE_SHIFT) + end_pfn_map = MAXMEM>>PAGE_SHIFT; + if (end_pfn > end_user_pfn) + end_pfn = end_user_pfn; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; + + return end_pfn; +} + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +void __init e820_reserve_resources(void) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; + res = alloc_bootmem_low(sizeof(struct resource)); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + request_resource(&iomem_resource, res); + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, &code_resource); + request_resource(res, &data_resource); + } + } +} + +/* + * Add a memory region to the kernel e820 map. + */ +void __init add_memory_region(unsigned long start, unsigned long size, int type) +{ + int x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %u\n", e820.map[i].type); + break; + } + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ + }; + static struct change_member change_point_list[2*E820MAX] __initdata; + static struct change_member *change_point[2*E820MAX] __initdata; + static struct e820entry *overlap_list[E820MAX] __initdata; + static struct e820entry new_bios[E820MAX] __initdata; + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; iaddr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < 2*old_nr; i++) { + /* if > , swap */ + /* or, if current= & last=, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < 2*old_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; ipbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; itype > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long start = biosmap->addr; + unsigned long size = biosmap->size; + unsigned long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + * + * This should be removed on Hammer which is supposed to not + * have non e820 covered ISA mappings there, but I had some strange + * problems so it stays for now. -AK + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } + + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + +void __init setup_memory_region(void) +{ + char *who = "BIOS-e820"; + + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + sanitize_e820_map(E820_MAP, &E820_MAP_NR); + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { + unsigned long mem_size; + + /* compare results from other methods and take the greater */ + if (ALT_MEM_K < EXT_MEM_K) { + mem_size = EXT_MEM_K; + who = "BIOS-88"; + } else { + mem_size = ALT_MEM_K; + who = "BIOS-e801"; + } + + e820.nr_map = 0; + add_memory_region(0, LOWMEMSIZE(), E820_RAM); + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + } + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + e820_print_map(who); +} + +void __init parse_memopt(char *p, char **from) +{ + end_user_pfn = memparse(p, from); + end_user_pfn >>= PAGE_SHIFT; +} + diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c new file mode 100644 index 000000000000..750bcd0655dc --- /dev/null +++ b/arch/x86_64/kernel/early_printk.c @@ -0,0 +1,220 @@ +#include +#include +#include +#include +#include +#include + +/* Simple VGA output */ + +#ifdef __i386__ +#define VGABASE (__ISA_IO_base + 0xb8000) +#else +#define VGABASE ((void __iomem *)0xffffffff800b8000UL) +#endif + +#define MAX_YPOS 25 +#define MAX_XPOS 80 + +static int current_ypos = 1, current_xpos = 0; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= MAX_YPOS) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < MAX_YPOS; k++, j++) { + for (i = 0; i < MAX_XPOS; i++) { + writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), + VGABASE + 2*(MAX_XPOS*j + i)); + } + } + for (i = 0; i < MAX_XPOS; i++) + writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); + current_ypos = MAX_YPOS-1; + } + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(MAX_XPOS*current_ypos + + current_xpos++)); + if (current_xpos >= MAX_XPOS) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s,"0x",2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static int bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s,"ttyS",4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +struct console *early_console = &early_vga_console; +static int early_console_initialized = 0; + +void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap,fmt); + n = vscnprintf(buf,512,fmt,ap); + early_console->write(early_console,buf,n); + va_end(ap); +} + +static int keep_early; + +int __init setup_early_printk(char *opt) +{ + char *space; + char buf[256]; + + if (early_console_initialized) + return -1; + + opt = strchr(opt, '=') + 1; + + strlcpy(buf,opt,sizeof(buf)); + space = strchr(buf, ' '); + if (space) + *space = 0; + + if (strstr(buf,"keep")) + keep_early = 1; + + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console = &early_serial_console; + } else if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf); + early_console = &early_serial_console; + } else if (!strncmp(buf, "vga", 3)) { + early_console = &early_vga_console; + } + early_console_initialized = 1; + register_console(early_console); + return 0; +} + +void __init disable_early_printk(void) +{ + if (!early_console_initialized || !early_console) + return; + if (!keep_early) { + printk("disabling early console\n"); + unregister_console(early_console); + early_console_initialized = 0; + } else { + printk("keeping early console\n"); + } +} + +__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S new file mode 100644 index 000000000000..e126284db7a8 --- /dev/null +++ b/arch/x86_64/kernel/entry.S @@ -0,0 +1,920 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + * + * $Id$ + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers upto R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * TODO: + * - schedule it carefully for the final hardware. + */ + +#define ASSEMBLY 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + .code64 + +#ifdef CONFIG_PREEMPT +#define preempt_stop cli +#else +#define preempt_stop +#define retint_kernel retint_restore_args +#endif + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp + movq %gs:pda_oldrsp,\tmp + movq \tmp,RSP(%rsp) + movq $__USER_DS,SS(%rsp) + movq $__USER_CS,CS(%rsp) + movq $-1,RCX(%rsp) + movq R11(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp,offset=0 + movq RSP-\offset(%rsp),\tmp + movq \tmp,%gs:pda_oldrsp + movq EFLAGS-\offset(%rsp),\tmp + movq \tmp,R11-\offset(%rsp) + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorq %rax, %rax + pushq %rax /* ss */ + CFI_ADJUST_CFA_OFFSET 8 + pushq %rax /* rsp */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_OFFSET rip,0 + pushq $(1<<9) /* eflags - interrupts on */ + CFI_ADJUST_CFA_OFFSET 8 + pushq $__KERNEL_CS /* cs */ + CFI_ADJUST_CFA_OFFSET 8 + pushq \child_rip /* rip */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_OFFSET rip,0 + pushq %rax /* orig rax */ + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + + .macro CFI_DEFAULT_STACK + CFI_ADJUST_CFA_OFFSET (SS) + CFI_OFFSET r15,R15-SS + CFI_OFFSET r14,R14-SS + CFI_OFFSET r13,R13-SS + CFI_OFFSET r12,R12-SS + CFI_OFFSET rbp,RBP-SS + CFI_OFFSET rbx,RBX-SS + CFI_OFFSET r11,R11-SS + CFI_OFFSET r10,R10-SS + CFI_OFFSET r9,R9-SS + CFI_OFFSET r8,R8-SS + CFI_OFFSET rax,RAX-SS + CFI_OFFSET rcx,RCX-SS + CFI_OFFSET rdx,RDX-SS + CFI_OFFSET rsi,RSI-SS + CFI_OFFSET rdi,RDI-SS + CFI_OFFSET rsp,RSP-SS + CFI_OFFSET rip,RIP-SS + .endm +/* + * A newly forked process directly context switches into this. + */ +/* rdi: prev */ +ENTRY(ret_from_fork) + CFI_STARTPROC + CFI_DEFAULT_STACK + call schedule_tail + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz rff_trace +rff_action: + RESTORE_REST + testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? + je int_ret_from_sys_call + testl $_TIF_IA32,threadinfo_flags(%rcx) + jnz int_ret_from_sys_call + RESTORE_TOP_OF_STACK %rdi,ARGOFFSET + jmp ret_from_sys_call +rff_trace: + movq %rsp,%rdi + call syscall_trace_leave + GET_THREAD_INFO(%rcx) + jmp rff_action + CFI_ENDPROC + +/* + * System call entry. Upto 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are off on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + */ + +ENTRY(system_call) + CFI_STARTPROC + swapgs + movq %rsp,%gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + sti + SAVE_ARGS 8,1 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + GET_THREAD_INFO(%rcx) + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + jnz tracesys + cmpq $__NR_syscall_max,%rax + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ + .globl ret_from_sys_call +ret_from_sys_call: + movl $_TIF_WORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + GET_THREAD_INFO(%rcx) + cli + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz sysret_careful + movq RIP-ARGOFFSET(%rsp),%rcx + RESTORE_ARGS 0,-ARG_SKIP,1 + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq + + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + sti + pushq %rdi + call schedule + popq %rdi + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + sti + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz 1f + + /* Really a signal */ + /* edx: work flags (arg3) */ + leaq do_notify_resume(%rip),%rax + leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call ptregscall_common +1: movl $_TIF_NEED_RESCHED,%edi + jmp sysret_check + + /* Do syscall tracing */ +tracesys: + SAVE_REST + movq $-ENOSYS,RAX(%rsp) + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpq $__NR_syscall_max,%rax + ja 1f + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) +1: SAVE_REST + movq %rsp,%rdi + call syscall_trace_leave + RESTORE_TOP_OF_STACK %rbx + RESTORE_REST + jmp ret_from_sys_call + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +ENTRY(int_ret_from_sys_call) + cli + testl $3,CS-ARGOFFSET(%rsp) + je retint_restore_args + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +int_with_check: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + jmp retint_swapgs + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + sti + pushq %rdi + call schedule + popq %rdi + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + sti + SAVE_REST + /* Check for syscall exit trace */ + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + jz int_signal + pushq %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq %rdi + btr $TIF_SYSCALL_TRACE,%edi + btr $TIF_SYSCALL_AUDIT,%edi + btr $TIF_SINGLESTEP,%edi + jmp int_restore_rest + +int_signal: + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_NEED_RESCHED,%edi +int_restore_rest: + RESTORE_REST + jmp int_with_check + CFI_ENDPROC + +/* + * Certain special system calls that need to save a complete full stack frame. + */ + + .macro PTREGSCALL label,func,arg + .globl \label +\label: + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ptregscall_common + .endm + + PTREGSCALL stub_clone, sys_clone, %r8 + PTREGSCALL stub_fork, sys_fork, %rdi + PTREGSCALL stub_vfork, sys_vfork, %rdi + PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx + PTREGSCALL stub_iopl, sys_iopl, %rsi + +ENTRY(ptregscall_common) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %r11, %r15 + FIXUP_TOP_OF_STACK %r11 + call *%rax + RESTORE_TOP_OF_STACK %r11 + movq %r15, %r11 + RESTORE_REST + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + ret + CFI_ENDPROC + +ENTRY(stub_execve) + CFI_STARTPROC + popq %r11 + CFI_ADJUST_CFA_OFFSET -8 + SAVE_REST + movq %r11, %r15 + FIXUP_TOP_OF_STACK %r11 + call sys_execve + GET_THREAD_INFO(%rcx) + bt $TIF_IA32,threadinfo_flags(%rcx) + jc exec_32bit + RESTORE_TOP_OF_STACK %r11 + movq %r15, %r11 + RESTORE_REST + push %r11 + ret + +exec_32bit: + CFI_ADJUST_CFA_OFFSET REST_SKIP + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): interrupt number */ + .macro interrupt func + CFI_STARTPROC simple + CFI_DEF_CFA rsp,(SS-RDI) + CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) + CFI_REL_OFFSET rip,(RIP-ORIG_RAX) + cld +#ifdef CONFIG_DEBUG_INFO + SAVE_ALL + movq %rsp,%rdi + /* + * Setup a stack frame pointer. This allows gdb to trace + * back to the original stack. + */ + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp +#else + SAVE_ARGS + leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler +#endif + testl $3,CS(%rdi) + je 1f + swapgs +1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count + movq %gs:pda_irqstackptr,%rax + cmoveq %rax,%rsp + pushq %rdi # save old stack + call \func + .endm + +ENTRY(common_interrupt) + interrupt do_IRQ + /* 0(%rsp): oldrsp-ARGOFFSET */ +ret_from_intr: + popq %rdi + cli + subl $1,%gs:pda_irqcount +#ifdef CONFIG_DEBUG_INFO + movq RBP(%rdi),%rbp +#endif + leaq ARGOFFSET(%rdi),%rsp +exit_intr: + GET_THREAD_INFO(%rcx) + testl $3,CS-ARGOFFSET(%rsp) + je retint_kernel + + /* Interrupt came from user space */ + /* + * Has a correct top of stack, but a partial stack frame + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + movl threadinfo_flags(%rcx),%edx + andl %edi,%edx + jnz retint_careful +retint_swapgs: + cli + swapgs +retint_restore_args: + cli + RESTORE_ARGS 0,8,0 +iret_label: + iretq + + .section __ex_table,"a" + .quad iret_label,bad_iret + .previous + .section .fixup,"ax" + /* force a signal here? this matches i386 behaviour */ + /* running with kernel gs */ +bad_iret: + movq $-9999,%rdi /* better code? */ + jmp do_exit + .previous + + /* edi: workmask, edx: work */ +retint_careful: + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + sti + pushq %rdi + call schedule + popq %rdi + GET_THREAD_INFO(%rcx) + cli + jmp retint_check + +retint_signal: + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz retint_swapgs + sti + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorq %rsi,%rsi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + cli + movl $_TIF_NEED_RESCHED,%edi + GET_THREAD_INFO(%rcx) + jmp retint_check + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ + .p2align +retint_kernel: + cmpl $0,threadinfo_preempt_count(%rcx) + jnz retint_restore_args + bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) + jnc retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp exit_intr +#endif + CFI_ENDPROC + +/* + * APIC interrupts. + */ + .macro apicinterrupt num,func + pushq $\num-256 + interrupt \func + jmp ret_from_intr + CFI_ENDPROC + .endm + +ENTRY(thermal_interrupt) + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt + +#ifdef CONFIG_SMP +ENTRY(reschedule_interrupt) + apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt + +ENTRY(invalidate_interrupt) + apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt + +ENTRY(call_function_interrupt) + apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +ENTRY(apic_timer_interrupt) + apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt + +ENTRY(error_interrupt) + apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt + +ENTRY(spurious_interrupt) + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt +#endif + +/* + * Exception entry points. + */ + .macro zeroentry sym + pushq $0 /* push error code/oldrax */ + pushq %rax /* push real oldrax to the rdi slot */ + leaq \sym(%rip),%rax + jmp error_entry + .endm + + .macro errorentry sym + pushq %rax + leaq \sym(%rip),%rax + jmp error_entry + .endm + + /* error code is on the stack already */ + /* handle NMI like exceptions that can happen everywhere */ + .macro paranoidentry sym + SAVE_ALL + cld + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f + swapgs + xorl %ebx,%ebx +1: movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi + movq $-1,ORIG_RAX(%rsp) + call \sym + .endm + +/* + * Exception entry point. This expects an error code/orig_rax on the stack + * and the exception handler in %rax. + */ +ENTRY(error_entry) + CFI_STARTPROC simple + CFI_DEF_CFA rsp,(SS-RDI) + CFI_REL_OFFSET rsp,(RSP-RDI) + CFI_REL_OFFSET rip,(RIP-RDI) + /* rdi slot contains rax, oldrax contains error code */ + cld + subq $14*8,%rsp + CFI_ADJUST_CFA_OFFSET (14*8) + movq %rsi,13*8(%rsp) + CFI_REL_OFFSET rsi,RSI + movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + movq %rdx,12*8(%rsp) + CFI_REL_OFFSET rdx,RDX + movq %rcx,11*8(%rsp) + CFI_REL_OFFSET rcx,RCX + movq %rsi,10*8(%rsp) /* store rax */ + CFI_REL_OFFSET rax,RAX + movq %r8, 9*8(%rsp) + CFI_REL_OFFSET r8,R8 + movq %r9, 8*8(%rsp) + CFI_REL_OFFSET r9,R9 + movq %r10,7*8(%rsp) + CFI_REL_OFFSET r10,R10 + movq %r11,6*8(%rsp) + CFI_REL_OFFSET r11,R11 + movq %rbx,5*8(%rsp) + CFI_REL_OFFSET rbx,RBX + movq %rbp,4*8(%rsp) + CFI_REL_OFFSET rbp,RBP + movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12,R12 + movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13,R13 + movq %r14,1*8(%rsp) + CFI_REL_OFFSET r14,R14 + movq %r15,(%rsp) + CFI_REL_OFFSET r15,R15 + xorl %ebx,%ebx + testl $3,CS(%rsp) + je error_kernelspace +error_swapgs: + swapgs +error_sti: + movq %rdi,RDI(%rsp) + movq %rsp,%rdi + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) + call *%rax + /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +error_exit: + movl %ebx,%eax + RESTORE_REST + cli + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + movl threadinfo_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + swapgs + RESTORE_ARGS 0,8,0 + iretq + CFI_ENDPROC + +error_kernelspace: + incl %ebx + /* There are two places in the kernel that can potentially fault with + usergs. Handle them here. The exception handlers after + iret run with kernel gs again, so don't set the user space flag. + B stepping K8s sometimes report an truncated RIP for IRET + exceptions returning to compat mode. Check for these here too. */ + leaq iret_label(%rip),%rbp + cmpq %rbp,RIP(%rsp) + je error_swapgs + movl %ebp,%ebp /* zero extend */ + cmpq %rbp,RIP(%rsp) + je error_swapgs + cmpq $gs_change,RIP(%rsp) + je error_swapgs + jmp error_sti + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(load_gs_index) + pushf + cli + swapgs +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + swapgs + popf + ret + + .section __ex_table,"a" + .align 8 + .quad gs_change,bad_gs + .previous + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + swapgs /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* + * Create a kernel thread. + * + * C extern interface: + * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + * + * asm input arguments: + * rdi: fn, rsi: arg, rdx: flags + */ +ENTRY(kernel_thread) + CFI_STARTPROC + FAKE_STACK_FRAME $child_rip + SAVE_ALL + + # rdi: flags, rsi: usp, rdx: will be &pt_regs + movq %rdx,%rdi + orq kernel_thread_flags(%rip),%rdi + movq $-1, %rsi + movq %rsp, %rdx + + xorl %r8d,%r8d + xorl %r9d,%r9d + + # clone now + call do_fork + movq %rax,RAX(%rsp) + xorl %edi,%edi + + /* + * It isn't worth to check for reschedule here, + * so internally to the x86_64 port you can rely on kernel_thread() + * not to reschedule the child before returning, this avoids the need + * of hacks for example to fork off the per-CPU idle tasks. + * [Hopefully no generic code relies on the reschedule -AK] + */ + RESTORE_ALL + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC + + +child_rip: + /* + * Here we are in the child and the registers are set as they were + * at kernel_thread() invocation in the parent. + */ + movq %rdi, %rax + movq %rsi, %rdi + call *%rax + # exit + xorq %rdi, %rdi + call do_exit + +/* + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. + * + * C extern interface: + * extern long execve(char *name, char **argv, char **envp) + * + * asm input arguments: + * rdi: name, rsi: argv, rdx: envp + * + * We want to fallback into: + * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) + * + * do_sys_execve asm fallback arguments: + * rdi: name, rsi: argv, rdx: envp, fake frame on the stack + */ +ENTRY(execve) + CFI_STARTPROC + FAKE_STACK_FRAME $0 + SAVE_ALL + call sys_execve + movq %rax, RAX(%rsp) + RESTORE_REST + testq %rax,%rax + je int_ret_from_sys_call + RESTORE_ARGS + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC + +ENTRY(page_fault) + errorentry do_page_fault + +ENTRY(coprocessor_error) + zeroentry do_coprocessor_error + +ENTRY(simd_coprocessor_error) + zeroentry do_simd_coprocessor_error + +ENTRY(device_not_available) + zeroentry math_state_restore + + /* runs on exception stack */ +ENTRY(debug) + CFI_STARTPROC + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_debug + /* switch back to process stack to restore the state ptrace touched */ + movq %rax,%rsp + testl $3,CS(%rsp) + jnz paranoid_userspace + jmp paranoid_exit + CFI_ENDPROC + + /* runs on exception stack */ +ENTRY(nmi) + CFI_STARTPROC + pushq $-1 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_nmi + /* ebx: no swapgs flag */ +paranoid_exit: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore +paranoid_swapgs: + cli + swapgs +paranoid_restore: + RESTORE_ALL 8 + iretq +paranoid_userspace: + cli + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%edx + testl $_TIF_NEED_RESCHED,%edx + jnz paranoid_resched + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jnz paranoid_signal + jmp paranoid_swapgs +paranoid_resched: + sti + call schedule + jmp paranoid_exit +paranoid_signal: + sti + xorl %esi,%esi /* oldset */ + movq %rsp,%rdi /* &pt_regs */ + call do_notify_resume + jmp paranoid_exit + CFI_ENDPROC + +ENTRY(int3) + zeroentry do_int3 + +ENTRY(overflow) + zeroentry do_overflow + +ENTRY(bounds) + zeroentry do_bounds + +ENTRY(invalid_op) + zeroentry do_invalid_op + +ENTRY(coprocessor_segment_overrun) + zeroentry do_coprocessor_segment_overrun + +ENTRY(reserved) + zeroentry do_reserved + + /* runs on exception stack */ +ENTRY(double_fault) + CFI_STARTPROC + paranoidentry do_double_fault + movq %rax,%rsp + testl $3,CS(%rsp) + jnz paranoid_userspace + jmp paranoid_exit + CFI_ENDPROC + +ENTRY(invalid_TSS) + errorentry do_invalid_TSS + +ENTRY(segment_not_present) + errorentry do_segment_not_present + + /* runs on exception stack */ +ENTRY(stack_segment) + CFI_STARTPROC + paranoidentry do_stack_segment + movq %rax,%rsp + testl $3,CS(%rsp) + jnz paranoid_userspace + jmp paranoid_exit + CFI_ENDPROC + +ENTRY(general_protection) + errorentry do_general_protection + +ENTRY(alignment_check) + errorentry do_alignment_check + +ENTRY(divide_error) + zeroentry do_divide_error + +ENTRY(spurious_interrupt_bug) + zeroentry do_spurious_interrupt_bug + +#ifdef CONFIG_X86_MCE + /* runs on exception stack */ +ENTRY(machine_check) + CFI_STARTPROC + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_machine_check + jmp paranoid_exit + CFI_ENDPROC +#endif + +ENTRY(call_debug) + zeroentry do_call_debug + diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c new file mode 100644 index 000000000000..d2c42fb99cb3 --- /dev/null +++ b/arch/x86_64/kernel/genapic.c @@ -0,0 +1,89 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(x86_cpu_to_apicid); +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +extern struct genapic apic_cluster; +extern struct genapic apic_flat; + +struct genapic *genapic = &apic_flat; + + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init clustered_apic_check(void) +{ + long i; + u8 clusters, max_cluster; + u8 id; + u8 cluster_cnt[NUM_APIC_CLUSTERS]; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + /* AMD always uses flat mode right now */ + genapic = &apic_flat; + goto print; + } + + memset(cluster_cnt, 0, sizeof(cluster_cnt)); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + cluster_cnt[APIC_CLUSTERID(id)]++; + } + + clusters = 0; + max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (cluster_cnt[i] > 0) { + ++clusters; + if (cluster_cnt[i] > max_cluster) + max_cluster = cluster_cnt[i]; + } + } + + /* + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical + * else physical mode. + * (We don't use lowest priority delivery + HW APIC IRQ steering, so + * can ignore the clustered logical case and go straight to physical.) + */ + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + genapic = &apic_flat; + else + genapic = &apic_cluster; + +print: + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +} + +/* Same for both flat and clustered. */ + +void send_IPI_self(int vector) +{ + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c new file mode 100644 index 000000000000..9703da7202e3 --- /dev/null +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -0,0 +1,130 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. + * (A more realistic maximum is around 230 CPUs.) + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static void cluster_init_apic_ldr(void) +{ + unsigned long val, id; + long i, count; + u8 lid; + u8 my_id = hard_smp_processor_id(); + u8 my_cluster = APIC_CLUSTER(my_id); + + /* Create logical APIC IDs by counting CPUs already in cluster. */ + for (count = 0, i = NR_CPUS; --i >= 0; ) { + lid = x86_cpu_to_log_apicid[i]; + if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) + ++count; + } + /* + * We only have a 4 wide bitmap in cluster mode. There's no way + * to get above 60 CPUs and still give each one it's own bit. + * But, we're using physical IRQ delivery, so we don't care. + * Use bit 3 for the 4th through Nth CPU in each cluster. + */ + if (count >= XAPIC_DEST_CPUS_SHIFT) + count = 3; + id = my_cluster | (1UL << count); + x86_cpu_to_log_apicid[smp_processor_id()] = id; + apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write_around(APIC_LDR, val); +} + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t cluster_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static void cluster_send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static void cluster_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + cluster_send_IPI_mask(mask, vector); +} + +static void cluster_send_IPI_all(int vector) +{ + cluster_send_IPI_mask(cpu_online_map, vector); +} + +static int cluster_apic_id_registered(void) +{ + return 1; +} + +static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return x86_cpu_to_apicid[cpu]; + else + return BAD_APICID; +} + +/* cpuid returns the value latched in the HW at reset, not the APIC ID + * register's value. For any box whose BIOS changes APIC IDs, like + * clustered APIC systems, we must use hard_smp_processor_id. + * + * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. + */ +static unsigned int phys_pkg_id(int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +struct genapic apic_cluster = { + .name = "clustered", + .int_delivery_mode = dest_Fixed, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, + .target_cpus = cluster_target_cpus, + .apic_id_registered = cluster_apic_id_registered, + .init_apic_ldr = cluster_init_apic_ldr, + .send_IPI_all = cluster_send_IPI_all, + .send_IPI_allbutself = cluster_send_IPI_allbutself, + .send_IPI_mask = cluster_send_IPI_mask, + .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c new file mode 100644 index 000000000000..b4cbbad04226 --- /dev/null +++ b/arch/x86_64/kernel/genapic_flat.c @@ -0,0 +1,127 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Flat APIC subarch code. Maximum 8 CPUs, logical delivery. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static cpumask_t flat_target_cpus(void) +{ + return cpu_online_map; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static void flat_init_apic_ldr(void) +{ + unsigned long val; + unsigned long num, id; + + num = smp_processor_id(); + id = 1UL << num; + x86_cpu_to_log_apicid[num] = id; + apic_write_around(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write_around(APIC_LDR, val); +} + +static void flat_send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then + * we get an APIC send error if we try to broadcast. + * thus we have to avoid sending IPIs in this case. + */ + if (num_online_cpus() > 1) + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_all(int vector) +{ + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static void flat_send_IPI_mask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned long cfg; + unsigned long flags; + + local_save_flags(flags); + local_irq_disable(); + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + apic_write_around(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); + local_irq_restore(flags); +} + +static int flat_apic_id_registered(void) +{ + return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map); +} + +static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) +{ + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + u32 ebx; + + ebx = cpuid_ebx(1); + return ((ebx >> 24) & 0xFF) >> index_msb; +} + +struct genapic apic_flat = { + .name = "flat", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, + .target_cpus = flat_target_cpus, + .apic_id_registered = flat_apic_id_registered, + .init_apic_ldr = flat_init_apic_ldr, + .send_IPI_all = flat_send_IPI_all, + .send_IPI_allbutself = flat_send_IPI_allbutself, + .send_IPI_mask = flat_send_IPI_mask, + .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S new file mode 100644 index 000000000000..b6d8725c1f61 --- /dev/null +++ b/arch/x86_64/kernel/head.S @@ -0,0 +1,396 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * + * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ + */ + + +#include +#include +#include +#include +#include +#include +#include + +/* we are not able to switch in one step to the final KERNEL ADRESS SPACE + * because we need identity-mapped pages on setup so define __START_KERNEL to + * 0x100000 for this stage + * + */ + + .text + .code32 + .globl startup_32 +/* %bx: 1 if coming from smp trampoline on secondary cpu */ +startup_32: + + /* + * At this point the CPU runs in 32bit protected mode (CS.D = 1) with + * paging disabled and the point of this file is to switch to 64bit + * long mode with a kernel mapping for kerneland to jump into the + * kernel virtual addresses. + * There is no stack until we set one up. + */ + + /* Initialize the %ds segment register */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + lgdt pGDT32 - __START_KERNEL_map + + /* If the CPU doesn't support CPUID this will double fault. + * Unfortunately it is hard to check for CPUID without a stack. + */ + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + cmpl $0x80000000, %eax + jbe no_long_mode + /* Check if long mode is implemented */ + mov $0x80000001, %eax + cpuid + btl $29, %edx + jnc no_long_mode + + /* + * Prepare for entering 64bits mode + */ + + /* Enable PAE mode */ + xorl %eax, %eax + btsl $5, %eax + movl %eax, %cr4 + + /* Setup early boot stage 4 level pagetables */ + movl $(init_level4_pgt - __START_KERNEL_map), %eax + movl %eax, %cr3 + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + /* Enable Long Mode */ + btsl $_EFER_LME, %eax + + /* Make changes effective */ + wrmsr + + xorl %eax, %eax + btsl $31, %eax /* Enable paging and in turn activate Long Mode */ + btsl $0, %eax /* Enable protected mode */ + /* Make changes effective */ + movl %eax, %cr0 + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) + + .code64 + .org 0x100 + .globl startup_64 +startup_64: + /* We come here either from startup_32 + * or directly from a 64bit bootloader. + * Since we may have come directly from a bootloader we + * reload the page tables here. + */ + + /* Enable PAE mode and PGE */ + xorq %rax, %rax + btsq $5, %rax + btsq $7, %rax + movq %rax, %cr4 + + /* Setup early boot stage 4 level pagetables. */ + movq $(init_level4_pgt - __START_KERNEL_map), %rax + movq %rax, %cr3 + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + /* Enable System Call */ + btsl $_EFER_SCE, %eax + + /* No Execute supported? */ + btl $20,%edi + jnc 1f + btsl $_EFER_NX, %eax +1: + /* Make changes effective */ + wrmsr + + /* Setup cr0 */ + xorq %rax, %rax + btsq $31, %rax /* Enable paging */ + btsq $0, %rax /* Enable protected mode */ + btsq $1, %rax /* Enable MP */ + btsq $4, %rax /* Enable ET */ + btsq $5, %rax /* Enable NE */ + btsq $16, %rax /* Enable WP */ + btsq $18, %rax /* Enable AM */ + /* Make changes effective */ + movq %rax, %cr0 + + /* Setup a boot time stack */ + movq init_rsp(%rip),%rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt cpu_gdt_descr + + /* + * Setup up a dummy PDA. this is just for some early bootup code + * that does in_interrupt() + */ + movl $MSR_GS_BASE,%ecx + movq $empty_zero_page,%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + + /* set up data segments. actually 0 would do too */ + movl $__KERNEL_DS,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* esi is pointer to real mode structure with interesting info. + pass it to C */ + movl %esi, %edi + + /* Finally jump to run C code and to be on real kernel address + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address , this is only possible as indirect + * jump + */ + movq initial_code(%rip),%rax + jmp *%rax + + /* SMP bootup changes these two */ + .globl initial_code +initial_code: + .quad x86_64_start_kernel + .globl init_rsp +init_rsp: + .quad init_thread_union+THREAD_SIZE-8 + +ENTRY(early_idt_handler) + xorl %eax,%eax + movq 8(%rsp),%rsi # get rip + movq (%rsp),%rdx + movq %cr2,%rcx + leaq early_idt_msg(%rip),%rdi + call early_printk +1: hlt + jmp 1b + +early_idt_msg: + .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" + +.code32 +ENTRY(no_long_mode) + /* This isn't an x86-64 CPU so hang */ +1: + jmp 1b + +.org 0xf00 + .globl pGDT32 +pGDT32: + .word gdt_end-cpu_gdt_table + .long cpu_gdt_table-__START_KERNEL_map + +.org 0xf10 +ljumpvector: + .long startup_64-__START_KERNEL_map + .word __KERNEL_CS + +ENTRY(stext) +ENTRY(_stext) + + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ +.org 0x1000 +ENTRY(init_level4_pgt) + .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000010a007 + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000103007 /* -> level3_kernel_pgt */ + +.org 0x2000 +ENTRY(level3_ident_pgt) + .quad 0x0000000000104007 + .fill 511,8,0 + +.org 0x3000 +ENTRY(level3_kernel_pgt) + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad 0x0000000000105007 /* -> level2_kernel_pgt */ + .fill 1,8,0 + +.org 0x4000 +ENTRY(level2_ident_pgt) + /* 40MB for bootup. */ + .quad 0x0000000000000283 + .quad 0x0000000000200183 + .quad 0x0000000000400183 + .quad 0x0000000000600183 + .quad 0x0000000000800183 + .quad 0x0000000000A00183 + .quad 0x0000000000C00183 + .quad 0x0000000000E00183 + .quad 0x0000000001000183 + .quad 0x0000000001200183 + .quad 0x0000000001400183 + .quad 0x0000000001600183 + .quad 0x0000000001800183 + .quad 0x0000000001A00183 + .quad 0x0000000001C00183 + .quad 0x0000000001E00183 + .quad 0x0000000002000183 + .quad 0x0000000002200183 + .quad 0x0000000002400183 + .quad 0x0000000002600183 + /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ + .globl temp_boot_pmds +temp_boot_pmds: + .fill 492,8,0 + +.org 0x5000 +ENTRY(level2_kernel_pgt) + /* 40MB kernel mapping. The kernel code cannot be bigger than that. + When you change this change KERNEL_TEXT_SIZE in page.h too. */ + /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ + .quad 0x0000000000000183 + .quad 0x0000000000200183 + .quad 0x0000000000400183 + .quad 0x0000000000600183 + .quad 0x0000000000800183 + .quad 0x0000000000A00183 + .quad 0x0000000000C00183 + .quad 0x0000000000E00183 + .quad 0x0000000001000183 + .quad 0x0000000001200183 + .quad 0x0000000001400183 + .quad 0x0000000001600183 + .quad 0x0000000001800183 + .quad 0x0000000001A00183 + .quad 0x0000000001C00183 + .quad 0x0000000001E00183 + .quad 0x0000000002000183 + .quad 0x0000000002200183 + .quad 0x0000000002400183 + .quad 0x0000000002600183 + /* Module mapping starts here */ + .fill 492,8,0 + +.org 0x6000 +ENTRY(empty_zero_page) + +.org 0x7000 +ENTRY(empty_bad_page) + +.org 0x8000 +ENTRY(empty_bad_pte_table) + +.org 0x9000 +ENTRY(empty_bad_pmd_table) + +.org 0xa000 +ENTRY(level3_physmem_pgt) + .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ + + .org 0xb000 +#ifdef CONFIG_ACPI_SLEEP +ENTRY(wakeup_level4_pgt) + .quad 0x0000000000102007 /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000010a007 + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000103007 /* -> level3_kernel_pgt */ +#endif + + .data + + .align 16 + .globl cpu_gdt_descr +cpu_gdt_descr: + .word gdt_end-cpu_gdt_table +gdt: + .quad cpu_gdt_table +#ifdef CONFIG_SMP + .rept NR_CPUS-1 + .word 0 + .quad 0 + .endr +#endif + +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ + +.align L1_CACHE_BYTES + +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ + +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ + .quad 0x00cffa000000ffff /* __USER32_CS */ + .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affa000000ffff /* __USER_CS */ + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0,0 /* TSS */ + .quad 0,0 /* LDT */ + .quad 0,0,0 /* three TLS descriptors */ + .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */ + /* base must be patched for real base address. */ +gdt_end: + /* asm/segment.h:GDT_ENTRIES must match this */ + /* This should be a multiple of the cache line size */ + /* GDTs of other CPUs: */ + .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) + + .align L1_CACHE_BYTES +ENTRY(idt_table) + .rept 256 + .quad 0 + .quad 0 + .endr + diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c new file mode 100644 index 000000000000..6cad46c98a23 --- /dev/null +++ b/arch/x86_64/kernel/head64.c @@ -0,0 +1,117 @@ +/* + * linux/arch/x86_64/kernel/head64.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * + * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + extern char __bss_start[], __bss_end[]; + memset(__bss_start, 0, + (unsigned long) __bss_end - (unsigned long) __bss_start); +} + +extern char x86_boot_params[2048]; + +#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ +#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC 0xA33F +#define OLD_CL_BASE_ADDR 0x90000 +#define OLD_CL_OFFSET 0x90022 + +extern char saved_command_line[]; + +static void __init copy_bootdata(char *real_mode_data) +{ + int new_data; + char * command_line; + + memcpy(x86_boot_params, real_mode_data, 2048); + new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + if (!new_data) { + if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + printk("so old bootloader that it does not support commandline?!\n"); + return; + } + new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + printk("old bootloader convention, maybe loadlin?\n"); + } + command_line = (char *) ((u64)(new_data)); + memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); + printk("Bootdata ok (command line is %s)\n", saved_command_line); +} + +static void __init setup_boot_cpu_data(void) +{ + unsigned int dummy, eax; + + /* get vendor info */ + cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, + (unsigned int *)&boot_cpu_data.x86_vendor_id[0], + (unsigned int *)&boot_cpu_data.x86_vendor_id[8], + (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); + + /* get cpu type */ + cpuid(1, &eax, &dummy, &dummy, + (unsigned int *) &boot_cpu_data.x86_capability); + boot_cpu_data.x86 = (eax >> 8) & 0xf; + boot_cpu_data.x86_model = (eax >> 4) & 0xf; + boot_cpu_data.x86_mask = eax & 0xf; +} + +extern char _end[]; + +void __init x86_64_start_kernel(char * real_mode_data) +{ + char *s; + int i; + + for (i = 0; i < 256; i++) + set_intr_gate(i, early_idt_handler); + asm volatile("lidt %0" :: "m" (idt_descr)); + clear_bss(); + pda_init(0); + copy_bootdata(real_mode_data); +#ifdef CONFIG_SMP + cpu_set(0, cpu_online_map); +#endif + /* default console: */ + if (!strstr(saved_command_line, "console=")) + strcat(saved_command_line, " console=tty0"); + s = strstr(saved_command_line, "earlyprintk="); + if (s != NULL) + setup_early_printk(s); +#ifdef CONFIG_DISCONTIGMEM + s = strstr(saved_command_line, "numa="); + if (s != NULL) + numa_setup(s+5); +#endif +#ifdef CONFIG_X86_IO_APIC + if (strstr(saved_command_line, "disableapic")) + disable_apic = 1; +#endif + /* You need early console to see that */ + if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) + panic("Kernel too big for kernel mapping\n"); + + setup_boot_cpu_data(); + start_kernel(); +} diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c new file mode 100644 index 000000000000..ba139cac57ce --- /dev/null +++ b/arch/x86_64/kernel/i387.c @@ -0,0 +1,155 @@ +/* + * linux/arch/x86_64/kernel/i387.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + * + * x86-64 rework 2002 Andi Kleen. + * Does direct fxsave in and out of user space now for signal handlers. + * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, + * the 64bit user space sees a FXSAVE frame directly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned int mxcsr_feature_mask = 0xffffffff; + +void mxcsr_feature_mask_init(void) +{ + unsigned int mask; + clts(); + memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) mask = 0x0000ffbf; + mxcsr_feature_mask &= mask; + stts(); +} + +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ +void __init fpu_init(void) +{ + unsigned long oldcr0 = read_cr0(); + extern void __bad_fxsave_alignment(void); + + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) + __bad_fxsave_alignment(); + set_in_cr4(X86_CR4_OSFXSR); + set_in_cr4(X86_CR4_OSXMMEXCPT); + + write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ + + mxcsr_feature_mask_init(); + /* clean state in init */ + current_thread_info()->status = 0; + clear_used_math(); +} + +void init_fpu(struct task_struct *child) +{ + if (tsk_used_math(child)) { + if (child == current) + unlazy_fpu(child); + return; + } + memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + child->thread.i387.fxsave.cwd = 0x37f; + child->thread.i387.fxsave.mxcsr = 0x1f80; + /* only the device not available exception or ptrace can call init_fpu */ + set_stopped_child_used_math(child); +} + +/* + * Signal frame handlers. + */ + +int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + { + extern void bad_user_i387_struct(void); + if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave)) + bad_user_i387_struct(); + } + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n",buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (tsk->thread_info->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *)buf); + if (err) return err; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * ptrace request handlers. + */ + +int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) +{ + init_fpu(tsk); + return __copy_to_user(buf, &tsk->thread.i387.fxsave, + sizeof(struct user_i387_struct)) ? -EFAULT : 0; +} + +int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) +{ + if (__copy_from_user(&tsk->thread.i387.fxsave, buf, + sizeof(struct user_i387_struct))) + return -EFAULT; + return 0; +} + +/* + * FPU state for core dumps. + */ + +int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) +{ + struct task_struct *tsk = current; + + if (!used_math()) + return 0; + + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); + return 1; +} + +int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) +{ + int fpvalid = !!tsk_used_math(tsk); + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); +} + return fpvalid; +} diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c new file mode 100644 index 000000000000..d52701246010 --- /dev/null +++ b/arch/x86_64/kernel/i8259.c @@ -0,0 +1,579 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Common place to define all x86 IRQ vectors + * + * This builds up the IRQ handler stubs using some ugly macros in irq.h + * + * These macros create the low-level assembly IRQ routines that save + * register context and call do_IRQ(). do_IRQ() then does all the + * operations that are needed to keep the AT (or SMP IOAPIC) + * interrupt-controller happy. + */ + +#define BI(x,y) \ + BUILD_IRQ(x##y) + +#define BUILD_16_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) BI(x,e) BI(x,f) + +#define BUILD_14_IRQS(x) \ + BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ + BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ + BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ + BI(x,c) BI(x,d) + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x20-0x2f) + */ +BUILD_16_IRQS(0x0) + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) +BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) +BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) + +#ifdef CONFIG_PCI_MSI + BUILD_14_IRQS(0xe) +#endif + +#endif + +#undef BUILD_16_IRQS +#undef BUILD_14_IRQS +#undef BI + + +#define IRQ(x,y) \ + IRQ##x##y##_interrupt + +#define IRQLIST_16(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) + +#define IRQLIST_14(x) \ + IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ + IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ + IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ + IRQ(x,c), IRQ(x,d) + +void (*interrupt[NR_IRQS])(void) = { + IRQLIST_16(0x0), + +#ifdef CONFIG_X86_IO_APIC + IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), + IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), + IRQLIST_16(0xc), IRQLIST_16(0xd) + +#ifdef CONFIG_PCI_MSI + , IRQLIST_14(0xe) +#endif + +#endif +}; + +#undef IRQ +#undef IRQLIST_16 +#undef IRQLIST_14 + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ + +DEFINE_SPINLOCK(i8259A_lock); + +static void end_8259A_irq (unsigned int irq) +{ + if (irq > 256) { + char var; + printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info); + + BUG(); + } + + if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && + irq_desc[irq].action) + enable_8259A_irq(irq); +} + +#define shutdown_8259A_irq disable_8259A_irq + +static void mask_and_ack_8259A(unsigned int); + +static unsigned int startup_8259A_irq(unsigned int irq) +{ + enable_8259A_irq(irq); + return 0; /* never anything pending */ +} + +static struct hw_interrupt_type i8259A_irq_type = { + "XT-PIC", + startup_8259A_irq, + shutdown_8259A_irq, + enable_8259A_irq, + disable_8259A_irq, + mask_and_ack_8259A, + end_8259A_irq, + NULL +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +static unsigned int cached_irq_mask = 0xffff; + +#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define cached_21 (__byte(0,cached_irq_mask)) +#define cached_A1 (__byte(1,cached_irq_mask)) + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,0xA0); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecesserily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(0xA1); /* DUMMY - (do we need this?) */ + outb(cached_A1,0xA1); + outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ + outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + } else { + inb(0x21); /* DUMMY - (do we need this?) */ + outb(cached_21,0x21); + outb(0x60+irq,0x20); /* 'Specific EOI' to master */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-2 */ + + /* + * outb_p - this has to work on a wide range of PC hardware. + */ + outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ + outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ + if (auto_eoi) + outb_p(0x03, 0x21); /* master does Auto EOI */ + else + outb_p(0x01, 0x21); /* master expects normal EOI */ + + outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ + outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ + outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode + is to be investigated) */ + + if (auto_eoi) + /* + * in AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_irq_type.ack = disable_8259A_irq; + else + i8259A_irq_type.ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_21, 0x21); /* restore master IRQ mask */ + outb(cached_A1, 0xA1); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(0); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, u32 state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + set_kset_name("i8259"), + .suspend = i8259A_suspend, + .resume = i8259A_resume, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ + +static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + irq_desc[i].handler = &i8259A_irq_type; + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].handler = &no_irq_type; + } + } +} + +void apic_timer_interrupt(void); +void spurious_interrupt(void); +void error_interrupt(void); +void reschedule_interrupt(void); +void call_function_interrupt(void); +void invalidate_interrupt(void); +void thermal_interrupt(void); +void i8254_timer_resume(void); + +static void setup_timer(void) +{ + outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff , 0x40); /* LSB */ + udelay(10); + outb(LATCH >> 8 , 0x40); /* MSB */ +} + +static int timer_resume(struct sys_device *dev) +{ + setup_timer(); + return 0; +} + +void i8254_timer_resume(void) +{ + setup_timer(); +} + +static struct sysdev_class timer_sysclass = { + set_kset_name("timer"), + .resume = timer_resume, +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timer_sysclass, +}; + +static int __init init_timer_sysfs(void) +{ + int error = sysdev_class_register(&timer_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(init_timer_sysfs); + +void __init init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) { + set_intr_gate(vector, interrupt[i]); + } + } + +#ifdef CONFIG_SMP + /* + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ + set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); +#endif + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + +#ifdef CONFIG_X86_LOCAL_APIC + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + + /* + * Set the clock to HZ Hz, we already have a valid + * vector now: + */ + setup_timer(); + + if (!acpi_ioapic) + setup_irq(2, &irq2); +} diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c new file mode 100644 index 000000000000..c4dc91491157 --- /dev/null +++ b/arch/x86_64/kernel/init_task.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial task structure. + * + * We need to make sure that this is 8192-byte aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; + +#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c new file mode 100644 index 000000000000..adebe70c180e --- /dev/null +++ b/arch/x86_64/kernel/io_apic.c @@ -0,0 +1,1982 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define __apicdebuginit __init + +int sis_apic_bug; /* not actually supported, dummy for compile */ + +static DEFINE_SPINLOCK(ioapic_lock); + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + short apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_pin_list *entry = irq_2_pin + irq; \ + \ + for (;;) { \ + unsigned int reg; \ + pin = entry->pin; \ + if (pin == -1) \ + break; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, reg); \ + if (!entry->next) \ + break; \ + entry = irq_2_pin + entry->next; \ + } \ + FINAL; \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) + /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) + /* mask = 0 */ + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; +int ioapic_force; + +/* dummy parsing: see setup.c */ + +static int __init disable_ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +static int __init enable_ioapic_setup(char *str) +{ + ioapic_force = 1; + skip_ioapic_setup = 0; + return 1; +} + +__setup("noapic", disable_ioapic_setup); +__setup("apic", enable_ioapic_setup); + +#include +#include +#include + +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC + off. Check for an Nvidia or VIA PCI bridge and turn it off. + Use pci direct infrastructure because this runs before the PCI subsystem. + + Can be overwritten with "apic" + + And another hack to disable the IOMMU on VIA chipsets. + + Kludge-O-Rama. */ +void __init check_ioapic(void) +{ + int num,slot,func; + if (ioapic_force) + return; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + switch (vendor) { + case PCI_VENDOR_ID_VIA: +#ifdef CONFIG_GART_IOMMU + if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || + force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); + iommu_aperture_disabled = 1; + } +#endif + return; + case PCI_VENDOR_ID_NVIDIA: +#ifdef CONFIG_ACPI + /* All timer overrides on Nvidia + seem to be wrong. Skip them. */ + acpi_skip_timer_override = 1; + printk(KERN_INFO + "Nvidia board detected. Ignoring ACPI timer override.\n"); +#endif + /* RED-PEN skip them on mptables too? */ + return; + } + + /* No multi-function device? */ + type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; + +int assign_irq_vector(int irq) +{ + static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + + BUG_ON(irq >= NR_IRQ_VECTORS); + if (IO_APIC_VECTOR(irq) > 0) + return IO_APIC_VECTOR(irq); +next: + current_vector += 8; + if (current_vector == IA32_SYSCALL_VECTOR) + goto next; + + if (current_vector >= FIRST_SYSTEM_VECTOR) { + offset++; + if (!(offset%8)) + return -ENOSPC; + current_vector = FIRST_DEVICE_VECTOR + offset; + } + + vector_irq[current_vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = current_vector; + + return current_vector; +} + +extern void (*interrupt[NR_IRQS])(void); +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + if (use_pci_vector() && !platform_legacy_irq(irq)) { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[vector].handler = &ioapic_level_type; + else + irq_desc[vector].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[vector]); + } else { + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[irq].handler = &ioapic_level_type; + else + irq_desc[irq].handler = &ioapic_edge_type; + set_intr_gate(vector, interrupt[irq]); + } +} + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + } + + irq = pin_2_irq(idx, apic, pin); + add_pin_to_irq(irq, apic, pin); + + if (!apic && !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE," not connected.\n"); +} + +/* + * Set up the 8259A-master output pin as broadcast to all + * CPUs. + */ +static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].handler = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +void __init UNEXPECTED_IO_APIC(void) +{ +} + +void __apicdebuginit print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) && + (reg_01.bits.entries != 0x03) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + if (reg_01.bits.version >= 0x10) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +#if 0 + +static __apicdebuginit void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void __apicdebuginit print_PIC(void) +{ + extern spinlock_t i8259A_lock; + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +#endif /* 0 */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (i = 0; i < nr_ioapics; i++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(i, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[i] = reg_01.bits.entries+1; + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + disconnect_bsp_APIC(); +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc (void) +{ + union IO_APIC_reg_00 reg_00; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + + printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE," ok.\n"); + } +} + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (jiffies - t1 > 4) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + unsigned long flags; + unsigned int dest; + + dest = cpu_mask_to_apicid(mask); + + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + spin_lock_irqsave(&ioapic_lock, flags); + __DO_ACTION(1, = dest, ) + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +static struct hw_interrupt_type ioapic_edge_type = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static struct hw_interrupt_type ioapic_level_type = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, + .set_affinity = set_ioapic_affinity, +}; + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else + /* Strange. Oh, well.. */ + irq_desc[irq].handler = &no_irq_type; + } + } +} + +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq, +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + printk(KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(NULL); + + printk(" done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(0, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(0, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void check_timer(void) +{ + int pin1, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + pin2 = find_isa_irq_pin(0, mp_ExtINT); + + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + check_nmi_watchdog(); + } + return; + } + clear_IO_APIC_pin(0, pin1); + apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); + } + + apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + check_nmi_watchdog(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(0, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].handler = &lapic_irq_type; + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, " works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_printk(APIC_VERBOSE," failed.\n"); + + apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_VERBOSE," works.\n"); + return; + } + apic_printk(APIC_VERBOSE," failed :(.\n"); + panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); +} + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1<<2) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + + /* + * Set up the IO-APIC IRQ routing table. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, u32 state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +#define IO_APIC_MAX_ID 0xFE + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = phys_cpu_present_map; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= IO_APIC_MAX_ID) { + apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (physid_isset(apic_id, apic_id_map)) { + + for (i = 0; i < IO_APIC_MAX_ID; i++) { + if (!physid_isset(i, apic_id_map)) + break; + } + + if (i == IO_APIC_MAX_ID) + panic("Max apic_id exceeded!\n"); + + apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + physid_set(apic_id, apic_id_map); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) + panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); + } + + apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; /* Disabled (masked) */ + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /*CONFIG_ACPI_BOOT*/ + + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c new file mode 100644 index 000000000000..cde0e868efe3 --- /dev/null +++ b/arch/x86_64/kernel/ioport.c @@ -0,0 +1,117 @@ +/* + * linux/arch/x86_64/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + int i; + if (new_value) + for (i = base; i < base + extent; i++) + __set_bit(i, bitmap); + else + for (i = base; i < base + extent; i++) + clear_bit(i, bitmap); +} + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + unsigned int i, max_long, bytes, bytes_updated; + struct thread_struct * t = ¤t->thread; + struct tss_struct * tss; + unsigned long *bitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + } + + /* + * do it in the per-thread copy and in the TSS ... + * + * Disable preemption via get_cpu() - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ + tss = &per_cpu(init_tss, get_cpu()); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = 0; + for (i = 0; i < IO_BITMAP_LONGS; i++) + if (t->io_bitmap_ptr[i] != ~0UL) + max_long = i; + + bytes = (max_long + 1) * sizeof(long); + bytes_updated = max(bytes, t->io_bitmap_max); + + t->io_bitmap_max = bytes; + + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + + put_cpu(); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + unsigned int old = (regs->eflags >> 12) & 3; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); + return 0; +} diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c new file mode 100644 index 000000000000..62b112e4deb4 --- /dev/null +++ b/arch/x86_64/kernel/irq.c @@ -0,0 +1,108 @@ +/* + * linux/arch/x86_64/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include + +atomic_t irq_err_count; +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif + +/* + * Generic, controller-independent functions: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for (j=0; jtypename); + + seq_printf(p, " %s", action->name); + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif +#endif + } + return 0; +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +asmlinkage unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bits used in ret_from_ code */ + unsigned irq = regs->orig_rax & 0xff; + + irq_enter(); + BUG_ON(irq > 256); + + __do_IRQ(irq, regs); + irq_exit(); + + return 1; +} + diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c new file mode 100644 index 000000000000..4f2a852299b6 --- /dev/null +++ b/arch/x86_64/kernel/kprobes.c @@ -0,0 +1,631 @@ +/* + * Kernel Probes (KProbes) + * arch/x86_64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston and Prasanna S Panchamukhi + * adapted for x86_64 + * 2005-Mar Roland McGrath + * Fixed to handle %rip-relative addressing mode correctly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static DECLARE_MUTEX(kprobe_mutex); + +/* kprobe_status settings */ +#define KPROBE_HIT_ACTIVE 0x00000001 +#define KPROBE_HIT_SS 0x00000002 + +static struct kprobe *current_kprobe; +static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags; +static struct pt_regs jprobe_saved_regs; +static long *jprobe_saved_rsp; +static kprobe_opcode_t *get_insn_slot(void); +static void free_insn_slot(kprobe_opcode_t *slot); +void jprobe_return_end(void); + +/* copy of the kernel stack at the probe fire time */ +static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static inline int is_IF_modifier(kprobe_opcode_t *insn) +{ + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) + return 1; + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on x86_64. */ + up(&kprobe_mutex); + p->ainsn.insn = get_insn_slot(); + down(&kprobe_mutex); + if (!p->ainsn.insn) { + return -ENOMEM; + } + return 0; +} + +/* + * Determine if the instruction uses the %rip-relative addressing mode. + * If it does, return the address of the 32-bit displacement word. + * If not, return null. + */ +static inline s32 *is_riprel(u8 *insn) +{ +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 64)) + static const u64 onebyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ + W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ + W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ + W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ + W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ + W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ + W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ + W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; + static const u64 twobyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ + W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ + W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ + W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ + W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ + W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ + W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ + W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ + W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + int need_modrm; + + /* Skip legacy instruction prefixes. */ + while (1) { + switch (*insn) { + case 0x66: + case 0x67: + case 0x2e: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x36: + case 0xf0: + case 0xf3: + case 0xf2: + ++insn; + continue; + } + break; + } + + /* Skip REX instruction prefix. */ + if ((*insn & 0xf0) == 0x40) + ++insn; + + if (*insn == 0x0f) { /* Two-byte opcode. */ + ++insn; + need_modrm = test_bit(*insn, twobyte_has_modrm); + } else { /* One-byte opcode. */ + need_modrm = test_bit(*insn, onebyte_has_modrm); + } + + if (need_modrm) { + u8 modrm = *++insn; + if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ + /* Displacement follows ModRM byte. */ + return (s32 *) ++insn; + } + } + + /* No %rip-relative addressing mode here. */ + return NULL; +} + +void arch_copy_kprobe(struct kprobe *p) +{ + s32 *ripdisp; + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); + ripdisp = is_riprel(p->ainsn.insn); + if (ripdisp) { + /* + * The copied instruction uses the %rip-relative + * addressing mode. Adjust the displacement for the + * difference between the original location of this + * instruction and the location of the copy that will + * actually be run. The tricky bit here is making sure + * that the sign extension happens correctly in this + * calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the + * %rip value and yield the same 64-bit result that the + * sign-extension of the original signed 32-bit + * displacement would have given. + */ + s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ + *ripdisp = disp; + } +} + +void arch_remove_kprobe(struct kprobe *p) +{ + up(&kprobe_mutex); + free_insn_slot(p->ainsn.insn); + down(&kprobe_mutex); +} + +static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + *p->addr = p->opcode; + regs->rip = (unsigned long)p->addr; +} + +static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + /*single step inline if the instruction is an int3*/ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->rip = (unsigned long)p->addr; + else + regs->rip = (unsigned long)p->ainsn.insn; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +int kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); + + /* We're in an interrupt, but this is clear and BUG()-safe. */ + preempt_disable(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + /* We *are* holding lock here, so this is safe. + Disarm the probe we just hit, and ignore it. */ + p = get_kprobe(addr); + if (p) { + if (kprobe_status == KPROBE_HIT_SS) { + regs->eflags &= ~TF_MASK; + regs->eflags |= kprobe_saved_rflags; + unlock_kprobes(); + goto no_kprobe; + } + disarm_kprobe(p, regs); + ret = 1; + } else { + p = current_kprobe; + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + /* If it's not ours, can't be delete race, (we hold lock). */ + goto no_kprobe; + } + + lock_kprobes(); + p = get_kprobe(addr); + if (!p) { + unlock_kprobes(); + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + kprobe_status = KPROBE_HIT_ACTIVE; + current_kprobe = p; + kprobe_saved_rflags = kprobe_old_rflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->ainsn.insn)) + kprobe_saved_rflags &= ~IF_MASK; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: + prepare_singlestep(p, regs); + kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new rip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed eflags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long *tos = (unsigned long *)regs->rsp; + unsigned long next_rip = 0; + unsigned long copy_rip = (unsigned long)p->ainsn.insn; + unsigned long orig_rip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /*skip the REX prefix*/ + if (*insn >= 0x40 && *insn <= 0x4f) + insn++; + + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(TF_MASK | IF_MASK); + *tos |= kprobe_old_rflags; + break; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_rip + (*tos - copy_rip); + break; + case 0xff: + if ((*insn & 0x30) == 0x10) { + /* call absolute, indirect */ + /* Fix return addr; rip is correct. */ + next_rip = regs->rip; + *tos = orig_rip + (*tos - copy_rip); + } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + /* rip is correct. */ + next_rip = regs->rip; + } + break; + case 0xea: /* jmp absolute -- rip is correct */ + next_rip = regs->rip; + break; + default: + break; + } + + regs->eflags &= ~TF_MASK; + if (next_rip) { + regs->rip = next_rip; + } else { + regs->rip = orig_rip + (regs->rip - copy_rip); + } +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. And we hold kprobe lock. + */ +int post_kprobe_handler(struct pt_regs *regs) +{ + if (!kprobe_running()) + return 0; + + if (current_kprobe->post_handler) + current_kprobe->post_handler(current_kprobe, regs, 0); + + resume_execution(current_kprobe, regs); + regs->eflags |= kprobe_saved_rflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +/* Interrupts disabled, kprobe_lock held. */ +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + if (current_kprobe->fault_handler + && current_kprobe->fault_handler(current_kprobe, regs, trapnr)) + return 1; + + if (kprobe_status & KPROBE_HIT_SS) { + resume_execution(current_kprobe, regs); + regs->eflags |= kprobe_old_rflags; + + unlock_kprobes(); + preempt_enable_no_resched(); + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = (struct die_args *)data; + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + return NOTIFY_STOP; + break; + case DIE_GPF: + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + return NOTIFY_STOP; + break; + default: + break; + } + return NOTIFY_DONE; +} + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + + jprobe_saved_regs = *regs; + jprobe_saved_rsp = (long *) regs->rsp; + addr = (unsigned long)jprobe_saved_rsp; + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr)); + regs->eflags &= ~IF_MASK; + regs->rip = (unsigned long)(jp->entry); + return 1; +} + +void jprobe_return(void) +{ + preempt_enable_no_resched(); + asm volatile (" xchg %%rbx,%%rsp \n" + " int3 \n" + " .globl jprobe_return_end \n" + " jprobe_return_end: \n" + " nop \n"::"b" + (jprobe_saved_rsp):"memory"); +} + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + u8 *addr = (u8 *) (regs->rip - 1); + unsigned long stack_addr = (unsigned long)jprobe_saved_rsp; + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { + if ((long *)regs->rsp != jprobe_saved_rsp) { + struct pt_regs *saved_regs = + container_of(jprobe_saved_rsp, struct pt_regs, rsp); + printk("current rsp %p does not match saved rsp %p\n", + (long *)regs->rsp, jprobe_saved_rsp); + printk("Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk("Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + return 1; + } + return 0; +} + +/* + * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped. + * By default on x86_64, pages we get from kmalloc or vmalloc are not + * executable. Single-stepping an instruction on such a page yields an + * oops. So instead of storing the instruction copies in their respective + * kprobe objects, we allocate a page, map it executable, and store all the + * instruction copies there. (We can allocate additional pages if somebody + * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE + * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t) + * bytes. + */ +#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t))) +struct kprobe_insn_page { + struct hlist_node hlist; + kprobe_opcode_t *insns; /* page of instruction slots */ + char slot_used[INSNS_PER_PAGE]; + int nused; +}; + +static struct hlist_head kprobe_insn_pages; + +/** + * get_insn_slot() - Find a slot on an executable page for an instruction. + * We allocate an executable page if there's no room on existing ones. + */ +static kprobe_opcode_t *get_insn_slot(void) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos; + + hlist_for_each(pos, &kprobe_insn_pages) { + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->nused < INSNS_PER_PAGE) { + int i; + for (i = 0; i < INSNS_PER_PAGE; i++) { + if (!kip->slot_used[i]) { + kip->slot_used[i] = 1; + kip->nused++; + return kip->insns + (i*MAX_INSN_SIZE); + } + } + /* Surprise! No unused slots. Fix kip->nused. */ + kip->nused = INSNS_PER_PAGE; + } + } + + /* All out of space. Need to allocate a new page. Use slot 0.*/ + kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); + if (!kip) { + return NULL; + } + + /* + * For the %rip-relative displacement fixups to be doable, we + * need our instruction copy to be within +/- 2GB of any data it + * might access via %rip. That is, within 2GB of where the + * kernel image and loaded module images reside. So we allocate + * a page in the module loading area. + */ + kip->insns = module_alloc(PAGE_SIZE); + if (!kip->insns) { + kfree(kip); + return NULL; + } + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, &kprobe_insn_pages); + memset(kip->slot_used, 0, INSNS_PER_PAGE); + kip->slot_used[0] = 1; + kip->nused = 1; + return kip->insns; +} + +/** + * free_insn_slot() - Free instruction slot obtained from get_insn_slot(). + */ +static void free_insn_slot(kprobe_opcode_t *slot) +{ + struct kprobe_insn_page *kip; + struct hlist_node *pos; + + hlist_for_each(pos, &kprobe_insn_pages) { + kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + if (kip->insns <= slot + && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) { + int i = (slot - kip->insns) / MAX_INSN_SIZE; + kip->slot_used[i] = 0; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + hlist_del(&kip->hlist); + if (hlist_empty(&kprobe_insn_pages)) { + INIT_HLIST_NODE(&kip->hlist); + hlist_add_head(&kip->hlist, + &kprobe_insn_pages); + } else { + module_free(NULL, kip->insns); + kfree(kip); + } + } + return; + } + } +} diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c new file mode 100644 index 000000000000..d7e5d0cf4285 --- /dev/null +++ b/arch/x86_64/kernel/ldt.c @@ -0,0 +1,253 @@ +/* + * linux/arch/x86_64/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) +{ + void *oldldt; + void *newldt; + unsigned oldsize; + + if (mincount <= (unsigned)pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + wmb(); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + + preempt_disable(); + mask = cpumask_of_cpu(smp_processor_id()); + load_LDT(pc); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#else + load_LDT(pc); +#endif + } + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * + * Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + /* Arbitrary number */ + /* x86-64 default LDT is all zeros */ + if (bytecount > 128) + bytecount = 128; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct task_struct *me = current; + struct mm_struct * mm = me->mm; + __u32 entry_1, entry_2, *lp; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, bytecount)) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= (unsigned)mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + *lp = entry_1; + *(lp+1) = entry_2; + error = 0; + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c new file mode 100644 index 000000000000..86f9fd85016a --- /dev/null +++ b/arch/x86_64/kernel/mce.c @@ -0,0 +1,548 @@ +/* + * Machine check handler. + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MISC_MCELOG_MINOR 227 +#define NR_BANKS 5 + +static int mce_dont_init; + +/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, + 3: never panic or exit (for testing only) */ +static int tolerant = 1; +static int banks; +static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; +static unsigned long console_logged; +static int notify_user; + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +struct mce_log mcelog = { + MCE_LOG_SIGNATURE, + MCE_LOG_LEN, +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + mce->finished = 0; + smp_wmb(); + for (;;) { + entry = rcu_dereference(mcelog.next); + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) + continue; + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + smp_wmb(); + mcelog.entry[entry].finished = 1; + smp_wmb(); + + if (!test_and_set_bit(0, &console_logged)) + notify_user = 1; +} + +static void print_mce(struct mce *m) +{ + printk(KERN_EMERG "\n" + KERN_EMERG + "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + m->cpu, m->mcgstatus, m->bank, m->status); + if (m->rip) { + printk(KERN_EMERG + "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->rip); + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->rip); + printk("\n"); + } + printk(KERN_EMERG "TSC %Lx ", m->tsc); + if (m->addr) + printk("ADDR %Lx ", m->addr); + if (m->misc) + printk("MISC %Lx ", m->misc); + printk("\n"); +} + +static void mce_panic(char *msg, struct mce *backup, unsigned long start) +{ + int i; + oops_begin(); + for (i = 0; i < MCE_LOG_LEN; i++) { + unsigned long tsc = mcelog.entry[i].tsc; + if (time_before(tsc, start)) + continue; + print_mce(&mcelog.entry[i]); + if (backup && mcelog.entry[i].tsc == backup->tsc) + backup = NULL; + } + if (backup) + print_mce(backup); + if (tolerant >= 3) + printk("Fake panic: %s\n", msg); + else + panic(msg); +} + +static int mce_available(struct cpuinfo_x86 *c) +{ + return test_bit(X86_FEATURE_MCE, &c->x86_capability) && + test_bit(X86_FEATURE_MCA, &c->x86_capability); +} + +/* + * The actual machine check handler + */ + +void do_machine_check(struct pt_regs * regs, long error_code) +{ + struct mce m, panicm; + int nowayout = (tolerant < 1); + int kill_it = 0; + u64 mcestart = 0; + int i; + int panicm_found = 0; + + if (regs) + notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL); + if (!banks) + return; + + memset(&m, 0, sizeof(struct mce)); + m.cpu = hard_smp_processor_id(); + rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + rdtscll(mcestart); + barrier(); + + for (i = 0; i < banks; i++) { + if (!bank[i]) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + if (m.status & MCI_STATUS_EN) { + /* In theory _OVER could be a nowayout too, but + assume any overflowed errors were no fatal. */ + nowayout |= !!(m.status & MCI_STATUS_PCC); + kill_it |= !!(m.status & MCI_STATUS_UC); + } + + if (m.status & MCI_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc); + if (m.status & MCI_STATUS_ADDRV) + rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr); + + if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) { + m.rip = regs->rip; + m.cs = regs->cs; + } else { + m.rip = 0; + m.cs = 0; + } + + if (error_code != -1) + rdtscll(m.tsc); + wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); + mce_log(&m); + + /* Did this bank cause the exception? */ + /* Assume that the bank with uncorrectable errors did it, + and that there is only a single one. */ + if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) { + panicm = m; + panicm_found = 1; + } + + tainted |= TAINT_MACHINE_CHECK; + } + + /* Never do anything final in the polling timer */ + if (!regs) + goto out; + + /* If we didn't find an uncorrectable error, pick + the last one (shouldn't happen, just being safe). */ + if (!panicm_found) + panicm = m; + if (nowayout) + mce_panic("Machine check", &panicm, mcestart); + if (kill_it) { + int user_space = 0; + + if (m.mcgstatus & MCG_STATUS_RIPV) + user_space = panicm.rip && (panicm.cs & 3); + + /* When the machine was in user space and the CPU didn't get + confused it's normally not necessary to panic, unless you + are paranoid (tolerant == 0) + + RED-PEN could be more tolerant for MCEs in idle, + but most likely they occur at boot anyways, where + it is best to just halt the machine. */ + if ((!user_space && (panic_on_oops || tolerant < 2)) || + (unsigned)current->pid <= 1) + mce_panic("Uncorrected machine check", &panicm, mcestart); + + /* do_exit takes an awful lot of locks and has as + slight risk of deadlocking. If you don't want that + don't set tolerant >= 2 */ + if (tolerant < 3) + do_exit(SIGBUS); + } + + out: + /* Last thing done in the machine check exception to clear state. */ + wrmsrl(MSR_IA32_MCG_STATUS, 0); +} + +/* + * Periodic polling timer for "silent" machine check errors. + */ + +static int check_interval = 5 * 60; /* 5 minutes */ +static void mcheck_timer(void *data); +static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); + +static void mcheck_check_cpu(void *info) +{ + if (mce_available(¤t_cpu_data)) + do_machine_check(NULL, 0); +} + +static void mcheck_timer(void *data) +{ + on_each_cpu(mcheck_check_cpu, NULL, 1, 1); + schedule_delayed_work(&mcheck_work, check_interval * HZ); + + /* + * It's ok to read stale data here for notify_user and + * console_logged as we'll simply get the updated versions + * on the next mcheck_timer execution and atomic operations + * on console_logged act as synchronization for notify_user + * writes. + */ + if (notify_user && console_logged) { + notify_user = 0; + clear_bit(0, &console_logged); + printk(KERN_INFO "Machine check events logged\n"); + } +} + + +static __init int periodic_mcheck_init(void) +{ + if (check_interval) + schedule_delayed_work(&mcheck_work, check_interval*HZ); + return 0; +} +__initcall(periodic_mcheck_init); + + +/* + * Initialize Machine Checks for a CPU. + */ +static void mce_init(void *dummy) +{ + u64 cap; + int i; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + banks = cap & 0xff; + if (banks > NR_BANKS) { + printk(KERN_INFO "MCE: warning: using only %d banks\n", banks); + banks = NR_BANKS; + } + + /* Log the machine checks left over from the previous reset. + This also clears all registers */ + do_machine_check(NULL, -1); + + set_in_cr4(X86_CR4_MCE); + + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < banks; i++) { + wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + } +} + +/* Add per CPU specific workarounds here */ +static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { + /* disable GART TBL walk error reporting, which trips off + incorrectly with the IOMMU & 3ware & Cerberus. */ + clear_bit(10, &bank[4]); + } +} + +static void __init mce_cpu_features(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + break; + default: + break; + } +} + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off. + */ +void __init mcheck_init(struct cpuinfo_x86 *c) +{ + static cpumask_t mce_cpus __initdata = CPU_MASK_NONE; + + mce_cpu_quirks(c); + + if (mce_dont_init || + cpu_test_and_set(smp_processor_id(), mce_cpus) || + !mce_available(c)) + return; + + mce_init(NULL); + mce_cpu_features(c); +} + +/* + * Character device to read and clear the MCE log. + */ + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) +{ + unsigned long cpu_tsc[NR_CPUS]; + static DECLARE_MUTEX(mce_read_sem); + unsigned next; + char __user *buf = ubuf; + int i, err; + + down(&mce_read_sem); + next = rcu_dereference(mcelog.next); + + /* Only supports full reads right now */ + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { + up(&mce_read_sem); + return -EINVAL; + } + + err = 0; + for (i = 0; i < next; i++) { + if (!mcelog.entry[i].finished) + continue; + smp_rmb(); + err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); + buf += sizeof(struct mce); + } + + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + + synchronize_kernel(); + + /* Collect entries that were still getting written before the synchronize. */ + + on_each_cpu(collect_tscs, cpu_tsc, 1, 1); + for (i = next; i < MCE_LOG_LEN; i++) { + if (mcelog.entry[i].finished && + mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { + err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce)); + smp_rmb(); + buf += sizeof(struct mce); + memset(&mcelog.entry[i], 0, sizeof(struct mce)); + } + } + up(&mce_read_sem); + return err ? -EFAULT : buf - ubuf; +} + +static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) +{ + int __user *p = (int __user *)arg; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static struct file_operations mce_chrdev_ops = { + .read = mce_read, + .ioctl = mce_ioctl, +}; + +static struct miscdevice mce_log_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +/* + * Old style boot options parsing. Only for compatibility. + */ + +static int __init mcheck_disable(char *str) +{ + mce_dont_init = 1; + return 0; +} + +/* mce=off disables machine check. Note you can reenable it later + using sysfs */ +static int __init mcheck_enable(char *str) +{ + if (!strcmp(str, "off")) + mce_dont_init = 1; + else + printk("mce= argument %s ignored. Please use /sys", str); + return 0; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); + +/* + * Sysfs support + */ + +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +static int mce_resume(struct sys_device *dev) +{ + on_each_cpu(mce_init, NULL, 1, 1); + return 0; +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + if (check_interval) + cancel_delayed_work(&mcheck_work); + /* Timer race is harmless here */ + on_each_cpu(mce_init, NULL, 1, 1); + if (check_interval) + schedule_delayed_work(&mcheck_work, check_interval*HZ); +} + +static struct sysdev_class mce_sysclass = { + .resume = mce_resume, + set_kset_name("machinecheck"), +}; + +static struct sys_device device_mce = { + .id = 0, + .cls = &mce_sysclass, +}; + +/* Why are there no generic functions for this? */ +#define ACCESSOR(name, var, start) \ + static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + return sprintf(buf, "%lx\n", (unsigned long)var); \ + } \ + static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + char *end; \ + unsigned long new = simple_strtoul(buf, &end, 0); \ + if (end == buf) return -EINVAL; \ + var = new; \ + start; \ + return end-buf; \ + } \ + static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); + +ACCESSOR(bank0ctl,bank[0],mce_restart()) +ACCESSOR(bank1ctl,bank[1],mce_restart()) +ACCESSOR(bank2ctl,bank[2],mce_restart()) +ACCESSOR(bank3ctl,bank[3],mce_restart()) +ACCESSOR(bank4ctl,bank[4],mce_restart()) +ACCESSOR(tolerant,tolerant,) +ACCESSOR(check_interval,check_interval,mce_restart()) + +static __init int mce_init_device(void) +{ + int err; + if (!mce_available(&boot_cpu_data)) + return -EIO; + err = sysdev_class_register(&mce_sysclass); + if (!err) + err = sysdev_register(&device_mce); + if (!err) { + /* could create per CPU objects, but it is not worth it. */ + sysdev_create_file(&device_mce, &attr_bank0ctl); + sysdev_create_file(&device_mce, &attr_bank1ctl); + sysdev_create_file(&device_mce, &attr_bank2ctl); + sysdev_create_file(&device_mce, &attr_bank3ctl); + sysdev_create_file(&device_mce, &attr_bank4ctl); + sysdev_create_file(&device_mce, &attr_tolerant); + sysdev_create_file(&device_mce, &attr_check_interval); + } + + misc_register(&mce_log_device); + return err; + +} +device_initcall(mce_init_device); diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c new file mode 100644 index 000000000000..4db9a640069f --- /dev/null +++ b/arch/x86_64/kernel/mce_intel.c @@ -0,0 +1,99 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo + */ + +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(unsigned long, next_check); + +asmlinkage void smp_thermal_interrupt(void) +{ + struct mce m; + + ack_APIC_irq(); + + irq_enter(); + if (time_before(jiffies, __get_cpu_var(next_check))) + goto done; + + __get_cpu_var(next_check) = jiffies + HZ*300; + memset(&m, 0, sizeof(m)); + m.cpu = smp_processor_id(); + m.bank = MCE_THERMAL_BANK; + rdtscll(m.tsc); + rdmsrl(MSR_IA32_THERM_STATUS, m.status); + if (m.status & 0x1) { + printk(KERN_EMERG + "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); + } + + mce_log(&m); +done: + irq_exit(); +} + +static void __init intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + int tm2 = 0; + unsigned int cpu = smp_processor_id(); + + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; + + if (!cpu_has(c, X86_FEATURE_ACC)) + return; + + /* first check if TM1 is already enabled by the BIOS, in which + * case there might be some SMM goo which handles it, so we can't even + * put a handler since it might be delivered via SMI already. + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already " + "installed\n", cpu, (h & APIC_VECTOR_MASK)); + return; + } + + h = THERMAL_APIC_VECTOR; + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); + apic_write_around(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + + l = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + return; +} + +void __init mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); +} diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c new file mode 100644 index 000000000000..c2ffea8845ed --- /dev/null +++ b/arch/x86_64/kernel/module.c @@ -0,0 +1,166 @@ +/* Kernel module help for x86-64 + Copyright (C) 2001 Rusty Russell. + Copyright (C) 2002,2003 Andi Kleen, SuSE Labs. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DEBUGP(fmt...) + +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); +} + +void *module_alloc(unsigned long size) +{ + struct vm_struct *area; + + if (!size) + return NULL; + size = PAGE_ALIGN(size); + if (size > MODULES_LEN) + return NULL; + + area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); + if (!area) + return NULL; + + return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)loc = val; + break; + case R_X86_64_32: + *(u32 *)loc = val; + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)loc = val; + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + default: + printk(KERN_ERR "module %s: Unknown rela relocation: %Lu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} + +int apply_relocate(Elf_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk("non add relocation not supported\n"); + return -ENOSYS; +} + +extern void apply_alternatives(void *start, void *end); + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + /* look for .altinstructions to patch */ + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + void *seg; + if (strcmp(".altinstructions", secstrings + s->sh_name)) + continue; + seg = (void *)s->sh_addr; + apply_alternatives(seg, seg + s->sh_size); + } + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ +} diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c new file mode 100644 index 000000000000..7ec031c6ca10 --- /dev/null +++ b/arch/x86_64/kernel/mpparse.c @@ -0,0 +1,949 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000 Ingo Molnar + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +int acpi_found_madt; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; + +static int mp_current_pci_id = 0; +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; +int pic_mode; +unsigned long mp_lapic_addr = 0; + + + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_id = -1U; +/* Internal processor count */ +static unsigned int num_processors = 0; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; + +/* ACPI MADT entry parsing functions */ +#ifdef CONFIG_ACPI_BOOT +extern struct acpi_boot_flags acpi_boot; +#ifdef CONFIG_X86_LOCAL_APIC +extern int acpi_parse_lapic (acpi_table_entry_header *header); +extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); +extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); +#endif /*CONFIG_X86_LOCAL_APIC*/ +#ifdef CONFIG_X86_IO_APIC +extern int acpi_parse_ioapic (acpi_table_entry_header *header); +#endif /*CONFIG_X86_IO_APIC*/ +#endif /*CONFIG_ACPI_BOOT*/ + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + + +/* + * Intel MP BIOS table parsing routines: + */ + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +static void __init MP_processor_info (struct mpc_config_processor *m) +{ + int ver; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, + (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, + m->mpc_apicver); + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_id = m->mpc_apicid; + } + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + + num_processors++; + + if (m->mpc_apicid > MAX_APICS) { + printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", + m->mpc_apicid, MAX_APICS); + return; + } + ver = m->mpc_apicver; + + physid_set(m->mpc_apicid, phys_cpu_present_map); + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + Dprintk("Bus #%d is %s\n", m->mpc_busid, str); + + if (strncmp(str, "ISA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, "EISA", 4) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, "PCI", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, "MCA", 3) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else { + printk(KERN_ERR "Unknown bustype %s\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk("I/O APIC #%d Version %d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk("SMP mptable: bad signature [%c%c%c%c]!\n", + mpc->mpc_signature[0], + mpc->mpc_signature[1], + mpc->mpc_signature[2], + mpc->mpc_signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk("SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(str,mpc->mpc_oem,8); + str[8]=0; + printk(KERN_INFO "OEM ID: %s ",str); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk(KERN_INFO "Product ID: %s ",str); + + printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic); + + /* save the local APIC address, it might be non-default */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + } + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI may be used to obtain the entire SMP configuration or just to + * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc((void *)(unsigned long)mpf->mpf_physptr)) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + extern void __bad_mpf_size(void); + unsigned int *bp = phys_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + __bad_mpf_size(); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; + reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) + reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE); + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_intel_smp (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + */ + + address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + if (smp_scan_config(address, 0x1000)) + return; + + /* If we have come this far, we did not find an MP table */ + printk(KERN_INFO "No mptable found.\n"); +} + +/* + * - Intel MP Configuration Table + */ +void __init find_smp_config (void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + find_intel_smp(); +#endif +} + + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI_BOOT + +void __init mp_register_lapic_address ( + u64 address) +{ + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_id == -1U) + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + + +void __init mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (id >= MAX_APICS) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = 0x10; /* TBD: lapic version */ + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + + MP_processor_info(&processor); +} + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_start; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_start) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic IRQ lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI IRQs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_start = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_start, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } + + return; +} + +int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + +#ifdef CONFIG_ACPI_BUS + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; +#endif + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1< +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Note: "err" is handled in a funny way below. Otherwise one version + of gcc or another breaks. */ + +static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) +{ + int err; + + asm volatile ("1: wrmsr\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %4,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) + :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); + + return err; +} + +static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) +{ + int err; + + asm volatile ("1: rdmsr\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: movl %4,%0\n" + " jmp 2b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 1b,3b\n" + ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) + :"c"(reg), "i"(-EIO), "0"(0)); + + return err; +} + +#ifdef CONFIG_SMP + +struct msr_command { + int cpu; + int err; + u32 reg; + u32 data[2]; +}; + +static void msr_smp_wrmsr(void *cmd_block) +{ + struct msr_command *cmd = (struct msr_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); +} + +static void msr_smp_rdmsr(void *cmd_block) +{ + struct msr_command *cmd = (struct msr_command *)cmd_block; + + if (cmd->cpu == smp_processor_id()) + cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); +} + +static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) +{ + struct msr_command cmd; + int ret; + + preempt_disable(); + if (cpu == smp_processor_id()) { + ret = wrmsr_eio(reg, eax, edx); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + cmd.data[0] = eax; + cmd.data[1] = edx; + + smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); + ret = cmd.err; + } + preempt_enable(); + return ret; +} + +static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) +{ + struct msr_command cmd; + int ret; + + preempt_disable(); + if (cpu == smp_processor_id()) { + ret = rdmsr_eio(reg, eax, edx); + } else { + cmd.cpu = cpu; + cmd.reg = reg; + + smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); + + *eax = cmd.data[0]; + *edx = cmd.data[1]; + + ret = cmd.err; + } + preempt_enable(); + return ret; +} + +#else /* ! CONFIG_SMP */ + +static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) +{ + return wrmsr_eio(reg, eax, edx); +} + +static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) +{ + return rdmsr_eio(reg, eax, edx); +} + +#endif /* ! CONFIG_SMP */ + +static loff_t msr_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret = -EINVAL; + + lock_kernel(); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + } + unlock_kernel(); + return ret; +} + +static ssize_t msr_read(struct file *file, char __user * buf, + size_t count, loff_t * ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 8) { + err = do_rdmsr(cpu, reg, &data[0], &data[1]); + if (err) + return err; + if (copy_to_user(tmp, &data, 8)) + return -EFAULT; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + size_t rv; + u32 reg = *ppos; + int cpu = iminor(file->f_dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (rv = 0; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) + return -EFAULT; + err = do_wrmsr(cpu, reg, data[0], data[1]); + if (err) + return err; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file->f_dentry->d_inode); + struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + + if (cpu >= NR_CPUS || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = msr_seek, + .read = msr_read, + .write = msr_write, + .open = msr_open, +}; + +static int __init msr_init(void) +{ + if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + printk(KERN_ERR "msr: unable to get major %d for msr\n", + MSR_MAJOR); + return -EBUSY; + } + + return 0; +} + +static void __exit msr_exit(void) +{ + unregister_chrdev(MSR_MAJOR, "cpu/msr"); +} + +module_init(msr_init); +module_exit(msr_exit) + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c new file mode 100644 index 000000000000..d9867de6a626 --- /dev/null +++ b/arch/x86_64/kernel/nmi.c @@ -0,0 +1,488 @@ +/* + * linux/arch/x86_64/nmi.c + * + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: + * - it may be reserved by some other driver, or not + * - when not reserved by some other driver, it may be used for + * the NMI watchdog, or not + * + * This is maintained separately from nmi_active because the NMI + * watchdog may also be driven from the I/O APIC timer. + */ +static DEFINE_SPINLOCK(lapic_nmi_owner_lock); +static unsigned int lapic_nmi_owner; +#define LAPIC_NMI_WATCHDOG (1<<0) +#define LAPIC_NMI_RESERVED (1<<1) + +/* nmi_active: + * +1: the lapic NMI watchdog is active, but can be disabled + * 0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * -1: the lapic NMI watchdog is disabled, but can be enabled + */ +int nmi_active; /* oprofile uses this */ +int panic_on_timeout; + +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; +unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +/* Run after command line and cpu_init init, but before all other checks */ +void __init nmi_watchdog_default(void) +{ + if (nmi_watchdog != NMI_DEFAULT) + return; + + /* For some reason the IO APIC watchdog doesn't work on the AMD + 8111 chipset. For now switch to local APIC mode using + perfctr0 there. On Intel CPUs we don't have code to handle + the perfctr and the IO-APIC seems to work, so use that. */ + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + nmi_watchdog = NMI_LOCAL_APIC; + printk(KERN_INFO + "Using local APIC NMI watchdog using perfctr0\n"); + } else { + printk(KERN_INFO "Using IO APIC NMI watchdog\n"); + nmi_watchdog = NMI_IO_APIC; + } +} + +/* Why is there no CPUID flag for this? */ +static __init int cpu_has_lapic(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + case X86_VENDOR_AMD: + return boot_cpu_data.x86 >= 6; + /* .... add more cpus here or find a different way to figure this out. */ + default: + return 0; + } +} + +int __init check_nmi_watchdog (void) +{ + int counts[NR_CPUS]; + int cpu; + + if (nmi_watchdog == NMI_LOCAL_APIC && !cpu_has_lapic()) { + nmi_watchdog = NMI_NONE; + return -1; + } + + printk(KERN_INFO "testing NMI watchdog ... "); + + for (cpu = 0; cpu < NR_CPUS; cpu++) + counts[cpu] = cpu_pda[cpu].__nmi_count; + local_irq_enable(); + mdelay((10*1000)/nmi_hz); // wait 10 ticks + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_SMP + /* Check cpu_callin_map here because that is set + after the timer is started. */ + if (!cpu_isset(cpu, cpu_callin_map)) + continue; +#endif + if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) { + printk("CPU#%d: NMI appears to be stuck (%d)!\n", + cpu, + cpu_pda[cpu].__nmi_count); + nmi_active = 0; + lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; + return -1; + } + } + printk("OK.\n"); + + /* now that we know it works we can reduce NMI frequency to + something more reasonable; makes a difference in some configs */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = 1; + + return 0; +} + +int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + if (!strncmp(str,"panic",5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID) + return 0; + nmi_watchdog = nmi; + return 1; +} + +__setup("nmi_watchdog=", setup_nmi_watchdog); + +static void disable_lapic_nmi_watchdog(void) +{ + if (nmi_active <= 0) + return; + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + wrmsr(MSR_K7_EVNTSEL0, 0, 0); + break; + case X86_VENDOR_INTEL: + wrmsr(MSR_IA32_EVNTSEL0, 0, 0); + break; + } + nmi_active = -1; + /* tell do_nmi() and others that we're not active any more */ + nmi_watchdog = 0; +} + +static void enable_lapic_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_LOCAL_APIC; + setup_apic_nmi_watchdog(); + } +} + +int reserve_lapic_nmi(void) +{ + unsigned int old_owner; + + spin_lock(&lapic_nmi_owner_lock); + old_owner = lapic_nmi_owner; + lapic_nmi_owner |= LAPIC_NMI_RESERVED; + spin_unlock(&lapic_nmi_owner_lock); + if (old_owner & LAPIC_NMI_RESERVED) + return -EBUSY; + if (old_owner & LAPIC_NMI_WATCHDOG) + disable_lapic_nmi_watchdog(); + return 0; +} + +void release_lapic_nmi(void) +{ + unsigned int new_owner; + + spin_lock(&lapic_nmi_owner_lock); + new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; + lapic_nmi_owner = new_owner; + spin_unlock(&lapic_nmi_owner_lock); + if (new_owner & LAPIC_NMI_WATCHDOG) + enable_lapic_nmi_watchdog(); +} + +void disable_timer_nmi_watchdog(void) +{ + if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + return; + + disable_irq(0); + unset_nmi_callback(); + nmi_active = -1; + nmi_watchdog = NMI_NONE; +} + +void enable_timer_nmi_watchdog(void) +{ + if (nmi_active < 0) { + nmi_watchdog = NMI_IO_APIC; + touch_nmi_watchdog(); + nmi_active = 1; + enable_irq(0); + } +} + +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, u32 state) +{ + nmi_pm_active = nmi_active; + disable_lapic_nmi_watchdog(); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + if (nmi_pm_active > 0) + enable_lapic_nmi_watchdog(); + return 0; +} + +static struct sysdev_class nmi_sysclass = { + set_kset_name("lapic_nmi"), + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +/* + * Activate the NMI watchdog via the local APIC. + * Original code written by Keith Owens. + */ + +static void setup_k7_watchdog(void) +{ + int i; + unsigned int evntsel; + + /* No check, so can start with slow frequency */ + nmi_hz = 1; + + /* XXX should check these in EFER */ + + nmi_perfctr_msr = MSR_K7_PERFCTR0; + + for(i = 0; i < 4; ++i) { + /* Simulator may not support it */ + if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) + return; + wrmsrl(MSR_K7_PERFCTR0+i, 0UL); + } + + evntsel = K7_EVNTSEL_INT + | K7_EVNTSEL_OS + | K7_EVNTSEL_USR + | K7_NMI_EVENT; + + wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz*1000) / nmi_hz); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); +} + +void setup_apic_nmi_watchdog(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 < 6) + return; + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + setup_k7_watchdog(); + break; + default: + return; + } + lapic_nmi_owner = LAPIC_NMI_WATCHDOG; + nmi_active = 1; +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up + * here too!] + */ + +static unsigned int + last_irq_sums [NR_CPUS], + alert_counter [NR_CPUS]; + +void touch_nmi_watchdog (void) +{ + int i; + + /* + * Just reset the alert counters, (other CPUs might be + * spinning on locks we hold): + */ + for (i = 0; i < NR_CPUS; i++) + alert_counter[i] = 0; +} + +void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +{ + int sum, cpu; + + cpu = safe_smp_processor_id(); + sum = read_pda(apic_timer_irqs); + if (last_irq_sums[cpu] == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; + if (alert_counter[cpu] == 5*nmi_hz) { + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + alert_counter[cpu] = 0; + return; + } + die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); + } + } else { + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } + if (nmi_perfctr_msr) + wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); +} + +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + +asmlinkage void do_nmi(struct pt_regs * regs, long error_code) +{ + int cpu = safe_smp_processor_id(); + + nmi_enter(); + add_pda(__nmi_count,1); + if (!nmi_callback(regs, cpu)) + default_do_nmi(regs); + nmi_exit(); +} + +void set_nmi_callback(nmi_callback_t callback) +{ + nmi_callback = callback; +} + +void unset_nmi_callback(void) +{ + nmi_callback = dummy_nmi_callback; +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + if (!(reason & 0xc0)) { + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf,regs); + } + return 0; +} + +/* + * proc handler for /proc/sys/kernel/unknown_nmi_panic + */ +int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + old_state = unknown_nmi_panic; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!unknown_nmi_panic) + return 0; + + if (unknown_nmi_panic) { + if (reserve_lapic_nmi() < 0) { + unknown_nmi_panic = 0; + return -EBUSY; + } else { + set_nmi_callback(unknown_nmi_panic_callback); + } + } else { + release_lapic_nmi(); + unset_nmi_callback(); + } + return 0; +} + +#endif + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(reserve_lapic_nmi); +EXPORT_SYMBOL(release_lapic_nmi); +EXPORT_SYMBOL(disable_timer_nmi_watchdog); +EXPORT_SYMBOL(enable_timer_nmi_watchdog); +EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c new file mode 100644 index 000000000000..cab471cf3edb --- /dev/null +++ b/arch/x86_64/kernel/pci-dma.c @@ -0,0 +1,60 @@ +/* + * Dynamic DMA mapping support. + */ + +#include +#include +#include +#include +#include +#include + +/* Map a set of buffers described by scatterlist in streaming + * mode for DMA. This is the scatter-gather version of the + * above pci_map_single interface. Here the scatter gather list + * elements are each tagged with the appropriate dma address + * and length. They are obtained via sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for pci_map_single are + * the same here. + */ +int dma_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction) +{ + int i; + + BUG_ON(direction == DMA_NONE); + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + s->dma_length = s->length; + } + return nents; +} + +EXPORT_SYMBOL(dma_map_sg); + +/* Unmap a set of streaming mode DMA translations. + * Again, cpu read rules concerning calls here are the same as for + * pci_unmap_single() above. + */ +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + BUG_ON(s->page == NULL); + BUG_ON(s->dma_address == 0); + dma_unmap_single(dev, s->dma_address, s->dma_length, dir); + } +} + +EXPORT_SYMBOL(dma_unmap_sg); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c new file mode 100644 index 000000000000..57f35c68aa34 --- /dev/null +++ b/arch/x86_64/kernel/pci-gart.c @@ -0,0 +1,980 @@ +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/DMA-mapping.txt for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +dma_addr_t bad_dma_address; + +unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +u32 *iommu_gatt_base; /* Remapping table */ + +int no_iommu; +static int no_agp; +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow = 1; +int force_iommu = 1; +#else +int panic_on_overflow = 0; +int force_iommu = 0; +#endif +int iommu_merge = 1; +int iommu_sac_force = 0; + +/* If this is disabled the IOMMU will use an optimized flushing strategy + of only flushing when an mapping is reused. With it true the GART is flushed + for every mapping. Problem is that doing the lazy flush seems to trigger + bugs with some popular PCI cards, in particular 3ware (but has been also + also seen with Qlogic at least). */ +int iommu_fullflush = 1; + +/* This tells the BIO block layer to assume merging. Default to off + because we cannot guarantee merging later. */ +int iommu_bio_merge = 0; + +#define MAX_NB 8 + +/* Allocation bitmap for the remapping area */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#define to_pages(addr,size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + +#define for_all_nb(dev) \ + dev = NULL; \ + while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)\ + if (dev->bus->number == 0 && \ + (PCI_SLOT(dev->devfn) >= 24) && (PCI_SLOT(dev->devfn) <= 31)) + +static struct pci_dev *northbridges[MAX_NB]; +static u32 northbridge_flush_word[MAX_NB]; + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static int need_flush; /* global flush state. set for each gart wrap */ +static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, + size_t size, int dir, int do_panic); + +/* Dummy device used for NULL arguments (normally ISA). Better would + be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ +static struct device fallback_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = 0xffffffff, + .dma_mask = &fallback_dev.coherent_dma_mask, +}; + +static unsigned long alloc_iommu(int size) +{ + unsigned long offset, flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + if (offset == -1) { + need_flush = 1; + offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); + } + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = 1; + } + } + if (iommu_fullflush) + need_flush = 1; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + if (size == 1) { + clear_bit(offset, iommu_gart_bitmap); + return; + } + spin_lock_irqsave(&iommu_bitmap_lock, flags); + __clear_bit_string(iommu_gart_bitmap, offset, size); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(struct device *dev) +{ + unsigned long flags; + int flushed = 0; + int i, max; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + max = 0; + for (i = 0; i < MAX_NB; i++) { + if (!northbridges[i]) + continue; + pci_write_config_dword(northbridges[i], 0x9c, + northbridge_flush_word[i] | 1); + flushed++; + max = i; + } + for (i = 0; i <= max; i++) { + u32 w; + if (!northbridges[i]) + continue; + /* Make sure the hardware actually executed the flush. */ + do { + pci_read_config_dword(northbridges[i], 0x9c, &w); + } while (w & 1); + } + if (!flushed) + printk("nothing to flush?\n"); + need_flush = 0; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* Allocate DMA memory on node near device */ +noinline +static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) +{ + struct page *page; + int node; + if (dev->bus == &pci_bus_type) { + cpumask_t mask; + mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); + node = cpu_to_node(first_cpu(mask)); + } else + node = numa_node_id(); + page = alloc_pages_node(node, gfp, order); + return page ? page_address(page) : NULL; +} + +/* + * Allocate memory for a coherent mapping. + */ +void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + unsigned gfp) +{ + void *memory; + unsigned long dma_mask = 0; + u64 bus; + + if (!dev) + dev = &fallback_dev; + dma_mask = dev->coherent_dma_mask; + if (dma_mask == 0) + dma_mask = 0xffffffff; + + /* Kludge to make it bug-to-bug compatible with i386. i386 + uses the normal dma_mask for alloc_coherent. */ + dma_mask &= *dev->dma_mask; + + again: + memory = dma_alloc_pages(dev, gfp, get_order(size)); + if (memory == NULL) + return NULL; + + { + int high, mmu; + bus = virt_to_bus(memory); + high = (bus + size) >= dma_mask; + mmu = high; + if (force_iommu && !(gfp & GFP_DMA)) + mmu = 1; + if (no_iommu || dma_mask < 0xffffffffUL) { + if (high) { + free_pages((unsigned long)memory, + get_order(size)); + + if (swiotlb) { + return + swiotlb_alloc_coherent(dev, size, + dma_handle, + gfp); + } + + if (!(gfp & GFP_DMA)) { + gfp |= GFP_DMA; + goto again; + } + return NULL; + } + mmu = 0; + } + memset(memory, 0, size); + if (!mmu) { + *dma_handle = virt_to_bus(memory); + return memory; + } + } + + *dma_handle = dma_map_area(dev, bus, size, PCI_DMA_BIDIRECTIONAL, 0); + if (*dma_handle == bad_dma_address) + goto error; + flush_gart(dev); + return memory; + +error: + if (panic_on_overflow) + panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", size); + free_pages((unsigned long)memory, get_order(size)); + return NULL; +} + +/* + * Unmap coherent memory. + * The caller must ensure that the device has finished accessing the mapping. + */ +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t bus) +{ + if (swiotlb) { + swiotlb_free_coherent(dev, size, vaddr, bus); + return; + } + + dma_unmap_single(dev, bus, size, 0); + free_pages((unsigned long)vaddr, get_order(size)); +} + +#ifdef CONFIG_IOMMU_LEAK + +#define SET_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0); +#define CLEAR_LEAK(x) if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; + +/* Debugging aid for drivers that don't free their IOMMU tables */ +static void **iommu_leak_tab; +static int leak_trace; +int iommu_leak_pages = 20; +void dump_leak(void) +{ + int i; + static int dump; + if (dump || !iommu_leak_tab) return; + dump = 1; + show_stack(NULL,NULL); + /* Very crude. dump some from the end of the table too */ + printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i+=2) { + printk("%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); + printk("%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk("\n"); +} +#else +#define SET_LEAK(x) +#define CLEAR_LEAK(x) +#endif + +static void iommu_full(struct device *dev, size_t size, int dir, int do_panic) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); + + if (size > PAGE_SIZE*EMERGENCY_PAGES && do_panic) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Random memory would be DMAed\n"); + } + +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size >= mask; + int mmu = high; + if (force_iommu) + mmu = 1; + if (no_iommu) { + if (high) + panic("PCI-DMA: high address but no IOMMU.\n"); + mmu = 0; + } + return mmu; +} + +static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + u64 mask = *dev->dma_mask; + int high = addr + size >= mask; + int mmu = high; + if (no_iommu) { + if (high) + panic("PCI-DMA: high address but no IOMMU.\n"); + mmu = 0; + } + return mmu; +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, unsigned long phys_mem, + size_t size, int dir, int do_panic) +{ + unsigned long npages = to_pages(phys_mem, size); + unsigned long iommu_page = alloc_iommu(npages); + int i; + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir, do_panic); + return bad_dma_address; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + SET_LEAK(iommu_page + i); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ +dma_addr_t dma_map_single(struct device *dev, void *addr, size_t size, int dir) +{ + unsigned long phys_mem, bus; + + BUG_ON(dir == DMA_NONE); + + if (swiotlb) + return swiotlb_map_single(dev,addr,size,dir); + if (!dev) + dev = &fallback_dev; + + phys_mem = virt_to_phys(addr); + if (!need_iommu(dev, phys_mem, size)) + return phys_mem; + + bus = dma_map_area(dev, phys_mem, size, dir, 1); + flush_gart(dev); + return bus; +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + int i; + +#ifdef CONFIG_IOMMU_DEBUG + printk(KERN_DEBUG "dma_map_sg overflow\n"); +#endif + + for (i = 0; i < nents; i++ ) { + struct scatterlist *s = &sg[i]; + unsigned long addr = page_to_phys(s->page) + s->offset; + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == bad_dma_address) { + if (i > 0) + dma_unmap_sg(dev, sg, i, dir); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(dev); + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(pages); + unsigned long iommu_page = iommu_start; + int i; + + if (iommu_start == -1) + return -1; + + for (i = start; i < stopat; i++) { + struct scatterlist *s = &sg[i]; + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(i > start && s->offset); + if (i == start) { + *sout = *s; + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + SET_LEAK(iommu_page); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; +} + +static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, + struct scatterlist *sout, + unsigned long pages, int need) +{ + if (!need) { + BUG_ON(stopat - start != 1); + *sout = sg[start]; + sout->dma_length = sg[start].length; + return 0; + } + return __dma_map_cont(sg, start, stopat, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + int out; + int start; + unsigned long pages = 0; + int need = 0, nextneed; + + BUG_ON(dir == DMA_NONE); + if (nents == 0) + return 0; + + if (swiotlb) + return swiotlb_map_sg(dev,sg,nents,dir); + if (!dev) + dev = &fallback_dev; + + out = 0; + start = 0; + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + dma_addr_t addr = page_to_phys(s->page) + s->offset; + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + struct scatterlist *ps = &sg[i-1]; + /* Can only merge when the last chunk ends on a page + boundary and the new one doesn't have an offset. */ + if (!iommu_merge || !nextneed || !need || s->offset || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(sg, start, i, sg+out, pages, + need) < 0) + goto error; + out++; + pages = 0; + start = i; + } + } + + need = nextneed; + pages += to_pages(s->offset, s->length); + } + if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) + goto error; + out++; + flush_gart(dev); + if (out < nents) + sg[out].dma_length = 0; + return out; + +error: + flush_gart(NULL); + dma_unmap_sg(dev, sg, nents, dir); + /* When it was forced try again unforced */ + if (force_iommu) + return dma_map_sg_nonforce(dev, sg, nents, dir); + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir, 0); + for (i = 0; i < nents; i++) + sg[i].dma_address = bad_dma_address; + return 0; +} + +/* + * Free a DMA mapping. + */ +void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + unsigned long iommu_page; + int npages; + int i; + + if (swiotlb) { + swiotlb_unmap_single(dev,dma_addr,size,direction); + return; + } + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = to_pages(dma_addr, size); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + CLEAR_LEAK(iommu_page + i); + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +{ + int i; + if (swiotlb) { + swiotlb_unmap_sg(dev,sg,nents,dir); + return; + } + for (i = 0; i < nents; i++) { + struct scatterlist *s = &sg[i]; + if (!s->dma_length || !s->length) + break; + dma_unmap_single(dev, s->dma_address, s->dma_length, dir); + } +} + +int dma_supported(struct device *dev, u64 mask) +{ + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < 0x00ffffff) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. + This allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area + and return DAC as fallback address the device may not handle it correctly. + + As a special case some controllers have a 39bit address mode + that is as efficient as 32bit (aic79xx). Don't force SAC for these. + Assume all masks <= 40 bits are of this type. Normally this doesn't + make any difference, but gives more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= 0xffffffffffULL)) { + printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); + return 0; + } + + return 1; +} + +int dma_get_cache_alignment(void) +{ + return boot_cpu_data.x86_clflush_size; +} + +EXPORT_SYMBOL(dma_unmap_sg); +EXPORT_SYMBOL(dma_map_sg); +EXPORT_SYMBOL(dma_map_single); +EXPORT_SYMBOL(dma_unmap_single); +EXPORT_SYMBOL(dma_supported); +EXPORT_SYMBOL(no_iommu); +EXPORT_SYMBOL(force_iommu); +EXPORT_SYMBOL(bad_dma_address); +EXPORT_SYMBOL(iommu_bio_merge); +EXPORT_SYMBOL(iommu_sac_force); +EXPORT_SYMBOL(dma_get_cache_alignment); +EXPORT_SYMBOL(dma_alloc_coherent); +EXPORT_SYMBOL(dma_free_coherent); + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) + printk(KERN_WARNING + "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32; + u64 aper_base; + unsigned aper_order; + + pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x90, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size >= 0xffffffff || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_k8_gatt(struct agp_kern_info *info) +{ + struct pci_dev *dev; + void *gatt; + unsigned aper_base, new_aper_base; + unsigned aper_size, gatt_size, new_aper_size; + + printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; + for_all_nb(dev) { + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + info->aper_base = aper_base; + info->aper_size = aper_size>>20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + memset(gatt, 0, gatt_size); + agp_gatt_table = gatt; + + for_all_nb(dev) { + u32 ctl; + u32 gatt_reg; + + gatt_reg = __pa(gatt) >> 12; + gatt_reg <<= 4; + pci_write_config_dword(dev, 0x98, gatt_reg); + pci_read_config_dword(dev, 0x90, &ctl); + + ctl |= 1; + ctl &= ~((1<<4) | (1<<5)); + + pci_write_config_dword(dev, 0x90, ctl); + } + flush_gart(NULL); + + printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + return 0; + + nommu: + /* Should not happen anymore */ + printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction."); + return -1; +} + +extern int agp_amd64_init(void); + +static int __init pci_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long aper_size; + unsigned long iommu_start; + struct pci_dev *dev; + unsigned long scratch; + long i; + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other K8 AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (swiotlb) { + no_iommu = 1; + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + return -1; + } + + if (no_iommu || + (!force_iommu && end_pfn < 0xffffffff>>PAGE_SHIFT) || + !iommu_aperture || + (no_agp && init_k8_gatt(&info) < 0)) { + printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); + no_iommu = 1; + return -1; + } + + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + memset(iommu_gart_bitmap, 0, iommu_pages/8); + +#ifdef CONFIG_IOMMU_LEAK + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + get_order(iommu_pages*sizeof(void *))); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); + else + printk("PCI-DMA: Cannot allocate leak trace area\n"); + } +#endif + + /* + * Out of IOMMU space handling. + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + + agp_memory_reserved = iommu_size; + printk(KERN_INFO + "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size>>20); + + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_address = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); + + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + iommu_gatt_base[i] = gart_unmapped_entry; + + for_all_nb(dev) { + u32 flag; + int cpu = PCI_SLOT(dev->devfn) - 24; + if (cpu >= MAX_NB) + continue; + northbridges[cpu] = dev; + pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ + northbridge_flush_word[cpu] = flag; + } + + flush_gart(NULL); + + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); + +/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge] + [,forcesac][,fullflush][,nomerge][,biomerge] + size set size of iommu (in bytes) + noagp don't initialize the AGP driver and use full aperture. + off don't use the IOMMU + leak turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on) + memaper[=order] allocate an own aperture over RAM with size 32MB^order. + noforce don't force IOMMU usage. Default. + force Force IOMMU. + merge Do lazy merging. This may improve performance on some block devices. + Implies force (experimental) + biomerge Do merging at the BIO layer. This is more efficient than merge, + but should be only done with very big IOMMUs. Implies merge,force. + nomerge Don't do SG merging. + forcesac For SAC mode for masks <40bits (experimental) + fullflush Flush IOMMU on each allocation (default) + nofullflush Don't use IOMMU fullflush + allowed overwrite iommu off workarounds for specific chipsets. + soft Use software bounce buffering (default for Intel machines) + noaperture Don't touch the aperture for AGP. +*/ +__init int iommu_setup(char *p) +{ + int arg; + + while (*p) { + if (!strncmp(p,"noagp",5)) + no_agp = 1; + if (!strncmp(p,"off",3)) + no_iommu = 1; + if (!strncmp(p,"force",5)) { + force_iommu = 1; + iommu_aperture_allowed = 1; + } + if (!strncmp(p,"allowed",7)) + iommu_aperture_allowed = 1; + if (!strncmp(p,"noforce",7)) { + iommu_merge = 0; + force_iommu = 0; + } + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } + if (!strncmp(p, "biomerge",8)) { + iommu_bio_merge = 4096; + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic",5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic",7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge",5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge",7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac",8)) + iommu_sac_force = 1; + if (!strncmp(p, "fullflush",8)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush",11)) + iommu_fullflush = 0; + if (!strncmp(p, "soft",4)) + swiotlb = 1; + if (!strncmp(p, "noaperture",10)) + fix_aperture = 0; +#ifdef CONFIG_IOMMU_LEAK + if (!strncmp(p,"leak",4)) { + leak_trace = 1; + p += 4; + if (*p == '=') ++p; + if (isdigit(*p) && get_option(&p, &arg)) + iommu_leak_pages = arg; + } else +#endif + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 1; +} diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c new file mode 100644 index 000000000000..67d90b89af0b --- /dev/null +++ b/arch/x86_64/kernel/pci-nommu.c @@ -0,0 +1,94 @@ +/* Fallback functions when the main IOMMU code is not compiled in. This + code is roughly equivalent to i386. */ +#include +#include +#include +#include +#include +#include + +int iommu_merge = 0; +EXPORT_SYMBOL(iommu_merge); + +dma_addr_t bad_dma_address; +EXPORT_SYMBOL(bad_dma_address); + +int iommu_bio_merge = 0; +EXPORT_SYMBOL(iommu_bio_merge); + +int iommu_sac_force = 0; +EXPORT_SYMBOL(iommu_sac_force); + +/* + * Dummy IO MMU functions + */ + +void *dma_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, unsigned gfp) +{ + void *ret; + u64 mask; + int order = get_order(size); + + if (hwdev) + mask = hwdev->coherent_dma_mask & *hwdev->dma_mask; + else + mask = 0xffffffff; + for (;;) { + ret = (void *)__get_free_pages(gfp, order); + if (ret == NULL) + return NULL; + *dma_handle = virt_to_bus(ret); + if ((*dma_handle & ~mask) == 0) + break; + free_pages((unsigned long)ret, order); + if (gfp & GFP_DMA) + return NULL; + gfp |= GFP_DMA; + } + + memset(ret, 0, size); + return ret; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *hwdev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} +EXPORT_SYMBOL(dma_free_coherent); + +int dma_supported(struct device *hwdev, u64 mask) +{ + /* + * we fall back to GFP_DMA when the mask isn't all 1s, + * so we can't guarantee allocations that must be + * within a tighter range than GFP_DMA.. + * RED-PEN this won't work for pci_map_single. Caller has to + * use GFP_DMA in the first place. + */ + if (mask < 0x00ffffff) + return 0; + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +int dma_get_cache_alignment(void) +{ + return boot_cpu_data.x86_clflush_size; +} +EXPORT_SYMBOL(dma_get_cache_alignment); + +static int __init check_ram(void) +{ + if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { + printk( + KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" + KERN_ERR "WARNING 32bit PCI may malfunction.\n"); + } + return 0; +} +__initcall(check_ram); + diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c new file mode 100644 index 000000000000..9922d2ba24a3 --- /dev/null +++ b/arch/x86_64/kernel/process.c @@ -0,0 +1,770 @@ +/* + * linux/arch/x86-64/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage extern void ret_from_fork(void); + +unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +static atomic_t hlt_counter = ATOMIC_INIT(0); + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +void disable_hlt(void) +{ + atomic_inc(&hlt_counter); +} + +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + atomic_dec(&hlt_counter); +} + +EXPORT_SYMBOL(enable_hlt); + +/* + * We use this if we don't have any better + * idle routine.. + */ +void default_idle(void) +{ + if (!atomic_read(&hlt_counter)) { + local_irq_disable(); + if (!need_resched()) + safe_halt(); + else + local_irq_enable(); + } +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + int oldval; + + local_irq_enable(); + + /* + * Deal with another CPU just having chosen a thread to + * run here: + */ + oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); + + if (!oldval) { + set_thread_flag(TIF_POLLING_NRFLAG); + asm volatile( + "2:" + "testl %0,%1;" + "rep; nop;" + "je 2b;" + : : + "i" (_TIF_NEED_RESCHED), + "m" (current_thread_info()->flags)); + } else { + set_need_resched(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle (void) +{ + /* endless idle loop with no priority at all */ + while (1) { + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + rmb(); + idle = pm_idle; + if (!idle) + idle = default_idle; + idle(); + } + + schedule(); + } +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + */ +static void mwait_idle(void) +{ + local_irq_enable(); + + if (!need_resched()) { + set_thread_flag(TIF_POLLING_NRFLAG); + do { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (need_resched()) + break; + __mwait(0, 0); + } while (!need_resched()); + clear_thread_flag(TIF_POLLING_NRFLAG); + } +} + +void __init select_idle_routine(const struct cpuinfo_x86 *c) +{ + static int printed; + if (cpu_has(c, X86_FEATURE_MWAIT)) { + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + if (!printed) { + printk("using mwait in idle threads.\n"); + printed = 1; + } + pm_idle = mwait_idle; + } + } +} + +static int __init idle_setup (char *str) +{ + if (!strncmp(str, "poll", 4)) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } + + boot_option_idle_override = 1; + return 1; +} + +__setup("idle=", idle_setup); + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs * regs) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned int fsindex,gsindex; + unsigned int ds,cs,es; + + printk("\n"); + print_modules(); + printk("Pid: %d, comm: %.20s %s %s\n", + current->pid, current->comm, print_tainted(), system_utsname.release); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); + printk_address(regs->rip); + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->rax, regs->rbx, regs->rcx); + printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->rdx, regs->rsi, regs->rdi); + printk("RBP: %016lx R08: %016lx R09: %016lx\n", + regs->rbp, regs->r8, regs->r9); + printk("R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk("R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("movl %%fs,%0" : "=r" (fsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + asm("movq %%cr0, %0": "=r" (cr0)); + asm("movq %%cr2, %0": "=r" (cr2)); + asm("movq %%cr3, %0": "=r" (cr3)); + asm("movq %%cr4, %0": "=r" (cr4)); + + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs,fsindex,gs,gsindex,shadowgs); + printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); + printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); +} + +void show_regs(struct pt_regs *regs) +{ + __show_regs(regs); + show_trace(®s->rsp); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + if (me->thread.io_bitmap_ptr) { + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); + t->io_bitmap_max = 0; + put_cpu(); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + struct thread_info *t = current_thread_info(); + + if (t->flags & _TIF_ABI_PENDING) + t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct n_desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + desc->a = LDT_entry_a(&ud); + desc->b = LDT_entry_b(&ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + struct desc_struct *desc = (void *)t->thread.tls_array; + desc += tls; + return desc->base0 | + (((u32)desc->base1) << 16) | + (((u32)desc->base2) << 24); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + int err; + struct pt_regs * childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; + + *childregs = *regs; + + childregs->rax = 0; + childregs->rsp = rsp; + if (rsp == ~0UL) { + childregs->rsp = (unsigned long)childregs; + } + + p->thread.rsp = (unsigned long) childregs; + p->thread.rsp0 = (unsigned long) (childregs+1); + p->thread.userrsp = me->thread.userrsp; + + set_ti_thread_flag(p->thread_info, TIF_FORK); + + p->thread.fs = me->thread.fs; + p->thread.gs = me->thread.gs; + + asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); + asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); + asm("movl %%es,%0" : "=m" (p->thread.es)); + asm("movl %%ds,%0" : "=m" (p->thread.ds)); + + if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = ia32_child_tls(p, childregs); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + */ +struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + unlazy_fpu(prev_p); + + /* + * Reload esp0, LDT and the page table pointer: + */ + tss->rsp0 = next->rsp0; + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + asm volatile("movl %%es,%0" : "=m" (prev->es)); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + asm volatile ("movl %%ds,%0" : "=m" (prev->ds)); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + load_TLS(next, cpu); + + /* + * Switch FS and GS. + */ + { + unsigned fsindex; + asm volatile("movl %%fs,%0" : "=r" (fsindex)); + /* segment register != 0 always requires a reload. + also reload when it has changed. + when prev process used 64bit base always reload + to avoid an information leak. */ + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* check if the user used a selector != 0 + * if yes clear 64bit base, since overloaded base + * is always mapped to the Null selector + */ + if (fsindex) + prev->fs = 0; + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + } + { + unsigned gsindex; + asm volatile("movl %%gs,%0" : "=r" (gsindex)); + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) + prev->gs = 0; + } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; + } + + /* + * Switch the PDA context. + */ + prev->userrsp = read_pda(oldrsp); + write_pda(oldrsp, next->userrsp); + write_pda(pcurrent, next_p); + write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); + + /* + * Now maybe reload the debug registers + */ + if (unlikely(next->debugreg7)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + + /* + * Handle the IO bitmap + */ + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + if (next->io_bitmap_ptr) + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + else { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } + } + + return prev_p; +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs regs) +{ + long error; + char * filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + } + putname(filename); + return error; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +asmlinkage long sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); +} + +asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->rsp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage long sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + NULL, NULL); +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp,rip; + int count = 0; + + if (!p || p == current || p->state==TASK_RUNNING) + return 0; + stack = (unsigned long)p->thread_info; + if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.rsp); + do { + if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) + return 0; + rip = *(u64 *)(fp+8); + if (!in_sched_functions(rip)) + return rip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = checking_wrmsrl(MSR_FS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) { + rdmsrl(MSR_FS_BASE, base); + } else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + rdmsrl(MSR_KERNEL_GS_BASE, base); + } else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs *pp, ptregs; + + pp = (struct pt_regs *)(tsk->thread.rsp0); + --pp; + + ptregs = *pp; + ptregs.cs &= 0xffff; + ptregs.ss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c new file mode 100644 index 000000000000..0b7b101debdf --- /dev/null +++ b/arch/x86_64/kernel/ptrace.c @@ -0,0 +1,547 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * x86-64 port 2000-2002 Andi Kleen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x44dd5UL + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100UL + +/* + * eflags and offset of eflags on child stack.. + */ +#define EFLAGS offsetof(struct pt_regs, eflags) +#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) + +/* + * this routine will get a word off of the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline unsigned long get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->thread.rsp0; + stack += offset; + return (*((unsigned long *)stack)); +} + +/* + * this routine will put a word on the processes privileged stack. + * the offset is how far from the base addr as stored in the TSS. + * this routine assumes that all the privileged stacks are in our + * data space. + */ +static inline long put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *) task->thread.rsp0; + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + long tmp; + + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; + put_stack_long(child, EFL_OFFSET, tmp); +} + +static int putreg(struct task_struct *child, + unsigned long regno, unsigned long value) +{ + unsigned long tmp; + + /* Some code in the 64bit emulation may not be 64bit clean. + Don't take any chances. */ + if (test_tsk_thread_flag(child, TIF_IA32)) + value &= 0xffffffff; + switch (regno) { + case offsetof(struct user_regs_struct,fs): + if (value && (value & 3) != 3) + return -EIO; + child->thread.fsindex = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,gs): + if (value && (value & 3) != 3) + return -EIO; + child->thread.gsindex = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,ds): + if (value && (value & 3) != 3) + return -EIO; + child->thread.ds = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,es): + if (value && (value & 3) != 3) + return -EIO; + child->thread.es = value & 0xffff; + return 0; + case offsetof(struct user_regs_struct,ss): + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + return 0; + case offsetof(struct user_regs_struct,fs_base): + if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) + return -EIO; + child->thread.fs = value; + return 0; + case offsetof(struct user_regs_struct,gs_base): + if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) + return -EIO; + child->thread.gs = value; + return 0; + case offsetof(struct user_regs_struct, eflags): + value &= FLAG_MASK; + tmp = get_stack_long(child, EFL_OFFSET); + tmp &= ~FLAG_MASK; + value |= tmp; + break; + case offsetof(struct user_regs_struct,cs): + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + } + put_stack_long(child, regno - sizeof(struct pt_regs), value); + return 0; +} + +static unsigned long getreg(struct task_struct *child, unsigned long regno) +{ + unsigned long val; + switch (regno) { + case offsetof(struct user_regs_struct, fs): + return child->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + return child->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + return child->thread.ds; + case offsetof(struct user_regs_struct, es): + return child->thread.es; + case offsetof(struct user_regs_struct, fs_base): + return child->thread.fs; + case offsetof(struct user_regs_struct, gs_base): + return child->thread.gs; + default: + regno = regno - sizeof(struct pt_regs); + val = get_stack_long(child, regno); + if (test_tsk_thread_flag(child, TIF_IA32)) + val &= 0xffffffff; + return val; + } + +} + +asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data) +{ + struct task_struct *child; + long i, ret; + unsigned ui; + + /* This lock_kernel fixes a subtle race with suid exec */ + lock_kernel(); + ret = -EPERM; + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->ptrace & PT_PTRACED) + goto out; + ret = security_ptrace(current->parent, current); + if (ret) + goto out; + /* set the ptrace bit in the process flags. */ + current->ptrace |= PT_PTRACED; + ret = 0; + goto out; + } + ret = -ESRCH; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); + if (!child) + goto out; + + ret = -EPERM; + if (pid == 1) /* you may not mess with init */ + goto out_tsk; + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + goto out_tsk; + } + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_tsk; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: { + unsigned long tmp; + int copied; + + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + ret = -EIO; + if (copied != sizeof(tmp)) + break; + ret = put_user(tmp,(unsigned long __user *) data); + break; + } + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & 7) || + addr > sizeof(struct user) - 7) + break; + + switch (addr) { + case 0 ... sizeof(struct user_regs_struct): + tmp = getreg(child, addr); + break; + case offsetof(struct user, u_debugreg[0]): + tmp = child->thread.debugreg0; + break; + case offsetof(struct user, u_debugreg[1]): + tmp = child->thread.debugreg1; + break; + case offsetof(struct user, u_debugreg[2]): + tmp = child->thread.debugreg2; + break; + case offsetof(struct user, u_debugreg[3]): + tmp = child->thread.debugreg3; + break; + case offsetof(struct user, u_debugreg[6]): + tmp = child->thread.debugreg6; + break; + case offsetof(struct user, u_debugreg[7]): + tmp = child->thread.debugreg7; + break; + default: + tmp = 0; + break; + } + ret = put_user(tmp,(unsigned long __user *) data); + break; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + ret = 0; + if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) + break; + ret = -EIO; + break; + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & 7) || + addr > sizeof(struct user) - 7) + break; + + switch (addr) { + case 0 ... sizeof(struct user_regs_struct): + ret = putreg(child, addr, data); + break; + /* Disallows to set a breakpoint into the vsyscall */ + case offsetof(struct user, u_debugreg[0]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg0 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[1]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg1 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[2]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg2 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[3]): + if (data >= TASK_SIZE-7) break; + child->thread.debugreg3 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[6]): + if (data >> 32) + break; + child->thread.debugreg6 = data; + ret = 0; + break; + case offsetof(struct user, u_debugreg[7]): + /* See arch/i386/kernel/ptrace.c for an explanation of + * this awkward check.*/ + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5454 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + break; + if (i == 4) { + child->thread.debugreg7 = data; + ret = 0; + } + break; + } + break; + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: { /* restart after signal. */ + long tmp; + + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + if (request == PTRACE_SYSCALL) + set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + else + clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = data; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, EFL_OFFSET); + tmp &= ~TRAP_FLAG; + put_stack_long(child, EFL_OFFSET,tmp); + wake_up_process(child); + ret = 0; + break; + } + +#ifdef CONFIG_IA32_EMULATION + /* This makes only sense with 32bit programs. Allow a + 64bit debugger to fully examine them too. Better + don't use it against 64bit processes, use + PTRACE_ARCH_PRCTL instead. */ + case PTRACE_SET_THREAD_AREA: { + struct user_desc __user *p; + int old; + p = (struct user_desc __user *)data; + get_user(old, &p->entry_number); + put_user(addr, &p->entry_number); + ret = do_set_thread_area(&child->thread, p); + put_user(old, &p->entry_number); + break; + case PTRACE_GET_THREAD_AREA: + p = (struct user_desc __user *)data; + get_user(old, &p->entry_number); + put_user(addr, &p->entry_number); + ret = do_get_thread_area(&child->thread, p); + put_user(old, &p->entry_number); + break; + } +#endif + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: { + long tmp; + + ret = 0; + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ + break; + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + tmp = get_stack_long(child, EFL_OFFSET) & ~TRAP_FLAG; + put_stack_long(child, EFL_OFFSET, tmp); + wake_up_process(child); + break; + } + + case PTRACE_SINGLESTEP: { /* set the trap flag. */ + long tmp; + + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); + if ((child->ptrace & PT_DTRACE) == 0) { + /* Spurious delayed TF traps may occur */ + child->ptrace |= PT_DTRACE; + } + tmp = get_stack_long(child, EFL_OFFSET) | TRAP_FLAG; + put_stack_long(child, EFL_OFFSET, tmp); + set_tsk_thread_flag(child, TIF_SINGLESTEP); + child->exit_code = data; + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + break; + } + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, + sizeof(struct user_regs_struct))) { + ret = -EIO; + break; + } + ret = 0; + for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { + ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); + data += sizeof(long); + } + break; + } + + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp; + if (!access_ok(VERIFY_READ, (unsigned __user *)data, + sizeof(struct user_regs_struct))) { + ret = -EIO; + break; + } + ret = 0; + for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { + ret |= __get_user(tmp, (unsigned long __user *) data); + putreg(child, ui, tmp); + data += sizeof(long); + } + break; + } + + case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ + if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + ret = get_fpregs((struct user_i387_struct __user *)data, child); + break; + } + + case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ + if (!access_ok(VERIFY_READ, (unsigned __user *)data, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + ret = set_fpregs(child, (struct user_i387_struct __user *)data); + break; + } + + default: + ret = ptrace_request(child, request, addr, data); + break; + } +out_tsk: + put_task_struct(child); +out: + unlock_kernel(); + return ret; +} + +static void syscall_trace(struct pt_regs *regs) +{ + +#if 0 + printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", + current->comm, + regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), + current_thread_info()->flags, current->ptrace); +#endif + + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_rax); + + if (unlikely(current->audit_context)) + audit_syscall_entry(current, regs->orig_rax, + regs->rdi, regs->rsi, + regs->rdx, regs->r10); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(current, regs->rax); + + if ((test_thread_flag(TIF_SYSCALL_TRACE) + || test_thread_flag(TIF_SINGLESTEP)) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c new file mode 100644 index 000000000000..be4b36f762cf --- /dev/null +++ b/arch/x86_64/kernel/reboot.c @@ -0,0 +1,163 @@ +/* Various gunk just to reboot the machine. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); + +static long no_idt[3]; +static enum { + BOOT_TRIPLE = 't', + BOOT_KBD = 'k' +} reboot_type = BOOT_KBD; +static int reboot_mode = 0; +int reboot_force; + +/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] + warm Don't set the cold reboot flag + cold Set the cold reboot flag + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + force Avoid anything that could hang. + */ +static int __init reboot_setup(char *str) +{ + for (;;) { + switch (*str) { + case 'w': + reboot_mode = 0x1234; + break; + + case 'c': + reboot_mode = 0; + break; + + case 't': + case 'b': + case 'k': + reboot_type = *str; + break; + case 'f': + reboot_force = 1; + break; + } + if((str = strchr(str,',')) != NULL) + str++; + else + break; + } + return 1; +} + +__setup("reboot=", reboot_setup); + +#ifdef CONFIG_SMP +static void smp_halt(void) +{ + int cpuid = safe_smp_processor_id(); + static int first_entry = 1; + + if (reboot_force) + return; + + if (first_entry) { + first_entry = 0; + smp_call_function((void *)machine_restart, NULL, 1, 0); + } + + smp_stop_cpu(); + + /* AP calling this. Just halt */ + if (cpuid != boot_cpu_id) { + for (;;) + asm("hlt"); + } + + /* Wait for all other CPUs to have run smp_stop_cpu */ + while (!cpus_empty(cpu_online_map)) + rep_nop(); +} +#endif + +static inline void kb_wait(void) +{ + int i; + + for (i=0; i<0x10000; i++) + if ((inb_p(0x64) & 0x02) == 0) + break; +} + +void machine_restart(char * __unused) +{ + int i; + + printk("machine restart\n"); + +#ifdef CONFIG_SMP + smp_halt(); +#endif + + if (!reboot_force) { + local_irq_disable(); +#ifndef CONFIG_SMP + disable_local_APIC(); +#endif + disable_IO_APIC(); + local_irq_enable(); + } + + /* Tell the BIOS if we want cold or warm reboot */ + *((unsigned short *)__va(0x472)) = reboot_mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_KBD: + for (i=0; i<100; i++) { + kb_wait(); + udelay(50); + outb(0xfe,0x64); /* pulse reset low */ + udelay(50); + } + + case BOOT_TRIPLE: + __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); + __asm__ __volatile__("int3"); + + reboot_type = BOOT_KBD; + break; + } + } +} + +EXPORT_SYMBOL(machine_restart); + +void machine_halt(void) +{ +} + +EXPORT_SYMBOL(machine_halt); + +void machine_power_off(void) +{ + if (pm_power_off) + pm_power_off(); +} + +EXPORT_SYMBOL(machine_power_off); diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c new file mode 100644 index 000000000000..48f7c18172b9 --- /dev/null +++ b/arch/x86_64/kernel/semaphore.c @@ -0,0 +1,180 @@ +/* + * x86_64 semaphore implementation. + * + * (C) Copyright 1999 Linus Torvalds + * + * Portions Copyright 1999 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * rw semaphores implemented November 1999 by Benjamin LaHaise + */ +#include +#include +#include +#include + +#include + +/* + * Semaphores are implemented using a two-way counter: + * The "count" variable is decremented for each process + * that tries to acquire the semaphore, while the "sleeping" + * variable is a count of such acquires. + * + * Notably, the inline "up()" and "down()" functions can + * efficiently test if they need to do any extra work (up + * needs to do something only if count was negative before + * the increment operation. + * + * "sleeping" and the contention routine ordering is protected + * by the spinlock in the semaphore's waitqueue head. + * + * Note that these functions are only called when there is + * contention on the lock, and as such all this is the + * "non-critical" part of the whole semaphore business. The + * critical part is the inline stuff in + * where we want to avoid any extra jumps and calls. + */ + +/* + * Logic: + * - only on a boundary condition do we need to care. When we go + * from a negative count to a non-negative, we wake people up. + * - when we go from a non-negative count to a negative do we + * (a) synchronize with the "sleeper" count and (b) make sure + * that we're on the wakeup list before we synchronize so that + * we cannot lose wakeup events. + */ + +void __up(struct semaphore *sem) +{ + wake_up(&sem->wait); +} + +void __sched __down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_UNINTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * the wait_queue_head. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_UNINTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + tsk->state = TASK_RUNNING; +} + +int __sched __down_interruptible(struct semaphore * sem) +{ + int retval = 0; + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + unsigned long flags; + + tsk->state = TASK_INTERRUPTIBLE; + spin_lock_irqsave(&sem->wait.lock, flags); + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + sem->sleepers++; + for (;;) { + int sleepers = sem->sleepers; + + /* + * With signals pending, this turns into + * the trylock failure case - we won't be + * sleeping, and we* can't get the lock as + * it has contention. Just correct the count + * and exit. + */ + if (signal_pending(current)) { + retval = -EINTR; + sem->sleepers = 0; + atomic_add(sleepers, &sem->count); + break; + } + + /* + * Add "everybody else" into it. They aren't + * playing, because we own the spinlock in + * wait_queue_head. The "-1" is because we're + * still hoping to get the semaphore. + */ + if (!atomic_add_negative(sleepers - 1, &sem->count)) { + sem->sleepers = 0; + break; + } + sem->sleepers = 1; /* us - see -1 above */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + schedule(); + + spin_lock_irqsave(&sem->wait.lock, flags); + tsk->state = TASK_INTERRUPTIBLE; + } + remove_wait_queue_locked(&sem->wait, &wait); + wake_up_locked(&sem->wait); + spin_unlock_irqrestore(&sem->wait.lock, flags); + + tsk->state = TASK_RUNNING; + return retval; +} + +/* + * Trylock failed - make sure we correct for + * having decremented the count. + * + * We could have done the trylock with a + * single "cmpxchg" without failure cases, + * but then it wouldn't work on a 386. + */ +int __down_trylock(struct semaphore * sem) +{ + int sleepers; + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + sleepers = sem->sleepers + 1; + sem->sleepers = 0; + + /* + * Add "everybody else" and us into it. They aren't + * playing, because we own the spinlock in the + * wait_queue_head. + */ + if (!atomic_add_negative(sleepers, &sem->count)) { + wake_up_locked(&sem->wait); + } + + spin_unlock_irqrestore(&sem->wait.lock, flags); + return 1; +} + + diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c new file mode 100644 index 000000000000..e50fc41de6b9 --- /dev/null +++ b/arch/x86_64/kernel/setup.c @@ -0,0 +1,1189 @@ +/* + * linux/arch/x86-64/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Nov 2001 Dave Jones + * Forked from i386 setup code. + * + * $Id$ + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include