diff options
67 files changed, 1335 insertions, 1167 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index dc3154e49279..1462ed86d40a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -6,7 +6,7 @@ # To add a new book the only step required is to add the book to the # list of DOCBOOKS. -DOCBOOKS := z8530book.xml mcabook.xml \ +DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ procfs-guide.xml writing_usb_driver.xml networking.xml \ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl new file mode 100644 index 000000000000..94a20fe8fedf --- /dev/null +++ b/Documentation/DocBook/device-drivers.tmpl @@ -0,0 +1,418 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN" + "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []> + +<book id="LinuxDriversAPI"> + <bookinfo> + <title>Linux Device Drivers</title> + + <legalnotice> + <para> + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + </para> + + <para> + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + </para> + + <para> + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + </para> + + <para> + For more details see the file COPYING in the source + distribution of Linux. + </para> + </legalnotice> + </bookinfo> + +<toc></toc> + + <chapter id="Basics"> + <title>Driver Basics</title> + <sect1><title>Driver Entry and Exit points</title> +!Iinclude/linux/init.h + </sect1> + + <sect1><title>Atomic and pointer manipulation</title> +!Iarch/x86/include/asm/atomic_32.h +!Iarch/x86/include/asm/unaligned.h + </sect1> + + <sect1><title>Delaying, scheduling, and timer routines</title> +!Iinclude/linux/sched.h +!Ekernel/sched.c +!Ekernel/timer.c + </sect1> + <sect1><title>High-resolution timers</title> +!Iinclude/linux/ktime.h +!Iinclude/linux/hrtimer.h +!Ekernel/hrtimer.c + </sect1> + <sect1><title>Workqueues and Kevents</title> +!Ekernel/workqueue.c + </sect1> + <sect1><title>Internal Functions</title> +!Ikernel/exit.c +!Ikernel/signal.c +!Iinclude/linux/kthread.h +!Ekernel/kthread.c + </sect1> + + <sect1><title>Kernel objects manipulation</title> +<!-- +X!Iinclude/linux/kobject.h +--> +!Elib/kobject.c + </sect1> + + <sect1><title>Kernel utility functions</title> +!Iinclude/linux/kernel.h +!Ekernel/printk.c +!Ekernel/panic.c +!Ekernel/sys.c +!Ekernel/rcupdate.c + </sect1> + + <sect1><title>Device Resource Management</title> +!Edrivers/base/devres.c + </sect1> + + </chapter> + + <chapter id="devdrivers"> + <title>Device drivers infrastructure</title> + <sect1><title>Device Drivers Base</title> +<!-- +X!Iinclude/linux/device.h +--> +!Edrivers/base/driver.c +!Edrivers/base/core.c +!Edrivers/base/class.c +!Edrivers/base/firmware_class.c +!Edrivers/base/transport_class.c +<!-- Cannot be included, because + attribute_container_add_class_device_adapter + and attribute_container_classdev_to_container + exceed allowed 44 characters maximum +X!Edrivers/base/attribute_container.c +--> +!Edrivers/base/sys.c +<!-- +X!Edrivers/base/interface.c +--> +!Edrivers/base/platform.c +!Edrivers/base/bus.c + </sect1> + <sect1><title>Device Drivers Power Management</title> +!Edrivers/base/power/main.c + </sect1> + <sect1><title>Device Drivers ACPI Support</title> +<!-- Internal functions only +X!Edrivers/acpi/sleep/main.c +X!Edrivers/acpi/sleep/wakeup.c +X!Edrivers/acpi/motherboard.c +X!Edrivers/acpi/bus.c +--> +!Edrivers/acpi/scan.c +!Idrivers/acpi/scan.c +<!-- No correct structured comments +X!Edrivers/acpi/pci_bind.c +--> + </sect1> + <sect1><title>Device drivers PnP support</title> +!Idrivers/pnp/core.c +<!-- No correct structured comments +X!Edrivers/pnp/system.c + --> +!Edrivers/pnp/card.c +!Idrivers/pnp/driver.c +!Edrivers/pnp/manager.c +!Edrivers/pnp/support.c + </sect1> + <sect1><title>Userspace IO devices</title> +!Edrivers/uio/uio.c +!Iinclude/linux/uio_driver.h + </sect1> + </chapter> + + <chapter id="parportdev"> + <title>Parallel Port Devices</title> +!Iinclude/linux/parport.h +!Edrivers/parport/ieee1284.c +!Edrivers/parport/share.c +!Idrivers/parport/daisy.c + </chapter> + + <chapter id="message_devices"> + <title>Message-based devices</title> + <sect1><title>Fusion message devices</title> +!Edrivers/message/fusion/mptbase.c +!Idrivers/message/fusion/mptbase.c +!Edrivers/message/fusion/mptscsih.c +!Idrivers/message/fusion/mptscsih.c +!Idrivers/message/fusion/mptctl.c +!Idrivers/message/fusion/mptspi.c +!Idrivers/message/fusion/mptfc.c +!Idrivers/message/fusion/mptlan.c + </sect1> + <sect1><title>I2O message devices</title> +!Iinclude/linux/i2o.h +!Idrivers/message/i2o/core.h +!Edrivers/message/i2o/iop.c +!Idrivers/message/i2o/iop.c +!Idrivers/message/i2o/config-osm.c +!Edrivers/message/i2o/exec-osm.c +!Idrivers/message/i2o/exec-osm.c +!Idrivers/message/i2o/bus-osm.c +!Edrivers/message/i2o/device.c +!Idrivers/message/i2o/device.c +!Idrivers/message/i2o/driver.c +!Idrivers/message/i2o/pci.c +!Idrivers/message/i2o/i2o_block.c +!Idrivers/message/i2o/i2o_scsi.c +!Idrivers/message/i2o/i2o_proc.c + </sect1> + </chapter> + + <chapter id="snddev"> + <title>Sound Devices</title> +!Iinclude/sound/core.h +!Esound/sound_core.c +!Iinclude/sound/pcm.h +!Esound/core/pcm.c +!Esound/core/device.c +!Esound/core/info.c +!Esound/core/rawmidi.c +!Esound/core/sound.c +!Esound/core/memory.c +!Esound/core/pcm_memory.c +!Esound/core/init.c +!Esound/core/isadma.c +!Esound/core/control.c +!Esound/core/pcm_lib.c +!Esound/core/hwdep.c +!Esound/core/pcm_native.c +!Esound/core/memalloc.c +<!-- FIXME: Removed for now since no structured comments in source +X!Isound/sound_firmware.c +--> + </chapter> + + <chapter id="uart16x50"> + <title>16x50 UART Driver</title> +!Iinclude/linux/serial_core.h +!Edrivers/serial/serial_core.c +!Edrivers/serial/8250.c + </chapter> + + <chapter id="fbdev"> + <title>Frame Buffer Library</title> + + <para> + The frame buffer drivers depend heavily on four data structures. + These structures are declared in include/linux/fb.h. They are + fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs. + The last three can be made available to and from userland. + </para> + + <para> + fb_info defines the current state of a particular video card. + Inside fb_info, there exists a fb_ops structure which is a + collection of needed functions to make fbdev and fbcon work. + fb_info is only visible to the kernel. + </para> + + <para> + fb_var_screeninfo is used to describe the features of a video card + that are user defined. With fb_var_screeninfo, things such as + depth and the resolution may be defined. + </para> + + <para> + The next structure is fb_fix_screeninfo. This defines the + properties of a card that are created when a mode is set and can't + be changed otherwise. A good example of this is the start of the + frame buffer memory. This "locks" the address of the frame buffer + memory, so that it cannot be changed or moved. + </para> + + <para> + The last structure is fb_monospecs. In the old API, there was + little importance for fb_monospecs. This allowed for forbidden things + such as setting a mode of 800x600 on a fix frequency monitor. With + the new API, fb_monospecs prevents such things, and if used + correctly, can prevent a monitor from being cooked. fb_monospecs + will not be useful until kernels 2.5.x. + </para> + + <sect1><title>Frame Buffer Memory</title> +!Edrivers/video/fbmem.c + </sect1> +<!-- + <sect1><title>Frame Buffer Console</title> +X!Edrivers/video/console/fbcon.c + </sect1> +--> + <sect1><title>Frame Buffer Colormap</title> +!Edrivers/video/fbcmap.c + </sect1> +<!-- FIXME: + drivers/video/fbgen.c has no docs, which stuffs up the sgml. Comment + out until somebody adds docs. KAO + <sect1><title>Frame Buffer Generic Functions</title> +X!Idrivers/video/fbgen.c + </sect1> +KAO --> + <sect1><title>Frame Buffer Video Mode Database</title> +!Idrivers/video/modedb.c +!Edrivers/video/modedb.c + </sect1> + <sect1><title>Frame Buffer Macintosh Video Mode Database</title> +!Edrivers/video/macmodes.c + </sect1> + <sect1><title>Frame Buffer Fonts</title> + <para> + Refer to the file drivers/video/console/fonts.c for more information. + </para> +<!-- FIXME: Removed for now since no structured comments in source +X!Idrivers/video/console/fonts.c +--> + </sect1> + </chapter> + + <chapter id="input_subsystem"> + <title>Input Subsystem</title> +!Iinclude/linux/input.h +!Edrivers/input/input.c +!Edrivers/input/ff-core.c +!Edrivers/input/ff-memless.c + </chapter> + + <chapter id="spi"> + <title>Serial Peripheral Interface (SPI)</title> + <para> + SPI is the "Serial Peripheral Interface", widely used with + embedded systems because it is a simple and efficient + interface: basically a multiplexed shift register. + Its three signal wires hold a clock (SCK, often in the range + of 1-20 MHz), a "Master Out, Slave In" (MOSI) data line, and + a "Master In, Slave Out" (MISO) data line. + SPI is a full duplex protocol; for each bit shifted out the + MOSI line (one per clock) another is shifted in on the MISO line. + Those bits are assembled into words of various sizes on the + way to and from system memory. + An additional chipselect line is usually active-low (nCS); + four signals are normally used for each peripheral, plus + sometimes an interrupt. + </para> + <para> + The SPI bus facilities listed here provide a generalized + interface to declare SPI busses and devices, manage them + according to the standard Linux driver model, and perform + input/output operations. + At this time, only "master" side interfaces are supported, + where Linux talks to SPI peripherals and does not implement + such a peripheral itself. + (Interfaces to support implementing SPI slaves would + necessarily look different.) + </para> + <para> + The programming interface is structured around two kinds of driver, + and two kinds of device. + A "Controller Driver" abstracts the controller hardware, which may + be as simple as a set of GPIO pins or as complex as a pair of FIFOs + connected to dual DMA engines on the other side of the SPI shift + register (maximizing throughput). Such drivers bridge between + whatever bus they sit on (often the platform bus) and SPI, and + expose the SPI side of their device as a + <structname>struct spi_master</structname>. + SPI devices are children of that master, represented as a + <structname>struct spi_device</structname> and manufactured from + <structname>struct spi_board_info</structname> descriptors which + are usually provided by board-specific initialization code. + A <structname>struct spi_driver</structname> is called a + "Protocol Driver", and is bound to a spi_device using normal + driver model calls. + </para> + <para> + The I/O model is a set of queued messages. Protocol drivers + submit one or more <structname>struct spi_message</structname> + objects, which are processed and completed asynchronously. + (There are synchronous wrappers, however.) Messages are + built from one or more <structname>struct spi_transfer</structname> + objects, each of which wraps a full duplex SPI transfer. + A variety of protocol tweaking options are needed, because + different chips adopt very different policies for how they + use the bits transferred with SPI. + </para> +!Iinclude/linux/spi/spi.h +!Fdrivers/spi/spi.c spi_register_board_info +!Edrivers/spi/spi.c + </chapter> + + <chapter id="i2c"> + <title>I<superscript>2</superscript>C and SMBus Subsystem</title> + + <para> + I<superscript>2</superscript>C (or without fancy typography, "I2C") + is an acronym for the "Inter-IC" bus, a simple bus protocol which is + widely used where low data rate communications suffice. + Since it's also a licensed trademark, some vendors use another + name (such as "Two-Wire Interface", TWI) for the same bus. + I2C only needs two signals (SCL for clock, SDA for data), conserving + board real estate and minimizing signal quality issues. + Most I2C devices use seven bit addresses, and bus speeds of up + to 400 kHz; there's a high speed extension (3.4 MHz) that's not yet + found wide use. + I2C is a multi-master bus; open drain signaling is used to + arbitrate between masters, as well as to handshake and to + synchronize clocks from slower clients. + </para> + + <para> + The Linux I2C programming interfaces support only the master + side of bus interactions, not the slave side. + The programming interface is structured around two kinds of driver, + and two kinds of device. + An I2C "Adapter Driver" abstracts the controller hardware; it binds + to a physical device (perhaps a PCI device or platform_device) and + exposes a <structname>struct i2c_adapter</structname> representing + each I2C bus segment it manages. + On each I2C bus segment will be I2C devices represented by a + <structname>struct i2c_client</structname>. Those devices will + be bound to a <structname>struct i2c_driver</structname>, + which should follow the standard Linux driver model. + (At this writing, a legacy model is more widely used.) + There are functions to perform various I2C protocol operations; at + this writing all such functions are usable only from task context. + </para> + + <para> + The System Management Bus (SMBus) is a sibling protocol. Most SMBus + systems are also I2C conformant. The electrical constraints are + tighter for SMBus, and it standardizes particular protocol messages + and idioms. Controllers that support I2C can also support most + SMBus operations, but SMBus controllers don't support all the protocol + options that an I2C controller will. + There are functions to perform various SMBus protocol operations, + either using I2C primitives or by issuing SMBus commands to + i2c_adapter devices which don't support those I2C operations. + </para> + +!Iinclude/linux/i2c.h +!Fdrivers/i2c/i2c-boardinfo.c i2c_register_board_info +!Edrivers/i2c/i2c-core.c + </chapter> + +</book> diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index 5818ff75786a..bc962cda6504 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -38,58 +38,6 @@ <toc></toc> - <chapter id="Basics"> - <title>Driver Basics</title> - <sect1><title>Driver Entry and Exit points</title> -!Iinclude/linux/init.h - </sect1> - - <sect1><title>Atomic and pointer manipulation</title> -!Iarch/x86/include/asm/atomic_32.h -!Iarch/x86/include/asm/unaligned.h - </sect1> - - <sect1><title>Delaying, scheduling, and timer routines</title> -!Iinclude/linux/sched.h -!Ekernel/sched.c -!Ekernel/timer.c - </sect1> - <sect1><title>High-resolution timers</title> -!Iinclude/linux/ktime.h -!Iinclude/linux/hrtimer.h -!Ekernel/hrtimer.c - </sect1> - <sect1><title>Workqueues and Kevents</title> -!Ekernel/workqueue.c - </sect1> - <sect1><title>Internal Functions</title> -!Ikernel/exit.c -!Ikernel/signal.c -!Iinclude/linux/kthread.h -!Ekernel/kthread.c - </sect1> - - <sect1><title>Kernel objects manipulation</title> -<!-- -X!Iinclude/linux/kobject.h ---> -!Elib/kobject.c - </sect1> - - <sect1><title>Kernel utility functions</title> -!Iinclude/linux/kernel.h -!Ekernel/printk.c -!Ekernel/panic.c -!Ekernel/sys.c -!Ekernel/rcupdate.c - </sect1> - - <sect1><title>Device Resource Management</title> -!Edrivers/base/devres.c - </sect1> - - </chapter> - <chapter id="adt"> <title>Data Types</title> <sect1><title>Doubly Linked Lists</title> @@ -298,62 +246,6 @@ X!Earch/x86/kernel/mca_32.c !Ikernel/acct.c </chapter> - <chapter id="devdrivers"> - <title>Device drivers infrastructure</title> - <sect1><title>Device Drivers Base</title> -<!-- -X!Iinclude/linux/device.h ---> -!Edrivers/base/driver.c -!Edrivers/base/core.c -!Edrivers/base/class.c -!Edrivers/base/firmware_class.c -!Edrivers/base/transport_class.c -<!-- Cannot be included, because - attribute_container_add_class_device_adapter - and attribute_container_classdev_to_container - exceed allowed 44 characters maximum -X!Edrivers/base/attribute_container.c ---> -!Edrivers/base/sys.c -<!-- -X!Edrivers/base/interface.c ---> -!Edrivers/base/platform.c -!Edrivers/base/bus.c - </sect1> - <sect1><title>Device Drivers Power Management</title> -!Edrivers/base/power/main.c - </sect1> - <sect1><title>Device Drivers ACPI Support</title> -<!-- Internal functions only -X!Edrivers/acpi/sleep/main.c -X!Edrivers/acpi/sleep/wakeup.c -X!Edrivers/acpi/motherboard.c -X!Edrivers/acpi/bus.c ---> -!Edrivers/acpi/scan.c -!Idrivers/acpi/scan.c -<!-- No correct structured comments -X!Edrivers/acpi/pci_bind.c ---> - </sect1> - <sect1><title>Device drivers PnP support</title> -!Idrivers/pnp/core.c -<!-- No correct structured comments -X!Edrivers/pnp/system.c - --> -!Edrivers/pnp/card.c -!Idrivers/pnp/driver.c -!Edrivers/pnp/manager.c -!Edrivers/pnp/support.c - </sect1> - <sect1><title>Userspace IO devices</title> -!Edrivers/uio/uio.c -!Iinclude/linux/uio_driver.h - </sect1> - </chapter> - <chapter id="blkdev"> <title>Block Devices</title> !Eblock/blk-core.c @@ -381,275 +273,6 @@ X!Edrivers/pnp/system.c !Edrivers/char/misc.c </chapter> - <chapter id="parportdev"> - <title>Parallel Port Devices</title> -!Iinclude/linux/parport.h -!Edrivers/parport/ieee1284.c -!Edrivers/parport/share.c -!Idrivers/parport/daisy.c - </chapter> - - <chapter id="message_devices"> - <title>Message-based devices</title> - <sect1><title>Fusion message devices</title> -!Edrivers/message/fusion/mptbase.c -!Idrivers/message/fusion/mptbase.c -!Edrivers/message/fusion/mptscsih.c -!Idrivers/message/fusion/mptscsih.c -!Idrivers/message/fusion/mptctl.c -!Idrivers/message/fusion/mptspi.c -!Idrivers/message/fusion/mptfc.c -!Idrivers/message/fusion/mptlan.c - </sect1> - <sect1><title>I2O message devices</title> -!Iinclude/linux/i2o.h -!Idrivers/message/i2o/core.h -!Edrivers/message/i2o/iop.c -!Idrivers/message/i2o/iop.c -!Idrivers/message/i2o/config-osm.c -!Edrivers/message/i2o/exec-osm.c -!Idrivers/message/i2o/exec-osm.c -!Idrivers/message/i2o/bus-osm.c -!Edrivers/message/i2o/device.c -!Idrivers/message/i2o/device.c -!Idrivers/message/i2o/driver.c -!Idrivers/message/i2o/pci.c -!Idrivers/message/i2o/i2o_block.c -!Idrivers/message/i2o/i2o_scsi.c -!Idrivers/message/i2o/i2o_proc.c - </sect1> - </chapter> - - <chapter id="snddev"> - <title>Sound Devices</title> -!Iinclude/sound/core.h -!Esound/sound_core.c -!Iinclude/sound/pcm.h -!Esound/core/pcm.c -!Esound/core/device.c -!Esound/core/info.c -!Esound/core/rawmidi.c -!Esound/core/sound.c -!Esound/core/memory.c -!Esound/core/pcm_memory.c -!Esound/core/init.c -!Esound/core/isadma.c -!Esound/core/control.c -!Esound/core/pcm_lib.c -!Esound/core/hwdep.c -!Esound/core/pcm_native.c -!Esound/core/memalloc.c -<!-- FIXME: Removed for now since no structured comments in source -X!Isound/sound_firmware.c ---> - </chapter> - - <chapter id="uart16x50"> - <title>16x50 UART Driver</title> -!Iinclude/linux/serial_core.h -!Edrivers/serial/serial_core.c -!Edrivers/serial/8250.c - </chapter> - - <chapter id="fbdev"> - <title>Frame Buffer Library</title> - - <para> - The frame buffer drivers depend heavily on four data structures. - These structures are declared in include/linux/fb.h. They are - fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs. - The last three can be made available to and from userland. - </para> - - <para> - fb_info defines the current state of a particular video card. - Inside fb_info, there exists a fb_ops structure which is a - collection of needed functions to make fbdev and fbcon work. - fb_info is only visible to the kernel. - </para> - - <para> - fb_var_screeninfo is used to describe the features of a video card - that are user defined. With fb_var_screeninfo, things such as - depth and the resolution may be defined. - </para> - - <para> - The next structure is fb_fix_screeninfo. This defines the - properties of a card that are created when a mode is set and can't - be changed otherwise. A good example of this is the start of the - frame buffer memory. This "locks" the address of the frame buffer - memory, so that it cannot be changed or moved. - </para> - - <para> - The last structure is fb_monospecs. In the old API, there was - little importance for fb_monospecs. This allowed for forbidden things - such as setting a mode of 800x600 on a fix frequency monitor. With - the new API, fb_monospecs prevents such things, and if used - correctly, can prevent a monitor from being cooked. fb_monospecs - will not be useful until kernels 2.5.x. - </para> - - <sect1><title>Frame Buffer Memory</title> -!Edrivers/video/fbmem.c - </sect1> -<!-- - <sect1><title>Frame Buffer Console</title> -X!Edrivers/video/console/fbcon.c - </sect1> ---> - <sect1><title>Frame Buffer Colormap</title> -!Edrivers/video/fbcmap.c - </sect1> -<!-- FIXME: - drivers/video/fbgen.c has no docs, which stuffs up the sgml. Comment - out until somebody adds docs. KAO - <sect1><title>Frame Buffer Generic Functions</title> -X!Idrivers/video/fbgen.c - </sect1> -KAO --> - <sect1><title>Frame Buffer Video Mode Database</title> -!Idrivers/video/modedb.c -!Edrivers/video/modedb.c - </sect1> - <sect1><title>Frame Buffer Macintosh Video Mode Database</title> -!Edrivers/video/macmodes.c - </sect1> - <sect1><title>Frame Buffer Fonts</title> - <para> - Refer to the file drivers/video/console/fonts.c for more information. - </para> -<!-- FIXME: Removed for now since no structured comments in source -X!Idrivers/video/console/fonts.c ---> - </sect1> - </chapter> - - <chapter id="input_subsystem"> - <title>Input Subsystem</title> -!Iinclude/linux/input.h -!Edrivers/input/input.c -!Edrivers/input/ff-core.c -!Edrivers/input/ff-memless.c - </chapter> - - <chapter id="spi"> - <title>Serial Peripheral Interface (SPI)</title> - <para> - SPI is the "Serial Peripheral Interface", widely used with - embedded systems because it is a simple and efficient - interface: basically a multiplexed shift register. - Its three signal wires hold a clock (SCK, often in the range - of 1-20 MHz), a "Master Out, Slave In" (MOSI) data line, and - a "Master In, Slave Out" (MISO) data line. - SPI is a full duplex protocol; for each bit shifted out the - MOSI line (one per clock) another is shifted in on the MISO line. - Those bits are assembled into words of various sizes on the - way to and from system memory. - An additional chipselect line is usually active-low (nCS); - four signals are normally used for each peripheral, plus - sometimes an interrupt. - </para> - <para> - The SPI bus facilities listed here provide a generalized - interface to declare SPI busses and devices, manage them - according to the standard Linux driver model, and perform - input/output operations. - At this time, only "master" side interfaces are supported, - where Linux talks to SPI peripherals and does not implement - such a peripheral itself. - (Interfaces to support implementing SPI slaves would - necessarily look different.) - </para> - <para> - The programming interface is structured around two kinds of driver, - and two kinds of device. - A "Controller Driver" abstracts the controller hardware, which may - be as simple as a set of GPIO pins or as complex as a pair of FIFOs - connected to dual DMA engines on the other side of the SPI shift - register (maximizing throughput). Such drivers bridge between - whatever bus they sit on (often the platform bus) and SPI, and - expose the SPI side of their device as a - <structname>struct spi_master</structname>. - SPI devices are children of that master, represented as a - <structname>struct spi_device</structname> and manufactured from - <structname>struct spi_board_info</structname> descriptors which - are usually provided by board-specific initialization code. - A <structname>struct spi_driver</structname> is called a - "Protocol Driver", and is bound to a spi_device using normal - driver model calls. - </para> - <para> - The I/O model is a set of queued messages. Protocol drivers - submit one or more <structname>struct spi_message</structname> - objects, which are processed and completed asynchronously. - (There are synchronous wrappers, however.) Messages are - built from one or more <structname>struct spi_transfer</structname> - objects, each of which wraps a full duplex SPI transfer. - A variety of protocol tweaking options are needed, because - different chips adopt very different policies for how they - use the bits transferred with SPI. - </para> -!Iinclude/linux/spi/spi.h -!Fdrivers/spi/spi.c spi_register_board_info -!Edrivers/spi/spi.c - </chapter> - - <chapter id="i2c"> - <title>I<superscript>2</superscript>C and SMBus Subsystem</title> - - <para> - I<superscript>2</superscript>C (or without fancy typography, "I2C") - is an acronym for the "Inter-IC" bus, a simple bus protocol which is - widely used where low data rate communications suffice. - Since it's also a licensed trademark, some vendors use another - name (such as "Two-Wire Interface", TWI) for the same bus. - I2C only needs two signals (SCL for clock, SDA for data), conserving - board real estate and minimizing signal quality issues. - Most I2C devices use seven bit addresses, and bus speeds of up - to 400 kHz; there's a high speed extension (3.4 MHz) that's not yet - found wide use. - I2C is a multi-master bus; open drain signaling is used to - arbitrate between masters, as well as to handshake and to - synchronize clocks from slower clients. - </para> - - <para> - The Linux I2C programming interfaces support only the master - side of bus interactions, not the slave side. - The programming interface is structured around two kinds of driver, - and two kinds of device. - An I2C "Adapter Driver" abstracts the controller hardware; it binds - to a physical device (perhaps a PCI device or platform_device) and - exposes a <structname>struct i2c_adapter</structname> representing - each I2C bus segment it manages. - On each I2C bus segment will be I2C devices represented by a - <structname>struct i2c_client</structname>. Those devices will - be bound to a <structname>struct i2c_driver</structname>, - which should follow the standard Linux driver model. - (At this writing, a legacy model is more widely used.) - There are functions to perform various I2C protocol operations; at - this writing all such functions are usable only from task context. - </para> - - <para> - The System Management Bus (SMBus) is a sibling protocol. Most SMBus - systems are also I2C conformant. The electrical constraints are - tighter for SMBus, and it standardizes particular protocol messages - and idioms. Controllers that support I2C can also support most - SMBus operations, but SMBus controllers don't support all the protocol - options that an I2C controller will. - There are functions to perform various SMBus protocol operations, - either using I2C primitives or by issuing SMBus commands to - i2c_adapter devices which don't support those I2C operations. - </para> - -!Iinclude/linux/i2c.h -!Fdrivers/i2c/i2c-boardinfo.c i2c_register_board_info -!Edrivers/i2c/i2c-core.c - </chapter> - <chapter id="clk"> <title>Clock Framework</title> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index b182626739ea..f6d5d5b9b2b1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -114,7 +114,7 @@ In addition, the following text indicates that the option: Parameters denoted with BOOT are actually interpreted by the boot loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme -need or coordination with <Documentation/x86/i386/boot.txt>. +need or coordination with <Documentation/x86/boot.txt>. There are also arch-specific kernel-parameters not documented here. See for example <Documentation/x86/x86_64/boot-options.txt>. @@ -134,7 +134,7 @@ and is between 256 and 4096 characters. It is defined in the file acpi= [HW,ACPI,X86-64,i386] Advanced Configuration and Power Interface - Format: { force | off | ht | strict | noirq } + Format: { force | off | ht | strict | noirq | rsdt } force -- enable ACPI if default was off off -- disable ACPI if default was on noirq -- do not use ACPI for IRQ routing @@ -2449,7 +2449,7 @@ and is between 256 and 4096 characters. It is defined in the file See Documentation/fb/modedb.txt. vga= [BOOT,X86-32] Select a particular video mode - See Documentation/x86/i386/boot.txt and + See Documentation/x86/boot.txt and Documentation/svga.txt. Use vga=ask for menu. This is actually a boot loader parameter; the value is @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 29 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Erotic Pickled Herring # *DOCUMENTATION* diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 29c5fbf08392..3a8a866fb2e2 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -25,14 +25,12 @@ #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/boot.h> #include <asm/asm-offsets.h> .section ".text.head","ax",@progbits - .globl startup_32 - -startup_32: +ENTRY(startup_32) cld /* test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ @@ -113,6 +111,8 @@ startup_32: */ leal relocated(%ebx), %eax jmp *%eax +ENDPROC(startup_32) + .section ".text" relocated: diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1d5dff4123e1..ed4a82948002 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -26,8 +26,8 @@ #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/pgtable.h> -#include <asm/page.h> +#include <asm/pgtable_types.h> +#include <asm/page_types.h> #include <asm/boot.h> #include <asm/msr.h> #include <asm/processor-flags.h> @@ -35,9 +35,7 @@ .section ".text.head" .code32 - .globl startup_32 - -startup_32: +ENTRY(startup_32) cld /* test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ @@ -176,6 +174,7 @@ startup_32: /* Jump from 32bit compatibility mode into 64bit mode. */ lret +ENDPROC(startup_32) no_longmode: /* This isn't an x86-64 CPU so hang */ @@ -295,7 +294,6 @@ relocated: call decompress_kernel popq %rsi - /* * Jump to the decompressed kernel. */ diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index ef50c84e8b4b..11f272c6f5e9 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -8,6 +8,8 @@ * * ----------------------------------------------------------------------- */ +#include <linux/linkage.h> + /* * Memory copy routines */ @@ -15,9 +17,7 @@ .code16gcc .text - .globl memcpy - .type memcpy, @function -memcpy: +GLOBAL(memcpy) pushw %si pushw %di movw %ax, %di @@ -31,11 +31,9 @@ memcpy: popw %di popw %si ret - .size memcpy, .-memcpy +ENDPROC(memcpy) - .globl memset - .type memset, @function -memset: +GLOBAL(memset) pushw %di movw %ax, %di movzbl %dl, %eax @@ -48,52 +46,42 @@ memset: rep; stosb popw %di ret - .size memset, .-memset +ENDPROC(memset) - .globl copy_from_fs - .type copy_from_fs, @function -copy_from_fs: +GLOBAL(copy_from_fs) pushw %ds pushw %fs popw %ds call memcpy popw %ds ret - .size copy_from_fs, .-copy_from_fs +ENDPROC(copy_from_fs) - .globl copy_to_fs - .type copy_to_fs, @function -copy_to_fs: +GLOBAL(copy_to_fs) pushw %es pushw %fs popw %es call memcpy popw %es ret - .size copy_to_fs, .-copy_to_fs +ENDPROC(copy_to_fs) #if 0 /* Not currently used, but can be enabled as needed */ - - .globl copy_from_gs - .type copy_from_gs, @function -copy_from_gs: +GLOBAL(copy_from_gs) pushw %ds pushw %gs popw %ds call memcpy popw %ds ret - .size copy_from_gs, .-copy_from_gs - .globl copy_to_gs +ENDPROC(copy_from_gs) - .type copy_to_gs, @function -copy_to_gs: +GLOBAL(copy_to_gs) pushw %es pushw %gs popw %es call memcpy popw %es ret - .size copy_to_gs, .-copy_to_gs - +ENDPROC(copy_to_gs) #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b993062e9a5f..7ccff4884a23 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -19,7 +19,7 @@ #include <linux/utsrelease.h> #include <asm/boot.h> #include <asm/e820.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/setup.h> #include "boot.h" #include "offsets.h" diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S index 141b6e20ed31..019c17a75851 100644 --- a/arch/x86/boot/pmjump.S +++ b/arch/x86/boot/pmjump.S @@ -15,18 +15,15 @@ #include <asm/boot.h> #include <asm/processor-flags.h> #include <asm/segment.h> +#include <linux/linkage.h> .text - - .globl protected_mode_jump - .type protected_mode_jump, @function - .code16 /* * void protected_mode_jump(u32 entrypoint, u32 bootparams); */ -protected_mode_jump: +GLOBAL(protected_mode_jump) movl %edx, %esi # Pointer to boot_params table xorl %ebx, %ebx @@ -47,12 +44,10 @@ protected_mode_jump: .byte 0x66, 0xea # ljmpl opcode 2: .long in_pm32 # offset .word __BOOT_CS # segment - - .size protected_mode_jump, .-protected_mode_jump +ENDPROC(protected_mode_jump) .code32 - .type in_pm32, @function -in_pm32: +GLOBAL(in_pm32) # Set up data segments for flat 32-bit mode movl %ecx, %ds movl %ecx, %es @@ -78,5 +73,4 @@ in_pm32: lldt %cx jmpl *%eax # Jump to the 32-bit entrypoint - - .size in_pm32, .-in_pm32 +ENDPROC(in_pm32) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index dd77ac0cac46..588a7aa937e1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -33,8 +33,6 @@ #include <asm/sigframe.h> #include <asm/sys_ia32.h> -#define DEBUG_SIG 0 - #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) #define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ @@ -190,42 +188,47 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, /* * Do a signal return; undo the signal stack. */ +#define loadsegment_gs(v) load_gs_index(v) +#define loadsegment_fs(v) loadsegment(fs, v) +#define loadsegment_ds(v) loadsegment(ds, v) +#define loadsegment_es(v) loadsegment(es, v) + +#define get_user_seg(seg) ({ unsigned int v; savesegment(seg, v); v; }) +#define set_user_seg(seg, v) loadsegment_##seg(v) + #define COPY(x) { \ get_user_ex(regs->x, &sc->x); \ } -#define COPY_SEG_CPL3(seg) { \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - regs->seg = tmp | 3; \ -} +#define GET_SEG(seg) ({ \ + unsigned short tmp; \ + get_user_ex(tmp, &sc->seg); \ + tmp; \ +}) + +#define COPY_SEG_CPL3(seg) do { \ + regs->seg = GET_SEG(seg) | 3; \ +} while (0) #define RELOAD_SEG(seg) { \ - unsigned int cur, pre; \ - get_user_ex(pre, &sc->seg); \ - savesegment(seg, cur); \ + unsigned int pre = GET_SEG(seg); \ + unsigned int cur = get_user_seg(seg); \ pre |= 3; \ if (pre != cur) \ - loadsegment(seg, pre); \ + set_user_seg(seg, pre); \ } static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *pax) { - unsigned int tmpflags, gs, oldgs, err = 0; + unsigned int tmpflags, err = 0; void __user *buf; u32 tmp; /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#if DEBUG_SIG - printk(KERN_DEBUG "SIG restore_sigcontext: " - "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", - sc, sc->err, sc->ip, sc->cs, sc->flags); -#endif - get_user_try { /* * Reload fs and gs if they have changed in the signal @@ -233,12 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, * the handler, but does not clobber them at least in the * normal case. */ - get_user_ex(gs, &sc->gs); - gs |= 3; - savesegment(gs, oldgs); - if (gs != oldgs) - load_gs_index(gs); - + RELOAD_SEG(gs); RELOAD_SEG(fs); RELOAD_SEG(ds); RELOAD_SEG(es); @@ -337,17 +335,13 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned int mask) { - int tmp, err = 0; + int err = 0; put_user_try { - savesegment(gs, tmp); - put_user_ex(tmp, (unsigned int __user *)&sc->gs); - savesegment(fs, tmp); - put_user_ex(tmp, (unsigned int __user *)&sc->fs); - savesegment(ds, tmp); - put_user_ex(tmp, (unsigned int __user *)&sc->ds); - savesegment(es, tmp); - put_user_ex(tmp, (unsigned int __user *)&sc->es); + put_user_ex(get_user_seg(gs), (unsigned int __user *)&sc->gs); + put_user_ex(get_user_seg(fs), (unsigned int __user *)&sc->fs); + put_user_ex(get_user_seg(ds), (unsigned int __user *)&sc->ds); + put_user_ex(get_user_seg(es), (unsigned int __user *)&sc->es); put_user_ex(regs->di, &sc->di); put_user_ex(regs->si, &sc->si); @@ -488,11 +482,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->cs = __USER32_CS; regs->ss = __USER32_DS; -#if DEBUG_SIG - printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->ip, frame->pretcode); -#endif - return 0; } @@ -574,10 +563,5 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER32_CS; regs->ss = __USER32_DS; -#if DEBUG_SIG - printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->ip, frame->pretcode); -#endif - return 0; } diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 4f8e820cf38f..683d0b4c00fc 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -124,10 +124,15 @@ static inline void *phys_to_virt(phys_addr_t address) /* * ISA I/O bus memory addresses are 1:1 with the physical address. + * However, we truncate the address to unsigned int to avoid undesirable + * promitions in legacy drivers. */ -#define isa_virt_to_bus (unsigned long)virt_to_phys -#define isa_page_to_bus page_to_phys -#define isa_bus_to_virt phys_to_virt +static inline unsigned int isa_virt_to_bus(volatile void *address) +{ + return (unsigned int)virt_to_phys(address); +} +#define isa_page_to_bus(page) ((unsigned int)page_to_phys(page)) +#define isa_bus_to_virt phys_to_virt /* * However PCI ones are not necessarily 1:1 and therefore these interfaces diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index b07278c55e9e..8a285f356f8a 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -128,7 +128,7 @@ #ifndef __ASSEMBLY__ static inline int invalid_vm86_irq(int irq) { - return irq < 3 || irq > 15; + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; } #endif diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 5d98d0b68ffc..9320e2a8a26a 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -52,70 +52,14 @@ #endif +#define GLOBAL(name) \ + .globl name; \ + name: + #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 #define __ALIGN_STR ".align 16,0x90" #endif -/* - * to check ENTRY_X86/END_X86 and - * KPROBE_ENTRY_X86/KPROBE_END_X86 - * unbalanced-missed-mixed appearance - */ -#define __set_entry_x86 .set ENTRY_X86_IN, 0 -#define __unset_entry_x86 .set ENTRY_X86_IN, 1 -#define __set_kprobe_x86 .set KPROBE_X86_IN, 0 -#define __unset_kprobe_x86 .set KPROBE_X86_IN, 1 - -#define __macro_err_x86 .error "ENTRY_X86/KPROBE_X86 unbalanced,missed,mixed" - -#define __check_entry_x86 \ - .ifdef ENTRY_X86_IN; \ - .ifeq ENTRY_X86_IN; \ - __macro_err_x86; \ - .abort; \ - .endif; \ - .endif - -#define __check_kprobe_x86 \ - .ifdef KPROBE_X86_IN; \ - .ifeq KPROBE_X86_IN; \ - __macro_err_x86; \ - .abort; \ - .endif; \ - .endif - -#define __check_entry_kprobe_x86 \ - __check_entry_x86; \ - __check_kprobe_x86 - -#define ENTRY_KPROBE_FINAL_X86 __check_entry_kprobe_x86 - -#define ENTRY_X86(name) \ - __check_entry_kprobe_x86; \ - __set_entry_x86; \ - .globl name; \ - __ALIGN; \ - name: - -#define END_X86(name) \ - __unset_entry_x86; \ - __check_entry_kprobe_x86; \ - .size name, .-name - -#define KPROBE_ENTRY_X86(name) \ - __check_entry_kprobe_x86; \ - __set_kprobe_x86; \ - .pushsection .kprobes.text, "ax"; \ - .globl name; \ - __ALIGN; \ - name: - -#define KPROBE_END_X86(name) \ - __unset_kprobe_x86; \ - __check_entry_kprobe_x86; \ - .size name, .-name; \ - .popsection - #endif /* _ASM_X86_LINKAGE_H */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index b5486aaf36ec..f1e4a79a6e41 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -33,12 +33,10 @@ /* 44=32+12, the limit we can fit into an unsigned long pfn */ #define __PHYSICAL_MASK_SHIFT 44 #define __VIRTUAL_MASK_SHIFT 32 -#define PAGETABLE_LEVELS 3 #else /* !CONFIG_X86_PAE */ #define __PHYSICAL_MASK_SHIFT 32 #define __VIRTUAL_MASK_SHIFT 32 -#define PAGETABLE_LEVELS 2 #endif /* CONFIG_X86_PAE */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index bc73af3eda9c..d38c91b70248 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,8 +1,6 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define PAGETABLE_LEVELS 4 - #define THREAD_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 2c52ff767584..2d625da6603c 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -16,12 +16,6 @@ (ie, 32-bit PAE). */ #define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK) -/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ -#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) - -/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ -#define PTE_FLAGS_MASK (~PTE_PFN_MASK) - #define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 09ae67efcebd..daacc23e3fb9 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -17,6 +17,7 @@ typedef union { #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 +#define PAGETABLE_LEVELS 2 /* * traditional i386 two-level paging structure: @@ -25,6 +26,7 @@ typedef union { #define PGDIR_SHIFT 22 #define PTRS_PER_PGD 1024 + /* * the i386 is two-level, so we don't really have any * PMD directory physically. diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..1bd5876c8649 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -24,6 +24,8 @@ typedef union { #define SHARED_KERNEL_PMD 1 #endif +#define PAGETABLE_LEVELS 3 + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 2f59135c6f2a..fbf42b8e0383 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -18,6 +18,7 @@ typedef struct { pteval_t pte; } pte_t; #endif /* !__ASSEMBLY__ */ #define SHARED_KERNEL_PMD 0 +#define PAGETABLE_LEVELS 4 /* * PGDIR_SHIFT determines what a top-level page table entry can map diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 9dafe87be2de..4d258ad76a0f 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -173,6 +173,12 @@ #include <linux/types.h> +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index dabab1a19ddd..c7a98f738210 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -403,7 +403,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary); #endif #endif /* X86_64 */ -extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; @@ -862,6 +861,7 @@ static inline void spin_lock_prefetch(const void *x) * User space process size: 3GB (default). */ #define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX STACK_TOP @@ -921,7 +921,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); /* * User space process size. 47bits minus one guard page. */ -#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE) +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -930,12 +930,12 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); 0xc0000000 : 0xFFFFe000) #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ - IA32_PAGE_OFFSET : TASK_SIZE64) + IA32_PAGE_OFFSET : TASK_SIZE_MAX) #define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX TASK_SIZE64 +#define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 258ef730aaa4..7043408f6904 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -82,7 +82,7 @@ asmlinkage long sys_iopl(unsigned int, struct pt_regs *); /* kernel/signal_64.c */ asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, struct pt_regs *); -asmlinkage long sys_rt_sigreturn(struct pt_regs *); +long sys_rt_sigreturn(struct pt_regs *); /* kernel/sys_x86_64.c */ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 84210c479fca..987a2c10fe20 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -192,14 +192,26 @@ static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) { might_sleep(); - return __copy_user_nocache(dst, src, size, 1); + /* + * In practice this limit means that large file write()s + * which get chunked to 4K copies get handled via + * non-temporal stores here. Smaller writes get handled + * via regular __copy_from_user(): + */ + if (likely(size >= PAGE_SIZE)) + return __copy_user_nocache(dst, src, size, 1); + else + return __copy_from_user(dst, src, size); } static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { - return __copy_user_nocache(dst, src, size, 0); + if (likely(size >= PAGE_SIZE)) + return __copy_user_nocache(dst, src, size, 0); + else + return __copy_from_user_inatomic(dst, src, size); } unsigned long diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 3355973b12ac..580b4e296010 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -3,8 +3,8 @@ */ #include <asm/segment.h> #include <asm/msr-index.h> -#include <asm/page.h> -#include <asm/pgtable.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h> #include <asm/processor-flags.h> .code16 diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index a12e6a9fb659..8ded418b0593 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,7 +1,7 @@ .section .text.page_aligned #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h> # Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 96258d9dc974..8ea5164cbd04 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -1,8 +1,8 @@ .text #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/pgtable.h> -#include <asm/page.h> +#include <asm/pgtable_types.h> +#include <asm/page_types.h> #include <asm/msr.h> #include <asm/asm-offsets.h> diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c index c2f930d86640..41ab3f064cb1 100644 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c @@ -204,12 +204,12 @@ static int eps_cpu_init(struct cpufreq_policy *policy) } /* Enable Enhanced PowerSaver */ rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & 1 << 16)) { - val |= 1 << 16; + if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; wrmsrl(MSR_IA32_MISC_ENABLE, val); /* Can be locked at 0 */ rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & 1 << 16)) { + if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); return -ENODEV; } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index f08998278a3a..c9f1fdc02830 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -390,14 +390,14 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) enable it if not. */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & (1<<16))) { - l |= (1<<16); + if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { + l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); wrmsr(MSR_IA32_MISC_ENABLE, l, h); /* check to see if it stuck */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & (1<<16))) { + if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 7aeef1d327b1..25c559ba8d54 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -146,10 +146,10 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); - if ((lo & (1<<9)) == 0) { + if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); - lo |= (1<<9); /* Disable hw prefetching */ + lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 4b7d78cdc0a5..aa5e287c98e0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -49,13 +49,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); - if ((l & (1 << 3)) && (h & APIC_DM_SMI)) { + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) tm2 = 1; if (h & APIC_VECTOR_MASK) { @@ -73,7 +73,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h); rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 9b60fce09f75..f53bdcbaf382 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -85,7 +85,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); h = apic_read(APIC_LVTTHMR); - if ((l & (1<<3)) && (h & APIC_DM_SMI)) { + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); return; /* -EBUSY */ @@ -111,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) vendor_thermal_interrupt = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); - wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S index ef00bb77d7e4..fbe66e626c09 100644 --- a/arch/x86/kernel/efi_stub_32.S +++ b/arch/x86/kernel/efi_stub_32.S @@ -6,7 +6,7 @@ */ #include <linux/linkage.h> -#include <asm/page.h> +#include <asm/page_types.h> /* * efi_call_phys(void *, ...) is a function with variable parameters. @@ -113,6 +113,7 @@ ENTRY(efi_call_phys) movl (%edx), %ecx pushl %ecx ret +ENDPROC(efi_call_phys) .previous .data diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S index 99b47d48c9f4..4c07ccab8146 100644 --- a/arch/x86/kernel/efi_stub_64.S +++ b/arch/x86/kernel/efi_stub_64.S @@ -41,6 +41,7 @@ ENTRY(efi_call0) addq $32, %rsp RESTORE_XMM ret +ENDPROC(efi_call0) ENTRY(efi_call1) SAVE_XMM @@ -50,6 +51,7 @@ ENTRY(efi_call1) addq $32, %rsp RESTORE_XMM ret +ENDPROC(efi_call1) ENTRY(efi_call2) SAVE_XMM @@ -59,6 +61,7 @@ ENTRY(efi_call2) addq $32, %rsp RESTORE_XMM ret +ENDPROC(efi_call2) ENTRY(efi_call3) SAVE_XMM @@ -69,6 +72,7 @@ ENTRY(efi_call3) addq $32, %rsp RESTORE_XMM ret +ENDPROC(efi_call3) ENTRY(efi_call4) SAVE_XMM @@ -80,6 +84,7 @@ ENTRY(efi_call4) addq $32, %rsp RESTORE_XMM ret +ENDPROC(efi_call4) ENTRY(efi_call5) SAVE_XMM @@ -92,6 +97,7 @@ ENTRY(efi_call5) addq $48, %rsp RESTORE_XMM ret +ENDPROC(efi_call5) ENTRY(efi_call6) SAVE_XMM @@ -107,3 +113,4 @@ ENTRY(efi_call6) addq $48, %rsp RESTORE_XMM ret +ENDPROC(efi_call6) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index e99206831459..899e8938e79f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -47,7 +47,7 @@ #include <asm/errno.h> #include <asm/segment.h> #include <asm/smp.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/desc.h> #include <asm/percpu.h> #include <asm/dwarf2.h> @@ -1359,7 +1359,7 @@ nmi_espfix_stack: CFI_ADJUST_CFA_OFFSET 4 pushl %esp CFI_ADJUST_CFA_OFFSET 4 - addw $4, (%esp) + addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 pushl 16(%esp) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fbcf96b295ff..83d1836b9467 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -48,7 +48,7 @@ #include <asm/unistd.h> #include <asm/thread_info.h> #include <asm/hw_irq.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/irqflags.h> #include <asm/paravirt.h> #include <asm/ftrace.h> @@ -77,20 +77,17 @@ ENTRY(ftrace_caller) movq 8(%rbp), %rsi subq $MCOUNT_INSN_SIZE, %rdi -.globl ftrace_call -ftrace_call: +GLOBAL(ftrace_call) call ftrace_stub MCOUNT_RESTORE_FRAME #ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: +GLOBAL(ftrace_graph_call) jmp ftrace_stub #endif -.globl ftrace_stub -ftrace_stub: +GLOBAL(ftrace_stub) retq END(ftrace_caller) @@ -110,8 +107,7 @@ ENTRY(mcount) jnz ftrace_graph_caller #endif -.globl ftrace_stub -ftrace_stub: +GLOBAL(ftrace_stub) retq trace: @@ -148,9 +144,7 @@ ENTRY(ftrace_graph_caller) retq END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: +GLOBAL(return_to_handler) subq $80, %rsp movq %rax, (%rsp) @@ -188,6 +182,7 @@ return_to_handler: ENTRY(native_usergs_sysret64) swapgs sysretq +ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ @@ -633,16 +628,14 @@ tracesys: * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ - .globl int_ret_from_sys_call - .globl int_with_check -int_ret_from_sys_call: +GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ -int_with_check: +GLOBAL(int_with_check) LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) movl TI_flags(%rcx),%edx diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 2a0aad7718d5..c32ca19d591a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -11,8 +11,8 @@ #include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> -#include <asm/pgtable.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h> #include <asm/desc.h> #include <asm/cache.h> #include <asm/thread_info.h> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2e648e3a5ea4..54b29bb24e71 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -329,8 +329,6 @@ early_idt_ripmsg: #endif /* CONFIG_EARLY_PRINTK */ .previous -.balign PAGE_SIZE - #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ ENTRY(name) @@ -419,7 +417,7 @@ ENTRY(phys_base) .section .bss, "aw", @nobits .align L1_CACHE_BYTES ENTRY(idt_table) - .skip 256 * 16 + .skip IDT_ENTRIES * 16 .section .bss.page_aligned, "aw", @nobits .align PAGE_SIZE diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 37f420018a41..f5fc8c781a62 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -121,7 +121,7 @@ static void machine_kexec_page_table_set_one( static void machine_kexec_prepare_page_tables(struct kimage *image) { void *control_page; - pmd_t *pmd = 0; + pmd_t *pmd = NULL; control_page = page_address(image->control_code_page); #ifdef CONFIG_X86_PAE diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d2f7cd5b2c83..fb2159a5c817 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -268,7 +268,7 @@ static unsigned long debugreg_addr_limit(struct task_struct *task) if (test_tsk_thread_flag(task, TIF_IA32)) return IA32_PAGE_OFFSET - 3; #endif - return TASK_SIZE64 - 7; + return TASK_SIZE_MAX - 7; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index a160f3119725..2064d0aa8d28 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -7,7 +7,7 @@ */ #include <linux/linkage.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/kexec.h> #include <asm/processor-flags.h> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index b0bbdd4829c9..d32cfb27a479 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -7,10 +7,10 @@ */ #include <linux/linkage.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/kexec.h> #include <asm/processor-flags.h> -#include <asm/pgtable.h> +#include <asm/pgtable_types.h> /* * Must be relocatable PIC code callable as a C function diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index d8ccc3c6552f..66d874e5404c 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -29,7 +29,7 @@ #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h> /* We can free up trampoline after bootup if cpu hotplug is not supported. */ #ifndef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 95a012a4664e..cddfb8d386b9 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,8 +25,8 @@ */ #include <linux/linkage.h> -#include <asm/pgtable.h> -#include <asm/page.h> +#include <asm/pgtable_types.h> +#include <asm/page_types.h> #include <asm/msr.h> #include <asm/segment.h> #include <asm/processor-flags.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c8c0a7e530be..c05430ac1b44 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -942,7 +942,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_signo = SIGILL; info.si_errno = 0; info.si_code = ILL_BADSTK; - info.si_addr = 0; + info.si_addr = NULL; if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 32, SIGILL) == NOTIFY_STOP) return; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 49b4cd6707f9..33a788d5879c 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -287,8 +287,7 @@ static struct clocksource clocksource_vmi; static cycle_t read_real_cycles(void) { cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); - return ret >= clocksource_vmi.cycle_last ? - ret : clocksource_vmi.cycle_last; + return max(ret, clocksource_vmi.cycle_last); } static struct clocksource clocksource_vmi = { diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 3eba7f7bac05..0d860963f268 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -12,7 +12,7 @@ #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/cache.h> #include <asm/boot.h> diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 087a7f2c639b..fbfced6f6800 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -6,7 +6,7 @@ #include <asm-generic/vmlinux.lds.h> #include <asm/asm-offsets.h> -#include <asm/page.h> +#include <asm/page_types.h> #undef i386 /* in case the preprocessor is a 32bit one */ diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index ad374003742f..51f1504cddd9 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -28,7 +28,7 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 29644175490f..a03b7279efa0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1,74 +1,79 @@ /* * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. + * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mmiotrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> #include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/mmiotrace.h> +#include <linux/bootmem.h> #include <linux/compiler.h> #include <linux/highmem.h> -#include <linux/bootmem.h> /* for max_low_pfn */ -#include <linux/vmalloc.h> -#include <linux/module.h> #include <linux/kprobes.h> #include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/vt_kern.h> +#include <linux/signal.h> +#include <linux/kernel.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/module.h> #include <linux/kdebug.h> +#include <linux/errno.h> #include <linux/magic.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/mman.h> +#include <linux/tty.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm-generic/sections.h> -#include <asm/system.h> -#include <asm/desc.h> -#include <asm/segment.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> +#include <asm/segment.h> +#include <asm/system.h> #include <asm/proto.h> -#include <asm-generic/sections.h> #include <asm/traps.h> +#include <asm/desc.h> /* - * Page fault error code bits - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch */ -#define PF_PROT (1<<0) -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) +enum x86_pf_error_code { + PF_PROT = 1 << 0, + PF_WRITE = 1 << 1, + PF_USER = 1 << 2, + PF_RSVD = 1 << 3, + PF_INSTR = 1 << 4, +}; + +/* + * Returns 0 if mmiotrace is disabled, or if the fault is not + * handled by mmiotrace: + */ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) { -#ifdef CONFIG_MMIOTRACE if (unlikely(is_kmmio_active())) if (kmmio_handler(regs, addr) == 1) return -1; -#endif return 0; } static inline int notify_page_fault(struct pt_regs *regs) { -#ifdef CONFIG_KPROBES int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode_vm(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -76,29 +81,76 @@ static inline int notify_page_fault(struct pt_regs *regs) } return ret; -#else - return 0; -#endif } /* - * X86_32 - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. + * Prefetch quirks: * - * X86_64 - * Sometimes the CPU reports invalid exceptions on prefetch. - * Check that here and ignore it. + * 32-bit mode: * - * Opcode checker based on code by Richard Brunner + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * 64-bit mode: + * + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner. */ -static int is_prefetch(struct pt_regs *regs, unsigned long error_code, - unsigned long addr) +static inline int +check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr, + unsigned char opcode, int *prefetch) { + unsigned char instr_hi = opcode & 0xf0; + unsigned char instr_lo = opcode & 0x0f; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + return ((instr_lo & 7) == 0x6); +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + return (!user_mode(regs)) || (regs->cs == __USER_CS); +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + return (instr_lo & 0xC) == 0x4; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + return !instr_lo || (instr_lo>>1) == 1; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + if (probe_kernel_address(instr, opcode)) + return 0; + + *prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + return 0; + default: + return 0; + } +} + +static int +is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) +{ + unsigned char *max_instr; unsigned char *instr; - int scan_more = 1; int prefetch = 0; - unsigned char *max_instr; /* * If it was a exec (instruction fetch) fault on NX page, then @@ -107,106 +159,170 @@ static int is_prefetch(struct pt_regs *regs, unsigned long error_code, if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_ip_to_linear(current, regs); + instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) return 0; - while (scan_more && instr < max_instr) { + while (instr < max_instr) { unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; if (probe_kernel_address(instr, opcode)) break; - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; instr++; - switch (instr_hi) { - case 0x20: - case 0x30: - /* - * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. - * In X86_64 long mode, the CPU will signal invalid - * opcode if some of these prefixes are present so - * X86_64 will never get here anyway - */ - scan_more = ((instr_lo & 7) == 0x6); - break; -#ifdef CONFIG_X86_64 - case 0x40: - /* - * In AMD64 long mode 0x40..0x4F are valid REX prefixes - * Need to figure out under what instruction mode the - * instruction was issued. Could check the LDT for lm, - * but for now it's good enough to assume that long - * mode only uses well known segments or kernel. - */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; -#endif - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); + if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) break; - default: - scan_more = 0; - break; - } } return prefetch; } -static void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) +static void +force_sig_info_fault(int si_signo, int si_code, unsigned long address, + struct task_struct *tsk) { siginfo_t info; - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); } -#ifdef CONFIG_X86_64 -static int bad_address(void *p) +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + } + + return pmd_k; +} + +void vmalloc_sync_all(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { + + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 32-bit: + * + * Handle a fault on the vmalloc or module mapping area + */ +static noinline int vmalloc_fault(unsigned long address) +{ + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + + return 0; +} + +/* + * Did it hit the DOS screen memory VA from vm86 mode? + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ + unsigned long bit; + + if (!v8086_mode(regs)) + return; + + bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; } -#endif static void dump_pagetable(unsigned long address) { -#ifdef CONFIG_X86_32 __typeof__(pte_val(__pte(0))) page; page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; + #ifdef CONFIG_X86_PAE printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; + & (PTRS_PER_PMD - 1)]; printk(KERN_CONT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } @@ -218,19 +334,145 @@ static void dump_pagetable(unsigned long address) * We must not directly access the pte in the highpte * case if the page table is located in highmem. * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. + * it's allocated already: */ if ((page >> PAGE_SHIFT) < max_low_pfn && (page & _PAGE_PRESENT) && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; + & (PTRS_PER_PTE - 1)]; printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } printk("\n"); -#else /* CONFIG_X86_64 */ +} + +#else /* CONFIG_X86_64: */ + +void vmalloc_sync_all(void) +{ + unsigned long address; + + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { + + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* + * 64-bit: + * + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static noinline int vmalloc_fault(unsigned long address) +{ + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Make sure we are in vmalloc area: */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + + /* + * Copy kernel mappings over when needed. This can also + * happen within a race in page table update. In the later + * case just flush: + */ + pgd = pgd_offset(current->active_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* + * Below here mismatches are bugs because these lower tables + * are shared: + */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + + pte = pte_offset_kernel(pmd, address); + + /* + * Don't use pte_page here, because the mappings can point + * outside mem_map, and the NUMA hash lookup cannot handle + * that: + */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + + return 0; +} + +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; + +/* + * No vm86 mode in 64-bit mode: + */ +static inline void +check_v8086_mode(struct pt_regs *regs, unsigned long address, + struct task_struct *tsk) +{ +} + +static int bad_address(void *p) +{ + unsigned long dummy; + + return probe_kernel_address((unsigned long *)p, dummy); +} + +static void dump_pagetable(unsigned long address) +{ pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -239,102 +481,77 @@ static void dump_pagetable(unsigned long address) pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; + if (bad_address(pgd)) + goto bad; + printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; + + if (!pgd_present(*pgd)) + goto out; pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; + if (bad_address(pud)) + goto bad; + printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud) || pud_large(*pud)) - goto ret; + goto out; pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; + if (bad_address(pmd)) + goto bad; + printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) + goto out; pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; + if (bad_address(pte)) + goto bad; + printk("PTE %lx", pte_val(*pte)); -ret: +out: printk("\n"); return; bad: printk("BAD\n"); -#endif } -#ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; +#endif /* CONFIG_X86_64 */ - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; -} -#endif - -#ifdef CONFIG_X86_64 -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; -#endif - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. - Does nothing for X86_32 +/* + * Workaround for K8 erratum #93 & buggy BIOS. + * + * BIOS SMM functions are required to use a specific workaround + * to avoid corruption of the 64bit RIP register on C stepping K8. + * + * A lot of BIOS that didn't get tested properly miss this. + * + * The OS sees this as a page fault with the upper 32bits of RIP cleared. + * Try to work around it here. + * + * Note we only handle faults in kernel here. + * Does nothing on 32-bit. */ static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int warned; + static int once; + if (address != regs->ip) return 0; + if ((address >> 32) != 0) return 0; + address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { + if (!once) { printk(errata93_warning); - warned = 1; + once = 1; } regs->ip = address; return 1; @@ -344,16 +561,17 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) } /* - * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal - * addresses >4GB. We catch this in the page fault handler because these - * addresses are not reachable. Just detect this case and return. Any code + * Work around K8 erratum #100 K8 in compat mode occasionally jumps + * to illegal addresses >4GB. + * + * We catch this in the page fault handler because these addresses + * are not reachable. Just detect this case and return. Any code * segment in LDT is compatibility mode. */ static int is_errata100(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32)) return 1; #endif return 0; @@ -363,8 +581,9 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG unsigned long nr; + /* - * Pentium F0 0F C7 C8 bug workaround. + * Pentium F0 0F C7 C8 bug workaround: */ if (boot_cpu_data.f00f_bug) { nr = (address - idt_descr.address) >> 3; @@ -378,80 +597,87 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) return 0; } -static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static const char nx_warning[] = KERN_CRIT +"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; + +static void +show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { -#ifdef CONFIG_X86_32 if (!oops_may_print()) return; -#endif -#ifdef CONFIG_X86_PAE if (error_code & PF_INSTR) { unsigned int level; + pte_t *pte = lookup_address(address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current_uid()); + printk(nx_warning, current_uid()); } -#endif printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); + printk(KERN_CONT " at %p\n", (void *) address); printk(KERN_ALERT "IP:"); printk_address(regs->ip, 1); + dump_pagetable(address); } -#ifdef CONFIG_X86_64 -static noinline void pgtable_bad(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +pgtable_bad(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { - unsigned long flags = oops_begin(); - int sig = SIGKILL; - struct task_struct *tsk = current; + struct task_struct *tsk; + unsigned long flags; + int sig; + + flags = oops_begin(); + tsk = current; + sig = SIGKILL; printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", tsk->comm, address); dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) sig = 0; + oops_end(flags, regs, sig); } -#endif -static noinline void no_context(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +no_context(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk = current; unsigned long *stackend; - -#ifdef CONFIG_X86_64 unsigned long flags; int sig; -#endif - /* Are we prepared to handle this kernel fault? */ + /* Are we prepared to handle this kernel fault? */ if (fixup_exception(regs)) return; /* - * X86_32 - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. + * 32-bit: + * + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * 64-bit: * - * X86_64 - * Hall of shame of CPU/BIOS bugs. + * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; @@ -461,54 +687,70 @@ static noinline void no_context(struct pt_regs *regs, /* * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. + * terminate things with extreme prejudice: */ -#ifdef CONFIG_X86_32 - bust_spinlocks(1); -#else flags = oops_begin(); -#endif show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); + stackend = end_of_stack(tsk); if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; -#ifdef CONFIG_X86_32 - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); -#else sig = SIGKILL; if (__die("Oops", regs, error_code)) sig = 0; + /* Executive summary in case the body of the oops scrolled away */ printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, sig); -#endif } -static void __bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +/* + * Print out info about fatal segfaults, if the show_unhandled_signals + * sysctl is set: + */ +static inline void +show_signal_msg(struct pt_regs *regs, unsigned long error_code, + unsigned long address, struct task_struct *tsk) +{ + if (!unhandled_signal(tsk, SIGSEGV)) + return; + + if (!printk_ratelimit()) + return; + + printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, + (void *)regs->ip, (void *)regs->sp, error_code); + + print_vma_addr(KERN_CONT " in ", regs->ip); + + printk(KERN_CONT "\n"); +} + +static void +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct task_struct *tsk = current; /* User mode accesses just cause a SIGSEGV */ if (error_code & PF_USER) { /* - * It's possible to have interrupts off here. + * It's possible to have interrupts off here: */ local_irq_enable(); /* * Valid to do another page fault here because this one came - * from user space. + * from user space: */ if (is_prefetch(regs, error_code, address)) return; @@ -516,22 +758,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, if (is_errata100(regs, address)) return; - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, - (void *) regs->ip, (void *) regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } + if (unlikely(show_unhandled_signals)) + show_signal_msg(regs, error_code, address, tsk); + + /* Kernel addresses are always protection faults: */ + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; } @@ -541,15 +777,16 @@ static void __bad_area_nosemaphore(struct pt_regs *regs, no_context(regs, error_code, address); } -static noinline void bad_area_nosemaphore(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR); } -static void __bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address, - int si_code) +static void +__bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { struct mm_struct *mm = current->mm; @@ -562,67 +799,75 @@ static void __bad_area(struct pt_regs *regs, __bad_area_nosemaphore(regs, error_code, address, si_code); } -static noinline void bad_area(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) { __bad_area(regs, error_code, address, SEGV_MAPERR); } -static noinline void bad_area_access_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static noinline void +bad_area_access_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { __bad_area(regs, error_code, address, SEGV_ACCERR); } /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ -static void out_of_memory(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +out_of_memory(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { /* * We ran out of memory, call the OOM killer, and return the userspace - * (which will retry the fault, or kill us if we got oom-killed). + * (which will retry the fault, or kill us if we got oom-killed): */ up_read(¤t->mm->mmap_sem); + pagefault_out_of_memory(); } -static void do_sigbus(struct pt_regs *regs, - unsigned long error_code, unsigned long address) +static void +do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; up_read(&mm->mmap_sem); - /* Kernel mode? Handle exceptions or die */ + /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) no_context(regs, error_code, address); -#ifdef CONFIG_X86_32 - /* User space => ok to do another page fault */ + + /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) return; -#endif - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; + + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -static noinline void mm_fault_error(struct pt_regs *regs, - unsigned long error_code, unsigned long address, unsigned int fault) +static noinline void +mm_fault_error(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { - if (fault & VM_FAULT_OOM) + if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); - else if (fault & VM_FAULT_SIGBUS) - do_sigbus(regs, error_code, address); - else - BUG(); + } else { + if (fault & VM_FAULT_SIGBUS) + do_sigbus(regs, error_code, address); + else + BUG(); + } } static int spurious_fault_check(unsigned long error_code, pte_t *pte) { if ((error_code & PF_WRITE) && !pte_write(*pte)) return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) return 0; @@ -630,21 +875,25 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte) } /* - * Handle a spurious fault caused by a stale TLB entry. This allows - * us to lazily refresh the TLB when increasing the permissions of a - * kernel page (RO -> RW or NX -> X). Doing it eagerly is very - * expensive since that implies doing a full cross-processor TLB - * flush, even if no stale TLB entries exist on other processors. + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * * There are no security implications to leaving a stale TLB when * increasing the permissions on a page. */ -static noinline int spurious_fault(unsigned long error_code, - unsigned long address) +static noinline int +spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; + int ret; /* Reserved-bit violation or user access to kernel space? */ if (error_code & (PF_USER | PF_RSVD)) @@ -672,123 +921,46 @@ static noinline int spurious_fault(unsigned long error_code, if (!pte_present(*pte)) return 0; - return spurious_fault_check(error_code, pte); -} - -/* - * X86_32 - * Handle a fault on the vmalloc or module mapping area - * - * X86_64 - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ -#ifdef CONFIG_X86_32 - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; + ret = spurious_fault_check(error_code, pte); + if (!ret) + return 0; /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -#else - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; + ret = spurious_fault_check(error_code, (pte_t *) pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); - /* Make sure we are in vmalloc area */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -#endif + return ret; } int show_unhandled_signals = 1; -static inline int access_error(unsigned long error_code, int write, - struct vm_area_struct *vma) +static inline int +access_error(unsigned long error_code, int write, struct vm_area_struct *vma) { if (write) { - /* write, present and write, not present */ + /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; - } else if (unlikely(error_code & PF_PROT)) { - /* read, present */ - return 1; - } else { - /* read, not present */ - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) - return 1; + return 0; } + /* read, present: */ + if (unlikely(error_code & PF_PROT)) + return 1; + + /* read, not present: */ + if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) + return 1; + return 0; } static int fault_in_kernel_space(unsigned long address) { -#ifdef CONFIG_X86_32 - return address >= TASK_SIZE; -#else /* !CONFIG_X86_32 */ - return address >= TASK_SIZE64; -#endif /* CONFIG_X86_32 */ + return address >= TASK_SIZE_MAX; } /* @@ -796,23 +968,22 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -#ifdef CONFIG_X86_64 -asmlinkage -#endif -void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) { - unsigned long address; + struct vm_area_struct *vma; struct task_struct *tsk; + unsigned long address; struct mm_struct *mm; - struct vm_area_struct *vma; int write; int fault; tsk = current; mm = tsk->mm; + prefetchw(&mm->mmap_sem); - /* get the address */ + /* Get the faulting address: */ address = read_cr2(); if (unlikely(kmmio_fault(regs, address))) @@ -836,22 +1007,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) vmalloc_fault(address) >= 0) return; - /* Can handle a stale RO->RW TLB */ + /* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return; - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. + * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address); + return; } - /* kprobes don't want to hook the spurious faults. */ + /* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* @@ -859,22 +1031,22 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) * vmalloc fault has been handled. * * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. + * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; - } else if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + } else { + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); + } -#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); -#endif /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * If we're in an interrupt, have no user context or are running + * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { bad_area_nosemaphore(regs, error_code, address); @@ -883,19 +1055,19 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. + * addresses in user space. All other faults represent errors in + * the kernel and should generate an OOPS. Unfortunately, in the + * case of an erroneous fault occurring in a code path which already + * holds mmap_sem we will deadlock attempting to validate the fault + * against the address space. Luckily the kernel only validly + * references user space from well defined areas of code, which are + * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. + * the source reference check when there is a possibility of a + * deadlock. Attempt to lock the address space, if we cannot we then + * validate the source. If this is invalid we can skip the address + * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && @@ -906,8 +1078,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) down_read(&mm->mmap_sem); } else { /* - * The above down_read_trylock() might have succeeded in which - * case we'll have missed the might_sleep() from down_read(). + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): */ might_sleep(); } @@ -927,7 +1100,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes + * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { @@ -946,6 +1119,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ good_area: write = error_code & PF_WRITE; + if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -954,75 +1128,21 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault. + * the fault: */ fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } + if (fault & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; -#ifdef CONFIG_X86_32 - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (v8086_mode(regs)) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } -#endif - up_read(&mm->mmap_sem); -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); + check_v8086_mode(regs, address, tsk); -void vmalloc_sync_all(void) -{ - unsigned long address; - -#ifdef CONFIG_X86_32 - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), - address)) - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#else /* CONFIG_X86_64 */ - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } -#endif + up_read(&mm->mmap_sem); } diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index d1f7439d173c..3957cd6d6454 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -194,7 +194,7 @@ void *alloc_remap(int nid, unsigned long size) size = ALIGN(size, L1_CACHE_BYTES); if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; + return NULL; node_remap_alloc_vaddr[nid] += size; memset(allocation, 0, size); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7be47d1a97e4..8253bc97587e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -482,6 +482,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) pbase = (pte_t *)page_address(base); paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + /* + * If we ever want to utilize the PAT bit, we need to + * update this function to make sure it's converted from + * bit 12 to bit 7 when we cross from the 2MB level to + * the 4K level: + */ + WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE); #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index d1e9b53f9d33..b641388d8286 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -8,7 +8,7 @@ #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 000415947d93..9356547d8c01 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -18,7 +18,7 @@ .text #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 9c98cc6ba978..7133cdf9098b 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -85,8 +85,8 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) unsigned long addr, end; unsigned offset; end = (start + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE64) - end = TASK_SIZE64; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; end -= len; /* This loses some more bits than a modulo, but is cheaper */ offset = get_random_int() & (PTRS_PER_PTE - 1); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 63d49a523ed3..1a5ff24e29c0 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -8,7 +8,7 @@ #include <asm/boot.h> #include <asm/asm.h> -#include <asm/page.h> +#include <asm/page_types.h> #include <xen/interface/elfnote.h> #include <asm/xen/interface.h> diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index d1dd5160daa9..2b6c59028254 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -272,7 +272,7 @@ acpi_os_map_memory(acpi_physical_address phys, acpi_size size) } EXPORT_SYMBOL_GPL(acpi_os_map_memory); -void acpi_os_unmap_memory(void __iomem * virt, acpi_size size) +void __ref acpi_os_unmap_memory(void __iomem *virt, acpi_size size) { if (acpi_gbl_permanent_mmap) iounmap(virt); @@ -281,7 +281,7 @@ void acpi_os_unmap_memory(void __iomem * virt, acpi_size size) } EXPORT_SYMBOL_GPL(acpi_os_unmap_memory); -void early_acpi_os_unmap_memory(void __iomem * virt, acpi_size size) +void __init early_acpi_os_unmap_memory(void __iomem *virt, acpi_size size) { if (!acpi_gbl_permanent_mmap) __acpi_unmap_table(virt, size); diff --git a/drivers/base/sys.c b/drivers/base/sys.c index c98c31ec2f75..b428c8c4bc64 100644 --- a/drivers/base/sys.c +++ b/drivers/base/sys.c @@ -303,7 +303,6 @@ void sysdev_unregister(struct sys_device * sysdev) * is guaranteed by virtue of the fact that child devices are registered * after their parents. */ - void sysdev_shutdown(void) { struct sysdev_class * cls; @@ -363,7 +362,6 @@ static void __sysdev_resume(struct sys_device *dev) * This is only called by the device PM core, so we let them handle * all synchronization. */ - int sysdev_suspend(pm_message_t state) { struct sysdev_class * cls; @@ -432,7 +430,7 @@ aux_driver: } return ret; } - +EXPORT_SYMBOL_GPL(sysdev_suspend); /** * sysdev_resume - Bring system devices back to life. @@ -442,7 +440,6 @@ aux_driver: * * Note: Interrupts are disabled when called. */ - int sysdev_resume(void) { struct sysdev_class * cls; @@ -463,7 +460,7 @@ int sysdev_resume(void) } return 0; } - +EXPORT_SYMBOL_GPL(sysdev_resume); int __init system_bus_init(void) { diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 81f1cff56fd5..2d797ffe8137 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -202,7 +202,7 @@ static int i915_initialize(struct drm_device * dev, drm_i915_init_t * init) dev_priv->ring.map.flags = 0; dev_priv->ring.map.mtrr = 0; - drm_core_ioremap(&dev_priv->ring.map, dev); + drm_core_ioremap_wc(&dev_priv->ring.map, dev); if (dev_priv->ring.map.handle == NULL) { i915_dma_cleanup(dev); diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index a31cbdbc3c54..0692622ee2b3 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -27,6 +27,7 @@ * */ +#include <linux/device.h> #include "drmP.h" #include "drm.h" #include "i915_drm.h" @@ -66,6 +67,12 @@ static int i915_suspend(struct drm_device *dev, pm_message_t state) i915_save_state(dev); + /* If KMS is active, we do the leavevt stuff here */ + if (drm_core_check_feature(dev, DRIVER_MODESET) && i915_gem_idle(dev)) { + dev_err(&dev->pdev->dev, "GEM idle failed, aborting suspend\n"); + return -EBUSY; + } + intel_opregion_free(dev); if (state.event == PM_EVENT_SUSPEND) { @@ -79,6 +86,9 @@ static int i915_suspend(struct drm_device *dev, pm_message_t state) static int i915_resume(struct drm_device *dev) { + struct drm_i915_private *dev_priv = dev->dev_private; + int ret = 0; + pci_set_power_state(dev->pdev, PCI_D0); pci_restore_state(dev->pdev); if (pci_enable_device(dev->pdev)) @@ -89,7 +99,18 @@ static int i915_resume(struct drm_device *dev) intel_opregion_init(dev); - return 0; + /* KMS EnterVT equivalent */ + if (drm_core_check_feature(dev, DRIVER_MODESET)) { + mutex_lock(&dev->struct_mutex); + dev_priv->mm.suspended = 0; + + ret = i915_gem_init_ringbuffer(dev); + if (ret != 0) + ret = -1; + mutex_unlock(&dev->struct_mutex); + } + + return ret; } static struct vm_operations_struct i915_gem_vm_ops = { diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 135a08f615cd..17fa40858d26 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -618,6 +618,7 @@ int i915_gem_init_ringbuffer(struct drm_device *dev); void i915_gem_cleanup_ringbuffer(struct drm_device *dev); int i915_gem_do_init(struct drm_device *dev, unsigned long start, unsigned long end); +int i915_gem_idle(struct drm_device *dev); int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); int i915_gem_object_set_to_gtt_domain(struct drm_gem_object *obj, int write); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index ac534c9a2f81..25b337438ca7 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -34,10 +34,6 @@ #define I915_GEM_GPU_DOMAINS (~(I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT)) -static void -i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj, - uint32_t read_domains, - uint32_t write_domain); static void i915_gem_object_flush_gpu_write_domain(struct drm_gem_object *obj); static void i915_gem_object_flush_gtt_write_domain(struct drm_gem_object *obj); static void i915_gem_object_flush_cpu_write_domain(struct drm_gem_object *obj); @@ -2021,30 +2017,28 @@ i915_gem_object_set_to_cpu_domain(struct drm_gem_object *obj, int write) * drm_agp_chipset_flush */ static void -i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj, - uint32_t read_domains, - uint32_t write_domain) +i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj) { struct drm_device *dev = obj->dev; struct drm_i915_gem_object *obj_priv = obj->driver_private; uint32_t invalidate_domains = 0; uint32_t flush_domains = 0; - BUG_ON(read_domains & I915_GEM_DOMAIN_CPU); - BUG_ON(write_domain == I915_GEM_DOMAIN_CPU); + BUG_ON(obj->pending_read_domains & I915_GEM_DOMAIN_CPU); + BUG_ON(obj->pending_write_domain == I915_GEM_DOMAIN_CPU); #if WATCH_BUF DRM_INFO("%s: object %p read %08x -> %08x write %08x -> %08x\n", __func__, obj, - obj->read_domains, read_domains, - obj->write_domain, write_domain); + obj->read_domains, obj->pending_read_domains, + obj->write_domain, obj->pending_write_domain); #endif /* * If the object isn't moving to a new write domain, * let the object stay in multiple read domains */ - if (write_domain == 0) - read_domains |= obj->read_domains; + if (obj->pending_write_domain == 0) + obj->pending_read_domains |= obj->read_domains; else obj_priv->dirty = 1; @@ -2054,15 +2048,17 @@ i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj, * any read domains which differ from the old * write domain */ - if (obj->write_domain && obj->write_domain != read_domains) { + if (obj->write_domain && + obj->write_domain != obj->pending_read_domains) { flush_domains |= obj->write_domain; - invalidate_domains |= read_domains & ~obj->write_domain; + invalidate_domains |= + obj->pending_read_domains & ~obj->write_domain; } /* * Invalidate any read caches which may have * stale data. That is, any new read domains. */ - invalidate_domains |= read_domains & ~obj->read_domains; + invalidate_domains |= obj->pending_read_domains & ~obj->read_domains; if ((flush_domains | invalidate_domains) & I915_GEM_DOMAIN_CPU) { #if WATCH_BUF DRM_INFO("%s: CPU domain flush %08x invalidate %08x\n", @@ -2071,9 +2067,15 @@ i915_gem_object_set_to_gpu_domain(struct drm_gem_object *obj, i915_gem_clflush_object(obj); } - if ((write_domain | flush_domains) != 0) - obj->write_domain = write_domain; - obj->read_domains = read_domains; + /* The actual obj->write_domain will be updated with + * pending_write_domain after we emit the accumulated flush for all + * of our domain changes in execbuffers (which clears objects' + * write_domains). So if we have a current write domain that we + * aren't changing, set pending_write_domain to that. + */ + if (flush_domains == 0 && obj->pending_write_domain == 0) + obj->pending_write_domain = obj->write_domain; + obj->read_domains = obj->pending_read_domains; dev->invalidate_domains |= invalidate_domains; dev->flush_domains |= flush_domains; @@ -2583,9 +2585,7 @@ i915_gem_execbuffer(struct drm_device *dev, void *data, struct drm_gem_object *obj = object_list[i]; /* Compute new gpu domains and update invalidate/flush */ - i915_gem_object_set_to_gpu_domain(obj, - obj->pending_read_domains, - obj->pending_write_domain); + i915_gem_object_set_to_gpu_domain(obj); } i915_verify_inactive(dev, __FILE__, __LINE__); @@ -2604,6 +2604,12 @@ i915_gem_execbuffer(struct drm_device *dev, void *data, (void)i915_add_request(dev, dev->flush_domains); } + for (i = 0; i < args->buffer_count; i++) { + struct drm_gem_object *obj = object_list[i]; + + obj->write_domain = obj->pending_write_domain; + } + i915_verify_inactive(dev, __FILE__, __LINE__); #if WATCH_COHERENCY @@ -2866,6 +2872,13 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data, return -EBADF; } + /* Update the active list for the hardware's current position. + * Otherwise this only updates on a delayed timer or when irqs are + * actually unmasked, and our working set ends up being larger than + * required. + */ + i915_gem_retire_requests(dev); + obj_priv = obj->driver_private; /* Don't count being on the flushing list against the object being * done. Otherwise, a buffer left on the flushing list but not getting @@ -2967,7 +2980,7 @@ i915_gem_evict_from_list(struct drm_device *dev, struct list_head *head) return 0; } -static int +int i915_gem_idle(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; @@ -3130,16 +3143,20 @@ static void i915_gem_cleanup_hws(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; - struct drm_gem_object *obj = dev_priv->hws_obj; - struct drm_i915_gem_object *obj_priv = obj->driver_private; + struct drm_gem_object *obj; + struct drm_i915_gem_object *obj_priv; if (dev_priv->hws_obj == NULL) return; + obj = dev_priv->hws_obj; + obj_priv = obj->driver_private; + kunmap(obj_priv->page_list[0]); i915_gem_object_unpin(obj); drm_gem_object_unreference(obj); dev_priv->hws_obj = NULL; + memset(&dev_priv->hws_map, 0, sizeof(dev_priv->hws_map)); dev_priv->hw_status_page = NULL; diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 4d2baf7b00be..65b635ce28c8 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -1008,6 +1008,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc, temp = CURSOR_MODE_DISABLE; addr = 0; bo = NULL; + mutex_lock(&dev->struct_mutex); goto finish; } diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 32851eef48f0..2ec6cc14a114 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -182,6 +182,14 @@ struct kprobe_blackpoint { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); +/* + * For #ifdef avoidance: + */ +static inline int kprobes_built_in(void) +{ + return 1; +} + #ifdef CONFIG_KRETPROBES extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); @@ -271,8 +279,16 @@ void unregister_kretprobes(struct kretprobe **rps, int num); void kprobe_flush_task(struct task_struct *tk); void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); -#else /* CONFIG_KPROBES */ +#else /* !CONFIG_KPROBES: */ +static inline int kprobes_built_in(void) +{ + return 0; +} +static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + return 0; +} static inline struct kprobe *get_kprobe(void *addr) { return NULL; @@ -329,5 +345,5 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num) static inline void kprobe_flush_task(struct task_struct *tk) { } -#endif /* CONFIG_KPROBES */ -#endif /* _LINUX_KPROBES_H */ +#endif /* CONFIG_KPROBES */ +#endif /* _LINUX_KPROBES_H */ diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 139d7c88d9c9..3d1b7bde1283 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -1,5 +1,5 @@ -#ifndef MMIOTRACE_H -#define MMIOTRACE_H +#ifndef _LINUX_MMIOTRACE_H +#define _LINUX_MMIOTRACE_H #include <linux/types.h> #include <linux/list.h> @@ -13,28 +13,34 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *, unsigned long condition, struct pt_regs *); struct kmmio_probe { - struct list_head list; /* kmmio internal list */ - unsigned long addr; /* start location of the probe point */ - unsigned long len; /* length of the probe region */ - kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */ - kmmio_post_handler_t post_handler; /* Called after addr is executed */ - void *private; + /* kmmio internal list: */ + struct list_head list; + /* start location of the probe point: */ + unsigned long addr; + /* length of the probe region: */ + unsigned long len; + /* Called before addr is executed: */ + kmmio_pre_handler_t pre_handler; + /* Called after addr is executed: */ + kmmio_post_handler_t post_handler; + void *private; }; +extern unsigned int kmmio_count; + +extern int register_kmmio_probe(struct kmmio_probe *p); +extern void unregister_kmmio_probe(struct kmmio_probe *p); + +#ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ static inline int is_kmmio_active(void) { - extern unsigned int kmmio_count; return kmmio_count; } -extern int register_kmmio_probe(struct kmmio_probe *p); -extern void unregister_kmmio_probe(struct kmmio_probe *p); - /* Called from page fault handler. */ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_MMIOTRACE /* Called from ioremap.c */ extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr); @@ -43,7 +49,17 @@ extern void mmiotrace_iounmap(volatile void __iomem *addr); /* For anyone to insert markers. Remember trailing newline. */ extern int mmiotrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); -#else +#else /* !CONFIG_MMIOTRACE: */ +static inline int is_kmmio_active(void) +{ + return 0; +} + +static inline int kmmio_handler(struct pt_regs *regs, unsigned long addr) +{ + return 0; +} + static inline void mmiotrace_ioremap(resource_size_t offset, unsigned long size, void __iomem *addr) { @@ -63,28 +79,28 @@ static inline int mmiotrace_printk(const char *fmt, ...) #endif /* CONFIG_MMIOTRACE */ enum mm_io_opcode { - MMIO_READ = 0x1, /* struct mmiotrace_rw */ - MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ - MMIO_PROBE = 0x3, /* struct mmiotrace_map */ - MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ - MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ + MMIO_READ = 0x1, /* struct mmiotrace_rw */ + MMIO_WRITE = 0x2, /* struct mmiotrace_rw */ + MMIO_PROBE = 0x3, /* struct mmiotrace_map */ + MMIO_UNPROBE = 0x4, /* struct mmiotrace_map */ + MMIO_UNKNOWN_OP = 0x5, /* struct mmiotrace_rw */ }; struct mmiotrace_rw { - resource_size_t phys; /* PCI address of register */ - unsigned long value; - unsigned long pc; /* optional program counter */ - int map_id; - unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ - unsigned char width; /* size of register access in bytes */ + resource_size_t phys; /* PCI address of register */ + unsigned long value; + unsigned long pc; /* optional program counter */ + int map_id; + unsigned char opcode; /* one of MMIO_{READ,WRITE,UNKNOWN_OP} */ + unsigned char width; /* size of register access in bytes */ }; struct mmiotrace_map { - resource_size_t phys; /* base address in PCI space */ - unsigned long virt; /* base virtual address */ - unsigned long len; /* mapping size */ - int map_id; - unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ + resource_size_t phys; /* base address in PCI space */ + unsigned long virt; /* base virtual address */ + unsigned long len; /* mapping size */ + int map_id; + unsigned char opcode; /* MMIO_PROBE or MMIO_UNPROBE */ }; /* in kernel/trace/trace_mmiotrace.c */ @@ -94,4 +110,4 @@ extern void mmio_trace_rw(struct mmiotrace_rw *rw); extern void mmio_trace_mapping(struct mmiotrace_map *map); extern int mmio_trace_printk(const char *fmt, va_list args); -#endif /* MMIOTRACE_H */ +#endif /* _LINUX_MMIOTRACE_H */ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6bb2635b5ded..7bc992976d29 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -3,11 +3,16 @@ * * This is an implementation of the CIPSO 2.2 protocol as specified in * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in - * FIPS-188, copies of both documents can be found in the Documentation - * directory. While CIPSO never became a full IETF RFC standard many vendors + * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors * have chosen to adopt the protocol and over the years it has become a * de-facto standard for labeled networking. * + * The CIPSO draft specification can be found in the kernel's Documentation + * directory as well as the following URL: + * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt + * The FIPS-188 specification can be found at the following URL: + * http://www.itl.nist.gov/fipspubs/fip188.htm + * * Author: Paul Moore <paul.moore@hp.com> * */ diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c index f58701a7b728..3f4b26647386 100644 --- a/security/selinux/netlabel.c +++ b/security/selinux/netlabel.c @@ -490,8 +490,10 @@ int selinux_netlbl_socket_setsockopt(struct socket *sock, lock_sock(sk); rc = netlbl_sock_getattr(sk, &secattr); release_sock(sk); - if (rc == 0 && secattr.flags != NETLBL_SECATTR_NONE) + if (rc == 0) rc = -EACCES; + else if (rc == -ENOMSG) + rc = 0; netlbl_secattr_destroy(&secattr); } |