diff options
Diffstat (limited to 'Documentation')
82 files changed, 2515 insertions, 1492 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 1a7f53068ec2..054a7ecf64c6 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -165,7 +165,7 @@ quiet_cmd_db2man = MAN $@ @touch $@ ### -# Rules to generate postscripts and PNG imgages from .fig format files +# Rules to generate postscripts and PNG images from .fig format files quiet_cmd_fig2eps = FIG2EPS $@ cmd_fig2eps = fig2dev -Leps $< $@ diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index d3290c46af51..aa38cc5692a0 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -46,7 +46,7 @@ <sect1><title>Atomic and pointer manipulation</title> !Iinclude/asm-x86/atomic_32.h -!Iinclude/asm-x86/unaligned_32.h +!Iinclude/asm-x86/unaligned.h </sect1> <sect1><title>Delaying, scheduling, and timer routines</title> diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 6fbc41d98c1e..957cf5c26831 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl @@ -282,7 +282,7 @@ int __init board_init (void) goto out; } - /* map physical adress */ + /* map physical address */ baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024); if(!baseaddr){ printk("Ioremap to access NAND chip failed\n"); @@ -306,7 +306,7 @@ int __init board_init (void) this->dev_ready = board_dev_ready; this->eccmode = NAND_ECC_SOFT; - /* Scan to find existance of the device */ + /* Scan to find existence of the device */ if (nand_scan (board_mtd, 1)) { err = -ENXIO; goto out_ior; @@ -340,7 +340,7 @@ static void __exit board_cleanup (void) /* Release resources, unregister device */ nand_release (board_mtd); - /* unmap physical adress */ + /* unmap physical address */ iounmap((void *)baseaddr); /* Free the MTD device structure */ diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index 24dc3fcf1594..bc38283379f0 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt @@ -441,17 +441,20 @@ ACPI, and if none of those then a KCS device at the spec-specified 0xca2. If you want to turn this off, set the "trydefaults" option to false. -If you have high-res timers compiled into the kernel, the driver will -use them to provide much better performance. Note that if you do not -have high-res timers enabled in the kernel and you don't have -interrupts enabled, the driver will run VERY slowly. Don't blame me, +If your IPMI interface does not support interrupts and is a KCS or +SMIC interface, the IPMI driver will start a kernel thread for the +interface to help speed things up. This is a low-priority kernel +thread that constantly polls the IPMI driver while an IPMI operation +is in progress. The force_kipmid module parameter will all the user to +force this thread on or off. If you force it off and don't have +interrupts, the driver will run VERY slowly. Don't blame me, these interfaces suck. The driver supports a hot add and remove of interfaces. This way, interfaces can be added or removed after the kernel is up and running. -This is done using /sys/modules/ipmi_si/hotmod, which is a write-only -parameter. You write a string to this interface. The string has the -format: +This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a +write-only parameter. You write a string to this interface. The string +has the format: <op1>[:op2[:op3...]] The "op"s are: add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]] @@ -581,9 +584,11 @@ The watchdog will panic and start a 120 second reset timeout if it gets a pre-action. During a panic or a reboot, the watchdog will start a 120 timer if it is running to make sure the reboot occurs. -Note that if you use the NMI preaction for the watchdog, you MUST -NOT use nmi watchdog mode 1. If you use the NMI watchdog, you -must use mode 2. +Note that if you use the NMI preaction for the watchdog, you MUST NOT +use the nmi watchdog. There is no reasonable way to tell if an NMI +comes from the IPMI controller, so it must assume that if it gets an +otherwise unhandled NMI, it must be from IPMI and it will panic +immediately. Once you open the watchdog timer, you must write a 'V' character to the device to close it, or the timer will not stop. This is a new semantic diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/Intel-IOMMU.txt new file mode 100644 index 000000000000..c2321903aa09 --- /dev/null +++ b/Documentation/Intel-IOMMU.txt @@ -0,0 +1,115 @@ +Linux IOMMU Support +=================== + +The architecture spec can be obtained from the below location. + +http://www.intel.com/technology/virtualization/ + +This guide gives a quick cheat sheet for some basic understanding. + +Some Keywords + +DMAR - DMA remapping +DRHD - DMA Engine Reporting Structure +RMRR - Reserved memory Region Reporting Structure +ZLR - Zero length reads from PCI devices +IOVA - IO Virtual address. + +Basic stuff +----------- + +ACPI enumerates and lists the different DMA engines in the platform, and +device scope relationships between PCI devices and which DMA engine controls +them. + +What is RMRR? +------------- + +There are some devices the BIOS controls, for e.g USB devices to perform +PS2 emulation. The regions of memory used for these devices are marked +reserved in the e820 map. When we turn on DMA translation, DMA to those +regions will fail. Hence BIOS uses RMRR to specify these regions along with +devices that need to access these regions. OS is expected to setup +unity mappings for these regions for these devices to access these regions. + +How is IOVA generated? +--------------------- + +Well behaved drivers call pci_map_*() calls before sending command to device +that needs to perform DMA. Once DMA is completed and mapping is no longer +required, device performs a pci_unmap_*() calls to unmap the region. + +The Intel IOMMU driver allocates a virtual address per domain. Each PCIE +device has its own domain (hence protection). Devices under p2p bridges +share the virtual address with all devices under the p2p bridge due to +transaction id aliasing for p2p bridges. + +IOVA generation is pretty generic. We used the same technique as vmalloc() +but these are not global address spaces, but separate for each domain. +Different DMA engines may support different number of domains. + +We also allocate gaurd pages with each mapping, so we can attempt to catch +any overflow that might happen. + + +Graphics Problems? +------------------ +If you encounter issues with graphics devices, you can try adding +option intel_iommu=igfx_off to turn off the integrated graphics engine. + +If it happens to be a PCI device included in the INCLUDE_ALL Engine, +then try enabling CONFIG_DMAR_GFX_WA to setup a 1-1 map. We hear +graphics drivers may be in process of using DMA api's in the near +future and at that time this option can be yanked out. + +Some exceptions to IOVA +----------------------- +Interrupt ranges are not address translated, (0xfee00000 - 0xfeefffff). +The same is true for peer to peer transactions. Hence we reserve the +address from PCI MMIO ranges so they are not allocated for IOVA addresses. + + +Fault reporting +--------------- +When errors are reported, the DMA engine signals via an interrupt. The fault +reason and device that caused it with fault reason is printed on console. + +See below for sample. + + +Boot Message Sample +------------------- + +Something like this gets printed indicating presence of DMAR tables +in ACPI. + +ACPI: DMAR (v001 A M I OEMDMAR 0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0 + +When DMAR is being processed and initialized by ACPI, prints DMAR locations +and any RMRR's processed. + +ACPI DMAR:Host address width 36 +ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000 +ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000 +ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000 +ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff +ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff + +When DMAR is enabled for use, you will notice.. + +PCI-DMA: Using DMAR IOMMU + +Fault reporting +--------------- + +DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 +DMAR:[fault reason 05] PTE Write access is not set +DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000 +DMAR:[fault reason 05] PTE Write access is not set + +TBD +---- + +- For compatibility testing, could use unity map domain for all devices, just + provide a 1-1 for all useful memory under a single domain for all devices. +- API for paravirt ops for abstracting functionlity for VMM folks. diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index 19e7f65c269f..34e06d2f194f 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist @@ -67,7 +67,7 @@ kernel patches. 20: Check that it all passes `make headers_check'. 21: Has been checked with injection of at least slab and page-allocation - fauilures. See Documentation/fault-injection/. + failures. See Documentation/fault-injection/. If the new code is substantial, addition of subsystem-specific fault injection might be appropriate. diff --git a/Documentation/SubmittingDrivers b/Documentation/SubmittingDrivers index d7e26427e426..24f2eb40cae5 100644 --- a/Documentation/SubmittingDrivers +++ b/Documentation/SubmittingDrivers @@ -36,8 +36,7 @@ Linux 2.4: If the code area has a general maintainer then please submit it to the maintainer listed in MAINTAINERS in the kernel file. If the maintainer does not respond or you cannot find the appropriate - maintainer then please contact Marcelo Tosatti - <marcelo.tosatti@cyclades.com>. + maintainer then please contact Willy Tarreau <w@1wt.eu>. Linux 2.6: The same rules apply as 2.4 except that you should follow linux-kernel diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt new file mode 100644 index 000000000000..eda40fd39cad --- /dev/null +++ b/Documentation/accounting/cgroupstats.txt @@ -0,0 +1,27 @@ +Control Groupstats is inspired by the discussion at +http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as +suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263. + +Per cgroup statistics infrastructure re-uses code from the taskstats +interface. A new set of cgroup operations are registered with commands +and attributes specific to cgroups. It should be very easy to +extend per cgroup statistics, by adding members to the cgroupstats +structure. + +The current model for cgroupstats is a pull, a push model (to post +statistics on interesting events), should be very easy to add. Currently +user space requests for statistics by passing the cgroup path. +Statistics about the state of all the tasks in the cgroup is returned to +user space. + +NOTE: We currently rely on delay accounting for extracting information +about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this +information will not be available. + +To extract cgroup statistics a utility very similar to getdelays.c +has been developed, the sample output of the utility is shown below + +~/balbir/cgroupstats # ./getdelays -C "/cgroup/a" +sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 +~/balbir/cgroupstats # ./getdelays -C "/cgroup" +sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2 diff --git a/Documentation/arm/Samsung-S3C24XX/DMA.txt b/Documentation/arm/Samsung-S3C24XX/DMA.txt index 37f4edcc5d87..3ed82383efea 100644 --- a/Documentation/arm/Samsung-S3C24XX/DMA.txt +++ b/Documentation/arm/Samsung-S3C24XX/DMA.txt @@ -5,7 +5,7 @@ Introduction ------------ The kernel provides an interface to manage DMA transfers - using the DMA channels in the cpu, so that the central + using the DMA channels in the CPU, so that the central duty of managing channel mappings, and programming the channel generators is in one place. @@ -17,24 +17,24 @@ DMA Channel Ordering channels to all sources, which means that some devices have a restricted number of channels that can be used. - To allow flexibilty for each cpu type and board, the - dma code can be given an dma ordering structure which + To allow flexibility for each CPU type and board, the + DMA code can be given a DMA ordering structure which allows the order of channel search to be specified, as well as allowing the prohibition of certain claims. struct s3c24xx_dma_order has a list of channels, and - each channel within has a slot for a list of dma - channel numbers. The slots are searched in order, for - the presence of a dma channel number with DMA_CH_VALID - orred in. + each channel within has a slot for a list of DMA + channel numbers. The slots are searched in order for + the presence of a DMA channel number with DMA_CH_VALID + or-ed in. If the order has the flag DMA_CH_NEVER set, then after checking the channel list, the system will return no found channel, thus denying the request. A board support file can call s3c24xx_dma_order_set() - to register an complete ordering set. The routine will - copy the data, so the original can be discared with + to register a complete ordering set. The routine will + copy the data, so the original can be discarded with __initdata. diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt index d46306fea230..f20c10c2858f 100644 --- a/Documentation/atomic_ops.txt +++ b/Documentation/atomic_ops.txt @@ -418,6 +418,20 @@ brothers: */ smp_mb__after_clear_bit(); +There are two special bitops with lock barrier semantics (acquire/release, +same as spinlocks). These operate in the same way as their non-_lock/unlock +postfixed variants, except that they are to provide acquire/release semantics, +respectively. This means they can be used for bit_spin_trylock and +bit_spin_unlock type operations without specifying any more barriers. + + int test_and_set_bit_lock(unsigned long nr, unsigned long *addr); + void clear_bit_unlock(unsigned long nr, unsigned long *addr); + void __clear_bit_unlock(unsigned long nr, unsigned long *addr); + +The __clear_bit_unlock version is non-atomic, however it still implements +unlock barrier semantics. This can be useful if the lock itself is protecting +the other bits in the word. + Finally, there are non-atomic versions of the bitmask operations provided. They are used in contexts where some other higher-level SMP locking scheme is being used to protect the bitmask, and thus less diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index 552cabac0608..da42ab414c48 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -87,30 +87,7 @@ changes occur: This is used primarily during fault processing. -5) void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) - - The software page tables for address space 'mm' for virtual - addresses in the range 'start' to 'end-1' are being torn down. - - Some platforms cache the lowest level of the software page tables - in a linear virtually mapped array, to make TLB miss processing - more efficient. On such platforms, since the TLB is caching the - software page table structure, it needs to be flushed when parts - of the software page table tree are unlinked/freed. - - Sparc64 is one example of a platform which does this. - - Usually, when munmap()'ing an area of user virtual address - space, the kernel leaves the page table parts around and just - marks the individual pte's as invalid. However, if very large - portions of the address space are unmapped, the kernel frees up - those portions of the software page tables to prevent potential - excessive kernel memory usage caused by erratic mmap/mmunmap - sequences. It is at these times that flush_tlb_pgtables will - be invoked. - -6) void update_mmu_cache(struct vm_area_struct *vma, +5) void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) At the end of every page fault, this routine is invoked to @@ -123,7 +100,7 @@ changes occur: translations for software managed TLB configurations. The sparc64 port currently does this. -7) void tlb_migrate_finish(struct mm_struct *mm) +6) void tlb_migrate_finish(struct mm_struct *mm) This interface is called at the end of an explicit process migration. This interface provides a hook diff --git a/Documentation/cdrom/cdrom-standard.tex b/Documentation/cdrom/cdrom-standard.tex index 92f94e597582..c713aeb020c4 100644 --- a/Documentation/cdrom/cdrom-standard.tex +++ b/Documentation/cdrom/cdrom-standard.tex @@ -1009,7 +1009,7 @@ taken over the torch in maintaining \cdromc\ and integrating much \cdrom-related code in the 2.1-kernel. Thanks to Scott Snyder and Gerd Knorr, who were the first to implement this interface for SCSI and IDE-CD drivers and added many ideas for extension of the data -structures relative to kernel~2.0. Further thanks to Heiko Eissfeldt, +structures relative to kernel~2.0. Further thanks to Heiko Ei{\sz}feldt, Thomas Quinot, Jon Tombs, Ken Pizzini, Eberhard M\"onkeberg and Andrew Kroll, the \linux\ \cdrom\ device driver developers who were kind enough to give suggestions and criticisms during the writing. Finally diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt new file mode 100644 index 000000000000..98a26f81fa75 --- /dev/null +++ b/Documentation/cgroups.txt @@ -0,0 +1,545 @@ + CGROUPS + ------- + +Written by Paul Menage <menage@google.com> based on Documentation/cpusets.txt + +Original copyright statements from cpusets.txt: +Portions Copyright (C) 2004 BULL SA. +Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. +Modified by Paul Jackson <pj@sgi.com> +Modified by Christoph Lameter <clameter@sgi.com> + +CONTENTS: +========= + +1. Control Groups + 1.1 What are cgroups ? + 1.2 Why are cgroups needed ? + 1.3 How are cgroups implemented ? + 1.4 What does notify_on_release do ? + 1.5 How do I use cgroups ? +2. Usage Examples and Syntax + 2.1 Basic Usage + 2.2 Attaching processes +3. Kernel API + 3.1 Overview + 3.2 Synchronization + 3.3 Subsystem API +4. Questions + +1. Control Groups +========== + +1.1 What are cgroups ? +---------------------- + +Control Groups provide a mechanism for aggregating/partitioning sets of +tasks, and all their future children, into hierarchical groups with +specialized behaviour. + +Definitions: + +A *cgroup* associates a set of tasks with a set of parameters for one +or more subsystems. + +A *subsystem* is a module that makes use of the task grouping +facilities provided by cgroups to treat groups of tasks in +particular ways. A subsystem is typically a "resource controller" that +schedules a resource or applies per-cgroup limits, but it may be +anything that wants to act on a group of processes, e.g. a +virtualization subsystem. + +A *hierarchy* is a set of cgroups arranged in a tree, such that +every task in the system is in exactly one of the cgroups in the +hierarchy, and a set of subsystems; each subsystem has system-specific +state attached to each cgroup in the hierarchy. Each hierarchy has +an instance of the cgroup virtual filesystem associated with it. + +At any one time there may be multiple active hierachies of task +cgroups. Each hierarchy is a partition of all tasks in the system. + +User level code may create and destroy cgroups by name in an +instance of the cgroup virtual file system, specify and query to +which cgroup a task is assigned, and list the task pids assigned to +a cgroup. Those creations and assignments only affect the hierarchy +associated with that instance of the cgroup file system. + +On their own, the only use for cgroups is for simple job +tracking. The intention is that other subsystems hook into the generic +cgroup support to provide new attributes for cgroups, such as +accounting/limiting the resources which processes in a cgroup can +access. For example, cpusets (see Documentation/cpusets.txt) allows +you to associate a set of CPUs and a set of memory nodes with the +tasks in each cgroup. + +1.2 Why are cgroups needed ? +---------------------------- + +There are multiple efforts to provide process aggregations in the +Linux kernel, mainly for resource tracking purposes. Such efforts +include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server +namespaces. These all require the basic notion of a +grouping/partitioning of processes, with newly forked processes ending +in the same group (cgroup) as their parent process. + +The kernel cgroup patch provides the minimum essential kernel +mechanisms required to efficiently implement such groups. It has +minimal impact on the system fast paths, and provides hooks for +specific subsystems such as cpusets to provide additional behaviour as +desired. + +Multiple hierarchy support is provided to allow for situations where +the division of tasks into cgroups is distinctly different for +different subsystems - having parallel hierarchies allows each +hierarchy to be a natural division of tasks, without having to handle +complex combinations of tasks that would be present if several +unrelated subsystems needed to be forced into the same tree of +cgroups. + +At one extreme, each resource controller or subsystem could be in a +separate hierarchy; at the other extreme, all subsystems +would be attached to the same hierarchy. + +As an example of a scenario (originally proposed by vatsa@in.ibm.com) +that can benefit from multiple hierarchies, consider a large +university server with various users - students, professors, system +tasks etc. The resource planning for this server could be along the +following lines: + + CPU : Top cpuset + / \ + CPUSet1 CPUSet2 + | | + (Profs) (Students) + + In addition (system tasks) are attached to topcpuset (so + that they can run anywhere) with a limit of 20% + + Memory : Professors (50%), students (30%), system (20%) + + Disk : Prof (50%), students (30%), system (20%) + + Network : WWW browsing (20%), Network File System (60%), others (20%) + / \ + Prof (15%) students (5%) + +Browsers like firefox/lynx go into the WWW network class, while (k)nfsd go +into NFS network class. + +At the same time firefox/lynx will share an appropriate CPU/Memory class +depending on who launched it (prof/student). + +With the ability to classify tasks differently for different resources +(by putting those resource subsystems in different hierarchies) then +the admin can easily set up a script which receives exec notifications +and depending on who is launching the browser he can + + # echo browser_pid > /mnt/<restype>/<userclass>/tasks + +With only a single hierarchy, he now would potentially have to create +a separate cgroup for every browser launched and associate it with +approp network and other resource class. This may lead to +proliferation of such cgroups. + +Also lets say that the administrator would like to give enhanced network +access temporarily to a student's browser (since it is night and the user +wants to do online gaming :) OR give one of the students simulation +apps enhanced CPU power, + +With ability to write pids directly to resource classes, its just a +matter of : + + # echo pid > /mnt/network/<new_class>/tasks + (after some time) + # echo pid > /mnt/network/<orig_class>/tasks + +Without this ability, he would have to split the cgroup into +multiple separate ones and then associate the new cgroups with the +new resource classes. + + + +1.3 How are cgroups implemented ? +--------------------------------- + +Control Groups extends the kernel as follows: + + - Each task in the system has a reference-counted pointer to a + css_set. + + - A css_set contains a set of reference-counted pointers to + cgroup_subsys_state objects, one for each cgroup subsystem + registered in the system. There is no direct link from a task to + the cgroup of which it's a member in each hierarchy, but this + can be determined by following pointers through the + cgroup_subsys_state objects. This is because accessing the + subsystem state is something that's expected to happen frequently + and in performance-critical code, whereas operations that require a + task's actual cgroup assignments (in particular, moving between + cgroups) are less common. A linked list runs through the cg_list + field of each task_struct using the css_set, anchored at + css_set->tasks. + + - A cgroup hierarchy filesystem can be mounted for browsing and + manipulation from user space. + + - You can list all the tasks (by pid) attached to any cgroup. + +The implementation of cgroups requires a few, simple hooks +into the rest of the kernel, none in performance critical paths: + + - in init/main.c, to initialize the root cgroups and initial + css_set at system boot. + + - in fork and exit, to attach and detach a task from its css_set. + +In addition a new file system, of type "cgroup" may be mounted, to +enable browsing and modifying the cgroups presently known to the +kernel. When mounting a cgroup hierarchy, you may specify a +comma-separated list of subsystems to mount as the filesystem mount +options. By default, mounting the cgroup filesystem attempts to +mount a hierarchy containing all registered subsystems. + +If an active hierarchy with exactly the same set of subsystems already +exists, it will be reused for the new mount. If no existing hierarchy +matches, and any of the requested subsystems are in use in an existing +hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy +is activated, associated with the requested subsystems. + +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. + +When a cgroup filesystem is unmounted, if there are any +child cgroups created below the top-level cgroup, that hierarchy +will remain active even though unmounted; if there are no +child cgroups then the hierarchy will be deactivated. + +No new system calls are added for cgroups - all support for +querying and modifying cgroups is via this cgroup file system. + +Each task under /proc has an added file named 'cgroup' displaying, +for each active hierarchy, the subsystem names and the cgroup name +as the path relative to the root of the cgroup file system. + +Each cgroup is represented by a directory in the cgroup file system +containing the following files describing that cgroup: + + - tasks: list of tasks (by pid) attached to that cgroup + - notify_on_release flag: run /sbin/cgroup_release_agent on exit? + +Other subsystems such as cpusets may add additional files in each +cgroup dir + +New cgroups are created using the mkdir system call or shell +command. The properties of a cgroup, such as its flags, are +modified by writing to the appropriate file in that cgroups +directory, as listed above. + +The named hierarchical structure of nested cgroups allows partitioning +a large system into nested, dynamically changeable, "soft-partitions". + +The attachment of each task, automatically inherited at fork by any +children of that task, to a cgroup allows organizing the work load +on a system into related sets of tasks. A task may be re-attached to +any other cgroup, if allowed by the permissions on the necessary +cgroup file system directories. + +When a task is moved from one cgroup to another, it gets a new +css_set pointer - if there's an already existing css_set with the +desired collection of cgroups then that group is reused, else a new +css_set is allocated. Note that the current implementation uses a +linear search to locate an appropriate existing css_set, so isn't +very efficient. A future version will use a hash table for better +performance. + +To allow access from a cgroup to the css_sets (and hence tasks) +that comprise it, a set of cg_cgroup_link objects form a lattice; +each cg_cgroup_link is linked into a list of cg_cgroup_links for +a single cgroup on its cont_link_list field, and a list of +cg_cgroup_links for a single css_set on its cg_link_list. + +Thus the set of tasks in a cgroup can be listed by iterating over +each css_set that references the cgroup, and sub-iterating over +each css_set's task set. + +The use of a Linux virtual file system (vfs) to represent the +cgroup hierarchy provides for a familiar permission and name space +for cgroups, with a minimum of additional kernel code. + +1.4 What does notify_on_release do ? +------------------------------------ + +*** notify_on_release is disabled in the current patch set. It will be +*** reactivated in a future patch in a less-intrusive manner + +If the notify_on_release flag is enabled (1) in a cgroup, then +whenever the last task in the cgroup leaves (exits or attaches to +some other cgroup) and the last child cgroup of that cgroup +is removed, then the kernel runs the command specified by the contents +of the "release_agent" file in that hierarchy's root directory, +supplying the pathname (relative to the mount point of the cgroup +file system) of the abandoned cgroup. This enables automatic +removal of abandoned cgroups. The default value of +notify_on_release in the root cgroup at system boot is disabled +(0). The default value of other cgroups at creation is the current +value of their parents notify_on_release setting. The default value of +a cgroup hierarchy's release_agent path is empty. + +1.5 How do I use cgroups ? +-------------------------- + +To start a new job that is to be contained within a cgroup, using +the "cpuset" cgroup subsystem, the steps are something like: + + 1) mkdir /dev/cgroup + 2) mount -t cgroup -ocpuset cpuset /dev/cgroup + 3) Create the new cgroup by doing mkdir's and write's (or echo's) in + the /dev/cgroup virtual file system. + 4) Start a task that will be the "founding father" of the new job. + 5) Attach that task to the new cgroup by writing its pid to the + /dev/cgroup tasks file for that cgroup. + 6) fork, exec or clone the job tasks from this founding father task. + +For example, the following sequence of commands will setup a cgroup +named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, +and then start a subshell 'sh' in that cgroup: + + mount -t cgroup cpuset -ocpuset /dev/cgroup + cd /dev/cgroup + mkdir Charlie + cd Charlie + /bin/echo 2-3 > cpus + /bin/echo 1 > mems + /bin/echo $$ > tasks + sh + # The subshell 'sh' is now running in cgroup Charlie + # The next line should display '/Charlie' + cat /proc/self/cgroup + +2. Usage Examples and Syntax +============================ + +2.1 Basic Usage +--------------- + +Creating, modifying, using the cgroups can be done through the cgroup +virtual filesystem. + +To mount a cgroup hierarchy will all available subsystems, type: +# mount -t cgroup xxx /dev/cgroup + +The "xxx" is not interpreted by the cgroup code, but will appear in +/proc/mounts so may be any useful identifying string that you like. + +To mount a cgroup hierarchy with just the cpuset and numtasks +subsystems, type: +# mount -t cgroup -o cpuset,numtasks hier1 /dev/cgroup + +To change the set of subsystems bound to a mounted hierarchy, just +remount with different options: + +# mount -o remount,cpuset,ns /dev/cgroup + +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. + +Then under /dev/cgroup you can find a tree that corresponds to the +tree of the cgroups in the system. For instance, /dev/cgroup +is the cgroup that holds the whole system. + +If you want to create a new cgroup under /dev/cgroup: +# cd /dev/cgroup +# mkdir my_cgroup + +Now you want to do something with this cgroup. +# cd my_cgroup + +In this directory you can find several files: +# ls +notify_on_release release_agent tasks +(plus whatever files are added by the attached subsystems) + +Now attach your shell to this cgroup: +# /bin/echo $$ > tasks + +You can also create cgroups inside your cgroup by using mkdir in this +directory. +# mkdir my_sub_cs + +To remove a cgroup, just use rmdir: +# rmdir my_sub_cs + +This will fail if the cgroup is in use (has cgroups inside, or +has processes attached, or is held alive by other subsystem-specific +reference). + +2.2 Attaching processes +----------------------- + +# /bin/echo PID > tasks + +Note that it is PID, not PIDs. You can only attach ONE task at a time. +If you have several tasks to attach, you have to do it one after another: + +# /bin/echo PID1 > tasks +# /bin/echo PID2 > tasks + ... +# /bin/echo PIDn > tasks + +3. Kernel API +============= + +3.1 Overview +------------ + +Each kernel subsystem that wants to hook into the generic cgroup +system needs to create a cgroup_subsys object. This contains +various methods, which are callbacks from the cgroup system, along +with a subsystem id which will be assigned by the cgroup system. + +Other fields in the cgroup_subsys object include: + +- subsys_id: a unique array index for the subsystem, indicating which + entry in cgroup->subsys[] this subsystem should be + managing. Initialized by cgroup_register_subsys(); prior to this + it should be initialized to -1 + +- hierarchy: an index indicating which hierarchy, if any, this + subsystem is currently attached to. If this is -1, then the + subsystem is not attached to any hierarchy, and all tasks should be + considered to be members of the subsystem's top_cgroup. It should + be initialized to -1. + +- name: should be initialized to a unique subsystem name prior to + calling cgroup_register_subsystem. Should be no longer than + MAX_CGROUP_TYPE_NAMELEN + +Each cgroup object created by the system has an array of pointers, +indexed by subsystem id; this pointer is entirely managed by the +subsystem; the generic cgroup code will never touch this pointer. + +3.2 Synchronization +------------------- + +There is a global mutex, cgroup_mutex, used by the cgroup +system. This should be taken by anything that wants to modify a +cgroup. It may also be taken to prevent cgroups from being +modified, but more specific locks may be more appropriate in that +situation. + +See kernel/cgroup.c for more details. + +Subsystems can take/release the cgroup_mutex via the functions +cgroup_lock()/cgroup_unlock(), and can +take/release the callback_mutex via the functions +cgroup_lock()/cgroup_unlock(). + +Accessing a task's cgroup pointer may be done in the following ways: +- while holding cgroup_mutex +- while holding the task's alloc_lock (via task_lock()) +- inside an rcu_read_lock() section via rcu_dereference() + +3.3 Subsystem API +-------------------------- + +Each subsystem should: + +- add an entry in linux/cgroup_subsys.h +- define a cgroup_subsys object called <name>_subsys + +Each subsystem may export the following methods. The only mandatory +methods are create/destroy. Any others that are null are presumed to +be successful no-ops. + +struct cgroup_subsys_state *create(struct cgroup *cont) +LL=cgroup_mutex + +Called to create a subsystem state object for a cgroup. The +subsystem should allocate its subsystem state object for the passed +cgroup, returning a pointer to the new object on success or a +negative error code. On success, the subsystem pointer should point to +a structure of type cgroup_subsys_state (typically embedded in a +larger subsystem-specific object), which will be initialized by the +cgroup system. Note that this will be called at initialization to +create the root subsystem state for this subsystem; this case can be +identified by the passed cgroup object having a NULL parent (since +it's the root of the hierarchy) and may be an appropriate place for +initialization code. + +void destroy(struct cgroup *cont) +LL=cgroup_mutex + +The cgroup system is about to destroy the passed cgroup; the +subsystem should do any necessary cleanup + +int can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *task) +LL=cgroup_mutex + +Called prior to moving a task into a cgroup; if the subsystem +returns an error, this will abort the attach operation. If a NULL +task is passed, then a successful result indicates that *any* +unspecified task can be moved into the cgroup. Note that this isn't +called on a fork. If this method returns 0 (success) then this should +remain valid while the caller holds cgroup_mutex. + +void attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *old_cont, struct task_struct *task) +LL=cgroup_mutex + + +Called after the task has been attached to the cgroup, to allow any +post-attachment activity that requires memory allocations or blocking. + +void fork(struct cgroup_subsy *ss, struct task_struct *task) +LL=callback_mutex, maybe read_lock(tasklist_lock) + +Called when a task is forked into a cgroup. Also called during +registration for all existing tasks. + +void exit(struct cgroup_subsys *ss, struct task_struct *task) +LL=callback_mutex + +Called during task exit + +int populate(struct cgroup_subsys *ss, struct cgroup *cont) +LL=none + +Called after creation of a cgroup to allow a subsystem to populate +the cgroup directory with file entries. The subsystem should make +calls to cgroup_add_file() with objects of type cftype (see +include/linux/cgroup.h for details). Note that although this +method can return an error code, the error code is currently not +always handled well. + +void post_clone(struct cgroup_subsys *ss, struct cgroup *cont) + +Called at the end of cgroup_clone() to do any paramater +initialization which might be required before a task could attach. For +example in cpusets, no task may attach before 'cpus' and 'mems' are set +up. + +void bind(struct cgroup_subsys *ss, struct cgroup *root) +LL=callback_mutex + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). + +4. Questions +============ + +Q: what's up with this '/bin/echo' ? +A: bash's builtin 'echo' command does not check calls to write() against + errors. If you use it in the cgroup file system, you won't be + able to tell whether a command succeeded or failed. + +Q: When I attach processes, only the first of the line gets really attached ! +A: We can only return one error code per call to write(). So you should also + put only ONE pid. + diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt index b6d24c22274b..a741f658a3c9 100644 --- a/Documentation/cpu-hotplug.txt +++ b/Documentation/cpu-hotplug.txt @@ -220,7 +220,9 @@ A: The following happen, listed in no particular order :-) CPU_DOWN_PREPARE or CPU_DOWN_PREPARE_FROZEN, depending on whether or not the CPU is being offlined while tasks are frozen due to a suspend operation in progress -- All process is migrated away from this outgoing CPU to a new CPU +- All processes are migrated away from this outgoing CPU to new CPUs. + The new CPU is chosen from each process' current cpuset, which may be + a subset of all online CPUs. - All interrupts targeted to this CPU is migrated to a new CPU - timers/bottom half/task lets are also migrated to a new CPU - Once all services are migrated, kernel calls an arch specific routine diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index ec9de6917f01..141bef1c8599 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -7,6 +7,7 @@ Written by Simon.Derr@bull.net Portions Copyright (c) 2004-2006 Silicon Graphics, Inc. Modified by Paul Jackson <pj@sgi.com> Modified by Christoph Lameter <clameter@sgi.com> +Modified by Paul Menage <menage@google.com> CONTENTS: ========= @@ -16,9 +17,9 @@ CONTENTS: 1.2 Why are cpusets needed ? 1.3 How are cpusets implemented ? 1.4 What are exclusive cpusets ? - 1.5 What does notify_on_release do ? - 1.6 What is memory_pressure ? - 1.7 What is memory spread ? + 1.5 What is memory_pressure ? + 1.6 What is memory spread ? + 1.7 What is sched_load_balance ? 1.8 How do I use cpusets ? 2. Usage Examples and Syntax 2.1 Basic Usage @@ -44,18 +45,19 @@ hierarchy visible in a virtual file system. These are the essential hooks, beyond what is already present, required to manage dynamic job placement on large systems. -Each task has a pointer to a cpuset. Multiple tasks may reference -the same cpuset. Requests by a task, using the sched_setaffinity(2) -system call to include CPUs in its CPU affinity mask, and using the -mbind(2) and set_mempolicy(2) system calls to include Memory Nodes -in its memory policy, are both filtered through that tasks cpuset, -filtering out any CPUs or Memory Nodes not in that cpuset. The -scheduler will not schedule a task on a CPU that is not allowed in -its cpus_allowed vector, and the kernel page allocator will not -allocate a page on a node that is not allowed in the requesting tasks -mems_allowed vector. - -User level code may create and destroy cpusets by name in the cpuset +Cpusets use the generic cgroup subsystem described in +Documentation/cgroup.txt. + +Requests by a task, using the sched_setaffinity(2) system call to +include CPUs in its CPU affinity mask, and using the mbind(2) and +set_mempolicy(2) system calls to include Memory Nodes in its memory +policy, are both filtered through that tasks cpuset, filtering out any +CPUs or Memory Nodes not in that cpuset. The scheduler will not +schedule a task on a CPU that is not allowed in its cpus_allowed +vector, and the kernel page allocator will not allocate a page on a +node that is not allowed in the requesting tasks mems_allowed vector. + +User level code may create and destroy cpusets by name in the cgroup virtual file system, manage the attributes and permissions of these cpusets and which CPUs and Memory Nodes are assigned to each cpuset, specify and query to which cpuset a task is assigned, and list the @@ -115,7 +117,7 @@ Cpusets extends these two mechanisms as follows: - Cpusets are sets of allowed CPUs and Memory Nodes, known to the kernel. - Each task in the system is attached to a cpuset, via a pointer - in the task structure to a reference counted cpuset structure. + in the task structure to a reference counted cgroup structure. - Calls to sched_setaffinity are filtered to just those CPUs allowed in that tasks cpuset. - Calls to mbind and set_mempolicy are filtered to just @@ -145,15 +147,10 @@ into the rest of the kernel, none in performance critical paths: - in page_alloc.c, to restrict memory to allowed nodes. - in vmscan.c, to restrict page recovery to the current cpuset. -In addition a new file system, of type "cpuset" may be mounted, -typically at /dev/cpuset, to enable browsing and modifying the cpusets -presently known to the kernel. No new system calls are added for -cpusets - all support for querying and modifying cpusets is via -this cpuset file system. - -Each task under /proc has an added file named 'cpuset', displaying -the cpuset name, as the path relative to the root of the cpuset file -system. +You should mount the "cgroup" filesystem type in order to enable +browsing and modifying the cpusets presently known to the kernel. No +new system calls are added for cpusets - all support for querying and +modifying cpusets is via this cpuset file system. The /proc/<pid>/status file for each task has two added lines, displaying the tasks cpus_allowed (on which CPUs it may be scheduled) @@ -163,16 +160,15 @@ in the format seen in the following example: Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff Mems_allowed: ffffffff,ffffffff -Each cpuset is represented by a directory in the cpuset file system -containing the following files describing that cpuset: +Each cpuset is represented by a directory in the cgroup file system +containing (on top of the standard cgroup files) the following +files describing that cpuset: - cpus: list of CPUs in that cpuset - mems: list of Memory Nodes in that cpuset - memory_migrate flag: if set, move pages to cpusets nodes - cpu_exclusive flag: is cpu placement exclusive? - mem_exclusive flag: is memory placement exclusive? - - tasks: list of tasks (by pid) attached to that cpuset - - notify_on_release flag: run /sbin/cpuset_release_agent on exit? - memory_pressure: measure of how much paging pressure in cpuset In addition, the root cpuset only has the following file: @@ -237,21 +233,7 @@ such as requests from interrupt handlers, is allowed to be taken outside even a mem_exclusive cpuset. -1.5 What does notify_on_release do ? ------------------------------------- - -If the notify_on_release flag is enabled (1) in a cpuset, then whenever -the last task in the cpuset leaves (exits or attaches to some other -cpuset) and the last child cpuset of that cpuset is removed, then -the kernel runs the command /sbin/cpuset_release_agent, supplying the -pathname (relative to the mount point of the cpuset file system) of the -abandoned cpuset. This enables automatic removal of abandoned cpusets. -The default value of notify_on_release in the root cpuset at system -boot is disabled (0). The default value of other cpusets at creation -is the current value of their parents notify_on_release setting. - - -1.6 What is memory_pressure ? +1.5 What is memory_pressure ? ----------------------------- The memory_pressure of a cpuset provides a simple per-cpuset metric of the rate that the tasks in a cpuset are attempting to free up in @@ -308,7 +290,7 @@ the tasks in the cpuset, in units of reclaims attempted per second, times 1000. -1.7 What is memory spread ? +1.6 What is memory spread ? --------------------------- There are two boolean flag files per cpuset that control where the kernel allocates pages for the file system buffers and related in @@ -378,6 +360,142 @@ policy, especially for jobs that might have one thread reading in the data set, the memory allocation across the nodes in the jobs cpuset can become very uneven. +1.7 What is sched_load_balance ? +-------------------------------- + +The kernel scheduler (kernel/sched.c) automatically load balances +tasks. If one CPU is underutilized, kernel code running on that +CPU will look for tasks on other more overloaded CPUs and move those +tasks to itself, within the constraints of such placement mechanisms +as cpusets and sched_setaffinity. + +The algorithmic cost of load balancing and its impact on key shared +kernel data structures such as the task list increases more than +linearly with the number of CPUs being balanced. So the scheduler +has support to partition the systems CPUs into a number of sched +domains such that it only load balances within each sched domain. +Each sched domain covers some subset of the CPUs in the system; +no two sched domains overlap; some CPUs might not be in any sched +domain and hence won't be load balanced. + +Put simply, it costs less to balance between two smaller sched domains +than one big one, but doing so means that overloads in one of the +two domains won't be load balanced to the other one. + +By default, there is one sched domain covering all CPUs, except those +marked isolated using the kernel boot time "isolcpus=" argument. + +This default load balancing across all CPUs is not well suited for +the following two situations: + 1) On large systems, load balancing across many CPUs is expensive. + If the system is managed using cpusets to place independent jobs + on separate sets of CPUs, full load balancing is unnecessary. + 2) Systems supporting realtime on some CPUs need to minimize + system overhead on those CPUs, including avoiding task load + balancing if that is not needed. + +When the per-cpuset flag "sched_load_balance" is enabled (the default +setting), it requests that all the CPUs in that cpusets allowed 'cpus' +be contained in a single sched domain, ensuring that load balancing +can move a task (not otherwised pinned, as by sched_setaffinity) +from any CPU in that cpuset to any other. + +When the per-cpuset flag "sched_load_balance" is disabled, then the +scheduler will avoid load balancing across the CPUs in that cpuset, +--except-- in so far as is necessary because some overlapping cpuset +has "sched_load_balance" enabled. + +So, for example, if the top cpuset has the flag "sched_load_balance" +enabled, then the scheduler will have one sched domain covering all +CPUs, and the setting of the "sched_load_balance" flag in any other +cpusets won't matter, as we're already fully load balancing. + +Therefore in the above two situations, the top cpuset flag +"sched_load_balance" should be disabled, and only some of the smaller, +child cpusets have this flag enabled. + +When doing this, you don't usually want to leave any unpinned tasks in +the top cpuset that might use non-trivial amounts of CPU, as such tasks +may be artificially constrained to some subset of CPUs, depending on +the particulars of this flag setting in descendent cpusets. Even if +such a task could use spare CPU cycles in some other CPUs, the kernel +scheduler might not consider the possibility of load balancing that +task to that underused CPU. + +Of course, tasks pinned to a particular CPU can be left in a cpuset +that disables "sched_load_balance" as those tasks aren't going anywhere +else anyway. + +There is an impedance mismatch here, between cpusets and sched domains. +Cpusets are hierarchical and nest. Sched domains are flat; they don't +overlap and each CPU is in at most one sched domain. + +It is necessary for sched domains to be flat because load balancing +across partially overlapping sets of CPUs would risk unstable dynamics +that would be beyond our understanding. So if each of two partially +overlapping cpusets enables the flag 'sched_load_balance', then we +form a single sched domain that is a superset of both. We won't move +a task to a CPU outside it cpuset, but the scheduler load balancing +code might waste some compute cycles considering that possibility. + +This mismatch is why there is not a simple one-to-one relation +between which cpusets have the flag "sched_load_balance" enabled, +and the sched domain configuration. If a cpuset enables the flag, it +will get balancing across all its CPUs, but if it disables the flag, +it will only be assured of no load balancing if no other overlapping +cpuset enables the flag. + +If two cpusets have partially overlapping 'cpus' allowed, and only +one of them has this flag enabled, then the other may find its +tasks only partially load balanced, just on the overlapping CPUs. +This is just the general case of the top_cpuset example given a few +paragraphs above. In the general case, as in the top cpuset case, +don't leave tasks that might use non-trivial amounts of CPU in +such partially load balanced cpusets, as they may be artificially +constrained to some subset of the CPUs allowed to them, for lack of +load balancing to the other CPUs. + +1.7.1 sched_load_balance implementation details. +------------------------------------------------ + +The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary +to most cpuset flags.) When enabled for a cpuset, the kernel will +ensure that it can load balance across all the CPUs in that cpuset +(makes sure that all the CPUs in the cpus_allowed of that cpuset are +in the same sched domain.) + +If two overlapping cpusets both have 'sched_load_balance' enabled, +then they will be (must be) both in the same sched domain. + +If, as is the default, the top cpuset has 'sched_load_balance' enabled, +then by the above that means there is a single sched domain covering +the whole system, regardless of any other cpuset settings. + +The kernel commits to user space that it will avoid load balancing +where it can. It will pick as fine a granularity partition of sched +domains as it can while still providing load balancing for any set +of CPUs allowed to a cpuset having 'sched_load_balance' enabled. + +The internal kernel cpuset to scheduler interface passes from the +cpuset code to the scheduler code a partition of the load balanced +CPUs in the system. This partition is a set of subsets (represented +as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all +the CPUs that must be load balanced. + +Whenever the 'sched_load_balance' flag changes, or CPUs come or go +from a cpuset with this flag enabled, or a cpuset with this flag +enabled is removed, the cpuset code builds a new such partition and +passes it to the scheduler sched domain setup code, to have the sched +domains rebuilt as necessary. + +This partition exactly defines what sched domains the scheduler should +setup - one sched domain for each element (cpumask_t) in the partition. + +The scheduler remembers the currently active sched domain partitions. +When the scheduler routine partition_sched_domains() is invoked from +the cpuset code to update these sched domains, it compares the new +partition requested with the current, and updates its sched domains, +removing the old and adding the new, for each change. 1.8 How do I use cpusets ? -------------------------- @@ -469,7 +587,7 @@ than stress the kernel. To start a new job that is to be contained within a cpuset, the steps are: 1) mkdir /dev/cpuset - 2) mount -t cpuset none /dev/cpuset + 2) mount -t cgroup -ocpuset cpuset /dev/cpuset 3) Create the new cpuset by doing mkdir's and write's (or echo's) in the /dev/cpuset virtual file system. 4) Start a task that will be the "founding father" of the new job. @@ -481,7 +599,7 @@ For example, the following sequence of commands will setup a cpuset named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, and then start a subshell 'sh' in that cpuset: - mount -t cpuset none /dev/cpuset + mount -t cgroup -ocpuset cpuset /dev/cpuset cd /dev/cpuset mkdir Charlie cd Charlie @@ -513,7 +631,7 @@ Creating, modifying, using the cpusets can be done through the cpuset virtual filesystem. To mount it, type: -# mount -t cpuset none /dev/cpuset +# mount -t cgroup -o cpuset cpuset /dev/cpuset Then under /dev/cpuset you can find a tree that corresponds to the tree of the cpusets in the system. For instance, /dev/cpuset @@ -556,6 +674,18 @@ To remove a cpuset, just use rmdir: This will fail if the cpuset is in use (has cpusets inside, or has processes attached). +Note that for legacy reasons, the "cpuset" filesystem exists as a +wrapper around the cgroup filesystem. + +The command + +mount -t cpuset X /dev/cpuset + +is equivalent to + +mount -t cgroup -ocpuset X /dev/cpuset +echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent + 2.2 Adding/removing cpus ------------------------ diff --git a/Documentation/device-mapper/dm-uevent.txt b/Documentation/device-mapper/dm-uevent.txt new file mode 100644 index 000000000000..07edbd85c714 --- /dev/null +++ b/Documentation/device-mapper/dm-uevent.txt @@ -0,0 +1,97 @@ +The device-mapper uevent code adds the capability to device-mapper to create +and send kobject uevents (uevents). Previously device-mapper events were only +available through the ioctl interface. The advantage of the uevents interface +is the event contains environment attributes providing increased context for +the event avoiding the need to query the state of the device-mapper device after +the event is received. + +There are two functions currently for device-mapper events. The first function +listed creates the event and the second function sends the event(s). + +void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti, + const char *path, unsigned nr_valid_paths) + +void dm_send_uevents(struct list_head *events, struct kobject *kobj) + + +The variables added to the uevent environment are: + +Variable Name: DM_TARGET +Uevent Action(s): KOBJ_CHANGE +Type: string +Description: +Value: Name of device-mapper target that generated the event. + +Variable Name: DM_ACTION +Uevent Action(s): KOBJ_CHANGE +Type: string +Description: +Value: Device-mapper specific action that caused the uevent action. + PATH_FAILED - A path has failed. + PATH_REINSTATED - A path has been reinstated. + +Variable Name: DM_SEQNUM +Uevent Action(s): KOBJ_CHANGE +Type: unsigned integer +Description: A sequence number for this specific device-mapper device. +Value: Valid unsigned integer range. + +Variable Name: DM_PATH +Uevent Action(s): KOBJ_CHANGE +Type: string +Description: Major and minor number of the path device pertaining to this +event. +Value: Path name in the form of "Major:Minor" + +Variable Name: DM_NR_VALID_PATHS +Uevent Action(s): KOBJ_CHANGE +Type: unsigned integer +Description: +Value: Valid unsigned integer range. + +Variable Name: DM_NAME +Uevent Action(s): KOBJ_CHANGE +Type: string +Description: Name of the device-mapper device. +Value: Name + +Variable Name: DM_UUID +Uevent Action(s): KOBJ_CHANGE +Type: string +Description: UUID of the device-mapper device. +Value: UUID. (Empty string if there isn't one.) + +An example of the uevents generated as captured by udevmonitor is shown +below. + +1.) Path failure. +UEVENT[1192521009.711215] change@/block/dm-3 +ACTION=change +DEVPATH=/block/dm-3 +SUBSYSTEM=block +DM_TARGET=multipath +DM_ACTION=PATH_FAILED +DM_SEQNUM=1 +DM_PATH=8:32 +DM_NR_VALID_PATHS=0 +DM_NAME=mpath2 +DM_UUID=mpath-35333333000002328 +MINOR=3 +MAJOR=253 +SEQNUM=1130 + +2.) Path reinstate. +UEVENT[1192521132.989927] change@/block/dm-3 +ACTION=change +DEVPATH=/block/dm-3 +SUBSYSTEM=block +DM_TARGET=multipath +DM_ACTION=PATH_REINSTATED +DM_SEQNUM=2 +DM_PATH=8:32 +DM_NR_VALID_PATHS=1 +DM_NAME=mpath2 +DM_UUID=mpath-35333333000002328 +MINOR=3 +MAJOR=253 +SEQNUM=1131 diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 6c46730c631a..e6244cde26e9 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt @@ -2188,7 +2188,7 @@ Your cooperation is appreciated. 136-143 char Unix98 PTY slaves 0 = /dev/pts/0 First Unix98 pseudo-TTY - 1 = /dev/pts/1 Second Unix98 pesudo-TTY + 1 = /dev/pts/1 Second Unix98 pseudo-TTY ... These device nodes are automatically generated with diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index 8569072fa387..387b8a720f4a 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -32,7 +32,7 @@ braindamaged document, if it's finally working, well, it's working. For one reason or another, low level drivers don't receive as much attention or testing as core code, and bugs on driver detach or -initilaization failure doesn't happen often enough to be noticeable. +initialization failure don't happen often enough to be noticeable. Init failure path is worse because it's much less travelled while needs to handle multiple entry points. @@ -160,7 +160,7 @@ resources on failure. For example, devres_release_group(dev, NULL); return err_code; -As resource acquision failure usually means probe failure, constructs +As resource acquisition failure usually means probe failure, constructs like above are usually useful in midlayer driver (e.g. libata core layer) where interface function shouldn't have side effect on failure. For LLDs, just returning error code suffices in most cases. diff --git a/Documentation/fb/deferred_io.txt b/Documentation/fb/deferred_io.txt index 73cf9fb7cf60..63883a892120 100644 --- a/Documentation/fb/deferred_io.txt +++ b/Documentation/fb/deferred_io.txt @@ -3,7 +3,7 @@ Deferred IO Deferred IO is a way to delay and repurpose IO. It uses host memory as a buffer and the MMU pagefault as a pretrigger for when to perform the device -IO. The following example may be a useful explaination of how one such setup +IO. The following example may be a useful explanation of how one such setup works: - userspace app like Xfbdev mmaps framebuffer @@ -28,7 +28,7 @@ a relatively more expensive operation. For some types of nonvolatile high latency displays, the desired image is the final image rather than the intermediate stages which is why it's okay -to not update for each write that is occuring. +to not update for each write that is occurring. It may be the case that this is useful in other scenarios as well. Paul Mundt has mentioned a case where it is beneficial to use the page count to decide diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 280ec06573e6..6bb9be54ab76 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -14,18 +14,6 @@ Who: Jiri Slaby <jirislaby@gmail.com> --------------------------- -What: V4L2 VIDIOC_G_MPEGCOMP and VIDIOC_S_MPEGCOMP -When: October 2007 -Why: Broken attempt to set MPEG compression parameters. These ioctls are - not able to implement the wide variety of parameters that can be set - by hardware MPEG encoders. A new MPEG control mechanism was created - in kernel 2.6.18 that replaces these ioctls. See the V4L2 specification - (section 1.9: Extended controls) for more information on this topic. -Who: Hans Verkuil <hverkuil@xs4all.nl> and - Mauro Carvalho Chehab <mchehab@infradead.org> - ---------------------------- - What: dev->power.power_state When: July 2007 Why: Broken design for runtime control over driver power states, confusing @@ -49,10 +37,10 @@ Who: David Miller <davem@davemloft.net> --------------------------- What: Video4Linux API 1 ioctls and video_decoder.h from Video devices. -When: December 2006 -Files: include/linux/video_decoder.h -Check: include/linux/video_decoder.h -Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6 +When: December 2008 +Files: include/linux/video_decoder.h include/linux/videodev.h +Check: include/linux/video_decoder.h include/linux/videodev.h +Why: V4L1 AP1 was replaced by V4L2 API during migration from 2.4 to 2.6 series. The old API have lots of drawbacks and don't provide enough means to work with all video and audio standards. The newer API is already available on the main drivers and should be used instead. @@ -61,7 +49,9 @@ Why: V4L1 AP1 was replaced by V4L2 API. during migration from 2.4 to 2.6 Decoder iocts are using internally to allow video drivers to communicate with video decoders. This should also be improved to allow V4L2 calls being translated into compatible internal ioctls. -Who: Mauro Carvalho Chehab <mchehab@brturbo.com.br> + Compatibility ioctls will be provided, for a while, via + v4l1-compat module. +Who: Mauro Carvalho Chehab <mchehab@infradead.org> --------------------------- @@ -82,6 +72,41 @@ Who: Dominik Brodowski <linux@brodo.de> --------------------------- +What: sys_sysctl +When: September 2010 +Option: CONFIG_SYSCTL_SYSCALL +Why: The same information is available in a more convenient from + /proc/sys, and none of the sysctl variables appear to be + important performance wise. + + Binary sysctls are a long standing source of subtle kernel + bugs and security issues. + + When I looked several months ago all I could find after + searching several distributions were 5 user space programs and + glibc (which falls back to /proc/sys) using this syscall. + + The man page for sysctl(2) documents it as unusable for user + space programs. + + sysctl(2) is not generally ABI compatible to a 32bit user + space application on a 64bit and a 32bit kernel. + + For the last several months the policy has been no new binary + sysctls and no one has put forward an argument to use them. + + Binary sysctls issues seem to keep happening appearing so + properly deprecating them (with a warning to user space) and a + 2 year grace warning period will mean eventually we can kill + them and end the pain. + + In the mean time individual binary sysctls can be dealt with + in a piecewise fashion. + +Who: Eric Biederman <ebiederm@xmission.com> + +--------------------------- + What: a.out interpreter support for ELF executables When: 2.6.25 Files: fs/binfmt_elf.c @@ -184,13 +209,6 @@ Who: Jean Delvare <khali@linux-fr.org>, --------------------------- -What: drivers depending on OBSOLETE_OSS -When: options in 2.6.22, code in 2.6.24 -Why: OSS drivers with ALSA replacements -Who: Adrian Bunk <bunk@stusta.de> - ---------------------------- - What: ACPI procfs interface When: July 2008 Why: ACPI sysfs conversion should be finished by January 2008. diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index d6fd6c6e4244..bf8080640eba 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt @@ -42,10 +42,12 @@ OPTIONS trans=name select an alternative transport. Valid options are currently: - unix - specifying a named pipe mount point - tcp - specifying a normal TCP/IP connection - fd - used passed file descriptors for connection + unix - specifying a named pipe mount point + tcp - specifying a normal TCP/IP connection + fd - used passed file descriptors for connection (see rfdno and wfdno) + virtio - connect to the next virtio channel available + (from lguest or KVM with trans_virtio module) uname=name user name to attempt mount as on the remote server. The server may override or ignore this value. Certain user @@ -54,7 +56,7 @@ OPTIONS aname=name aname specifies the file tree to access when the server is offering several exported file systems. - cache=mode specifies a cacheing policy. By default, no caches are used. + cache=mode specifies a caching policy. By default, no caches are used. loose = no attempts are made at consistency, intended for exclusive, read-only mounts diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/Exporting index 31047e0fe14b..87019d2b5981 100644 --- a/Documentation/filesystems/Exporting +++ b/Documentation/filesystems/Exporting @@ -2,9 +2,12 @@ Making Filesystems Exportable ============================= -Most filesystem operations require a dentry (or two) as a starting +Overview +-------- + +All filesystem operations require a dentry (or two) as a starting point. Local applications have a reference-counted hold on suitable -dentrys via open file descriptors or cwd/root. However remote +dentries via open file descriptors or cwd/root. However remote applications that access a filesystem via a remote filesystem protocol such as NFS may not be able to hold such a reference, and so need a different way to refer to a particular dentry. As the alternative @@ -13,14 +16,14 @@ server-reboot (among other things, though these tend to be the most problematic), there is no simple answer like 'filename'. The mechanism discussed here allows each filesystem implementation to -specify how to generate an opaque (out side of the filesystem) byte +specify how to generate an opaque (outside of the filesystem) byte string for any dentry, and how to find an appropriate dentry for any given opaque byte string. This byte string will be called a "filehandle fragment" as it corresponds to part of an NFS filehandle. A filesystem which supports the mapping between filehandle fragments -and dentrys will be termed "exportable". +and dentries will be termed "exportable". @@ -89,11 +92,9 @@ For a filesystem to be exportable it must: 1/ provide the filehandle fragment routines described below. 2/ make sure that d_splice_alias is used rather than d_add when ->lookup finds an inode for a given parent and name. - Typically the ->lookup routine will end: - if (inode) - return d_splice(inode, dentry); - d_add(dentry, inode); - return NULL; + Typically the ->lookup routine will end with a: + + return d_splice_alias(inode, dentry); } @@ -101,67 +102,39 @@ For a filesystem to be exportable it must: A file system implementation declares that instances of the filesystem are exportable by setting the s_export_op field in the struct super_block. This field must point to a "struct export_operations" -struct which could potentially be full of NULLs, though normally at -least get_parent will be set. - - The primary operations are decode_fh and encode_fh. -decode_fh takes a filehandle fragment and tries to find or create a -dentry for the object referred to by the filehandle. -encode_fh takes a dentry and creates a filehandle fragment which can -later be used to find/create a dentry for the same object. - -decode_fh will probably make use of "find_exported_dentry". -This function lives in the "exportfs" module which a filesystem does -not need unless it is being exported. So rather that calling -find_exported_dentry directly, each filesystem should call it through -the find_exported_dentry pointer in it's export_operations table. -This field is set correctly by the exporting agent (e.g. nfsd) when a -filesystem is exported, and before any export operations are called. - -find_exported_dentry needs three support functions from the -filesystem: - get_name. When given a parent dentry and a child dentry, this - should find a name in the directory identified by the parent - dentry, which leads to the object identified by the child dentry. - If no get_name function is supplied, a default implementation is - provided which uses vfs_readdir to find potential names, and - matches inode numbers to find the correct match. - - get_parent. When given a dentry for a directory, this should return - a dentry for the parent. Quite possibly the parent dentry will - have been allocated by d_alloc_anon. - The default get_parent function just returns an error so any - filehandle lookup that requires finding a parent will fail. - ->lookup("..") is *not* used as a default as it can leave ".." - entries in the dcache which are too messy to work with. - - get_dentry. When given an opaque datum, this should find the - implied object and create a dentry for it (possibly with - d_alloc_anon). - The opaque datum is whatever is passed down by the decode_fh - function, and is often simply a fragment of the filehandle - fragment. - decode_fh passes two datums through find_exported_dentry. One that - should be used to identify the target object, and one that can be - used to identify the object's parent, should that be necessary. - The default get_dentry function assumes that the datum contains an - inode number and a generation number, and it attempts to get the - inode using "iget" and check it's validity by matching the - generation number. A filesystem should only depend on the default - if iget can safely be used this way. - -If decode_fh and/or encode_fh are left as NULL, then default -implementations are used. These defaults are suitable for ext2 and -extremely similar filesystems (like ext3). - -The default encode_fh creates a filehandle fragment from the inode -number and generation number of the target together with the inode -number and generation number of the parent (if the parent is -required). - -The default decode_fh extract the target and parent datums from the -filehandle assuming the format used by the default encode_fh and -passed them to find_exported_dentry. +struct which has the following members: + + encode_fh (optional) + Takes a dentry and creates a filehandle fragment which can later be used + to find or create a dentry for the same object. The default + implementation creates a filehandle fragment that encodes a 32bit inode + and generation number for the inode encoded, and if necessary the + same information for the parent. + + fh_to_dentry (mandatory) + Given a filehandle fragment, this should find the implied object and + create a dentry for it (possibly with d_alloc_anon). + + fh_to_parent (optional but strongly recommended) + Given a filehandle fragment, this should find the parent of the + implied object and create a dentry for it (possibly with d_alloc_anon). + May fail if the filehandle fragment is too small. + + get_parent (optional but strongly recommended) + When given a dentry for a directory, this should return a dentry for + the parent. Quite possibly the parent dentry will have been allocated + by d_alloc_anon. The default get_parent function just returns an error + so any filehandle lookup that requires finding a parent will fail. + ->lookup("..") is *not* used as a default as it can leave ".." entries + in the dcache which are too messy to work with. + + get_name (optional) + When given a parent dentry and a child dentry, this should find a name + in the directory identified by the parent dentry, which leads to the + object identified by the child dentry. If no get_name function is + supplied, a default implementation is provided which uses vfs_readdir + to find potential names, and matches inode numbers to find the correct + match. A filehandle fragment consists of an array of 1 or more 4byte words, @@ -172,5 +145,3 @@ generated by encode_fh, in which case it will have been padded with nuls. Rather, the encode_fh routine should choose a "type" which indicates the decode_fh how much of the filehandle is valid, and how it should be interpreted. - - diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index fe26cc978523..37c10cba7177 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -224,7 +224,7 @@ against the page the filesystem should redirty the page with redirty_page_for_writepage(), then unlock the page and return zero. This may also be done to avoid internal deadlocks, but rarely. -If the filesytem is called for sync then it must wait on any +If the filesystem is called for sync then it must wait on any in-progress I/O and then start new I/O. The filesystem should unlock the page synchronously, before returning to the diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 4aecc9bdb273..b45f3c1b8b43 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -130,12 +130,12 @@ Device layer. Journaling Block Device layer ----------------------------- -The Journaling Block Device layer (JBD) isn't ext3 specific. It was design to -add journaling capabilities on a block device. The ext3 filesystem code will -inform the JBD of modifications it is performing (called a transaction). The -journal supports the transactions start and stop, and in case of crash, the -journal can replayed the transactions to put the partition back in a -consistent state fast. +The Journaling Block Device layer (JBD) isn't ext3 specific. It was designed +to add journaling capabilities to a block device. The ext3 filesystem code +will inform the JBD of modifications it is performing (called a transaction). +The journal supports the transactions start and stop, and in case of a crash, +the journal can replay the transactions to quickly put the partition back into +a consistent state. Handles represent a single atomic update to a filesystem. JBD can handle an external journal on a block device. @@ -164,7 +164,7 @@ written to the journal first, and then to its final location. In the event of a crash, the journal can be replayed, bringing both data and metadata into a consistent state. This mode is the slowest except when data needs to be read from and written to disk at the same time where it -outperforms all others modes. +outperforms all other modes. Compatibility ------------- diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt index 133e213ebb72..bb0142f61084 100644 --- a/Documentation/filesystems/files.txt +++ b/Documentation/filesystems/files.txt @@ -76,13 +76,13 @@ the fdtable structure - 5. Handling of the file structures is special. Since the look-up of the fd (fget()/fget_light()) are lock-free, it is possible that look-up may race with the last put() operation on the - file structure. This is avoided using the rcuref APIs + file structure. This is avoided using atomic_inc_not_zero() on ->f_count : rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (rcuref_inc_lf(&file->f_count)) + if (atomic_inc_not_zero(&file->f_count)) *fput_needed = 1; else /* Didn't get the reference, someone's freed */ @@ -92,7 +92,7 @@ the fdtable structure - .... return file; - rcuref_inc_lf() detects if refcounts is already zero or + atomic_inc_not_zero() detects if refcounts is already zero or goes to zero during increment. If it does, we fail fget()/fget_light(). diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e5c1df52a876..dec99455321f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -813,9 +813,9 @@ Various pieces of information about kernel activity are available in the since the system first booted. For a quick look, simply cat the file: > cat /proc/stat - cpu 2255 34 2290 22625563 6290 127 456 - cpu0 1132 34 1441 11311718 3675 127 438 - cpu1 1123 0 849 11313845 2614 0 18 + cpu 2255 34 2290 22625563 6290 127 456 0 + cpu0 1132 34 1441 11311718 3675 127 438 0 + cpu1 1123 0 849 11313845 2614 0 18 0 intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] ctxt 1990473 btime 1062191376 @@ -835,6 +835,7 @@ second). The meanings of the columns are as follows, from left to right: - iowait: waiting for I/O to complete - irq: servicing interrupts - softirq: servicing softirqs +- steal: involuntary wait The "intr" line gives counts of interrupts serviced since boot time, for each of the possible system interrupts. The first column is the total of all diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 4b5ca26e5048..4598ef7b622b 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -51,7 +51,7 @@ for the attributes, providing a means to read and write kernel attributes. Attributes should be ASCII text files, preferably with only one value -per file. It is noted that it may not be efficient to contain only +per file. It is noted that it may not be efficient to contain only one value per file, so it is socially acceptable to express an array of values of the same type. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 6f8e16e3d6c0..9d019d35728f 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -706,7 +706,7 @@ struct address_space_operations { wants to make it a free page. If ->releasepage succeeds, the page will be removed from the address_space and become free. - The second case if when a request has been made to invalidate + The second case is when a request has been made to invalidate some or all pages in an address_space. This can happen through the fadvice(POSIX_FADV_DONTNEED) system call or by the filesystem explicitly requesting it as nfs and 9fs do (when diff --git a/Documentation/i2c/i2c-protocol b/Documentation/i2c/i2c-protocol index 579b92d5f3a3..10518dd58814 100644 --- a/Documentation/i2c/i2c-protocol +++ b/Documentation/i2c/i2c-protocol @@ -68,7 +68,7 @@ We have found some I2C devices that needs the following modifications: Flags I2C_M_IGNORE_NAK Normally message is interrupted immediately if there is [NA] from the - client. Setting this flag treats any [NA] as [A], and all of + client. Setting this flag treats any [NA] as [A], and all of message is sent. These messages may still fail to SCL lo->hi timeout. diff --git a/Documentation/i386/boot.txt b/Documentation/i386/boot.txt index 35985b34d5a6..2f75e750e4f5 100644 --- a/Documentation/i386/boot.txt +++ b/Documentation/i386/boot.txt @@ -168,6 +168,8 @@ Offset Proto Name Meaning 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 0235/3 N/A pad2 Unused 0238/4 2.06+ cmdline_size Maximum size of the kernel command line +023C/4 2.07+ hardware_subarch Hardware subarchitecture +0240/8 2.07+ hardware_subarch_data Subarchitecture-specific data (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -204,7 +206,7 @@ boot loaders can ignore those fields. The byte order of all fields is littleendian (this is x86, after all.) -Field name: setup_secs +Field name: setup_sects Type: read Offset/size: 0x1f1/1 Protocol: ALL @@ -356,6 +358,13 @@ Protocol: 2.00+ - If 0, the protected-mode code is loaded at 0x10000. - If 1, the protected-mode code is loaded at 0x100000. + Bit 6 (write): KEEP_SEGMENTS + Protocol: 2.07+ + - if 0, reload the segment registers in the 32bit entry point. + - if 1, do not reload the segment registers in the 32bit entry point. + Assume that %cs %ds %ss %es are all set to flat segments with + a base of 0 (or the equivalent for their environment). + Bit 7 (write): CAN_USE_HEAP Set this bit to 1 to indicate that the value entered in the heap_end_ptr is valid. If this field is clear, some setup code @@ -480,6 +489,29 @@ Protocol: 2.06+ cmdline_size characters. With protocol version 2.05 and earlier, the maximum size was 255. +Field name: hardware_subarch +Type: write +Offset/size: 0x23c/4 +Protocol: 2.07+ + + In a paravirtualized environment the hardware low level architectural + pieces such as interrupt handling, page table handling, and + accessing process control registers needs to be done differently. + + This field allows the bootloader to inform the kernel we are in one + one of those environments. + + 0x00000000 The default x86/PC environment + 0x00000001 lguest + 0x00000002 Xen + +Field name: hardware_subarch_data +Type: write +Offset/size: 0x240/8 +Protocol: 2.07+ + + A pointer to data that is specific to hardware subarch + **** THE KERNEL COMMAND LINE diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt index 6449a7090dbb..223e4f0582d0 100644 --- a/Documentation/ia64/err_inject.txt +++ b/Documentation/ia64/err_inject.txt @@ -21,10 +21,10 @@ software test suits to do stressful testing on IPF. Below is a sample application as part of the whole tool. The sample can be used as a working test tool. Or it can be expanded to include -more features. It also can be a integrated into a libary or other user +more features. It also can be a integrated into a library or other user application to have more thorough test. -The sample application takes err.conf as error configuation input. Gcc +The sample application takes err.conf as error configuration input. GCC compiles the code. After you install err_inject driver, you can run this sample application to inject errors. @@ -809,7 +809,7 @@ int err_inj() } /* Create semaphore: If one_lock, one semaphore for all processors. - Otherwise, one sempaphore for each processor. */ + Otherwise, one semaphore for each processor. */ if (one_lock) { if (create_sem(0)) { printf("Can not create semaphore...exit\n"); diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt index ab050621e20f..f3a3ba8847ba 100644 --- a/Documentation/input/atarikbd.txt +++ b/Documentation/input/atarikbd.txt @@ -170,7 +170,7 @@ major controller faults (ROM checksum and RAM test) and such things as stuck keys. Any keys down at power-up are presumed to be stuck, and their BREAK (sic) code is returned (which without the preceding MAKE code is a flag for a keyboard error). If the controller self-test completes without error, the code -0xF0 is returned. (This code will be used to indicate the version/rlease of +0xF0 is returned. (This code will be used to indicate the version/release of the ikbd controller. The first release of the ikbd is version 0xF0, should there be a second release it will be 0xF1, and so on.) The ikbd defaults to a mouse position reporting with threshold of 1 unit in @@ -413,7 +413,7 @@ INTERROGATION MODE. %nnnnmmmm ; where m is JOYSTICK1 state ; and n is JOYSTICK0 state -Sets the ikbd to do nothing but monitor the serial command lne, maintain the +Sets the ikbd to do nothing but monitor the serial command line, maintain the time-of-day clock, and monitor the joystick. The rate sets the interval between joystick samples. N.B. The user should not set the rate higher than the serial communications @@ -446,10 +446,10 @@ The sample interval should be as constant as possible. ; until vertical cursor key is generated before RY ; has elapsed VX ; length (in tenths of seconds) of joystick closure - ; until horizontal cursor keystokes are generated + ; until horizontal cursor keystrokes are generated ; after RX has elapsed VY ; length (in tenths of seconds) of joystick closure - ; until vertical cursor keystokes are generated + ; until vertical cursor keystrokes are generated ; after RY has elapsed In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes. diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt index 085eb15b45b7..ded4d5f53109 100644 --- a/Documentation/input/ff.txt +++ b/Documentation/input/ff.txt @@ -1,5 +1,5 @@ Force feedback for Linux. -By Johann Deneux <deneux@ifrance.com> on 2001/04/22. +By Johann Deneux <johann.deneux@gmail.com> on 2001/04/22. Updated by Anssi Hannula <anssi.hannula@gmail.com> on 2006/04/09. You may redistribute this file. Please remember to include shape.fig and interactive.fig as well. diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt index 8777d2d321e3..3ac92413c874 100644 --- a/Documentation/input/iforce-protocol.txt +++ b/Documentation/input/iforce-protocol.txt @@ -4,10 +4,10 @@ specify force effects to I-Force 2.0 devices. None of this information comes from Immerse. That's why you should not trust what is written in this document. This document is intended to help understanding the protocol. This is not a reference. Comments and corrections are welcome. To contact me, -send an email to: deneux@ifrance.com +send an email to: johann.deneux@gmail.com ** WARNING ** -I may not be held responsible for any dammage or harm caused if you try to +I shall not be held responsible for any damage or harm caused if you try to send data to your I-Force device based on what you read in this document. ** Preliminary Notes: @@ -151,13 +151,13 @@ OP= ff Query command. Length varies according to the query type. The general format of this packet is: ff 01 QUERY [INDEX] CHECKSUM -reponses are of the same form: +responses are of the same form: FF LEN QUERY VALUE_QUERIED CHECKSUM2 where LEN = 1 + length(VALUE_QUERIED) **** Query ram size **** QUERY = 42 ('B'uffer size) -The device should reply with the same packet plus two additionnal bytes +The device should reply with the same packet plus two additional bytes containing the size of the memory: ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available. @@ -234,19 +234,23 @@ is the amount of memory apparently needed for every set of parameters: ** Appendix: How to study the protocol ? ** -1. Generate effects using the force editor provided with the DirectX SDK, or use Immersion Studio (freely available at their web site in the developer section: www.immersion.com) -2. Start a soft spying RS232 or USB (depending on where you connected your joystick/wheel). I used ComPortSpy from fCoder (alpha version!) +1. Generate effects using the force editor provided with the DirectX SDK, or +use Immersion Studio (freely available at their web site in the developer section: +www.immersion.com) +2. Start a soft spying RS232 or USB (depending on where you connected your +joystick/wheel). I used ComPortSpy from fCoder (alpha version!) 3. Play the effect, and watch what happens on the spy screen. A few words about ComPortSpy: -At first glance, this soft seems, hum, well... buggy. In fact, data appear with a few seconds latency. Personnaly, I restart it every time I play an effect. +At first glance, this software seems, hum, well... buggy. In fact, data appear with a +few seconds latency. Personally, I restart it every time I play an effect. Remember it's free (as in free beer) and alpha! ** URLS ** Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy. ** Author of this document ** -Johann Deneux <deneux@ifrance.com> +Johann Deneux <johann.deneux@gmail.com> Home page at http://www.esil.univ-mrs.fr/~jdeneux/projects/ff/ Additions by Vojtech Pavlik. diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt index d9d523099bb7..47fc86830cd7 100644 --- a/Documentation/input/input-programming.txt +++ b/Documentation/input/input-programming.txt @@ -42,8 +42,8 @@ static int __init button_init(void) goto err_free_irq; } - button_dev->evbit[0] = BIT(EV_KEY); - button_dev->keybit[LONG(BTN_0)] = BIT(BTN_0); + button_dev->evbit[0] = BIT_MASK(EV_KEY); + button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); error = input_register_device(button_dev); if (error) { @@ -79,7 +79,7 @@ In the _init function, which is called either upon module load or when booting the kernel, it grabs the required resources (it should also check for the presence of the device). -Then it allocates a new input device structure with input_aloocate_device() +Then it allocates a new input device structure with input_allocate_device() and sets up input bitfields. This way the device driver tells the other parts of the input systems what it is - what events can be generated or accepted by this input device. Our example device can only generate EV_KEY @@ -217,14 +217,15 @@ If you don't need absfuzz and absflat, you can set them to zero, which mean that the thing is precise and always returns to exactly the center position (if it has any). -1.4 NBITS(), LONG(), BIT() +1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK() ~~~~~~~~~~~~~~~~~~~~~~~~~~ -These three macros from input.h help some bitfield computations: +These three macros from bitops.h help some bitfield computations: - NBITS(x) - returns the length of a bitfield array in longs for x bits - LONG(x) - returns the index in the array in longs for bit x - BIT(x) - returns the index in a long for bit x + BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for + x bits + BIT_WORD(x) - returns the index in the array in longs for bit x + BIT_MASK(x) - returns the index in a long for bit x 1.5 The id* and name fields ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Documentation/isdn/CREDITS b/Documentation/isdn/CREDITS index 7c17c837064f..8cac6c2f23ee 100644 --- a/Documentation/isdn/CREDITS +++ b/Documentation/isdn/CREDITS @@ -40,7 +40,7 @@ Andreas Kool (akool@Kool.f.EUnet.de) Pedro Roque Marques (roque@di.fc.ul.pt) For lot of new ideas and the pcbit driver. -Eberhard Moenkeberg (emoenke@gwdg.de) +Eberhard Mönkeberg (emoenke@gwdg.de) For testing and help to get into kernel. Thomas Neumann (tn@ruhr.de) diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap index 2f114babe4b6..a76d74845a4c 100644 --- a/Documentation/isdn/README.concap +++ b/Documentation/isdn/README.concap @@ -111,7 +111,7 @@ struct concap_proto_ops{ struct concap_proto * (*proto_new) (void); /* delete encapsulation protocol instance and free all its resources. - cprot may no loger be referenced after calling this */ + cprot may no longer be referenced after calling this */ void (*proto_del)(struct concap_proto *cprot); /* initialize the protocol's data. To be called at interface startup diff --git a/Documentation/java.txt b/Documentation/java.txt index 3cce3fbb6644..e6a723281547 100644 --- a/Documentation/java.txt +++ b/Documentation/java.txt @@ -37,7 +37,7 @@ other program after you have done the following: or the following, if you want to be more selective: ':Applet:M::<!--applet::/usr/bin/appletviewer:' - Of cause you have to fix the path names. Given path/file names in this + Of course you have to fix the path names. The path/file names given in this document match the Debian 2.1 system. (i.e. jdk installed in /usr, custom wrappers from this document in /usr/local) diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt index fe8b0c4892cf..616043a6da99 100644 --- a/Documentation/kbuild/kconfig-language.txt +++ b/Documentation/kbuild/kconfig-language.txt @@ -77,7 +77,12 @@ applicable everywhere (see syntax). Optionally, dependencies only for this default value can be added with "if". -- dependencies: "depends on"/"requires" <expr> +- type definition + default value: + "def_bool"/"def_tristate" <expr> ["if" <expr>] + This is a shorthand notation for a type definition plus a value. + Optionally dependencies for this default value can be added with "if". + +- dependencies: "depends on" <expr> This defines a dependency for this menu entry. If multiple dependencies are defined, they are connected with '&&'. Dependencies are applied to all other options within this menu entry (which also @@ -289,3 +294,10 @@ source: "source" <prompt> This reads the specified configuration file. This file is always parsed. + +mainmenu: + + "mainmenu" <prompt> + +This sets the config program's title bar if the config program chooses +to use it. diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index f099b814d383..7a7753321a26 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -518,6 +518,28 @@ more details, with real examples. In this example for a specific GCC version the build will error out explaining to the user why it stops. + cc-cross-prefix + cc-cross-prefix is used to check if there exists a $(CC) in path with + one of the listed prefixes. The first prefix where there exist a + prefix$(CC) in the PATH is returned - and if no prefix$(CC) is found + then nothing is returned. + Additional prefixes are separated by a single space in the + call of cc-cross-prefix. + This functionality is useful for architecture Makefiles that try + to set CROSS_COMPILE to well-known values but may have several + values to select between. + It is recommended only to try to set CROSS_COMPILE if it is a cross + build (host arch is different from target arch). And if CROSS_COMPILE + is already set then leave it with the old value. + + Example: + #arch/m68k/Makefile + ifneq ($(SUBARCH),$(ARCH)) + ifeq ($(CROSS_COMPILE),) + CROSS_COMPILE := $(call cc-cross-prefix, m68k-linux-gnu-) + endif + endif + === 4 Host Program support Kbuild supports building executables on the host for use during the diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 1b37b28cc234..d0ac72cc19ff 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -231,6 +231,32 @@ Dump-capture kernel config options (Arch Dependent, ia64) any space below the alignment point will be wasted. +Extended crashkernel syntax +=========================== + +While the "crashkernel=size[@offset]" syntax is sufficient for most +configurations, sometimes it's handy to have the reserved memory dependent +on the value of System RAM -- that's mostly for distributors that pre-setup +the kernel command line to avoid a unbootable system after some memory has +been removed from the machine. + +The syntax is: + + crashkernel=<range1>:<size1>[,<range2>:<size2>,...][@offset] + range=start-[end] + +For example: + + crashkernel=512M-2G:64M,2G-:128M + +This would mean: + + 1) if the RAM is smaller than 512M, then don't reserve anything + (this is the "rescue" case) + 2) if the RAM size is between 512M and 2G, then reserve 64M + 3) if the RAM size is larger than 2G, then reserve 128M + + Boot into System Kernel ======================= diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt index d9e3b199929b..5a4ef48224ae 100644 --- a/Documentation/kernel-docs.txt +++ b/Documentation/kernel-docs.txt @@ -76,9 +76,9 @@ * Title: "Conceptual Architecture of the Linux Kernel" Author: Ivan T. Bowman. URL: http://plg.uwaterloo.ca/~itbowman/papers/CS746G-a1.html - Keywords: conceptual software arquitecture, extracted design, + Keywords: conceptual software architecture, extracted design, reverse engineering, system structure. - Description: Conceptual software arquitecture of the Linux kernel, + Description: Conceptual software architecture of the Linux kernel, automatically extracted from the source code. Very detailed. Good figures. Gives good overall kernel understanding. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 98cf90f2631d..b2361667839f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -222,9 +222,6 @@ and is between 256 and 4096 characters. It is defined in the file Warning: Many of these options can produce a lot of output and make your system unusable. Be very careful. - - acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT - acpi_pm_good [X86-32,X86-64] Override the pmtimer bug detection: force the kernel to assume that this machine's pmtimer latches its value @@ -297,9 +294,6 @@ and is between 256 and 4096 characters. It is defined in the file apm= [APM] Advanced Power Management See header of arch/i386/kernel/apm.c. - applicom= [HW] - Format: <mem>,<irq> - arcrimi= [HW,NET] ARCnet - "RIM I" (entirely mem-mapped) cards Format: <io>,<irq>,<nodeID> @@ -345,12 +339,6 @@ and is between 256 and 4096 characters. It is defined in the file Format: <io>,<irq>,<mode> See header of drivers/net/hamradio/baycom_ser_hdx.c. - blkmtd_device= [HW,MTD] - blkmtd_erasesz= - blkmtd_ro= - blkmtd_bs= - blkmtd_count= - boot_delay= Milliseconds to delay each printk during boot. Values larger than 10 seconds (10000) are changed to no delay (0). @@ -431,8 +419,10 @@ and is between 256 and 4096 characters. It is defined in the file over the 8254 in addition to over the IO-APIC. The kernel tries to set a sensible default. - hpet= [X86-32,HPET] option to disable HPET and use PIT. - Format: disable + hpet= [X86-32,HPET] option to control HPET usage + Format: { enable (default) | disable | force } + disable: disable HPET and use PIT instead + force: allow force enabled of undocumented chips (ICH4, VIA) com20020= [HW,NET] ARCnet - COM20020 chipset Format: @@ -479,6 +469,16 @@ and is between 256 and 4096 characters. It is defined in the file UART at the specified I/O port or MMIO address. The options are the same as for ttyS, above. + no_console_suspend + [HW] Never suspend the console + Disable suspending of consoles during suspend and + hibernate operations. Once disabled, debugging + messages can reach various consoles while the rest + of the system is being put to sleep (ie, while + debugging driver suspend/resume hooks). This may + not work reliably with all consoles, but is known + to work with serial and VGA consoles. + cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver Format: <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>] @@ -487,6 +487,13 @@ and is between 256 and 4096 characters. It is defined in the file [KNL] Reserve a chunk of physical memory to hold a kernel to switch to with kexec on panic. + crashkernel=range1:size1[,range2:size2,...][@offset] + [KNL] Same as above, but depends on the memory + in the running system. The syntax of range is + start-[end] where start and end are both + a memory unit (amount[KMG]). See also + Documentation/kdump/kdump.txt for a example. + cs4232= [HW,OSS] Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq> @@ -496,8 +503,6 @@ and is between 256 and 4096 characters. It is defined in the file cs89x0_media= [HW,NET] Format: { rj45 | aui | bnc } - cyclades= [HW,SERIAL] Cyclades multi-serial port adapter. - dasd= [HW,NET] See header of drivers/s390/block/dasd_devmap.c. @@ -555,10 +560,6 @@ and is between 256 and 4096 characters. It is defined in the file See drivers/char/README.epca and Documentation/digiepca.txt. - dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA - support available. - Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]] - dmasound= [HW,OSS] Sound subsystem buffers dscc4.setup= [NET] @@ -589,17 +590,10 @@ and is between 256 and 4096 characters. It is defined in the file 0: polling mode non-0: interrupt mode (default) - eda= [HW,PS2] - - edb= [HW,PS2] - edd= [EDD] Format: {"of[f]" | "sk[ipmbr]"} See comment in arch/i386/boot/edd.S - eicon= [HW,ISDN] - Format: <id>,<membase>,<irq> - eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. @@ -778,6 +772,23 @@ and is between 256 and 4096 characters. It is defined in the file inttest= [IA64] + intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option + off + Disable intel iommu driver. + igfx_off [Default Off] + By default, gfx is mapped as normal device. If a gfx + device has a dedicated DMAR unit, the DMAR unit is + bypassed by not enabling DMAR with this option. In + this case, gfx device will use physical address for + DMA. + forcedac [x86_64] + With this option iommu will not optimize to look + for io virtual address below 32 bit forcing dual + address cycle on pci bus for cards supporting greater + than 32 bit addressing. The default is to look + for translation below 32 bit and if not available + then look in the higher range. + io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. @@ -875,9 +886,6 @@ and is between 256 and 4096 characters. It is defined in the file lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in C2 power state. - lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip - Format: addr:<io>,irq:<irq> - libata.noacpi [LIBATA] Disables use of ACPI in libata suspend/resume when set. Format: <int> @@ -1125,9 +1133,6 @@ and is between 256 and 4096 characters. It is defined in the file noapic [SMP,APIC] Tells the kernel to not make use of any IOAPICs that may be present in the system. - noasync [HW,M68K] Disables async and sync negotiation for - all devices. - nobats [PPC] Do not use BATs for mapping kernel lowmem on "Classic" PPC cores. @@ -1439,6 +1444,7 @@ and is between 256 and 4096 characters. It is defined in the file Param: <number> - step/bucket size as a power of 2 for statistical time based profiling. Param: "sleep" - profile D-state sleeping (millisecs) + Param: "kvm" - profile VM exits. processor.max_cstate= [HW,ACPI] Limit processor to maximum C-state @@ -1565,9 +1571,6 @@ and is between 256 and 4096 characters. It is defined in the file sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. - sb= [HW,OSS] - Format: <io>,<irq>,<dma>,<dma2> - sbni= [NET] Granch SBNI12 leased line adapter sc1200wdt= [HW,WDT] SC1200 WDT (watchdog) driver @@ -1611,8 +1614,6 @@ and is between 256 and 4096 characters. It is defined in the file serialnumber [BUGS=X86-32] - sg_def_reserved_size= [SCSI] - shapers= [NET] Maximal number of shapers. @@ -2003,10 +2004,6 @@ and is between 256 and 4096 characters. It is defined in the file norandmaps Don't use address space randomization Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space - unwind_debug=N N > 0 will enable dwarf2 unwinder debugging - This is useful to get more information why - you got a "dwarf2 unwinder stuck" - ______________________________________________________________________ TODO: diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index c0b7a4556390..bac037eb1cda 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile @@ -1,28 +1,8 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. - -# For those people that have a separate object dir, look there for .config -KBUILD_OUTPUT := ../.. -ifdef O - ifeq ("$(origin O)", "command line") - KBUILD_OUTPUT := $(O) - endif -endif -# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. -include $(KBUILD_OUTPUT)/.config -LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) - -CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds +CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include LDLIBS:=-lz -# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and -# not others (eg. FC7). -LDFLAGS+=-static -all: lguest.lds lguest -# The linker script on x86 is so complex the only way of creating one -# which will link our binary in the right place is to mangle the -# default one. -lguest.lds: - $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ +all: lguest clean: - rm -f lguest.lds lguest + rm -f lguest diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 103e346c8b6a..5bdc37f81842 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,10 +1,7 @@ /*P:100 This is the Launcher code, a simple program which lays out the * "physical" memory for the new Guest by mapping the kernel image and the * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. - * - * The only trick: the Makefile links it at a high address so it will be clear - * of the guest memory region. It means that each Guest cannot have more than - * about 2.5G of memory on a normally configured Host. :*/ +:*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include <stdio.h> @@ -15,6 +12,7 @@ #include <stdlib.h> #include <elf.h> #include <sys/mman.h> +#include <sys/param.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/wait.h> @@ -34,7 +32,9 @@ #include <termios.h> #include <getopt.h> #include <zlib.h> -/*L:110 We can ignore the 28 include files we need for this program, but I do +#include <assert.h> +#include <sched.h> +/*L:110 We can ignore the 30 include files we need for this program, but I do * want to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -45,8 +45,14 @@ typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; -#include "../../include/linux/lguest_launcher.h" -#include "../../include/asm-x86/e820_32.h" +#include "linux/lguest_launcher.h" +#include "linux/pci_ids.h" +#include "linux/virtio_config.h" +#include "linux/virtio_net.h" +#include "linux/virtio_blk.h" +#include "linux/virtio_console.h" +#include "linux/virtio_ring.h" +#include "asm-x86/bootparam.h" /*:*/ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ @@ -55,6 +61,10 @@ typedef uint8_t u8; #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #endif +/* We can have up to 256 pages for devices. */ +#define DEVICE_PAGES 256 +/* This fits nicely in a single 4096-byte page. */ +#define VIRTQUEUE_NUM 127 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows * this, and although I wouldn't recommend it, it works quite nicely here. */ @@ -65,8 +75,10 @@ static bool verbose; /* The pipe to send commands to the waker process */ static int waker_fd; -/* The top of guest physical memory. */ -static u32 top; +/* The pointer to the start of guest memory. */ +static void *guest_base; +/* The maximum guest physical address allowed, and maximum possible. */ +static unsigned long guest_limit, guest_max; /* This is our list of devices. */ struct device_list @@ -76,8 +88,17 @@ struct device_list fd_set infds; int max_infd; + /* Counter to assign interrupt numbers. */ + unsigned int next_irq; + + /* Counter to print out convenient device numbers. */ + unsigned int device_num; + /* The descriptor page for the devices. */ - struct lguest_device_desc *descs; + u8 *descpage; + + /* The tail of the last descriptor. */ + unsigned int desc_used; /* A single linked list of devices. */ struct device *dev; @@ -85,31 +106,111 @@ struct device_list struct device **lastdev; }; +/* The list of Guest devices, based on command line arguments. */ +static struct device_list devices; + /* The device structure describes a single device. */ struct device { /* The linked-list pointer. */ struct device *next; - /* The descriptor for this device, as mapped into the Guest. */ + + /* The this device's descriptor, as mapped into the Guest. */ struct lguest_device_desc *desc; - /* The memory page(s) of this device, if any. Also mapped in Guest. */ - void *mem; + + /* The name of this device, for --verbose. */ + const char *name; /* If handle_input is set, it wants to be called when this file * descriptor is ready. */ int fd; bool (*handle_input)(int fd, struct device *me); - /* If handle_output is set, it wants to be called when the Guest sends - * DMA to this key. */ - unsigned long watch_key; - u32 (*handle_output)(int fd, const struct iovec *iov, - unsigned int num, struct device *me); + /* Any queues attached to this device */ + struct virtqueue *vq; /* Device-specific data. */ void *priv; }; +/* The virtqueue structure describes a queue attached to a device. */ +struct virtqueue +{ + struct virtqueue *next; + + /* Which device owns me. */ + struct device *dev; + + /* The configuration for this queue. */ + struct lguest_vqconfig config; + + /* The actual ring of buffers. */ + struct vring vring; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* The routine to call when the Guest pings us. */ + void (*handle_output)(int fd, struct virtqueue *me); +}; + +/* Since guest is UP and we don't run at the same time, we don't need barriers. + * But I include them in the code in case others copy it. */ +#define wmb() + +/* Convert an iovec element to the given type. + * + * This is a fairly ugly trick: we need to know the size of the type and + * alignment requirement to check the pointer is kosher. It's also nice to + * have the name of the type in case we report failure. + * + * Typing those three things all the time is cumbersome and error prone, so we + * have a macro which sets them all up and passes to the real function. */ +#define convert(iov, type) \ + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) + +static void *_convert(struct iovec *iov, size_t size, size_t align, + const char *name) +{ + if (iov->iov_len != size) + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); + if ((unsigned long)iov->iov_base % align != 0) + errx(1, "Bad alignment %p for %s", iov->iov_base, name); + return iov->iov_base; +} + +/* The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. */ +#define cpu_to_le16(v16) (v16) +#define cpu_to_le32(v32) (v32) +#define cpu_to_le64(v64) (v64) +#define le16_to_cpu(v16) (v16) +#define le32_to_cpu(v32) (v32) +#define le64_to_cpu(v32) (v64) + +/*L:100 The Launcher code itself takes us out into userspace, that scary place + * where pointers run wild and free! Unfortunately, like most userspace + * programs, it's quite boring (which is why everyone likes to hack on the + * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it + * will get you through this section. Or, maybe not. + * + * The Launcher sets up a big chunk of memory to be the Guest's "physical" + * memory and stores it in "guest_base". In other words, Guest physical == + * Launcher virtual with an offset. + * + * This can be tough to get your head around, but usually it just means that we + * use these trivial conversion functions when the Guest gives us it's + * "physical" addresses: */ +static void *from_guest_phys(unsigned long addr) +{ + return guest_base + addr; +} + +static unsigned long to_guest_phys(const void *addr) +{ + return (addr - guest_base); +} + /*L:130 * Loading the Kernel. * @@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags) return fd; } -/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ -static void *map_zeroed_pages(unsigned long addr, unsigned int num) +/* map_zeroed_pages() takes a number of pages. */ +static void *map_zeroed_pages(unsigned int num) { - /* We cache the /dev/zero file-descriptor so we only open it once. */ - static int fd = -1; - - if (fd == -1) - fd = open_or_die("/dev/zero", O_RDONLY); + int fd = open_or_die("/dev/zero", O_RDONLY); + void *addr; /* We use a private mapping (ie. if we write to the page, it will be - * copied), and obviously we insist that it be mapped where we ask. */ - if (mmap((void *)addr, getpagesize() * num, - PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) - != (void *)addr) - err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); - - /* Returning the address is just a courtesy: can simplify callers. */ - return (void *)addr; + * copied). */ + addr = mmap(NULL, getpagesize() * num, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + err(1, "Mmaping %u pages of /dev/zero", num); + + return addr; } -/* To find out where to start we look for the magic Guest string, which marks - * the code we see in lguest_asm.S. This is a hack which we are currently - * plotting to replace with the normal Linux entry point. */ -static unsigned long entry_point(void *start, void *end, - unsigned long page_offset) +/* Get some more pages for a device. */ +static void *get_pages(unsigned int num) { - void *p; + void *addr = from_guest_phys(guest_limit); - /* The scan gives us the physical starting address. We want the - * virtual address in this case, and fortunately, we already figured - * out the physical-virtual difference and passed it here in - * "page_offset". */ - for (p = start; p < end; p++) - if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) - return (long)p + strlen("GenuineLguest") + page_offset; + guest_limit += num * getpagesize(); + if (guest_limit > guest_max) + errx(1, "Not enough memory for devices"); + return addr; +} - err(1, "Is this image a genuine lguest?"); +/* This routine is used to load the kernel or initrd. It tries mmap, but if + * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), + * it falls back to reading the memory in. */ +static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) +{ + ssize_t r; + + /* We map writable even though for some segments are marked read-only. + * The kernel really wants to be writable: it patches its own + * instructions. + * + * MAP_PRIVATE means that the page won't be copied until a write is + * done to it. This allows us to share untouched memory between + * Guests. */ + if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) + return; + + /* pread does a seek and a read in one shot: saves a few lines. */ + r = pread(fd, addr, len, offset); + if (r != len) + err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } /* This routine takes an open vmlinux image, which is in ELF, and maps it into @@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end, * by all modern binaries on Linux including the kernel. * * The ELF headers give *two* addresses: a physical address, and a virtual - * address. The Guest kernel expects to be placed in memory at the physical - * address, and the page tables set up so it will correspond to that virtual - * address. We return the difference between the virtual and physical - * addresses in the "page_offset" pointer. + * address. We use the physical address; the Guest will map itself to the + * virtual address. * * We return the starting address. */ -static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, - unsigned long *page_offset) +static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) { - void *addr; Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; - unsigned long start = -1UL, end = 0; /* Sanity checks on the main ELF header: an x86 executable with a * reasonable number of correctly-sized program headers. */ @@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); - /* We don't know page_offset yet. */ - *page_offset = 0; - /* Try all the headers: there are usually only three. A read-only one, * a read-write one, and a "note" section which isn't loadable. */ for (i = 0; i < ehdr->e_phnum; i++) { @@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, verbose("Section %i: size %i addr %p\n", i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); - /* We expect a simple linear address space: every segment must - * have the same difference between virtual (p_vaddr) and - * physical (p_paddr) address. */ - if (!*page_offset) - *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; - else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) - errx(1, "Page offset of section %i different", i); - - /* We track the first and last address we mapped, so we can - * tell entry_point() where to scan. */ - if (phdr[i].p_paddr < start) - start = phdr[i].p_paddr; - if (phdr[i].p_paddr + phdr[i].p_filesz > end) - end = phdr[i].p_paddr + phdr[i].p_filesz; - - /* We map this section of the file at its physical address. We - * map it read & write even if the header says this segment is - * read-only. The kernel really wants to be writable: it - * patches its own instructions which would normally be - * read-only. - * - * MAP_PRIVATE means that the page won't be copied until a - * write is done to it. This allows us to share much of the - * kernel memory between Guests. */ - addr = mmap((void *)phdr[i].p_paddr, - phdr[i].p_filesz, - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_FIXED|MAP_PRIVATE, - elf_fd, phdr[i].p_offset); - if (addr != (void *)phdr[i].p_paddr) - err(1, "Mmaping vmlinux seg %i gave %p not %p", - i, addr, (void *)phdr[i].p_paddr); + /* We map this section of the file at its physical address. */ + map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), + phdr[i].p_offset, phdr[i].p_filesz); } - return entry_point((void *)start, (void *)end, *page_offset); + /* The entry point is given in the ELF header. */ + return ehdr->e_entry; } -/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. - * - * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects - * to be. We don't know what that option was, but we can figure it out - * approximately by looking at the addresses in the code. I chose the common - * case of reading a memory location into the %eax register: - * - * movl <some-address>, %eax - * - * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, - * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. - * - * In this example can guess that the kernel was compiled with - * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the - * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our - * kernel isn't that bloated yet. - * - * Unfortunately, x86 has variable-length instructions, so finding this - * particular instruction properly involves writing a disassembler. Instead, - * we rely on statistics. We look for "0xA1" and tally the different bytes - * which occur 4 bytes later (the "0xC0" in our example above). When one of - * those bytes appears three times, we can be reasonably confident that it - * forms the start of CONFIG_PAGE_OFFSET. +/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're + * supposed to jump into it and it will unpack itself. We used to have to + * perform some hairy magic because the unpacking code scared me. * - * This is amazingly reliable. */ -static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) + * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote + * a small patch to jump over the tricky bits in the Guest, so now we just read + * the funky header so we know where in the file to load, and away we go! */ +static unsigned long load_bzimage(int fd) { - unsigned int i, possibilities[256] = { 0 }; + struct boot_params boot; + int r; + /* Modern bzImages get loaded at 1M. */ + void *p = from_guest_phys(0x100000); - for (i = 0; i + 4 < len; i++) { - /* mov 0xXXXXXXXX,%eax */ - if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) - return (unsigned long)img[i+4] << 24; - } - errx(1, "could not determine page offset"); -} + /* Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/i386/boot.txt) */ + lseek(fd, 0, SEEK_SET); + read(fd, &boot, sizeof(boot)); -/*L:160 Unfortunately the entire ELF image isn't compressed: the segments - * which need loading are extracted and compressed raw. This denies us the - * information we need to make a fully-general loader. */ -static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) -{ - gzFile f; - int ret, len = 0; - /* A bzImage always gets loaded at physical address 1M. This is - * actually configurable as CONFIG_PHYSICAL_START, but as the comment - * there says, "Don't change this unless you know what you are doing". - * Indeed. */ - void *img = (void *)0x100000; - - /* gzdopen takes our file descriptor (carefully placed at the start of - * the GZIP header we found) and returns a gzFile. */ - f = gzdopen(fd, "rb"); - /* We read it into memory in 64k chunks until we hit the end. */ - while ((ret = gzread(f, img + len, 65536)) > 0) - len += ret; - if (ret < 0) - err(1, "reading image from bzImage"); - - verbose("Unpacked size %i addr %p\n", len, img); - - /* Without the ELF header, we can't tell virtual-physical gap. This is - * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, - * I have a clever way of figuring it out from the code itself. */ - *page_offset = intuit_page_offset(img, len); - - return entry_point(img, img + len, *page_offset); -} + /* Inside the setup_hdr, we expect the magic "HdrS" */ + if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) + errx(1, "This doesn't look like a bzImage to me"); -/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're - * supposed to jump into it and it will unpack itself. We can't do that - * because the Guest can't run the unpacking code, and adding features to - * lguest kills puppies, so we don't want to. - * - * The bzImage is formed by putting the decompressing code in front of the - * compressed kernel code. So we can simple scan through it looking for the - * first "gzip" header, and start decompressing from there. */ -static unsigned long load_bzimage(int fd, unsigned long *page_offset) -{ - unsigned char c; - int state = 0; - - /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */ - while (read(fd, &c, 1) == 1) { - switch (state) { - case 0: - if (c == 0x1F) - state++; - break; - case 1: - if (c == 0x8B) - state++; - else - state = 0; - break; - case 2 ... 8: - state++; - break; - case 9: - /* Seek back to the start of the gzip header. */ - lseek(fd, -10, SEEK_CUR); - /* One final check: "compressed under UNIX". */ - if (c != 0x03) - state = -1; - else - return unpack_bzimage(fd, page_offset); - } - } - errx(1, "Could not find kernel in bzImage"); + /* Skip over the extra sectors of the header. */ + lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); + + /* Now read everything into memory. in nice big chunks. */ + while ((r = read(fd, p, 65536)) > 0) + p += r; + + /* Finally, code32_start tells us where to enter the kernel. */ + return boot.hdr.code32_start; } /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With some funky * coding, we can load those, too. */ -static unsigned long load_kernel(int fd, unsigned long *page_offset) +static unsigned long load_kernel(int fd) { Elf32_Ehdr hdr; @@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset) /* If it's an ELF file, it starts with "\177ELF" */ if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - return map_elf(fd, &hdr, page_offset); + return map_elf(fd, &hdr); /* Otherwise we assume it's a bzImage, and try to unpack it */ - return load_bzimage(fd, page_offset); + return load_bzimage(fd); } /* This is a trivial little helper to align pages. Andi Kleen hated it because @@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem) int ifd; struct stat st; unsigned long len; - void *iaddr; ifd = open_or_die(name, O_RDONLY); /* fstat() is needed to get the file size. */ if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); - /* The length needs to be rounded up to a page size: mmap needs the - * address to be page aligned. */ + /* We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. */ len = page_align(st.st_size); - /* We map the initrd at the top of memory. */ - iaddr = mmap((void *)mem - len, st.st_size, - PROT_READ|PROT_EXEC|PROT_WRITE, - MAP_FIXED|MAP_PRIVATE, ifd, 0); - if (iaddr != (void *)mem - len) - err(1, "Mmaping initrd '%s' returned %p not %p", - name, iaddr, (void *)mem - len); + map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); /* Once a file is mapped, you can close the file descriptor. It's a * little odd, but quite useful. */ close(ifd); - verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); + verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); /* We return the initrd size. */ return len; } -/* Once we know how much memory we have, and the address the Guest kernel - * expects, we can construct simple linear page tables which will get the Guest - * far enough into the boot to create its own. +/* Once we know how much memory we have, we can construct simple linear page + * tables which set virtual == physical which will get the Guest far enough + * into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to * know its size). */ static unsigned long setup_pagetables(unsigned long mem, - unsigned long initrd_size, - unsigned long page_offset) + unsigned long initrd_size) { - u32 *pgdir, *linear; + unsigned long *pgdir, *linear; unsigned int mapped_pages, i, linear_pages; - unsigned int ptes_per_page = getpagesize()/sizeof(u32); + unsigned int ptes_per_page = getpagesize()/sizeof(void *); - /* Ideally we map all physical memory starting at page_offset. - * However, if page_offset is 0xC0000000 we can only map 1G of physical - * (0xC0000000 + 1G overflows). */ - if (mem <= -page_offset) - mapped_pages = mem/getpagesize(); - else - mapped_pages = -page_offset/getpagesize(); + mapped_pages = mem/getpagesize(); /* Each PTE page can map ptes_per_page pages: how many do we need? */ linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; /* We put the toplevel page directory page at the top of memory. */ - pgdir = (void *)mem - initrd_size - getpagesize(); + pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages*getpagesize(); @@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem, for (i = 0; i < mapped_pages; i++) linear[i] = ((i * getpagesize()) | PAGE_PRESENT); - /* The top level points to the linear page table pages above. The - * entry representing page_offset points to the first one, and they - * continue from there. */ + /* The top level points to the linear page table pages above. */ for (i = 0; i < mapped_pages; i += ptes_per_page) { - pgdir[(i + page_offset/getpagesize())/ptes_per_page] - = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); + pgdir[i/ptes_per_page] + = ((to_guest_phys(linear) + i*sizeof(void *)) + | PAGE_PRESENT); } - verbose("Linear mapping of %u pages in %u pte pages at %p\n", - mapped_pages, linear_pages, linear); + verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", + mapped_pages, linear_pages, to_guest_phys(linear)); /* We return the top level (guest-physical) address: the kernel needs * to know where it is. */ - return (unsigned long)pgdir; + return to_guest_phys(pgdir); } /* Simple routine to roll all the commandline arguments together with spaces @@ -498,14 +483,17 @@ static void concat(char *dst, char *args[]) /* This is where we actually tell the kernel to initialize the Guest. We saw * the arguments it expects when we looked at initialize() in lguest_user.c: - * the top physical page to allow, the top level pagetable, the entry point and - * the page_offset constant for the Guest. */ -static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) + * the base of guest "physical" memory, the top physical page to allow, the + * top level pagetable and the entry point for the Guest. */ +static int tell_kernel(unsigned long pgdir, unsigned long start) { - u32 args[] = { LHREQ_INITIALIZE, - top/getpagesize(), pgdir, start, page_offset }; + unsigned long args[] = { LHREQ_INITIALIZE, + (unsigned long)guest_base, + guest_limit / getpagesize(), pgdir, start }; int fd; + verbose("Guest: %p - %p (%#lx)\n", + guest_base, guest_base + guest_limit, guest_limit); fd = open_or_die("/dev/lguest", O_RDWR); if (write(fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); @@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) } /*:*/ -static void set_fd(int fd, struct device_list *devices) +static void add_device_fd(int fd) { - FD_SET(fd, &devices->infds); - if (fd > devices->max_infd) - devices->max_infd = fd; + FD_SET(fd, &devices.infds); + if (fd > devices.max_infd) + devices.max_infd = fd; } /*L:200 @@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices) * * This, of course, is merely a different *kind* of icky. */ -static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) +static void wake_parent(int pipefd, int lguest_fd) { /* Add the pipe from the Launcher to the fdset in the device_list, so * we watch it, too. */ - set_fd(pipefd, devices); + add_device_fd(pipefd); for (;;) { - fd_set rfds = devices->infds; - u32 args[] = { LHREQ_BREAK, 1 }; + fd_set rfds = devices.infds; + unsigned long args[] = { LHREQ_BREAK, 1 }; /* Wait until input is ready from one of the devices. */ - select(devices->max_infd+1, &rfds, NULL, NULL, NULL); + select(devices.max_infd+1, &rfds, NULL, NULL, NULL); /* Is it a message from the Launcher? */ if (FD_ISSET(pipefd, &rfds)) { - int ignorefd; + int fd; /* If read() returns 0, it means the Launcher has * exited. We silently follow. */ - if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) + if (read(pipefd, &fd, sizeof(fd)) == 0) exit(0); - /* Otherwise it's telling us there's a problem with one - * of the devices, and we should ignore that file - * descriptor from now on. */ - FD_CLR(ignorefd, &devices->infds); + /* Otherwise it's telling us to change what file + * descriptors we're to listen to. */ + if (fd >= 0) + FD_SET(fd, &devices.infds); + else + FD_CLR(-fd - 1, &devices.infds); } else /* Send LHREQ_BREAK command. */ write(lguest_fd, args, sizeof(args)); } } /* This routine just sets up a pipe to the Waker process. */ -static int setup_waker(int lguest_fd, struct device_list *device_list) +static int setup_waker(int lguest_fd) { int pipefd[2], child; @@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list) if (child == 0) { /* Close the "writing" end of our copy of the pipe */ close(pipefd[1]); - wake_parent(pipefd[0], lguest_fd, device_list); + wake_parent(pipefd[0], lguest_fd); } /* Close the reading end of our copy of the pipe. */ close(pipefd[0]); @@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size, { /* We have to separately check addr and addr+size, because size could * be huge and addr + size might wrap around. */ - if (addr >= top || addr + size >= top) - errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); + if (addr >= guest_limit || addr + size >= guest_limit) + errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); /* We return a pointer for the caller's convenience, now we know it's * safe to use. */ - return (void *)addr; + return from_guest_phys(addr); } /* A macro which transparently hands the line number to the real function. */ #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) -/* The Guest has given us the address of a "struct lguest_dma". We check it's - * OK and convert it to an iovec (which is a simple array of ptr/size - * pairs). */ -static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) +/* This function returns the next descriptor in the chain, or vq->vring.num. */ +static unsigned next_desc(struct virtqueue *vq, unsigned int i) { - unsigned int i; - struct lguest_dma *udma; - - /* First we make sure that the array memory itself is valid. */ - udma = check_pointer(dma, sizeof(*udma)); - /* Now we check each element */ - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { - /* A zero length ends the array. */ - if (!udma->len[i]) - break; + unsigned int next; - iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); - iov[i].iov_len = udma->len[i]; - } - *num = i; + /* If this descriptor says it doesn't chain, we're done. */ + if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) + return vq->vring.num; + + /* Check they're not leading us off end of descriptors. */ + next = vq->vring.desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + wmb(); - /* We return the pointer to where the caller should write the amount of - * the buffer used. */ - return &udma->used_len; + if (next >= vq->vring.num) + errx(1, "Desc next is %u", next); + + return next; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->vring.num (which + * is never a valid descriptor number) if none was found. */ +static unsigned get_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + unsigned int i, head; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) + errx(1, "Guest moved used index from %u to %u", + vq->last_avail_idx, vq->vring.avail->idx); + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->vring.avail->idx == vq->last_avail_idx) + return vq->vring.num; + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->vring.num) + errx(1, "Guest says index %u is available", head); + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + /* Grab the first descriptor, and check it's OK. */ + iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; + iov[*out_num + *in_num].iov_base + = check_pointer(vq->vring.desc[i].addr, + vq->vring.desc[i].len); + /* If this is an input descriptor, increment that count. */ + if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) + (*in_num)++; + else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) + errx(1, "Descriptor has out after in"); + (*out_num)++; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (*out_num + *in_num > vq->vring.num) + errx(1, "Looped descriptor"); + } while ((i = next_desc(vq, i)) != vq->vring.num); + + return head; } -/* This routine gets a DMA buffer from the Guest for a given key, and converts - * it to an iovec array. It returns the interrupt the Guest wants when we're - * finished, and a pointer to the "used_len" field to fill in. */ -static u32 *get_dma_buffer(int fd, void *key, - struct iovec iov[], unsigned int *num, u32 *irq) +/* Once we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using trigger_irq(). */ +static void add_used(struct virtqueue *vq, unsigned int head, int len) { - u32 buf[] = { LHREQ_GETDMA, (u32)key }; - unsigned long udma; - u32 *res; - - /* Ask the kernel for a DMA buffer corresponding to this key. */ - udma = write(fd, buf, sizeof(buf)); - /* They haven't registered any, or they're all used? */ - if (udma == (unsigned long)-1) - return NULL; - - /* Convert it into our iovec array */ - res = dma2iov(udma, iov, num); - /* The kernel stashes irq in ->used_len to get it out to us. */ - *irq = *res; - /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */ - return res; + struct vring_used_elem *used; + + /* Get a pointer to the next entry in the used ring. */ + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; + used->id = head; + used->len = len; + /* Make sure buffer is written before we update index. */ + wmb(); + vq->vring.used->idx++; } -/* This is a convenient routine to send the Guest an interrupt. */ -static void trigger_irq(int fd, u32 irq) +/* This actually sends the interrupt for this virtqueue */ +static void trigger_irq(int fd, struct virtqueue *vq) { - u32 buf[] = { LHREQ_IRQ, irq }; + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + return; + + /* Send the Guest an interrupt tell them we used something up. */ if (write(fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", irq); + err(1, "Triggering irq %i", vq->config.irq); } -/* This simply sets up an iovec array where we can put data to be discarded. - * This happens when the Guest doesn't want or can't handle the input: we have - * to get rid of it somewhere, and if we bury it in the ceiling space it will - * start to smell after a week. */ -static void discard_iovec(struct iovec *iov, unsigned int *num) +/* And here's the combo meal deal. Supersize me! */ +static void add_used_and_trigger(int fd, struct virtqueue *vq, + unsigned int head, int len) { - static char discard_buf[1024]; - *num = 1; - iov->iov_base = discard_buf; - iov->iov_len = sizeof(discard_buf); + add_used(vq, head, len); + trigger_irq(fd, vq); } /* Here is the input terminal setting we save, and the routine to restore them @@ -701,38 +736,39 @@ struct console_abort /* This is the routine which handles console input (ie. stdin). */ static bool handle_console_input(int fd, struct device *dev) { - u32 irq = 0, *lenp; int len; - unsigned int num; - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + unsigned int head, in_num, out_num; + struct iovec iov[dev->vq->vring.num]; struct console_abort *abort = dev->priv; - /* First we get the console buffer from the Guest. The key is dev->mem - * which was set to 0 in setup_console(). */ - lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); - if (!lenp) { - /* If it's not ready for input, warn and set up to discard. */ - warn("console: no dma buffer!"); - discard_iovec(iov, &num); - } + /* First we need a console buffer from the Guests's input virtqueue. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + + /* If they're not ready for input, stop listening to this file + * descriptor. We'll start again once they add an input buffer. */ + if (head == dev->vq->vring.num) + return false; + + if (out_num) + errx(1, "Output buffers in console in queue?"); /* This is why we convert to iovecs: the readv() call uses them, and so * it reads straight into the Guest's buffer. */ - len = readv(dev->fd, iov, num); + len = readv(dev->fd, iov, in_num); if (len <= 0) { /* This implies that the console is closed, is /dev/null, or - * something went terribly wrong. We still go through the rest - * of the logic, though, especially the exit handling below. */ + * something went terribly wrong. */ warnx("Failed to get console input, ignoring console."); - len = 0; + /* Put the input terminal back. */ + restore_term(); + /* Remove callback from input vq, so it doesn't restart us. */ + dev->vq->handle_output = NULL; + /* Stop listening to this fd: don't call us again. */ + return false; } - /* If we read the data into the Guest, fill in the length and send the - * interrupt. */ - if (lenp) { - *lenp = len; - trigger_irq(fd, irq); - } + /* Tell the Guest about the new input. */ + add_used_and_trigger(fd, dev->vq, head, len); /* Three ^C within one second? Exit. * @@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev) struct timeval now; gettimeofday(&now, NULL); if (now.tv_sec <= abort->start.tv_sec+1) { - u32 args[] = { LHREQ_BREAK, 0 }; + unsigned long args[] = { LHREQ_BREAK, 0 }; /* Close the fd so Waker will know it has to * exit. */ close(waker_fd); @@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev) /* Any other key resets the abort counter. */ abort->count = 0; - /* Now, if we didn't read anything, put the input terminal back and - * return failure (meaning, don't call us again). */ - if (!len) { - restore_term(); - return false; - } /* Everything went OK! */ return true; } -/* Handling console output is much simpler than input. */ -static u32 handle_console_output(int fd, const struct iovec *iov, - unsigned num, struct device*dev) +/* Handling output for console is simple: we just get all the output buffers + * and write them to stdout. */ +static void handle_console_output(int fd, struct virtqueue *vq) { - /* Whatever the Guest sends, write it to standard output. Return the - * number of bytes written. */ - return writev(STDOUT_FILENO, iov, num); -} - -/* Guest->Host network output is also pretty easy. */ -static u32 handle_tun_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) -{ - /* We put a flag in the "priv" pointer of the network device, and set - * it as soon as we see output. We'll see why in handle_tun_input() */ - *(bool *)dev->priv = true; - /* Whatever packet the Guest sent us, write it out to the tun - * device. */ - return writev(dev->fd, iov, num); + unsigned int head, out, in; + int len; + struct iovec iov[vq->vring.num]; + + /* Keep getting output buffers from the Guest until we run out. */ + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { + if (in) + errx(1, "Input buffers in output queue?"); + len = writev(STDOUT_FILENO, iov, out); + add_used_and_trigger(fd, vq, head, len); + } } -/* This matches the peer_key() in lguest_net.c. The key for any given slot - * is the address of the network device's page plus 4 * the slot number. */ -static unsigned long peer_offset(unsigned int peernum) +/* Handling output for network is also simple: we get all the output buffers + * and write them (ignoring the first element) to this device's file descriptor + * (stdout). */ +static void handle_net_output(int fd, struct virtqueue *vq) { - return 4 * peernum; + unsigned int head, out, in; + int len; + struct iovec iov[vq->vring.num]; + + /* Keep getting output buffers from the Guest until we run out. */ + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { + if (in) + errx(1, "Input buffers in output queue?"); + /* Check header, but otherwise ignore it (we said we supported + * no features). */ + (void)convert(&iov[0], struct virtio_net_hdr); + len = writev(vq->dev->fd, iov+1, out-1); + add_used_and_trigger(fd, vq, head, len); + } } -/* This is where we handle a packet coming in from the tun device */ +/* This is where we handle a packet coming in from the tun device to our + * Guest. */ static bool handle_tun_input(int fd, struct device *dev) { - u32 irq = 0, *lenp; + unsigned int head, in_num, out_num; int len; - unsigned num; - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + struct iovec iov[dev->vq->vring.num]; + struct virtio_net_hdr *hdr; - /* First we get a buffer the Guest has bound to its key. */ - lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, - &irq); - if (!lenp) { + /* First we need a network buffer from the Guests's recv virtqueue. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + if (head == dev->vq->vring.num) { /* Now, it's expected that if we try to send a packet too - * early, the Guest won't be ready yet. This is why we set a - * flag when the Guest sends its first packet. If it's sent a - * packet we assume it should be ready to receive them. - * - * Actually, this is what the status bits in the descriptor are - * for: we should *use* them. FIXME! */ - if (*(bool *)dev->priv) + * early, the Guest won't be ready yet. Wait until the device + * status says it's ready. */ + /* FIXME: Actually want DRIVER_ACTIVE here. */ + if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) warn("network: no dma buffer!"); - discard_iovec(iov, &num); - } + /* We'll turn this back on if input buffers are registered. */ + return false; + } else if (out_num) + errx(1, "Output buffers in network recv queue?"); + + /* First element is the header: we set it to 0 (no features). */ + hdr = convert(&iov[0], struct virtio_net_hdr); + hdr->flags = 0; + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; /* Read the packet from the device directly into the Guest's buffer. */ - len = readv(dev->fd, iov, num); + len = readv(dev->fd, iov+1, in_num-1); if (len <= 0) err(1, "reading network"); - /* Write the used_len, and trigger the interrupt for the Guest */ - if (lenp) { - *lenp = len; - trigger_irq(fd, irq); - } + /* Tell the Guest about the new packet. */ + add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); + verbose("tun input packet len %i [%02x %02x] (%s)\n", len, - ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], - lenp ? "sent" : "discarded"); + ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], + head != dev->vq->vring.num ? "sent" : "discarded"); + /* All good. */ return true; } -/* The last device handling routine is block output: the Guest has sent a DMA - * to the block device. It will have placed the command it wants in the - * "struct lguest_block_page". */ -static u32 handle_block_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) +/* This callback ensures we try again, in case we stopped console or net + * delivery because Guest didn't have any buffers. */ +static void enable_fd(int fd, struct virtqueue *vq) { - struct lguest_block_page *p = dev->mem; - u32 irq, *lenp; - unsigned int len, reply_num; - struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; - off64_t device_len, off = (off64_t)p->sector * 512; - - /* First we extract the device length from the dev->priv pointer. */ - device_len = *(off64_t *)dev->priv; - - /* We first check that the read or write is within the length of the - * block file. */ - if (off >= device_len) - err(1, "Bad offset %llu vs %llu", off, device_len); - /* Move to the right location in the block file. This shouldn't fail, - * but best to check. */ - if (lseek64(dev->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %i", p->sector); - - verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); - - /* They were supposed to bind a reply buffer at key equal to the start - * of the block device memory. We need this to tell them when the - * request is finished. */ - lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); - if (!lenp) - err(1, "Block request didn't give us a dma buffer"); - - if (p->type) { - /* A write request. The DMA they sent contained the data, so - * write it out. */ - len = writev(dev->fd, iov, num); - /* Grr... Now we know how long the "struct lguest_dma" they - * sent was, we make sure they didn't try to write over the end - * of the block file (possibly extending it). */ - if (off + len > device_len) { - /* Trim it back to the correct length */ - ftruncate64(dev->fd, device_len); - /* Die, bad Guest, die. */ - errx(1, "Write past end %llu+%u", off, len); - } - /* The reply length is 0: we just send back an empty DMA to - * interrupt them and tell them the write is finished. */ - *lenp = 0; - } else { - /* A read request. They sent an empty DMA to start the - * request, and we put the read contents into the reply - * buffer. */ - len = readv(dev->fd, reply, reply_num); - *lenp = len; - } - - /* The result is 1 (done), 2 if there was an error (short read or - * write). */ - p->result = 1 + (p->bytes != len); - /* Now tell them we've used their reply buffer. */ - trigger_irq(fd, irq); - - /* We're supposed to return the number of bytes of the output buffer we - * used. But the block device uses the "result" field instead, so we - * don't bother. */ - return 0; + add_device_fd(vq->dev->fd); + /* Tell waker to listen to it again */ + write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } -/* This is the generic routine we call when the Guest sends some DMA out. */ -static void handle_output(int fd, unsigned long dma, unsigned long key, - struct device_list *devices) +/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ +static void handle_output(int fd, unsigned long addr) { struct device *i; - u32 *lenp; - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; - unsigned num = 0; - - /* Convert the "struct lguest_dma" they're sending to a "struct - * iovec". */ - lenp = dma2iov(dma, iov, &num); - - /* Check each device: if they expect output to this key, tell them to - * handle it. */ - for (i = devices->dev; i; i = i->next) { - if (i->handle_output && key == i->watch_key) { - /* We write the result straight into the used_len field - * for them. */ - *lenp = i->handle_output(fd, iov, num, i); - return; + struct virtqueue *vq; + + /* Check each virtqueue. */ + for (i = devices.dev; i; i = i->next) { + for (vq = i->vq; vq; vq = vq->next) { + if (vq->config.pfn == addr/getpagesize() + && vq->handle_output) { + verbose("Output to %s\n", vq->dev->name); + vq->handle_output(fd, vq); + return; + } } } - /* This can happen: the kernel sends any SEND_DMA which doesn't match - * another Guest to us. It could be that another Guest just left a - * network, for example. But it's unusual. */ - warnx("Pending dma %p, key %p", (void *)dma, (void *)key); + /* Early console write is done using notify on a nul-terminated string + * in Guest memory. */ + if (addr >= guest_limit) + errx(1, "Bad NOTIFY %#lx", addr); + + write(STDOUT_FILENO, from_guest_phys(addr), + strnlen(from_guest_phys(addr), guest_limit - addr)); } /* This is called when the waker wakes us up: check for incoming file * descriptors. */ -static void handle_input(int fd, struct device_list *devices) +static void handle_input(int fd) { /* select() wants a zeroed timeval to mean "don't wait". */ struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; for (;;) { struct device *i; - fd_set fds = devices->infds; + fd_set fds = devices.infds; /* If nothing is ready, we're done. */ - if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) + if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) break; /* Otherwise, call the device(s) which have readable * file descriptors and a method of handling them. */ - for (i = devices->dev; i; i = i->next) { + for (i = devices.dev; i; i = i->next) { if (i->handle_input && FD_ISSET(i->fd, &fds)) { + int dev_fd; + if (i->handle_input(fd, i)) + continue; + /* If handle_input() returns false, it means we - * should no longer service it. - * handle_console_input() does this. */ - if (!i->handle_input(fd, i)) { - /* Clear it from the set of input file - * descriptors kept at the head of the - * device list. */ - FD_CLR(i->fd, &devices->infds); - /* Tell waker to ignore it too... */ - write(waker_fd, &i->fd, sizeof(i->fd)); - } + * should no longer service it. Networking and + * console do this when there's no input + * buffers to deliver into. Console also uses + * it when it discovers that stdin is + * closed. */ + FD_CLR(i->fd, &devices.infds); + /* Tell waker to ignore it too, by sending a + * negative fd number (-1, since 0 is a valid + * FD number). */ + dev_fd = -i->fd - 1; + write(waker_fd, &dev_fd, sizeof(dev_fd)); } } } @@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices) * routines to allocate them. * * This routine allocates a new "struct lguest_device_desc" from descriptor - * table in the devices array just above the Guest's normal memory. */ -static struct lguest_device_desc * -new_dev_desc(struct lguest_device_desc *descs, - u16 type, u16 features, u16 num_pages) + * table just above the Guest's normal memory. It returns a pointer to that + * descriptor. */ +static struct lguest_device_desc *new_dev_desc(u16 type) { - unsigned int i; + struct lguest_device_desc *d; - for (i = 0; i < LGUEST_MAX_DEVICES; i++) { - if (!descs[i].type) { - descs[i].type = type; - descs[i].features = features; - descs[i].num_pages = num_pages; - /* If they said the device needs memory, we allocate - * that now, bumping up the top of Guest memory. */ - if (num_pages) { - map_zeroed_pages(top, num_pages); - descs[i].pfn = top/getpagesize(); - top += num_pages*getpagesize(); - } - return &descs[i]; - } - } - errx(1, "too many devices"); + /* We only have one page for all the descriptors. */ + if (devices.desc_used + sizeof(*d) > getpagesize()) + errx(1, "Too many devices"); + + /* We don't need to set config_len or status: page is 0 already. */ + d = (void *)devices.descpage + devices.desc_used; + d->type = type; + devices.desc_used += sizeof(*d); + + return d; } -/* This monster routine does all the creation and setup of a new device, - * including caling new_dev_desc() to allocate the descriptor and device - * memory. */ -static struct device *new_device(struct device_list *devices, - u16 type, u16 num_pages, u16 features, - int fd, - bool (*handle_input)(int, struct device *), - unsigned long watch_off, - u32 (*handle_output)(int, - const struct iovec *, - unsigned, - struct device *)) +/* Each device descriptor is followed by some configuration information. + * The first byte is a "status" byte for the Guest to report what's happening. + * After that are fields: u8 type, u8 len, [... len bytes...]. + * + * This routine adds a new field to an existing device's descriptor. It only + * works for the last device, but that's OK because that's how we use it. */ +static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) +{ + /* This is the last descriptor, right? */ + assert(devices.descpage + devices.desc_used + == (u8 *)(dev->desc + 1) + dev->desc->config_len); + + /* We only have one page of device descriptions. */ + if (devices.desc_used + 2 + len > getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the new config header: type then length. */ + devices.descpage[devices.desc_used++] = type; + devices.descpage[devices.desc_used++] = len; + memcpy(devices.descpage + devices.desc_used, c, len); + devices.desc_used += len; + + /* Update the device descriptor length: two byte head then data. */ + dev->desc->config_len += 2 + len; +} + +/* This routine adds a virtqueue to a device. We specify how many descriptors + * the virtqueue is to have. */ +static void add_virtqueue(struct device *dev, unsigned int num_descs, + void (*handle_output)(int fd, struct virtqueue *me)) +{ + unsigned int pages; + struct virtqueue **i, *vq = malloc(sizeof(*vq)); + void *p; + + /* First we need some pages for this virtqueue. */ + pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize(); + p = get_pages(pages); + + /* Initialize the configuration. */ + vq->config.num = num_descs; + vq->config.irq = devices.next_irq++; + vq->config.pfn = to_guest_phys(p) / getpagesize(); + + /* Initialize the vring. */ + vring_init(&vq->vring, num_descs, p); + + /* Add the configuration information to this device's descriptor. */ + add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, + sizeof(vq->config), &vq->config); + + /* Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. */ + for (i = &dev->vq; *i; i = &(*i)->next); + *i = vq; + + /* Link virtqueue back to device. */ + vq->dev = dev; + + /* Set up handler. */ + vq->handle_output = handle_output; + if (!handle_output) + vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; +} + +/* This routine does all the creation and setup of a new device, including + * caling new_dev_desc() to allocate the descriptor and device memory. */ +static struct device *new_device(const char *name, u16 type, int fd, + bool (*handle_input)(int, struct device *)) { struct device *dev = malloc(sizeof(*dev)); @@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices, * easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line * is eth0, the first block device /dev/lgba, etc. */ - *devices->lastdev = dev; + *devices.lastdev = dev; dev->next = NULL; - devices->lastdev = &dev->next; + devices.lastdev = &dev->next; /* Now we populate the fields one at a time. */ dev->fd = fd; /* If we have an input handler for this file descriptor, then we add it * to the device_list's fdset and maxfd. */ if (handle_input) - set_fd(dev->fd, devices); - dev->desc = new_dev_desc(devices->descs, type, features, num_pages); - dev->mem = (void *)(dev->desc->pfn * getpagesize()); + add_device_fd(dev->fd); + dev->desc = new_dev_desc(type); dev->handle_input = handle_input; - dev->watch_key = (unsigned long)dev->mem + watch_off; - dev->handle_output = handle_output; + dev->name = name; return dev; } /* Our first setup routine is the console. It's a fairly simple device, but * UNIX tty handling makes it uglier than it could be. */ -static void setup_console(struct device_list *devices) +static void setup_console(void) { struct device *dev; @@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices) atexit(restore_term); } - /* We don't currently require any memory for the console, so we ask for - * 0 pages. */ - dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, - STDIN_FILENO, handle_console_input, - LGUEST_CONSOLE_DMA_KEY, handle_console_output); + dev = new_device("console", VIRTIO_ID_CONSOLE, + STDIN_FILENO, handle_console_input); /* We store the console state in dev->priv, and initialize it. */ dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; - verbose("device %p: console\n", - (void *)(dev->desc->pfn * getpagesize())); -} -/* Setting up a block file is also fairly straightforward. */ -static void setup_block_file(const char *filename, struct device_list *devices) -{ - int fd; - struct device *dev; - off64_t *device_len; - struct lguest_block_page *p; - - /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We - * open with O_DIRECT because otherwise our benchmarks go much too - * fast. */ - fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); - - /* We want one page, and have no input handler (the block file never - * has anything interesting to say to us). Our timing will be quite - * random, so it should be a reasonable randomness source. */ - dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, - LGUEST_DEVICE_F_RANDOMNESS, - fd, NULL, 0, handle_block_output); - - /* We store the device size in the private area */ - device_len = dev->priv = malloc(sizeof(*device_len)); - /* This is the safe way of establishing the size of our device: it - * might be a normal file or an actual block device like /dev/hdb. */ - *device_len = lseek64(fd, 0, SEEK_END); - - /* The device memory is a "struct lguest_block_page". It's zeroed - * already, we just need to put in the device size. Block devices - * think in sectors (ie. 512 byte chunks), so we translate here. */ - p = dev->mem; - p->num_sectors = *device_len/512; - verbose("device %p: block %i sectors\n", - (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); + /* The console needs two virtqueues: the input then the output. When + * they put something the input queue, we make sure we're listening to + * stdin. When they put something in the output queue, we write it to + * stdout. */ + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); + + verbose("device %u: console\n", devices.device_num++); } +/*:*/ -/* - * Network Devices. +/*M:010 Inter-guest networking is an interesting area. Simplest is to have a + * --sharenet=<name> option which opens or creates a named pipe. This can be + * used to send packets to another guest in a 1:1 manner. * - * Setting up network devices is quite a pain, because we have three types. - * First, we have the inter-Guest network. This is a file which is mapped into - * the address space of the Guests who are on the network. Because it is a - * shared mapping, the same page underlies all the devices, and they can send - * DMA to each other. + * More sopisticated is to use one of the tools developed for project like UML + * to do networking. * - * Remember from our network driver, the Guest is told what slot in the page it - * is to use. We use exclusive fnctl locks to reserve a slot. If another - * Guest is using a slot, the lock will fail and we try another. Because fnctl - * locks are cleaned up automatically when we die, this cleverly means that our - * reservation on the slot will vanish if we crash. */ -static unsigned int find_slot(int netfd, const char *filename) -{ - struct flock fl; - - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_len = 1; - /* Try a 1 byte lock in each possible position number */ - for (fl.l_start = 0; - fl.l_start < getpagesize()/sizeof(struct lguest_net); - fl.l_start++) { - /* If we succeed, return the slot number. */ - if (fcntl(netfd, F_SETLK, &fl) == 0) - return fl.l_start; - } - errx(1, "No free slots in network file %s", filename); -} - -/* This function sets up the network file */ -static void setup_net_file(const char *filename, - struct device_list *devices) -{ - int netfd; - struct device *dev; - - /* We don't use open_or_die() here: for friendliness we create the file - * if it doesn't already exist. */ - netfd = open(filename, O_RDWR, 0); - if (netfd < 0) { - if (errno == ENOENT) { - netfd = open(filename, O_RDWR|O_CREAT, 0600); - if (netfd >= 0) { - /* If we succeeded, initialize the file with a - * blank page. */ - char page[getpagesize()]; - memset(page, 0, sizeof(page)); - write(netfd, page, sizeof(page)); - } - } - if (netfd < 0) - err(1, "cannot open net file '%s'", filename); - } - - /* We need 1 page, and the features indicate the slot to use and that - * no checksum is needed. We never touch this device again; it's - * between the Guests on the network, so we don't register input or - * output handlers. */ - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, - find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, - -1, NULL, 0, NULL); - - /* Map the shared file. */ - if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, - MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) - err(1, "could not mmap '%s'", filename); - verbose("device %p: shared net %s, peer %i\n", - (void *)(dev->desc->pfn * getpagesize()), filename, - dev->desc->features & ~LGUEST_NET_F_NOCSUM); -} -/*:*/ + * Faster is to do virtio bonding in kernel. Doing this 1:1 would be + * completely generic ("here's my vring, attach to your vring") and would work + * for any traffic. Of course, namespace and permissions issues need to be + * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide + * multiple inter-guest channels behind one interface, although it would + * require some manner of hotplugging new virtio channels. + * + * Finally, we could implement a virtio network switch in the kernel. :*/ static u32 str2ip(const char *ipaddr) { @@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) /* This sets up the Host end of the network device with an IP address, brings * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer (in practice, the Host's slot in the network device's memory). */ + * pointer. */ static void configure_device(int fd, const char *devname, u32 ipaddr, unsigned char hwaddr[6]) { @@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr, memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); } -/*L:195 The other kind of network is a Host<->Guest network. This can either - * use briding or routing, but the principle is the same: it uses the "tun" - * device to inject packets into the Host as if they came in from a normal - * network card. We just shunt packets between the Guest and the tun - * device. */ -static void setup_tun_net(const char *arg, struct device_list *devices) +/*L:195 Our network is a Host<->Guest network. This can either use bridging or + * routing, but the principle is the same: it uses the "tun" device to inject + * packets into the Host as if they came in from a normal network card. We + * just shunt packets between the Guest and the tun device. */ +static void setup_tun_net(const char *arg) { struct device *dev; struct ifreq ifr; int netfd, ipfd; u32 ip; const char *br_name = NULL; + u8 hwaddr[6]; /* We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell @@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices) * device: trust us! */ ioctl(netfd, TUNSETNOCSUM, 1); - /* We create the net device with 1 page, using the features field of - * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and - * that the device has fairly random timing. We do *not* specify - * LGUEST_NET_F_NOCSUM: these packets can reach the real world. - * - * We will put our MAC address is slot 0 for the Guest to see, so - * it will send packets to us using the key "peer_offset(0)": */ - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, - NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, - handle_tun_input, peer_offset(0), handle_tun_output); + /* First we create a new network device. */ + dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); - /* We keep a flag which says whether we've seen packets come out from - * this network device. */ - dev->priv = malloc(sizeof(bool)); - *(bool *)dev->priv = false; + /* Network devices need a receive and a send queue, just like + * console. */ + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); /* We need a socket to perform the magic network ioctls to bring up the * tap interface, connect to the bridge etc. Any socket will do! */ @@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices) } else /* It is an IP address to set up the device with */ ip = str2ip(arg); - /* We are peer 0, ie. first slot, so we hand dev->mem to this routine - * to write the MAC address at the start of the device memory. */ - configure_device(ipfd, ifr.ifr_name, ip, dev->mem); + /* Set up the tun device, and get the mac address for the interface. */ + configure_device(ipfd, ifr.ifr_name, ip, hwaddr); - /* Set "promisc" bit: we want every single packet if we're going to - * bridge to other machines (and otherwise it doesn't matter). */ - *((u8 *)dev->mem) |= 0x1; + /* Tell Guest what MAC address to use. */ + add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); + /* We don't seed the socket any more; setup is done. */ close(ipfd); - verbose("device %p: tun net %u.%u.%u.%u\n", - (void *)(dev->desc->pfn * getpagesize()), - (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); + verbose("device %u: tun net %u.%u.%u.%u\n", + devices.device_num++, + (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); if (br_name) verbose("attached to bridge: %s\n", br_name); } + + +/* + * Block device. + * + * Serving a block device is really easy: the Guest asks for a block number and + * we read or write that position in the file. + * + * Unfortunately, this is amazingly slow: the Guest waits until the read is + * finished before running anything else, even if it could be doing useful + * work. We could use async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. + * + * So we farm the I/O out to thread, and communicate with it via a pipe. */ + +/* This hangs off device->priv, with the data. */ +struct vblk_info +{ + /* The size of the file. */ + off64_t len; + + /* The file descriptor for the file. */ + int fd; + + /* IO thread listens on this file descriptor [0]. */ + int workpipe[2]; + + /* IO thread writes to this file descriptor to mark it done, then + * Launcher triggers interrupt to Guest. */ + int done_fd; +}; + +/* This is the core of the I/O thread. It returns true if it did something. */ +static bool service_io(struct device *dev) +{ + struct vblk_info *vblk = dev->priv; + unsigned int head, out_num, in_num, wlen; + int ret; + struct virtio_blk_inhdr *in; + struct virtio_blk_outhdr *out; + struct iovec iov[dev->vq->vring.num]; + off64_t off; + + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + if (head == dev->vq->vring.num) + return false; + + if (out_num == 0 || in_num == 0) + errx(1, "Bad virtblk cmd %u out=%u in=%u", + head, out_num, in_num); + + out = convert(&iov[0], struct virtio_blk_outhdr); + in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); + off = out->sector * 512; + + /* This is how we implement barriers. Pretty poor, no? */ + if (out->type & VIRTIO_BLK_T_BARRIER) + fdatasync(vblk->fd); + + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { + fprintf(stderr, "Scsi commands unsupported\n"); + in->status = VIRTIO_BLK_S_UNSUPP; + wlen = sizeof(in); + } else if (out->type & VIRTIO_BLK_T_OUT) { + /* Write */ + + /* Move to the right location in the block file. This can fail + * if they try to write past end. */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = writev(vblk->fd, iov+1, out_num-1); + verbose("WRITE to sector %llu: %i\n", out->sector, ret); + + /* Grr... Now we know how long the descriptor they sent was, we + * make sure they didn't try to write over the end of the block + * file (possibly extending it). */ + if (ret > 0 && off + ret > vblk->len) { + /* Trim it back to the correct length */ + ftruncate64(vblk->fd, vblk->len); + /* Die, bad Guest, die. */ + errx(1, "Write past end %llu+%u", off, ret); + } + wlen = sizeof(in); + in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else { + /* Read */ + + /* Move to the right location in the block file. This can fail + * if they try to read past end. */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = readv(vblk->fd, iov+1, in_num-1); + verbose("READ from sector %llu: %i\n", out->sector, ret); + if (ret >= 0) { + wlen = sizeof(in) + ret; + in->status = VIRTIO_BLK_S_OK; + } else { + wlen = sizeof(in); + in->status = VIRTIO_BLK_S_IOERR; + } + } + + /* We can't trigger an IRQ, because we're not the Launcher. It does + * that when we tell it we're done. */ + add_used(dev->vq, head, wlen); + return true; +} + +/* This is the thread which actually services the I/O. */ +static int io_thread(void *_dev) +{ + struct device *dev = _dev; + struct vblk_info *vblk = dev->priv; + char c; + + /* Close other side of workpipe so we get 0 read when main dies. */ + close(vblk->workpipe[1]); + /* Close the other side of the done_fd pipe. */ + close(dev->fd); + + /* When this read fails, it means Launcher died, so we follow. */ + while (read(vblk->workpipe[0], &c, 1) == 1) { + /* We acknowledge each request immediately, to reduce latency, + * rather than waiting until we've done them all. I haven't + * measured to see if it makes any difference. */ + while (service_io(dev)) + write(vblk->done_fd, &c, 1); + } + return 0; +} + +/* When the thread says some I/O is done, we interrupt the Guest. */ +static bool handle_io_finish(int fd, struct device *dev) +{ + char c; + + /* If child died, presumably it printed message. */ + if (read(dev->fd, &c, 1) != 1) + exit(1); + + /* It did some work, so trigger the irq. */ + trigger_irq(fd, dev->vq); + return true; +} + +/* When the Guest submits some I/O, we wake the I/O thread. */ +static void handle_virtblk_output(int fd, struct virtqueue *vq) +{ + struct vblk_info *vblk = vq->dev->priv; + char c = 0; + + /* Wake up I/O thread and tell it to go to work! */ + if (write(vblk->workpipe[1], &c, 1) != 1) + /* Presumably it indicated why it died. */ + exit(1); +} + +/* This creates a virtual block device. */ +static void setup_block_file(const char *filename) +{ + int p[2]; + struct device *dev; + struct vblk_info *vblk; + void *stack; + u64 cap; + unsigned int val; + + /* This is the pipe the I/O thread will use to tell us I/O is done. */ + pipe(p); + + /* The device responds to return from I/O thread. */ + dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); + + /* The device has a virtqueue. */ + add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); + + /* Allocate the room for our own bookkeeping */ + vblk = dev->priv = malloc(sizeof(*vblk)); + + /* First we open the file and store the length. */ + vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); + vblk->len = lseek64(vblk->fd, 0, SEEK_END); + + /* Tell Guest how many sectors this device has. */ + cap = cpu_to_le64(vblk->len / 512); + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); + + /* Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. */ + val = cpu_to_le32(VIRTQUEUE_NUM - 2); + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); + + /* The I/O thread writes to this end of the pipe when done. */ + vblk->done_fd = p[1]; + + /* This is how we tell the I/O thread about more work. */ + pipe(vblk->workpipe); + + /* Create stack for thread and run it */ + stack = malloc(32768); + if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) + err(1, "Creating clone"); + + /* We don't need to keep the I/O thread's end of the pipes open. */ + close(vblk->done_fd); + close(vblk->workpipe[0]); + + verbose("device %u: virtblock %llu sectors\n", + devices.device_num, cap); +} /* That's the end of device setup. */ /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves * its input and output, and finally, lays it to rest. */ -static void __attribute__((noreturn)) -run_guest(int lguest_fd, struct device_list *device_list) +static void __attribute__((noreturn)) run_guest(int lguest_fd) { for (;;) { - u32 args[] = { LHREQ_BREAK, 0 }; - unsigned long arr[2]; + unsigned long args[] = { LHREQ_BREAK, 0 }; + unsigned long notify_addr; int readval; /* We read from the /dev/lguest device to run the Guest. */ - readval = read(lguest_fd, arr, sizeof(arr)); - - /* The read can only really return sizeof(arr) (the Guest did a - * SEND_DMA to us), or an error. */ + readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); - /* For a successful read, arr[0] is the address of the "struct - * lguest_dma", and arr[1] is the key the Guest sent to. */ - if (readval == sizeof(arr)) { - handle_output(lguest_fd, arr[0], arr[1], device_list); + /* One unsigned long means the Guest did HCALL_NOTIFY */ + if (readval == sizeof(notify_addr)) { + verbose("Notify on address %#lx\n", notify_addr); + handle_output(lguest_fd, notify_addr); continue; /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { @@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list) /* Service input, then unset the BREAK which releases * the Waker. */ - handle_input(lguest_fd, device_list); + handle_input(lguest_fd); if (write(lguest_fd, args, sizeof(args)) < 0) err(1, "Resetting break"); } @@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list) static struct option opts[] = { { "verbose", 0, NULL, 'v' }, - { "sharenet", 1, NULL, 's' }, { "tunnet", 1, NULL, 't' }, { "block", 1, NULL, 'b' }, { "initrd", 1, NULL, 'i' }, @@ -1374,37 +1516,21 @@ static struct option opts[] = { static void usage(void) { errx(1, "Usage: lguest [--verbose] " - "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" + "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" "|--block=<filename>|--initrd=<filename>]...\n" "<mem-in-mb> vmlinux [args...]"); } -/*L:100 The Launcher code itself takes us out into userspace, that scary place - * where pointers run wild and free! Unfortunately, like most userspace - * programs, it's quite boring (which is why everyone like to hack on the - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it - * will get you through this section. Or, maybe not. - * - * The Launcher binary sits up high, usually starting at address 0xB8000000. - * Everything below this is the "physical" memory for the Guest. For example, - * if the Guest were to write a "1" at physical address 0, we would see a "1" - * in the Launcher at "(int *)0". Guest physical == Launcher virtual. - * - * This can be tough to get your head around, but usually it just means that we - * don't need to do any conversion when the Guest gives us it's "physical" - * addresses. - */ +/*L:105 The main routine is where the real work begins: */ int main(int argc, char *argv[]) { - /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size - * of the (optional) initrd. */ - unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; + /* Memory, top-level pagetable, code startpoint and size of the + * (optional) initrd. */ + unsigned long mem = 0, pgdir, start, initrd_size = 0; /* A temporary and the /dev/lguest file descriptor. */ int i, c, lguest_fd; - /* The list of Guest devices, based on command line arguments. */ - struct device_list device_list; - /* The boot information for the Guest: at guest-physical address 0. */ - void *boot = (void *)0; + /* The boot information for the Guest. */ + struct boot_params *boot; /* If they specify an initrd file to load. */ const char *initrd_name = NULL; @@ -1412,11 +1538,12 @@ int main(int argc, char *argv[]) * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the * list. We also keep a pointer to the last device, for easy appending - * to the list. */ - device_list.max_infd = -1; - device_list.dev = NULL; - device_list.lastdev = &device_list.dev; - FD_ZERO(&device_list.infds); + * to the list. Finally, we keep the next interrupt number to hand out + * (1: remember that 0 is used by the timer). */ + FD_ZERO(&devices.infds); + devices.max_infd = -1; + devices.lastdev = &devices.dev; + devices.next_irq = 1; /* We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command @@ -1424,9 +1551,16 @@ int main(int argc, char *argv[]) * of memory now. */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { - mem = top = atoi(argv[i]) * 1024 * 1024; - device_list.descs = map_zeroed_pages(top, 1); - top += getpagesize(); + mem = atoi(argv[i]) * 1024 * 1024; + /* We start by mapping anonymous pages over all of + * guest-physical memory range. This fills it with 0, + * and ensures that the Guest won't be killed when it + * tries to access it. */ + guest_base = map_zeroed_pages(mem / getpagesize() + + DEVICE_PAGES); + guest_limit = mem; + guest_max = mem + DEVICE_PAGES*getpagesize(); + devices.descpage = get_pages(1); break; } } @@ -1437,14 +1571,11 @@ int main(int argc, char *argv[]) case 'v': verbose = true; break; - case 's': - setup_net_file(optarg, &device_list); - break; case 't': - setup_tun_net(optarg, &device_list); + setup_tun_net(optarg); break; case 'b': - setup_block_file(optarg, &device_list); + setup_block_file(optarg); break; case 'i': initrd_name = optarg; @@ -1459,56 +1590,60 @@ int main(int argc, char *argv[]) if (optind + 2 > argc) usage(); - /* We always have a console device */ - setup_console(&device_list); + verbose("Guest base is at %p\n", guest_base); - /* We start by mapping anonymous pages over all of guest-physical - * memory range. This fills it with 0, and ensures that the Guest - * won't be killed when it tries to access it. */ - map_zeroed_pages(0, mem / getpagesize()); + /* We always have a console device */ + setup_console(); /* Now we load the kernel */ - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), - &page_offset); + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); + + /* Boot information is stashed at physical address 0 */ + boot = from_guest_phys(0); /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); /* These are the location in the Linux boot header where the * start and size of the initrd are expected to be found. */ - *(unsigned long *)(boot+0x218) = mem - initrd_size; - *(unsigned long *)(boot+0x21c) = initrd_size; + boot->hdr.ramdisk_image = mem - initrd_size; + boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ - *(unsigned char *)(boot+0x210) = 0xFF; + boot->hdr.type_of_loader = 0xFF; } /* Set up the initial linear pagetables, starting below the initrd. */ - pgdir = setup_pagetables(mem, initrd_size, page_offset); + pgdir = setup_pagetables(mem, initrd_size); /* The Linux boot header contains an "E820" memory map: ours is a * simple, single region. */ - *(char*)(boot+E820NR) = 1; - *((struct e820entry *)(boot+E820MAP)) - = ((struct e820entry) { 0, mem, E820_RAM }); + boot->e820_entries = 1; + boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); /* The boot header contains a command line pointer: we put the command - * line after the boot header (at address 4096) */ - *(void **)(boot + 0x228) = boot + 4096; - concat(boot + 4096, argv+optind+2); + * line after the boot header. */ + boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); + concat((char *)(boot + 1), argv+optind+2); + + /* Boot protocol version: 2.07 supports the fields for lguest. */ + boot->hdr.version = 0x207; + + /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = 1; - /* The guest type value of "1" tells the Guest it's under lguest. */ - *(int *)(boot + 0x23c) = 1; + /* Tell the entry path not to try to reload segment registers. */ + boot->hdr.loadflags |= KEEP_SEGMENTS; /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - lguest_fd = tell_kernel(pgdir, start, page_offset); + lguest_fd = tell_kernel(pgdir, start); /* We fork off a child process, which wakes the Launcher whenever one * of the input file descriptors needs attention. Otherwise we would * run the Guest until it tries to output something. */ - waker_fd = setup_waker(lguest_fd, &device_list); + waker_fd = setup_waker(lguest_fd); /* Finally, run the Guest. This doesn't return. */ - run_guest(lguest_fd, &device_list); + run_guest(lguest_fd); } /*:*/ diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 821617bd6c04..7885ab2d5f53 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for Linux developers and users to experiment with virtualization with the minimum of complexity. Nonetheless, it should have sufficient features to make it useful for specific tasks, and, of course, you are -encouraged to fork and enhance it. +encouraged to fork and enhance it (see drivers/lguest/README). Features: @@ -23,19 +23,30 @@ Developer features: Running Lguest: -- Lguest runs the same kernel as guest and host. You can configure - them differently, but usually it's easiest not to. +- The easiest way to run lguest is to use same kernel as guest and host. + You can configure them differently, but usually it's easiest not to. You will need to configure your kernel with the following options: - CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] - CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") - CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") - CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") - CONFIG_LGUEST=y/m ("Linux hypervisor example code") - - and I recommend: - CONFIG_HZ=100 ("Timer frequency")[2] + "General setup": + "Prompt for development and/or incomplete code/drivers" = Y + (CONFIG_EXPERIMENTAL=y) + + "Processor type and features": + "Paravirtualized guest support" = Y + "Lguest guest support" = Y + "High Memory Support" = off/4GB + "Alignment value to which kernel should be aligned" = 0x100000 + (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and + CONFIG_PHYSICAL_ALIGN=0x100000) + + "Device Drivers": + "Network device support" + "Universal TUN/TAP device driver support" = M/Y + (CONFIG_TUN=m) + "Virtualization" + "Linux hypervisor example code" = M/Y + (CONFIG_LGUEST=m) - A tool called "lguest" is available in this directory: type "make" to build it. If you didn't build your kernel in-tree, use "make @@ -51,14 +62,17 @@ Running Lguest: dd if=/dev/zero of=rootfile bs=1M count=2048 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d + Make sure that you install a getty on /dev/hvc0 if you want to log in on the + console! + - "modprobe lg" if you built it as a module. - Run an lguest as root: - Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba + Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda Explanation: - 64m: the amount of memory to use. + 64: the amount of memory to use, in MB. vmlinux: the kernel image found in the top of your build directory. You can also use a standard bzImage. @@ -66,10 +80,10 @@ Running Lguest: --tunnet=192.168.19.1: configures a "tap" device for networking with this IP address. - --block=rootfile: a file or block device which becomes /dev/lgba + --block=rootfile: a file or block device which becomes /dev/vda inside the guest. - root=/dev/lgba: this (and anything else on the command line) are + root=/dev/vda: this (and anything else on the command line) are kernel boot parameters. - Configuring networking. I usually have the host masquerade, using @@ -99,31 +113,7 @@ Running Lguest: "--sharenet=<filename>": any two guests using the same file are on the same network. This file is created if it does not exist. -Lguest I/O model: - -Lguest uses a simplified DMA model plus shared memory for I/O. Guests -can communicate with each other if they share underlying memory -(usually by the lguest program mmaping the same file), but they can -use any non-shared memory to communicate with the lguest process. - -Guests can register DMA buffers at any key (must be a valid physical -address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) -hypercall. "dmabufs" is the physical address of an array of "num" -"struct lguest_dma": each contains a used_len, and an array of -physical addresses and lengths. When a transfer occurs, the -"used_len" field of one of the buffers which has used_len 0 will be -set to the length transferred and the irq will fire. +There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest -Using an irq value of 0 unbinds the dma buffers. - -To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, -and the bytes used is written to the used_len field. This can be 0 if -noone else has bound a DMA buffer to that key or some other error. -DMA buffers bound by the same guest are ignored. - -Cheers! +Good luck! Rusty Russell rusty@rustcorp.com.au. - -[1] These are on various places on the TODO list, waiting for you to - get annoyed enough at the limitation to fix it. -[2] Lguest is not yet tickless when idle. See [1]. diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt index 8a523f6af48a..248589e8bcf5 100644 --- a/Documentation/m68k/kernel-options.txt +++ b/Documentation/m68k/kernel-options.txt @@ -890,10 +890,7 @@ Syntax: nosync:0 5.5.2) noasync -------------- -Syntax: noasync:0 - - Disables async and sync negotiation for all devices. Any value - after the colon is acceptable (and has the same effect). +[OBSOLETE, REMOVED] 5.5.3) nodisconnect ------------------- diff --git a/Documentation/markers.txt b/Documentation/markers.txt new file mode 100644 index 000000000000..295a71bc301e --- /dev/null +++ b/Documentation/markers.txt @@ -0,0 +1,81 @@ + Using the Linux Kernel Markers + + Mathieu Desnoyers + + +This document introduces Linux Kernel Markers and their use. It provides +examples of how to insert markers in the kernel and connect probe functions to +them and provides some examples of probe functions. + + +* Purpose of markers + +A marker placed in code provides a hook to call a function (probe) that you can +provide at runtime. A marker can be "on" (a probe is connected to it) or "off" +(no probe is attached). When a marker is "off" it has no effect, except for +adding a tiny time penalty (checking a condition for a branch) and space +penalty (adding a few bytes for the function call at the end of the +instrumented function and adds a data structure in a separate section). When a +marker is "on", the function you provide is called each time the marker is +executed, in the execution context of the caller. When the function provided +ends its execution, it returns to the caller (continuing from the marker site). + +You can put markers at important locations in the code. Markers are +lightweight hooks that can pass an arbitrary number of parameters, +described in a printk-like format string, to the attached probe function. + +They can be used for tracing and performance accounting. + + +* Usage + +In order to use the macro trace_mark, you should include linux/marker.h. + +#include <linux/marker.h> + +And, + +trace_mark(subsystem_event, "%d %s", someint, somestring); +Where : +- subsystem_event is an identifier unique to your event + - subsystem is the name of your subsystem. + - event is the name of the event to mark. +- "%d %s" is the formatted string for the serializer. +- someint is an integer. +- somestring is a char pointer. + +Connecting a function (probe) to a marker is done by providing a probe (function +to call) for the specific marker through marker_probe_register() and can be +activated by calling marker_arm(). Marker deactivation can be done by calling +marker_disarm() as many times as marker_arm() has been called. Removing a probe +is done through marker_probe_unregister(); it will disarm the probe and make +sure there is no caller left using the probe when it returns. Probe removal is +preempt-safe because preemption is disabled around the probe call. See the +"Probe example" section below for a sample probe module. + +The marker mechanism supports inserting multiple instances of the same marker. +Markers can be put in inline functions, inlined static functions, and +unrolled loops as well as regular functions. + +The naming scheme "subsystem_event" is suggested here as a convention intended +to limit collisions. Marker names are global to the kernel: they are considered +as being the same whether they are in the core kernel image or in modules. +Conflicting format strings for markers with the same name will cause the markers +to be detected to have a different format string not to be armed and will output +a printk warning which identifies the inconsistency: + +"Format mismatch for probe probe_name (format), marker (format)" + + +* Probe / marker example + +See the example provided in samples/markers/src + +Compile them with your kernel. + +Run, as root : +modprobe marker-example (insmod order is not important) +modprobe probe-example +cat /proc/marker-example (returns an expected error) +rmmod marker-example probe-example +dmesg diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 650657c54733..4e17beba2379 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1479,7 +1479,8 @@ kernel. Any atomic operation that modifies some state in memory and returns information about the state (old or new) implies an SMP-conditional general memory barrier -(smp_mb()) on each side of the actual operation. These include: +(smp_mb()) on each side of the actual operation (with the exception of +explicit lock operations, described later). These include: xchg(); cmpxchg(); @@ -1536,10 +1537,19 @@ If they're used for constructing a lock of some description, then they probably do need memory barriers as a lock primitive generally has to do things in a specific order. - Basically, each usage case has to be carefully considered as to whether memory barriers are needed or not. +The following operations are special locking primitives: + + test_and_set_bit_lock(); + clear_bit_unlock(); + __clear_bit_unlock(); + +These implement LOCK-class and UNLOCK-class operations. These should be used in +preference to other operations when implementing locking primitives, because +their implementations can be optimised on many architectures. + [!] Note that special memory barrier primitives are available for these situations because on some CPUs the atomic instructions used imply full memory barriers, and so barrier instructions are superfluous in conjunction with them, diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 5fbcc22c98e9..168117bd6ee8 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -2,7 +2,8 @@ Memory Hotplug ============== -Last Updated: Jul 28 2007 +Created: Jul 28 2007 +Add description of notifier of memory hotplug Oct 11 2007 This document is about memory hotplug including how-to-use and current status. Because Memory Hotplug is still under development, contents of this text will @@ -24,7 +25,8 @@ be changed often. 6.1 Memory offline and ZONE_MOVABLE 6.2. How to offline memory 7. Physical memory remove -8. Future Work List +8. Memory hotplug event notifier +9. Future Work List Note(1): x86_64's has special implementation for memory hotplug. This text does not describe it. @@ -307,8 +309,58 @@ Need more implementation yet.... - Notification completion of remove works by OS to firmware. - Guard from remove if not yet. +-------------------------------- +8. Memory hotplug event notifier +-------------------------------- +Memory hotplug has event notifer. There are 6 types of notification. + +MEMORY_GOING_ONLINE + Generated before new memory becomes available in order to be able to + prepare subsystems to handle memory. The page allocator is still unable + to allocate from the new memory. + +MEMORY_CANCEL_ONLINE + Generated if MEMORY_GOING_ONLINE fails. + +MEMORY_ONLINE + Generated when memory has succesfully brought online. The callback may + allocate pages from the new memory. + +MEMORY_GOING_OFFLINE + Generated to begin the process of offlining memory. Allocations are no + longer possible from the memory but some of the memory to be offlined + is still in use. The callback can be used to free memory known to a + subsystem from the indicated memory section. + +MEMORY_CANCEL_OFFLINE + Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from + the section that we attempted to offline. + +MEMORY_OFFLINE + Generated after offlining memory is complete. + +A callback routine can be registered by + hotplug_memory_notifier(callback_func, priority) + +The second argument of callback function (action) is event types of above. +The third argument is passed by pointer of struct memory_notify. + +struct memory_notify { + unsigned long start_pfn; + unsigned long nr_pages; + int status_cahnge_nid; +} + +start_pfn is start_pfn of online/offline memory. +nr_pages is # of pages of online/offline memory. +status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) +set/clear. It means a new(memoryless) node gets new memory by online and a +node loses all memory. If this is -1, then nodemask status is not changed. +If status_changed_nid >= 0, callback should create/discard structures for the +node if necessary. + -------------- -8. Future Work +9. Future Work -------------- - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like sysctl or new control file. diff --git a/Documentation/mips/00-INDEX b/Documentation/mips/00-INDEX index 9df8a2eac7b4..3f13bf8043d2 100644 --- a/Documentation/mips/00-INDEX +++ b/Documentation/mips/00-INDEX @@ -4,5 +4,3 @@ AU1xxx_IDE.README - README for MIPS AU1XXX IDE driver. GT64120.README - README for dir with info on MIPS boards using GT-64120 or GT-64120A. -time.README - - README for MIPS time services. diff --git a/Documentation/mips/AU1xxx_IDE.README b/Documentation/mips/AU1xxx_IDE.README index afb31c141d9d..5c8334123f4f 100644 --- a/Documentation/mips/AU1xxx_IDE.README +++ b/Documentation/mips/AU1xxx_IDE.README @@ -59,7 +59,7 @@ Four configs variables are introduced: CONFIG_BLK_DEV_IDE_AU1XXX_PIO_DBDMA - enable the PIO+DBDMA mode CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA - enable the MWDMA mode CONFIG_BLK_DEV_IDE_AU1XXX_BURSTABLE_ON - set Burstable FIFO in DBDMA - controler + controller CONFIG_BLK_DEV_IDE_AU1XXX_SEQTS_PER_RQ - maximum transfer size per descriptor diff --git a/Documentation/mips/time.README b/Documentation/mips/time.README deleted file mode 100644 index a4ce603ed3b3..000000000000 --- a/Documentation/mips/time.README +++ /dev/null @@ -1,173 +0,0 @@ -README for MIPS time services - -Jun Sun -jsun@mvista.com or jsun@junsun.net - - -ABOUT ------ -This file describes the new arch/mips/kernel/time.c, related files and the -services they provide. - -If you are short in patience and just want to know how to use time.c for a -new board or convert an existing board, go to the last section. - - -FILES, COMPATABILITY AND CONFIGS ---------------------------------- - -The old arch/mips/kernel/time.c is renamed to old-time.c. - -A new time.c is put there, together with include/asm-mips/time.h. - -Two configs variables are introduced, CONFIG_OLD_TIME_C and CONFIG_NEW_TIME_C. -So we allow boards using - - 1) old time.c (CONFIG_OLD_TIME_C) - 2) new time.c (CONFIG_NEW_TIME_C) - 3) neither (their own private time.c) - -However, it is expected every board will move to the new time.c in the near -future. - - -WHAT THE NEW CODE PROVIDES? ---------------------------- - -The new time code provide the following services: - - a) Implements functions required by Linux common code: - time_init - - b) provides an abstraction of RTC and null RTC implementation as default. - extern unsigned long (*rtc_get_time)(void); - extern int (*rtc_set_time)(unsigned long); - - c) high-level and low-level timer interrupt routines where the timer - interrupt source may or may not be the CPU timer. The high-level - routine is dispatched through do_IRQ() while the low-level is - dispatched in assemably code (usually int-handler.S) - - -WHAT THE NEW CODE REQUIRES? ---------------------------- - -For the new code to work properly, each board implementation needs to supply -the following functions or values: - - a) board_time_init - a function pointer. Invoked at the beginnig of - time_init(). It is optional. - 1. (optional) set up RTC routines - 2. (optional) calibrate and set the mips_hpt_frequency - - b) plat_timer_setup - a function pointer. Invoked at the end of time_init() - 1. (optional) over-ride any decisions made in time_init() - 2. set up the irqaction for timer interrupt. - 3. enable the timer interrupt - - c) (optional) board-specific RTC routines. - - d) (optional) mips_hpt_frequency - It must be definied if the board - is using CPU counter for timer interrupt. - - -PORTING GUIDE -------------- - -Step 1: decide how you like to implement the time services. - - a) does this board have a RTC? If yes, implement the two RTC funcs. - - b) does the CPU have counter/compare registers? - - If the answer is no, you need a timer to provide the timer interrupt - at 100 HZ speed. - - c) The following sub steps assume your CPU has counter register. - Do you plan to use the CPU counter register as the timer interrupt - or use an exnternal timer? - - In order to use CPU counter register as the timer interrupt source, you - must know the counter speed (mips_hpt_frequency). It is usually the - same as the CPU speed or an integral divisor of it. - - d) decide on whether you want to use high-level or low-level timer - interrupt routines. The low-level one is presumably faster, but should - not make too mcuh difference. - - -Step 2: the machine setup() function - - If you supply board_time_init(), set the function poointer. - - -Step 3: implement rtc routines, board_time_init() and plat_timer_setup() - if needed. - - board_time_init() - - a) (optional) set up RTC routines, - b) (optional) calibrate and set the mips_hpt_frequency - (only needed if you intended to use cpu counter as timer interrupt - source) - - plat_timer_setup() - - a) (optional) over-write any choices made above by time_init(). - b) machine specific code should setup the timer irqaction. - c) enable the timer interrupt - - - If the RTC chip is a common chip, I suggest the routines are put under - arch/mips/libs. For example, for DS1386 chip, one would create - rtc-ds1386.c under arch/mips/lib directory. Add the following line to - the arch/mips/lib/Makefile: - - obj-$(CONFIG_DDB5476) += rtc-ds1386.o - -Step 4: if you are using low-level timer interrupt, change your interrupt - dispathcing code to check for timer interrupt and jump to - ll_timer_interrupt() directly if one is detected. - -Step 5: Modify arch/mips/config.in and add CONFIG_NEW_TIME_C to your machine. - Modify the appropriate defconfig if applicable. - -Final notes: - -For some tricky cases, you may need to add your own wrapper functions -for some of the functions in time.c. - -For example, you may define your own timer interrupt routine, which does -some of its own processing and then calls timer_interrupt(). - -You can also over-ride any of the built-in functions (RTC routines -and/or timer interrupt routine). - - -PORTING NOTES FOR SMP ----------------------- - -If you have a SMP box, things are slightly more complicated. - -The time service running every jiffy is logically divided into two parts: - - 1) the one for the whole system (defined in timer_interrupt()) - 2) the one that should run for each CPU (defined in local_timer_interrupt()) - -You need to decide on your timer interrupt sources. - - case 1) - whole system has only one timer interrupt delivered to one CPU - - In this case, you set up timer interrupt as in UP systems. In addtion, - you need to set emulate_local_timer_interrupt to 1 so that other - CPUs get to call local_timer_interrupt(). - - THIS IS CURRENTLY NOT IMPLEMNETED. However, it is rather easy to write - one should such a need arise. You simply make a IPI call. - - case 2) - each CPU has a separate timer interrupt - - In this case, you need to set up IRQ such that each of them will - call local_timer_interrupt(). In addition, you need to arrange - one and only one of them to call timer_interrupt(). - - You can also do the low-level version of those interrupt routines, - following similar dispatching routes described above. diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index 51f935191ae5..aa60d1f627e5 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt @@ -133,4 +133,6 @@ the APIs of 'struct mutex' have been streamlined: int mutex_trylock(struct mutex *lock); void mutex_unlock(struct mutex *lock); int mutex_is_locked(struct mutex *lock); - + void mutex_lock_nested(struct mutex *lock, unsigned int subclass); + int mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass); diff --git a/Documentation/networking/bcm43xx.txt b/Documentation/networking/bcm43xx.txt index a136721499bf..d602c8d6ff3e 100644 --- a/Documentation/networking/bcm43xx.txt +++ b/Documentation/networking/bcm43xx.txt @@ -37,7 +37,7 @@ all, distributions. There is, however, additional software that is required. The firmware used by the chip is the intellectual property of Broadcom and they have not given the bcm43xx team redistribution rights to this firmware. Since we cannot legally redistribute -the firwmare we cannot include it with the driver. Furthermore, it +the firmware we cannot include it with the driver. Furthermore, it cannot be placed in the downloadable archives of any distributing organization; therefore, the user is responsible for obtaining the firmware and placing it in the appropriate location so that the driver diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6ae2feff3087..747a5d15d529 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -293,7 +293,7 @@ tcp_no_metrics_save - BOOLEAN when the connection closes, so that connections established in the near future can use these to set initial conditions. Usually, this increases overall performance, but may sometimes cause performance - degredation. If set, TCP will not cache metrics on closing + degradation. If set, TCP will not cache metrics on closing connections. tcp_orphan_retries - INTEGER diff --git a/Documentation/networking/rxrpc.txt b/Documentation/networking/rxrpc.txt index c36b64b0020f..c3669a3fb4af 100644 --- a/Documentation/networking/rxrpc.txt +++ b/Documentation/networking/rxrpc.txt @@ -689,7 +689,7 @@ such as the AFS filesystem. This permits such a utility to: buffers manipulated directly. To use the RxRPC facility, a kernel utility must still open an AF_RXRPC socket, -bind an addess as appropriate and listen if it's to be a server socket, but +bind an address as appropriate and listen if it's to be a server socket, but then it passes this to the kernel interface functions. The kernel interface functions are as follows: diff --git a/Documentation/networking/udplite.txt b/Documentation/networking/udplite.txt index 6be09ba24a36..b6409cab075c 100644 --- a/Documentation/networking/udplite.txt +++ b/Documentation/networking/udplite.txt @@ -12,7 +12,7 @@ For in-depth information, you can consult: o The UDP-Lite Homepage: http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/ - Fom here you can also download some example application source code. + From here you can also download some example application source code. o The UDP-Lite HOWTO on http://www.erg.abdn.ac.uk/users/gerrit/udp-lite/files/UDP-Lite-HOWTO.txt @@ -223,7 +223,7 @@ While it is important that such cases are dealt with correctly, they are (annoyingly) rare: UDP-Lite is designed for optimising multimedia performance over wireless (or generally noisy) links and thus smaller - coverage lenghts are likely to be expected. + coverage lengths are likely to be expected. V) UDP-LITE RUNTIME STATISTICS AND THEIR MEANING @@ -259,7 +259,7 @@ VI) IPTABLES There is packet match support for UDP-Lite as well as support for the LOG target. - If you copy and paste the following line into /etc/protcols, + If you copy and paste the following line into /etc/protocols, udplite 136 UDP-Lite # UDP-Lite [RFC 3828] diff --git a/Documentation/parport-lowlevel.txt b/Documentation/parport-lowlevel.txt index 8f2302415eff..265fcdcb8e5f 100644 --- a/Documentation/parport-lowlevel.txt +++ b/Documentation/parport-lowlevel.txt @@ -25,7 +25,6 @@ Global functions: parport_open parport_close parport_device_id - parport_device_num parport_device_coords parport_find_class parport_find_device @@ -735,7 +734,7 @@ NULL is returned. SEE ALSO -parport_register_device, parport_device_num +parport_register_device parport_close - unregister device for particular device number ------------- @@ -787,29 +786,7 @@ Many devices have ill-formed IEEE 1284 Device IDs. SEE ALSO -parport_find_class, parport_find_device, parport_device_num - -parport_device_num - convert device coordinates to device number ------------------- - -SYNOPSIS - -#include <linux/parport.h> - -int parport_device_num (int parport, int mux, int daisy); - -DESCRIPTION - -Convert between device coordinates (port, multiplexor, daisy chain -address) and device number (zero-based). - -RETURN VALUE - -Device number, or -1 if no device at given coordinates. - -SEE ALSO - -parport_device_coords, parport_open, parport_device_id +parport_find_class, parport_find_device parport_device_coords - convert device number to device coordinates ------------------ @@ -833,7 +810,7 @@ Zero on success, in which case the coordinates are (*parport, *mux, SEE ALSO -parport_device_num, parport_open, parport_device_id +parport_open, parport_device_id parport_find_class - find a device by its class ------------------ diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt index 1a85e2b964dc..57aef2f6e0de 100644 --- a/Documentation/power/basic-pm-debugging.txt +++ b/Documentation/power/basic-pm-debugging.txt @@ -78,8 +78,8 @@ c) Advanced debugging In case the STD does not work on your system even in the minimal configuration and compiling more drivers as modules is not practical or some modules cannot be unloaded, you can use one of the more advanced debugging techniques to find -the problem. First, if there is a serial port in your box, you can set the -CONFIG_DISABLE_CONSOLE_SUSPEND kernel configuration option and try to log kernel +the problem. First, if there is a serial port in your box, you can boot the +kernel with the 'no_console_suspend' parameter and try to log kernel messages using the serial console. This may provide you with some information about the reasons of the suspend (resume) failure. Alternatively, it may be possible to use a FireWire port for debugging with firescope diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt index 04dc1cf9d215..38b57248fd61 100644 --- a/Documentation/power/freezing-of-tasks.txt +++ b/Documentation/power/freezing-of-tasks.txt @@ -19,12 +19,13 @@ we only consider hibernation, but the description also applies to suspend). Namely, as the first step of the hibernation procedure the function freeze_processes() (defined in kernel/power/process.c) is called. It executes try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and -sends a fake signal to each of them. A task that receives such a signal and has -TIF_FREEZE set, should react to it by calling the refrigerator() function -(defined in kernel/power/process.c), which sets the task's PF_FROZEN flag, -changes its state to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is -cleared for it. Then, we say that the task is 'frozen' and therefore the set of -functions handling this mechanism is called 'the freezer' (these functions are +either wakes them up, if they are kernel threads, or sends fake signals to them, +if they are user space processes. A task that has TIF_FREEZE set, should react +to it by calling the function called refrigerator() (defined in +kernel/power/process.c), which sets the task's PF_FROZEN flag, changes its state +to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it. +Then, we say that the task is 'frozen' and therefore the set of functions +handling this mechanism is referred to as 'the freezer' (these functions are defined in kernel/power/process.c and include/linux/freezer.h). User space processes are generally frozen before kernel threads. @@ -35,21 +36,27 @@ task enter refrigerator() if the flag is set. For user space processes try_to_freeze() is called automatically from the signal-handling code, but the freezable kernel threads need to call it -explicitly in suitable places. The code to do this may look like the following: +explicitly in suitable places or use the wait_event_freezable() or +wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) +that combine interruptible sleep with checking if TIF_FREEZE is set and calling +try_to_freeze(). The main loop of a freezable kernel thread may look like the +following one: + set_freezable(); do { hub_events(); - wait_event_interruptible(khubd_wait, - !list_empty(&hub_event_list)); - try_to_freeze(); - } while (!signal_pending(current)); + wait_event_freezable(khubd_wait, + !list_empty(&hub_event_list) || + kthread_should_stop()); + } while (!kthread_should_stop() || !list_empty(&hub_event_list)); (from drivers/usb/core/hub.c::hub_thread()). If a freezable kernel thread fails to call try_to_freeze() after the freezer has set TIF_FREEZE for it, the freezing of tasks will fail and the entire hibernation operation will be cancelled. For this reason, freezable kernel -threads must call try_to_freeze() somewhere. +threads must call try_to_freeze() somewhere or use one of the +wait_event_freezable() and wait_event_freezable_timeout() macros. After the system memory state has been restored from a hibernation image and devices have been reinitialized, the function thaw_processes() is called in @@ -81,7 +88,16 @@ hibernation image has been created and before the system is finally powered off. The majority of these are user space processes, but if any of the kernel threads may cause something like this to happen, they have to be freezable. -2. The second reason is to prevent user space processes and some kernel threads +2. Next, to create the hibernation image we need to free a sufficient amount of +memory (approximately 50% of available RAM) and we need to do that before +devices are deactivated, because we generally need them for swapping out. Then, +after the memory for the image has been freed, we don't want tasks to allocate +additional memory and we prevent them from doing that by freezing them earlier. +[Of course, this also means that device drivers should not allocate substantial +amounts of memory from their .suspend() callbacks before hibernation, but this +is e separate issue.] + +3. The third reason is to prevent user space processes and some kernel threads from interfering with the suspending and resuming of devices. A user space process running on a second CPU while we are suspending devices may, for example, be troublesome and without the freezing of tasks we would need some @@ -111,7 +127,7 @@ frozen before the driver's .suspend() callback is executed and it will be thawed after the driver's .resume() callback has run, so it won't be accessing the device while it's suspended. -3. Another reason for freezing tasks is to prevent user space processes from +4. Another reason for freezing tasks is to prevent user space processes from realizing that hibernation (or suspend) operation takes place. Ideally, user space processes should not notice that such a system-wide operation has occurred and should continue running without any problems after the restore (or resume diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt index fd5192a8fa8a..e67211fe0ee2 100644 --- a/Documentation/power/interface.txt +++ b/Documentation/power/interface.txt @@ -20,7 +20,7 @@ states. /sys/power/disk controls the operating mode of the suspend-to-disk mechanism. Suspend-to-disk can be handled in several ways. We have a few options for putting the system to sleep - using the platform driver -(e.g. ACPI or other pm_ops), powering off the system or rebooting the +(e.g. ACPI or other suspend_ops), powering off the system or rebooting the system (for testing). Additionally, /sys/power/disk can be used to turn on one of the two testing diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt index 06f911a5f885..f281886de490 100644 --- a/Documentation/power/swsusp-and-swap-files.txt +++ b/Documentation/power/swsusp-and-swap-files.txt @@ -39,7 +39,7 @@ resume=<swap_file_partition> resume_offset=<swap_file_offset> where <swap_file_partition> is the partition on which the swap file is located and <swap_file_offset> is the offset of the swap header determined by the application in 2) (of course, this step may be carried out automatically -by the same application that determies the swap file's header offset using the +by the same application that determines the swap file's header offset using the FIBMAP ioctl) OR diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt index 4530d1bf0286..df7afe43d462 100644 --- a/Documentation/powerpc/eeh-pci-error-recovery.txt +++ b/Documentation/powerpc/eeh-pci-error-recovery.txt @@ -36,8 +36,8 @@ Causes of EEH Errors EEH was originally designed to guard against hardware failure, such as PCI cards dying from heat, humidity, dust, vibration and bad electrical connections. The vast majority of EEH errors seen in -"real life" are due to eithr poorly seated PCI cards, or, -unfortunately quite commonly, due device driver bugs, device firmware +"real life" are due to either poorly seated PCI cards, or, +unfortunately quite commonly, due to device driver bugs, device firmware bugs, and sometimes PCI card hardware bugs. The most common software bug, is one that causes the device to diff --git a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt index e59fcbbe338c..5e03610e186f 100644 --- a/Documentation/powerpc/mpc52xx-device-tree-bindings.txt +++ b/Documentation/powerpc/mpc52xx-device-tree-bindings.txt @@ -17,12 +17,12 @@ passed by the boot loader to the kernel at boot time. The device tree describes what devices are present on the board and how they are connected. The device tree can either be passed as a binary blob (as described in Documentation/powerpc/booting-without-of.txt), or passed -by Open Firmare (IEEE 1275) compatible firmware using an OF compatible +by Open Firmware (IEEE 1275) compatible firmware using an OF compatible client interface API. This document specifies the requirements on the device-tree for mpc5200 based boards. These requirements are above and beyond the details -specified in either the OpenFirmware spec or booting-without-of.txt +specified in either the Open Firmware spec or booting-without-of.txt All new mpc5200-based boards are expected to match this document. In cases where this document is not sufficient to support a new board port, @@ -73,8 +73,8 @@ match on the compatible list; the 'most compatible' driver should be selected. The split between the MPC5200 and the MPC5200B leaves a bit of a -connundrum. How should the compatible property be set up to provide -maximum compatability information; but still acurately describe the +conundrum. How should the compatible property be set up to provide +maximum compatibility information; but still accurately describe the chip? For the MPC5200; the answer is easy. Most of the SoC devices originally appeared on the MPC5200. Since they didn't exist anywhere else; the 5200 compatible properties will contain only one item; @@ -84,7 +84,7 @@ The 5200B is almost the same as the 5200, but not quite. It fixes silicon bugs and it adds a small number of enhancements. Most of the devices either provide exactly the same interface as on the 5200. A few devices have extra functions but still have a backwards compatible mode. -To express this infomation as completely as possible, 5200B device trees +To express this information as completely as possible, 5200B device trees should have two items in the compatible list; "mpc5200b-<device>\0mpc5200-<device>". It is *strongly* recommended that 5200B device trees follow this convention (instead of only listing @@ -185,7 +185,7 @@ bestcomm@<addr> dma-controller mpc5200-bestcomm 5200 pic also requires Recommended soc5200 child nodes; populate as needed for your board name device_type compatible Description ---- ----------- ---------- ----------- -gpt@<addr> gpt mpc5200-gpt General purpose timers +gpt@<addr> gpt fsl,mpc5200-gpt General purpose timers rtc@<addr> rtc mpc5200-rtc Real time clock mscan@<addr> mscan mpc5200-mscan CAN bus controller pci@<addr> pci mpc5200-pci PCI bridge @@ -199,7 +199,7 @@ ethernet@<addr> network mpc5200-fec MPC5200 ethernet device ata@<addr> ata mpc5200-ata IDE ATA interface i2c@<addr> i2c mpc5200-i2c I2C controller usb@<addr> usb-ohci-be mpc5200-ohci,ohci-be USB controller -xlb@<addr> xlb mpc5200-xlb XLB arbritrator +xlb@<addr> xlb mpc5200-xlb XLB arbitrator Important child node properties name type description @@ -213,7 +213,7 @@ cell-index int When multiple devices are present, is the 5) General Purpose Timer nodes (child of soc5200 node) On the mpc5200 and 5200b, GPT0 has a watchdog timer function. If the board design supports the internal wdt, then the device node for GPT0 should -include the empty property 'has-wdt'. +include the empty property 'fsl,has-wdt'. 6) PSC nodes (child of soc5200 node) PSC nodes can define the optional 'port-number' property to force assignment diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt index 6aa9a891f3d0..683ccae00ad4 100644 --- a/Documentation/scsi/aic79xx.txt +++ b/Documentation/scsi/aic79xx.txt @@ -120,7 +120,7 @@ The following information is available in this file: list size to avoid SCSI malloc pool fragmentation. - Cleanup channel display in our /proc output. - Workaround duplicate device entries in the mid-layer - devlice list during add-single-device. + device list during add-single-device. 1.3.6 (March 28th, 2003) - Correct a double free in the Domain Validation code. diff --git a/Documentation/scsi/aic7xxx.txt b/Documentation/scsi/aic7xxx.txt index 5f34d2ba69b4..b7e238cbb5a7 100644 --- a/Documentation/scsi/aic7xxx.txt +++ b/Documentation/scsi/aic7xxx.txt @@ -159,7 +159,7 @@ The following information is available in this file: - Add support for 2.5.X's scsi_report_device_reset(). 6.2.34 (May 5th, 2003) - - Fix locking regression instroduced in 6.2.29 that + - Fix locking regression introduced in 6.2.29 that could cause a lock order reversal between the io_request_lock and our per-softc lock. This was only possible on RH9, SuSE, and kernel.org 2.4.X kernels. @@ -264,7 +264,7 @@ The following information is available in this file: Option: tag_info:{{value[,value...]}[,{value[,value...]}...]} Definition: Set the per-target tagged queue depth on a per controller basis. Both controllers and targets - may be ommitted indicating that they should retain + may be omitted indicating that they should retain the default tag depth. Examples: tag_info:{{16,32,32,64,8,8,,32,32,32,32,32,32,32,32,32} On Controller 0 @@ -290,7 +290,7 @@ The following information is available in this file: ----------------------------------------------------------------- Option: dv: {value[,value...]} Definition: Set Domain Validation Policy on a per-controller basis. - Controllers may be ommitted indicating that + Controllers may be omitted indicating that they should retain the default read streaming setting. Example: dv:{-1,0,,1,1,0} On Controller 0 leave DV at its default setting. diff --git a/Documentation/scsi/arcmsr_spec.txt b/Documentation/scsi/arcmsr_spec.txt index 5e0042340fd3..45d9482c1517 100644 --- a/Documentation/scsi/arcmsr_spec.txt +++ b/Documentation/scsi/arcmsr_spec.txt @@ -3,7 +3,7 @@ ******************************************************************************* ** Usage of IOP331 adapter ** (All In/Out is in IOP331's view) -** 1. Message 0 --> InitThread message and retrun code +** 1. Message 0 --> InitThread message and return code ** 2. Doorbell is used for RS-232 emulation ** inDoorBell : bit0 -- data in ready ** (DRIVER DATA WRITE OK) diff --git a/Documentation/scsi/ibmmca.txt b/Documentation/scsi/ibmmca.txt index a08e225653d6..a810421f1fb3 100644 --- a/Documentation/scsi/ibmmca.txt +++ b/Documentation/scsi/ibmmca.txt @@ -21,7 +21,7 @@ versions older than 4.0 do not work with kernels 2.4.0 or later! If you try to compile your kernel with the wrong driver source, the compilation is aborted and you get a corresponding error message. This is - no bug in the driver. It prevents you from using the wrong sourcecode + no bug in the driver; it prevents you from using the wrong source code with the wrong kernel version. Authors of this Driver @@ -58,7 +58,7 @@ 5 Users' Manual 5.1 Commandline Parameters 5.2 Troubleshooting - 5.3 Bugreports + 5.3 Bug reports 5.4 Support WWW-page 6 References 7 Credits to @@ -71,13 +71,13 @@ 1 Abstract ---------- - This README-file describes the IBM SCSI-subsystem low level driver for - Linux. The descriptions which were formerly kept in the source-code have - been taken out to this file to easify the codes' readability. The driver + This README-file describes the IBM SCSI-subsystem low level driver for + Linux. The descriptions which were formerly kept in the source code have + been taken out of this file to simplify the codes readability. The driver description has been updated, as most of the former description was already - quite outdated. The history of the driver development is also kept inside - here. Multiple historical developments have been summarized to shorten the - textsize a bit. At the end of this file you can find a small manual for + quite outdated. The history of the driver development is also kept inside + here. Multiple historical developments have been summarized to shorten the + text size a bit. At the end of this file you can find a small manual for this driver and hints to get it running on your machine. 2 Driver Description @@ -186,7 +186,7 @@ between 0 and 7). The IBM SCSI-2 F/W adapter offers this on up to two busses and provides support for 30 logical devices at the same time, where in wide-addressing mode you can have 16 puns with 32 luns on each device. - This section dexribes you the handling of devices on non-F/W adapters. + This section describes the handling of devices on non-F/W adapters. Just imagine, that you can have 16 * 32 = 512 devices on a F/W adapter which means a lot of possible devices for such a small machine. @@ -209,10 +209,10 @@ -------------------------------------------------------- One consequence of information hiding is that the real (pun,lun) numbers are also hidden. The two possibilities to get around this problem - is to offer fake pun/lun combinations to the operating system or to + are to offer fake pun/lun combinations to the operating system or to delete the whole mapping of the adapter and to reassign the ldns, using the immediate assign command of the SCSI-subsystem for probing through - all possible pun/lun combinations. a ldn is a "logical device number" + all possible pun/lun combinations. An ldn is a "logical device number" which is used by IBM SCSI-subsystems to access some valid SCSI-device. At the beginning of the development of this driver, the following approach was used: @@ -251,9 +251,9 @@ lun>0 or to non-existing devices, in order to satisfy the subsystem, if there are less than 15 SCSI-devices connected. In the case of more than 15 devices, the dynamical mapping goes active. If the get_scsi[][] reports a - device to be existant, but it has no ldn assigned, it gets a ldn out of 7 - to 14. The numbers are assigned in cyclic order. Therefore it takes 8 - dynamical reassignments on the SCSI-devices, until a certain device + device to be existent, but it has no ldn assigned, it gets an ldn out of 7 + to 14. The numbers are assigned in cyclic order, therefore it takes 8 + dynamical reassignments on the SCSI-devices until a certain device loses its ldn again. This assures that dynamical remapping is avoided during intense I/O between up to 15 SCSI-devices (means pun,lun combinations). A further advantage of this method is that people who @@ -551,7 +551,7 @@ than devices are available, they are assigned to non existing pun,lun combinations to satisfy the adapter. With this, the dynamical mapping was possible to implement. (For further info see the text in the - source-code and in the description below. Read the description + source code and in the description below. Read the description below BEFORE installing this driver on your system!) 2) Changed the name IBMMCA_DRIVER_VERSION to IBMMCA_SCSI_DRIVER_VERSION. 3) The LED-display shows on PS/2-95 no longer the ldn, but the SCSI-ID @@ -762,9 +762,9 @@ - Michael Lang Apr 23, 2000 (v3.2pre1) - 1) During a very long time, I collected a huge amount of bugreports from + 1) During a very long time, I collected a huge amount of bug reports from various people, trying really quite different things on their SCSI- - PS/2s. Today, all these bugreports are taken into account and should be + PS/2s. Today, all these bug reports are taken into account and should be mostly solved. The major topics were: - Driver crashes during boottime by no obvious reason. - Driver panics while the midlevel-SCSI-driver is trying to inquire @@ -819,7 +819,7 @@ - Michael Lang July 17, 2000 (v3.2pre8) - A long period of collecting bugreports from all corners of the world + A long period of collecting bug reports from all corners of the world now lead to the following corrections to the code: 1) SCSI-2 F/W support crashed with a COMMAND ERROR. The reason for this was that it is possible to disable Fast-SCSI for the external bus. @@ -873,7 +873,7 @@ July 26, 2000 (v3.2pre11) 1) I passed a horrible weekend getting mad with NMIs on kernel 2.2.14 and a model 9595. Asking around in the community, nobody except of me has - seen such errors. Weired, but I am trying to recompile everything on + seen such errors. Weird, but I am trying to recompile everything on the model 9595. Maybe, as I use a specially modified gcc, that could cause problems. But, it was not the reason. The true background was, that the kernel was compiled for i386 and the 9595 has a 486DX-2. @@ -886,7 +886,7 @@ alive rotator during boottime. This makes sense, when no monitor is connected to the system. You can get rid of all display activity, if you do not use any parameter or just ibmmcascsi=activity, for the - harddrive activity LED, existant on all PS/2, except models 8595-XXX. + harddrive activity LED, existent on all PS/2, except models 8595-XXX. If no monitor is available, please use ibmmcascsi=display, which works fine together with the linuxinfo utility for the LED-panel. - Michael Lang @@ -1115,7 +1115,7 @@ If this really happens, do also send e-mail to the maintainer, as forced detection should be never necessary. Forced detection is in principal some flaw of the driver adapter detection and goes into - bugreports. + bug reports. Q: The driver screws up, if it starts to probe SCSI-devices, is there some way out of it? A: Yes, that was some recognition problem of the correct SCSI-adapter @@ -1172,7 +1172,7 @@ recommended version is 3.2 or later. Here, the F/W support is in a stable and reliable condition. Wide-addressing is in addition supported. - Q: I get a Ooops message and something like "killing interrupt". + Q: I get an Oops message and something like "killing interrupt". A: The reason for this is that the IBM SCSI-subsystem only sends a termination status back, if some error appeared. In former releases of the driver, it was not checked, if the termination status block @@ -1213,21 +1213,21 @@ problem. Not yet tried, but guessing that it could work. To get this, set unchecked_isa_dma argument of ibmmca.h from 0 to 1. - 5.3 Bugreports + 5.3 Bug reports -------------- - If you really find bugs in the sourcecode or the driver will successfully + If you really find bugs in the source code or the driver will successfully refuse to work on your machine, you should send a bug report to me. The best for this is to follow the instructions on the WWW-page for this driver. Fill out the bug-report form, placed on the WWW-page and ship it, so the bugs can be taken into account with maximum efforts. But, please do not send bug reports about this driver to Linus Torvalds or Leonard - Zubkoff, as Linus is burried in E-Mail and Leonard is supervising all + Zubkoff, as Linus is buried in E-Mail and Leonard is supervising all SCSI-drivers and won't have the time left to look inside every single driver to fix a bug and especially DO NOT send modified code to Linus Torvalds or Alan J. Cox which has not been checked here!!! They are both - quite burried in E-mail (as me, sometimes, too) and one should first check + quite buried in E-mail (as me, sometimes, too) and one should first check for problems on my local teststand. Recently, I got a lot of - bugreports for errors in the ibmmca.c code, which I could not imagine, but + bug reports for errors in the ibmmca.c code, which I could not imagine, but a look inside some Linux-distribution showed me quite often some modified code, which did no longer work on most other machines than the one of the modifier. Ok, so now that there is maintenance service available for this @@ -1261,7 +1261,7 @@ some e-mail directly, but at least with the same information as required by the formular. - If you have extensive bugreports, including Ooops messages and + If you have extensive bug reports, including Oops messages and screen-shots, please feel free to send it directly to the address of the maintainer, too. The current address of the maintainer is: @@ -1318,7 +1318,7 @@ detailed bug reports and ideas for this driver (and his patience ;-)). Alan J. Cox - for his bugreports and his bold activities in cross-checking + for his bug reports and his bold activities in cross-checking the driver-code with his teststand. 7.2 Sponsors & Supporters diff --git a/Documentation/sharedsubtree.txt b/Documentation/sharedsubtree.txt index ccf1cebe744f..736540045dc7 100644 --- a/Documentation/sharedsubtree.txt +++ b/Documentation/sharedsubtree.txt @@ -153,6 +153,7 @@ replicas continue to be exactly same. #include <stdio.h> #include <stdlib.h> #include <unistd.h> + #include <string.h> #include <sys/mount.h> #include <sys/fsuid.h> diff --git a/Documentation/sound/alsa/soc/DAI.txt b/Documentation/sound/alsa/soc/DAI.txt index 58cbfd01ea8f..3feeb9ecdec4 100644 --- a/Documentation/sound/alsa/soc/DAI.txt +++ b/Documentation/sound/alsa/soc/DAI.txt @@ -20,12 +20,12 @@ I2S === I2S is a common 4 wire DAI used in HiFi, STB and portable devices. The Tx and -Rx lines are used for audio transmision, whilst the bit clock (BCLK) and +Rx lines are used for audio transmission, whilst the bit clock (BCLK) and left/right clock (LRC) synchronise the link. I2S is flexible in that either the controller or CODEC can drive (master) the BCLK and LRC clock lines. Bit clock usually varies depending on the sample rate and the master system clock (SYSCLK). LRCLK is the same as the sample rate. A few devices support separate -ADC and DAC LRCLK's, this allows for similtanious capture and playback at +ADC and DAC LRCLK's, this allows for simultaneous capture and playback at different sample rates. I2S has several different operating modes:- @@ -41,12 +41,12 @@ I2S has several different operating modes:- PCM === -PCM is another 4 wire interface, very similar to I2S, that can support a more +PCM is another 4 wire interface, very similar to I2S, which can support a more flexible protocol. It has bit clock (BCLK) and sync (SYNC) lines that are used to synchronise the link whilst the Tx and Rx lines are used to transmit and receive the audio data. Bit clock usually varies depending on sample rate whilst sync runs at the sample rate. PCM also supports Time Division -Multiplexing (TDM) in that several devices can use the bus similtaniuosly (This +Multiplexing (TDM) in that several devices can use the bus simultaneously (this is sometimes referred to as network mode). Common PCM operating modes:- diff --git a/Documentation/sound/alsa/soc/clocking.txt b/Documentation/sound/alsa/soc/clocking.txt index e93960d53a1e..14930887c25f 100644 --- a/Documentation/sound/alsa/soc/clocking.txt +++ b/Documentation/sound/alsa/soc/clocking.txt @@ -2,20 +2,20 @@ Audio Clocking ============== This text describes the audio clocking terms in ASoC and digital audio in -general. Note: Audio clocking can be complex ! +general. Note: Audio clocking can be complex! Master Clock ------------ -Every audio subsystem is driven by a master clock (sometimes refered to as MCLK +Every audio subsystem is driven by a master clock (sometimes referred to as MCLK or SYSCLK). This audio master clock can be derived from a number of sources (e.g. crystal, PLL, CPU clock) and is responsible for producing the correct audio playback and capture sample rates. -Some master clocks (e.g. PLL's and CPU based clocks) are configuarble in that +Some master clocks (e.g. PLL's and CPU based clocks) are configurable in that their speed can be altered by software (depending on the system use and to save -power). Other master clocks are fixed at at set frequency (i.e. crystals). +power). Other master clocks are fixed at a set frequency (i.e. crystals). DAI Clocks @@ -44,7 +44,7 @@ This relationship depends on the codec or SoC CPU in particular. In general it's best to configure BCLK to the lowest possible speed (depending on your rate, number of channels and wordsize) to save on power. -It's also desireable to use the codec (if possible) to drive (or master) the +It's also desirable to use the codec (if possible) to drive (or master) the audio clocks as it's usually gives more accurate sample rates than the CPU. diff --git a/Documentation/sound/alsa/soc/codec.txt b/Documentation/sound/alsa/soc/codec.txt index 48983c75aad9..1e766ad0ebd1 100644 --- a/Documentation/sound/alsa/soc/codec.txt +++ b/Documentation/sound/alsa/soc/codec.txt @@ -19,7 +19,7 @@ Optionally, codec drivers can also provide:- 6) DAPM event handler. 7) DAC Digital mute control. -It's probably best to use this guide in conjuction with the existing codec +It's probably best to use this guide in conjunction with the existing codec driver code in sound/soc/codecs/ ASoC Codec driver breakdown @@ -28,7 +28,7 @@ ASoC Codec driver breakdown 1 - Codec DAI and PCM configuration ----------------------------------- Each codec driver must have a struct snd_soc_codec_dai to define it's DAI and -PCM's capablities and operations. This struct is exported so that it can be +PCM's capabilities and operations. This struct is exported so that it can be registered with the core by your machine driver. e.g. @@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(wm8731_dai); 2 - Codec control IO -------------------- -The codec can ususally be controlled via an I2C or SPI style interface (AC97 +The codec can usually be controlled via an I2C or SPI style interface (AC97 combines control with data in the DAI). The codec drivers will have to provide functions to read and write the codec registers along with supplying a register cache:- diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt index c11877f5b4a1..ab0766fd7869 100644 --- a/Documentation/sound/alsa/soc/dapm.txt +++ b/Documentation/sound/alsa/soc/dapm.txt @@ -11,7 +11,7 @@ other PM systems. DAPM is also completely transparent to all user space applications as all power switching is done within the ASoC core. No code changes or recompiling are -required for user space applications. DAPM makes power switching descisions based +required for user space applications. DAPM makes power switching decisions based upon any audio stream (capture/playback) activity and audio mixer settings within the device. @@ -38,7 +38,7 @@ There are 4 power domains within DAPM Enabled and disabled when stream playback/capture is started and stopped respectively. e.g. aplay, arecord. -All DAPM power switching descisons are made automatically by consulting an audio +All DAPM power switching decisions are made automatically by consulting an audio routing map of the whole machine. This map is specific to each machine and consists of the interconnections between every audio component (including internal codec components). All audio components that effect power are called diff --git a/Documentation/sound/alsa/soc/overview.txt b/Documentation/sound/alsa/soc/overview.txt index 753c5cc5984a..c47ce9530677 100644 --- a/Documentation/sound/alsa/soc/overview.txt +++ b/Documentation/sound/alsa/soc/overview.txt @@ -2,18 +2,19 @@ ALSA SoC Layer ============== The overall project goal of the ALSA System on Chip (ASoC) layer is to provide -better ALSA support for embedded system on chip procesors (e.g. pxa2xx, au1x00, +better ALSA support for embedded system-on-chip processors (e.g. pxa2xx, au1x00, iMX, etc) and portable audio codecs. Currently there is some support in the kernel for SoC audio, however it has some limitations:- * Currently, codec drivers are often tightly coupled to the underlying SoC - cpu. This is not ideal and leads to code duplication i.e. Linux now has 4 + CPU. This is not ideal and leads to code duplication i.e. Linux now has 4 different wm8731 drivers for 4 different SoC platforms. - * There is no standard method to signal user initiated audio events. - e.g. Headphone/Mic insertion, Headphone/Mic detection after an insertion - event. These are quite common events on portable devices and ofter require - machine specific code to re route audio, enable amps etc after such an event. + * There is no standard method to signal user initiated audio events (e.g. + Headphone/Mic insertion, Headphone/Mic detection after an insertion + event). These are quite common events on portable devices and often require + machine specific code to re-route audio, enable amps, etc., after such an + event. * Current drivers tend to power up the entire codec when playing (or recording) audio. This is fine for a PC, but tends to waste a lot of @@ -44,7 +45,7 @@ features :- signals the codec when to change power states. * Machine specific controls: Allow machines to add controls to the sound card - e.g. volume control for speaker amp. + (e.g. volume control for speaker amp). To achieve all this, ASoC basically splits an embedded audio system into 3 components :- @@ -57,7 +58,7 @@ components :- interface drivers (e.g. I2S, AC97, PCM) for that platform. * Machine driver: The machine driver handles any machine specific controls and - audio events. i.e. turing on an amp at start of playback. + audio events (e.g. turning on an amp at start of playback). Documentation diff --git a/Documentation/sound/alsa/soc/platform.txt b/Documentation/sound/alsa/soc/platform.txt index e95b16d5a53b..d4678b4dc6c6 100644 --- a/Documentation/sound/alsa/soc/platform.txt +++ b/Documentation/sound/alsa/soc/platform.txt @@ -20,7 +20,7 @@ struct snd_soc_ops { int (*trigger)(struct snd_pcm_substream *, int); }; -The platform driver exports it's DMA functionailty via struct snd_soc_platform:- +The platform driver exports its DMA functionality via struct snd_soc_platform:- struct snd_soc_platform { char *name; diff --git a/Documentation/sound/alsa/soc/pops_clicks.txt b/Documentation/sound/alsa/soc/pops_clicks.txt index 2cf7ee5b3d74..3371bd9d7cfa 100644 --- a/Documentation/sound/alsa/soc/pops_clicks.txt +++ b/Documentation/sound/alsa/soc/pops_clicks.txt @@ -2,7 +2,7 @@ Audio Pops and Clicks ===================== Pops and clicks are unwanted audio artifacts caused by the powering up and down -of components within the audio subsystem. This is noticable on PC's when an +of components within the audio subsystem. This is noticeable on PCs when an audio module is either loaded or unloaded (at module load time the sound card is powered up and causes a popping noise on the speakers). @@ -16,7 +16,7 @@ Minimising Playback Pops and Clicks =================================== Playback pops in portable audio subsystems cannot be completely eliminated atm, -however future audio codec hardware will have better pop and click supression. +however future audio codec hardware will have better pop and click suppression. Pops can be reduced within playback by powering the audio components in a specific order. This order is different for startup and shutdown and follows some basic rules:- @@ -33,7 +33,7 @@ Minimising Capture Pops and Clicks ================================== Capture artifacts are somewhat easier to get rid as we can delay activating the -ADC until all the pops have occured. This follows similar power rules to +ADC until all the pops have occurred. This follows similar power rules to playback in that components are powered in a sequence depending upon stream startup or shutdown. diff --git a/Documentation/sound/oss/es1371 b/Documentation/sound/oss/es1371 deleted file mode 100644 index c3151266771c..000000000000 --- a/Documentation/sound/oss/es1371 +++ /dev/null @@ -1,64 +0,0 @@ -/proc/sound, /dev/sndstat -------------------------- - -/proc/sound and /dev/sndstat is not supported by the -driver. To find out whether the driver succeeded loading, -check the kernel log (dmesg). - - -ALaw/uLaw sample formats ------------------------- - -This driver does not support the ALaw/uLaw sample formats. -ALaw is the default mode when opening a sound device -using OSS/Free. The reason for the lack of support is -that the hardware does not support these formats, and adding -conversion routines to the kernel would lead to very ugly -code in the presence of the mmap interface to the driver. -And since xquake uses mmap, mmap is considered important :-) -and no sane application uses ALaw/uLaw these days anyway. -In short, playing a Sun .au file as follows: - -cat my_file.au > /dev/dsp - -does not work. Instead, you may use the play script from -Chris Bagwell's sox-12.14 package (available from the URL -below) to play many different audio file formats. -The script automatically determines the audio format -and does do audio conversions if necessary. -http://home.sprynet.com/sprynet/cbagwell/projects.html - - -Blocking vs. nonblocking IO ---------------------------- - -Unlike OSS/Free this driver honours the O_NONBLOCK file flag -not only during open, but also during read and write. -This is an effort to make the sound driver interface more -regular. Timidity has problems with this; a patch -is available from http://www.ife.ee.ethz.ch/~sailer/linux/pciaudio.html. -(Timidity patched will also run on OSS/Free). - - -MIDI UART ---------- - -The driver supports a simple MIDI UART interface, with -no ioctl's supported. - - -MIDI synthesizer ----------------- - -This soundcard does not have any hardware MIDI synthesizer; -MIDI synthesis has to be done in software. To allow this -the driver/soundcard supports two PCM (/dev/dsp) interfaces. - -There is a freely available software package that allows -MIDI file playback on this soundcard called Timidity. -See http://www.cgs.fi/~tt/timidity/. - - - -Thomas Sailer -t.sailer@alumni.ethz.ch diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx index 215e3b8e7266..f3853cc37bde 100644 --- a/Documentation/spi/pxa2xx +++ b/Documentation/spi/pxa2xx @@ -1,4 +1,4 @@ -PXA2xx SPI on SSP driver HOWTO +PXA2xx SPI on SSP driver HOWTO =================================================== This a mini howto on the pxa2xx_spi driver. The driver turns a PXA2xx synchronous serial port into a SPI master controller diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt index 60953d6c919d..ec499265deca 100644 --- a/Documentation/thinkpad-acpi.txt +++ b/Documentation/thinkpad-acpi.txt @@ -105,10 +105,15 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver as a driver attribute (see below). Sysfs driver attributes are on the driver's sysfs attribute space, -for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/. +for 2.6.23 this is /sys/bus/platform/drivers/thinkpad_acpi/ and +/sys/bus/platform/drivers/thinkpad_hwmon/ -Sysfs device attributes are on the driver's sysfs attribute space, -for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/. +Sysfs device attributes are on the thinkpad_acpi device sysfs attribute +space, for 2.6.23 this is /sys/devices/platform/thinkpad_acpi/. + +Sysfs device attributes for the sensors and fan are on the +thinkpad_hwmon device's sysfs attribute space, but you should locate it +looking for a hwmon device with the name attribute of "thinkpad". Driver version -------------- @@ -766,7 +771,7 @@ Temperature sensors ------------------- procfs: /proc/acpi/ibm/thermal -sysfs device attributes: (hwmon) temp*_input +sysfs device attributes: (hwmon "thinkpad") temp*_input Most ThinkPads include six or more separate temperature sensors but only expose the CPU temperature through the standard ACPI methods. This @@ -989,7 +994,9 @@ Fan control and monitoring: fan speed, fan enable/disable --------------------------------------------------------- procfs: /proc/acpi/ibm/fan -sysfs device attributes: (hwmon) fan_input, pwm1, pwm1_enable +sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, + pwm1_enable +sysfs hwmon driver attributes: fan_watchdog NOTE NOTE NOTE: fan control operations are disabled by default for safety reasons. To enable them, the module parameter "fan_control=1" @@ -1028,7 +1035,7 @@ enable it if necessary to avoid overheating. An enabled fan in level "auto" may stop spinning if the EC decides the ThinkPad is cool enough and doesn't need the extra airflow. This is -normal, and the EC will spin the fan up if the varios thermal readings +normal, and the EC will spin the fan up if the various thermal readings rise too much. On the X40, this seems to depend on the CPU and HDD temperatures. @@ -1131,7 +1138,7 @@ hwmon device attribute fan1_input: which can take up to two minutes. May return rubbish on older ThinkPads. -driver attribute fan_watchdog: +hwmon driver attribute fan_watchdog: Fan safety watchdog timer interval, in seconds. Minimum is 1 second, maximum is 120 seconds. 0 disables the watchdog. @@ -1196,7 +1203,7 @@ for example: Enabling debugging output ------------------------- -The module takes a debug paramater which can be used to selectively +The module takes a debug parameter which can be used to selectively enable various classes of debugging output, for example: modprobe ibm_acpi debug=0xffff @@ -1233,3 +1240,9 @@ Sysfs interface changelog: layer, the radio switch generates input event EV_RADIO, and the driver enables hot key handling by default in the firmware. + +0x020000: ABI fix: added a separate hwmon platform device and + driver, which must be located by name (thinkpad) + and the hwmon class for libsensors4 (lm-sensors 3) + compatibility. Moved all hwmon attributes to this + new platform device. diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.txt index 4e0b62b8566f..8b077e43eee7 100644 --- a/Documentation/usb/usb-serial.txt +++ b/Documentation/usb/usb-serial.txt @@ -338,7 +338,7 @@ MCT USB Single Port Serial Adapter U232 This driver is for the MCT USB-RS232 Converter (25 pin, Model No. U232-P25) from Magic Control Technology Corp. (there is also a 9 pin Model No. U232-P9). More information about this device can be found at - the manufacture's web-site: http://www.mct.com.tw. + the manufacturer's web-site: http://www.mct.com.tw. The driver is generally working, though it still needs some more testing. It is derived from the Belkin USB Serial Adapter F5U103 driver and its diff --git a/Documentation/watchdog/src/watchdog-simple.c b/Documentation/watchdog/src/watchdog-simple.c index 47801bc7e742..4cf72f3fa8e9 100644 --- a/Documentation/watchdog/src/watchdog-simple.c +++ b/Documentation/watchdog/src/watchdog-simple.c @@ -3,15 +3,25 @@ #include <unistd.h> #include <fcntl.h> -int main(int argc, const char *argv[]) { +int main(void) +{ int fd = open("/dev/watchdog", O_WRONLY); + int ret = 0; if (fd == -1) { perror("watchdog"); - exit(1); + exit(EXIT_FAILURE); } while (1) { - write(fd, "\0", 1); - fsync(fd); + ret = write(fd, "\0", 1); + if (ret != 1) { + ret = -1; + break; + } + ret = fsync(fd); + if (ret) + break; sleep(10); } + close(fd); + return ret; } |