diff options
352 files changed, 13209 insertions, 4409 deletions
@@ -66,6 +66,7 @@ Kenneth W Chen <kenneth.w.chen@intel.com> Koushik <raghavendra.koushik@neterion.com> Leonid I Ananiev <leonid.i.ananiev@intel.com> Linas Vepstas <linas@austin.ibm.com> +Mark Brown <broonie@sirena.org.uk> Matthieu CASTET <castet.matthieu@free.fr> Michael Buesch <mb@bu3sch.de> Michael Buesch <mbuesch@freenet.de> diff --git a/Documentation/cgroups.txt b/Documentation/cgroups/cgroups.txt index d9014aa0eb68..d9014aa0eb68 100644 --- a/Documentation/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt new file mode 100644 index 000000000000..c50ab58b72eb --- /dev/null +++ b/Documentation/cgroups/freezer-subsystem.txt @@ -0,0 +1,99 @@ + The cgroup freezer is useful to batch job management system which start +and stop sets of tasks in order to schedule the resources of a machine +according to the desires of a system administrator. This sort of program +is often used on HPC clusters to schedule access to the cluster as a +whole. The cgroup freezer uses cgroups to describe the set of tasks to +be started/stopped by the batch job management system. It also provides +a means to start and stop the tasks composing the job. + + The cgroup freezer will also be useful for checkpointing running groups +of tasks. The freezer allows the checkpoint code to obtain a consistent +image of the tasks by attempting to force the tasks in a cgroup into a +quiescent state. Once the tasks are quiescent another task can +walk /proc or invoke a kernel interface to gather information about the +quiesced tasks. Checkpointed tasks can be restarted later should a +recoverable error occur. This also allows the checkpointed tasks to be +migrated between nodes in a cluster by copying the gathered information +to another node and restarting the tasks there. + + Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +and resuming tasks in userspace. Both of these signals are observable +from within the tasks we wish to freeze. While SIGSTOP cannot be caught, +blocked, or ignored it can be seen by waiting or ptracing parent tasks. +SIGCONT is especially unsuitable since it can be caught by the task. Any +programs designed to watch for SIGSTOP and SIGCONT could be broken by +attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can +demonstrate this problem using nested bash shells: + + $ echo $$ + 16644 + $ bash + $ echo $$ + 16690 + + From a second, unrelated bash shell: + $ kill -SIGSTOP 16690 + $ kill -SIGCONT 16990 + + <at this point 16990 exits and causes 16644 to exit too> + + This happens because bash can observe both signals and choose how it +responds to them. + + Another example of a program which catches and responds to these +signals is gdb. In fact any program designed to use ptrace is likely to +have a problem with this method of stopping and resuming tasks. + + In contrast, the cgroup freezer uses the kernel freezer code to +prevent the freeze/unfreeze cycle from becoming visible to the tasks +being frozen. This allows the bash example above and gdb to run as +expected. + + The freezer subsystem in the container filesystem defines a file named +freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the +cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. +Reading will return the current state. + +* Examples of usage : + + # mkdir /containers/freezer + # mount -t cgroup -ofreezer freezer /containers + # mkdir /containers/0 + # echo $some_pid > /containers/0/tasks + +to get status of the freezer subsystem : + + # cat /containers/0/freezer.state + THAWED + +to freeze all tasks in the container : + + # echo FROZEN > /containers/0/freezer.state + # cat /containers/0/freezer.state + FREEZING + # cat /containers/0/freezer.state + FROZEN + +to unfreeze all tasks in the container : + + # echo THAWED > /containers/0/freezer.state + # cat /containers/0/freezer.state + THAWED + +This is the basic mechanism which should do the right thing for user space task +in a simple scenario. + +It's important to note that freezing can be incomplete. In that case we return +EBUSY. This means that some tasks in the cgroup are busy doing something that +prevents us from completely freezing the cgroup at this time. After EBUSY, +the cgroup will remain partially frozen -- reflected by freezer.state reporting +"FREEZING" when read. The state will remain "FREEZING" until one of these +things happens: + + 1) Userspace cancels the freezing operation by writing "THAWED" to + the freezer.state file + 2) Userspace retries the freezing operation by writing "FROZEN" to + the freezer.state file (writing "FREEZING" is not legal + and returns EIO) + 3) The tasks that blocked the cgroup from entering the "FROZEN" + state disappear from the cgroup's set of tasks. diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt index 9b53d5827361..1c07547d3f81 100644 --- a/Documentation/controllers/memory.txt +++ b/Documentation/controllers/memory.txt @@ -112,14 +112,22 @@ the per cgroup LRU. 2.2.1 Accounting details -All mapped pages (RSS) and unmapped user pages (Page Cache) are accounted. -RSS pages are accounted at the time of page_add_*_rmap() unless they've already -been accounted for earlier. A file page will be accounted for as Page Cache; -it's mapped into the page tables of a process, duplicate accounting is carefully -avoided. Page Cache pages are accounted at the time of add_to_page_cache(). -The corresponding routines that remove a page from the page tables or removes -a page from Page Cache is used to decrement the accounting counters of the -cgroup. +All mapped anon pages (RSS) and cache pages (Page Cache) are accounted. +(some pages which never be reclaimable and will not be on global LRU + are not accounted. we just accounts pages under usual vm management.) + +RSS pages are accounted at page_fault unless they've already been accounted +for earlier. A file page will be accounted for as Page Cache when it's +inserted into inode (radix-tree). While it's mapped into the page tables of +processes, duplicate accounting is carefully avoided. + +A RSS page is unaccounted when it's fully unmapped. A PageCache page is +unaccounted when it's removed from radix-tree. + +At page migration, accounting information is kept. + +Note: we just account pages-on-lru because our purpose is to control amount +of used pages. not-on-lru pages are tend to be out-of-control from vm view. 2.3 Shared Page Accounting diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index 47e568a9370a..5c86c258c791 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -48,7 +48,7 @@ hooks, beyond what is already present, required to manage dynamic job placement on large systems. Cpusets use the generic cgroup subsystem described in -Documentation/cgroup.txt. +Documentation/cgroups/cgroups.txt. Requests by a task, using the sched_setaffinity(2) system call to include CPUs in its CPU affinity mask, and using the mbind(2) and diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 295f26cd895a..9dd2a3bb2acc 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -96,6 +96,11 @@ errors=remount-ro(*) Remount the filesystem read-only on an error. errors=continue Keep going on a filesystem error. errors=panic Panic and halt the machine if an error occurs. +data_err=ignore(*) Just print an error message if an error occurs + in a file data buffer in ordered mode. +data_err=abort Abort the journal if an error occurs in a file + data buffer in ordered mode. + grpid Give objects the same group ID as their creator. bsdgroups diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index c032bf39e8b9..bcceb99b81dd 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1384,15 +1384,18 @@ causes the kernel to prefer to reclaim dentries and inodes. dirty_background_ratio ---------------------- -Contains, as a percentage of total system memory, the number of pages at which -the pdflush background writeback daemon will start writing out dirty data. +Contains, as a percentage of the dirtyable system memory (free pages + mapped +pages + file cache, not including locked pages and HugePages), the number of +pages at which the pdflush background writeback daemon will start writing out +dirty data. dirty_ratio ----------------- -Contains, as a percentage of total system memory, the number of pages at which -a process which is generating disk writes will itself start writing out dirty -data. +Contains, as a percentage of the dirtyable system memory (free pages + mapped +pages + file cache, not including locked pages and HugePages), the number of +pages at which a process which is generating disk writes will itself start +writing out dirty data. dirty_writeback_centisecs ------------------------- @@ -2412,24 +2415,29 @@ will be dumped when the <pid> process is dumped. coredump_filter is a bitmask of memory types. If a bit of the bitmask is set, memory segments of the corresponding memory type are dumped, otherwise they are not dumped. -The following 4 memory types are supported: +The following 7 memory types are supported: - (bit 0) anonymous private memory - (bit 1) anonymous shared memory - (bit 2) file-backed private memory - (bit 3) file-backed shared memory - (bit 4) ELF header pages in file-backed private memory areas (it is effective only if the bit 2 is cleared) + - (bit 5) hugetlb private memory + - (bit 6) hugetlb shared memory Note that MMIO pages such as frame buffer are never dumped and vDSO pages are always dumped regardless of the bitmask status. -Default value of coredump_filter is 0x3; this means all anonymous memory -segments are dumped. + Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only + effected by bit 5-6. + +Default value of coredump_filter is 0x23; this means all anonymous memory +segments and hugetlb private memory are dumped. If you don't want to dump all shared memory segments attached to pid 1234, -write 1 to the process's proc file. +write 0x21 to the process's proc file. - $ echo 0x1 > /proc/1234/coredump_filter + $ echo 0x21 > /proc/1234/coredump_filter When a new process is created, the process inherits the bitmask status from its parent. It is useful to set up coredump_filter before the program runs. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bcecfaa1e770..0f1544f67400 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -690,7 +690,7 @@ and is between 256 and 4096 characters. It is defined in the file See Documentation/block/as-iosched.txt and Documentation/block/deadline-iosched.txt for details. - elfcorehdr= [X86-32, X86_64] + elfcorehdr= [IA64,PPC,SH,X86-32,X86_64] Specifies physical address of start of kernel core image elf header. Generally kexec loader will pass this option to capture kernel. diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt new file mode 100644 index 000000000000..bdf93b7f0f24 --- /dev/null +++ b/Documentation/mtd/nand_ecc.txt @@ -0,0 +1,714 @@ +Introduction +============ + +Having looked at the linux mtd/nand driver and more specific at nand_ecc.c +I felt there was room for optimisation. I bashed the code for a few hours +performing tricks like table lookup removing superfluous code etc. +After that the speed was increased by 35-40%. +Still I was not too happy as I felt there was additional room for improvement. + +Bad! I was hooked. +I decided to annotate my steps in this file. Perhaps it is useful to someone +or someone learns something from it. + + +The problem +=========== + +NAND flash (at least SLC one) typically has sectors of 256 bytes. +However NAND flash is not extremely reliable so some error detection +(and sometimes correction) is needed. + +This is done by means of a Hamming code. I'll try to explain it in +laymans terms (and apologies to all the pro's in the field in case I do +not use the right terminology, my coding theory class was almost 30 +years ago, and I must admit it was not one of my favourites). + +As I said before the ecc calculation is performed on sectors of 256 +bytes. This is done by calculating several parity bits over the rows and +columns. The parity used is even parity which means that the parity bit = 1 +if the data over which the parity is calculated is 1 and the parity bit = 0 +if the data over which the parity is calculated is 0. So the total +number of bits over the data over which the parity is calculated + the +parity bit is even. (see wikipedia if you can't follow this). +Parity is often calculated by means of an exclusive or operation, +sometimes also referred to as xor. In C the operator for xor is ^ + +Back to ecc. +Let's give a small figure: + +byte 0: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp4 ... rp14 +byte 1: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp2 rp4 ... rp14 +byte 2: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp4 ... rp14 +byte 3: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp4 ... rp14 +byte 4: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp5 ... rp14 +.... +byte 254: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp5 ... rp15 +byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15 + cp1 cp0 cp1 cp0 cp1 cp0 cp1 cp0 + cp3 cp3 cp2 cp2 cp3 cp3 cp2 cp2 + cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4 + +This figure represents a sector of 256 bytes. +cp is my abbreviaton for column parity, rp for row parity. + +Let's start to explain column parity. +cp0 is the parity that belongs to all bit0, bit2, bit4, bit6. +so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even. +Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7. +cp2 is the parity over bit0, bit1, bit4 and bit5 +cp3 is the parity over bit2, bit3, bit6 and bit7. +cp4 is the parity over bit0, bit1, bit2 and bit3. +cp5 is the parity over bit4, bit5, bit6 and bit7. +Note that each of cp0 .. cp5 is exactly one bit. + +Row parity actually works almost the same. +rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254) +rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255) +rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ... +(so handle two bytes, then skip 2 bytes). +rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...) +for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc. +so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...) +and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, .. +The story now becomes quite boring. I guess you get the idea. +rp6 covers 8 bytes then skips 8 etc +rp7 skips 8 bytes then covers 8 etc +rp8 covers 16 bytes then skips 16 etc +rp9 skips 16 bytes then covers 16 etc +rp10 covers 32 bytes then skips 32 etc +rp11 skips 32 bytes then covers 32 etc +rp12 covers 64 bytes then skips 64 etc +rp13 skips 64 bytes then covers 64 etc +rp14 covers 128 bytes then skips 128 +rp15 skips 128 bytes then covers 128 + +In the end the parity bits are grouped together in three bytes as +follows: +ECC Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0 +ECC 0 rp07 rp06 rp05 rp04 rp03 rp02 rp01 rp00 +ECC 1 rp15 rp14 rp13 rp12 rp11 rp10 rp09 rp08 +ECC 2 cp5 cp4 cp3 cp2 cp1 cp0 1 1 + +I detected after writing this that ST application note AN1823 +(http://www.st.com/stonline/books/pdf/docs/10123.pdf) gives a much +nicer picture.(but they use line parity as term where I use row parity) +Oh well, I'm graphically challenged, so suffer with me for a moment :-) +And I could not reuse the ST picture anyway for copyright reasons. + + +Attempt 0 +========= + +Implementing the parity calculation is pretty simple. +In C pseudocode: +for (i = 0; i < 256; i++) +{ + if (i & 0x01) + rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1; + else + rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1; + if (i & 0x02) + rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3; + else + rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2; + if (i & 0x04) + rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5; + else + rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4; + if (i & 0x08) + rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7; + else + rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6; + if (i & 0x10) + rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9; + else + rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8; + if (i & 0x20) + rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11; + else + rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10; + if (i & 0x40) + rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13; + else + rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12; + if (i & 0x80) + rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15; + else + rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14; + cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0; + cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1; + cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2; + cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3 + cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4 + cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5 +} + + +Analysis 0 +========== + +C does have bitwise operators but not really operators to do the above +efficiently (and most hardware has no such instructions either). +Therefore without implementing this it was clear that the code above was +not going to bring me a Nobel prize :-) + +Fortunately the exclusive or operation is commutative, so we can combine +the values in any order. So instead of calculating all the bits +individually, let us try to rearrange things. +For the column parity this is easy. We can just xor the bytes and in the +end filter out the relevant bits. This is pretty nice as it will bring +all cp calculation out of the if loop. + +Similarly we can first xor the bytes for the various rows. +This leads to: + + +Attempt 1 +========= + +const char parity[256] = { + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 +}; + +void ecc1(const unsigned char *buf, unsigned char *code) +{ + int i; + const unsigned char *bp = buf; + unsigned char cur; + unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; + unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15; + unsigned char par; + + par = 0; + rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0; + rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0; + rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0; + rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0; + + for (i = 0; i < 256; i++) + { + cur = *bp++; + par ^= cur; + if (i & 0x01) rp1 ^= cur; else rp0 ^= cur; + if (i & 0x02) rp3 ^= cur; else rp2 ^= cur; + if (i & 0x04) rp5 ^= cur; else rp4 ^= cur; + if (i & 0x08) rp7 ^= cur; else rp6 ^= cur; + if (i & 0x10) rp9 ^= cur; else rp8 ^= cur; + if (i & 0x20) rp11 ^= cur; else rp10 ^= cur; + if (i & 0x40) rp13 ^= cur; else rp12 ^= cur; + if (i & 0x80) rp15 ^= cur; else rp14 ^= cur; + } + code[0] = + (parity[rp7] << 7) | + (parity[rp6] << 6) | + (parity[rp5] << 5) | + (parity[rp4] << 4) | + (parity[rp3] << 3) | + (parity[rp2] << 2) | + (parity[rp1] << 1) | + (parity[rp0]); + code[1] = + (parity[rp15] << 7) | + (parity[rp14] << 6) | + (parity[rp13] << 5) | + (parity[rp12] << 4) | + (parity[rp11] << 3) | + (parity[rp10] << 2) | + (parity[rp9] << 1) | + (parity[rp8]); + code[2] = + (parity[par & 0xf0] << 7) | + (parity[par & 0x0f] << 6) | + (parity[par & 0xcc] << 5) | + (parity[par & 0x33] << 4) | + (parity[par & 0xaa] << 3) | + (parity[par & 0x55] << 2); + code[0] = ~code[0]; + code[1] = ~code[1]; + code[2] = ~code[2]; +} + +Still pretty straightforward. The last three invert statements are there to +give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash +all data is 0xff, so the checksum then matches. + +I also introduced the parity lookup. I expected this to be the fastest +way to calculate the parity, but I will investigate alternatives later +on. + + +Analysis 1 +========== + +The code works, but is not terribly efficient. On my system it took +almost 4 times as much time as the linux driver code. But hey, if it was +*that* easy this would have been done long before. +No pain. no gain. + +Fortunately there is plenty of room for improvement. + +In step 1 we moved from bit-wise calculation to byte-wise calculation. +However in C we can also use the unsigned long data type and virtually +every modern microprocessor supports 32 bit operations, so why not try +to write our code in such a way that we process data in 32 bit chunks. + +Of course this means some modification as the row parity is byte by +byte. A quick analysis: +for the column parity we use the par variable. When extending to 32 bits +we can in the end easily calculate p0 and p1 from it. +(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0 +respectively) +also rp2 and rp3 can be easily retrieved from par as rp3 covers the +first two bytes and rp2 the last two bytes. + +Note that of course now the loop is executed only 64 times (256/4). +And note that care must taken wrt byte ordering. The way bytes are +ordered in a long is machine dependent, and might affect us. +Anyway, if there is an issue: this code is developed on x86 (to be +precise: a DELL PC with a D920 Intel CPU) + +And of course the performance might depend on alignment, but I expect +that the I/O buffers in the nand driver are aligned properly (and +otherwise that should be fixed to get maximum performance). + +Let's give it a try... + + +Attempt 2 +========= + +extern const char parity[256]; + +void ecc2(const unsigned char *buf, unsigned char *code) +{ + int i; + const unsigned long *bp = (unsigned long *)buf; + unsigned long cur; + unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; + unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15; + unsigned long par; + + par = 0; + rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0; + rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0; + rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0; + rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0; + + for (i = 0; i < 64; i++) + { + cur = *bp++; + par ^= cur; + if (i & 0x01) rp5 ^= cur; else rp4 ^= cur; + if (i & 0x02) rp7 ^= cur; else rp6 ^= cur; + if (i & 0x04) rp9 ^= cur; else rp8 ^= cur; + if (i & 0x08) rp11 ^= cur; else rp10 ^= cur; + if (i & 0x10) rp13 ^= cur; else rp12 ^= cur; + if (i & 0x20) rp15 ^= cur; else rp14 ^= cur; + } + /* + we need to adapt the code generation for the fact that rp vars are now + long; also the column parity calculation needs to be changed. + we'll bring rp4 to 15 back to single byte entities by shifting and + xoring + */ + rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff; + rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff; + rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff; + rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff; + rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff; + rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff; + rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff; + rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff; + rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff; + rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff; + rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff; + rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff; + rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff; + rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff; + par ^= (par >> 16); + rp1 = (par >> 8); rp1 &= 0xff; + rp0 = (par & 0xff); + par ^= (par >> 8); par &= 0xff; + + code[0] = + (parity[rp7] << 7) | + (parity[rp6] << 6) | + (parity[rp5] << 5) | + (parity[rp4] << 4) | + (parity[rp3] << 3) | + (parity[rp2] << 2) | + (parity[rp1] << 1) | + (parity[rp0]); + code[1] = + (parity[rp15] << 7) | + (parity[rp14] << 6) | + (parity[rp13] << 5) | + (parity[rp12] << 4) | + (parity[rp11] << 3) | + (parity[rp10] << 2) | + (parity[rp9] << 1) | + (parity[rp8]); + code[2] = + (parity[par & 0xf0] << 7) | + (parity[par & 0x0f] << 6) | + (parity[par & 0xcc] << 5) | + (parity[par & 0x33] << 4) | + (parity[par & 0xaa] << 3) | + (parity[par & 0x55] << 2); + code[0] = ~code[0]; + code[1] = ~code[1]; + code[2] = ~code[2]; +} + +The parity array is not shown any more. Note also that for these +examples I kinda deviated from my regular programming style by allowing +multiple statements on a line, not using { } in then and else blocks +with only a single statement and by using operators like ^= + + +Analysis 2 +========== + +The code (of course) works, and hurray: we are a little bit faster than +the linux driver code (about 15%). But wait, don't cheer too quickly. +THere is more to be gained. +If we look at e.g. rp14 and rp15 we see that we either xor our data with +rp14 or with rp15. However we also have par which goes over all data. +This means there is no need to calculate rp14 as it can be calculated from +rp15 through rp14 = par ^ rp15; +(or if desired we can avoid calculating rp15 and calculate it from +rp14). That is why some places refer to inverse parity. +Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13. +Effectively this means we can eliminate the else clause from the if +statements. Also we can optimise the calculation in the end a little bit +by going from long to byte first. Actually we can even avoid the table +lookups + +Attempt 3 +========= + +Odd replaced: + if (i & 0x01) rp5 ^= cur; else rp4 ^= cur; + if (i & 0x02) rp7 ^= cur; else rp6 ^= cur; + if (i & 0x04) rp9 ^= cur; else rp8 ^= cur; + if (i & 0x08) rp11 ^= cur; else rp10 ^= cur; + if (i & 0x10) rp13 ^= cur; else rp12 ^= cur; + if (i & 0x20) rp15 ^= cur; else rp14 ^= cur; +with + if (i & 0x01) rp5 ^= cur; + if (i & 0x02) rp7 ^= cur; + if (i & 0x04) rp9 ^= cur; + if (i & 0x08) rp11 ^= cur; + if (i & 0x10) rp13 ^= cur; + if (i & 0x20) rp15 ^= cur; + + and outside the loop added: + rp4 = par ^ rp5; + rp6 = par ^ rp7; + rp8 = par ^ rp9; + rp10 = par ^ rp11; + rp12 = par ^ rp13; + rp14 = par ^ rp15; + +And after that the code takes about 30% more time, although the number of +statements is reduced. This is also reflected in the assembly code. + + +Analysis 3 +========== + +Very weird. Guess it has to do with caching or instruction parallellism +or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting +observation was that this one is only 30% slower (according to time) +executing the code as my 3Ghz D920 processor. + +Well, it was expected not to be easy so maybe instead move to a +different track: let's move back to the code from attempt2 and do some +loop unrolling. This will eliminate a few if statements. I'll try +different amounts of unrolling to see what works best. + + +Attempt 4 +========= + +Unrolled the loop 1, 2, 3 and 4 times. +For 4 the code starts with: + + for (i = 0; i < 4; i++) + { + cur = *bp++; + par ^= cur; + rp4 ^= cur; + rp6 ^= cur; + rp8 ^= cur; + rp10 ^= cur; + if (i & 0x1) rp13 ^= cur; else rp12 ^= cur; + if (i & 0x2) rp15 ^= cur; else rp14 ^= cur; + cur = *bp++; + par ^= cur; + rp5 ^= cur; + rp6 ^= cur; + ... + + +Analysis 4 +========== + +Unrolling once gains about 15% +Unrolling twice keeps the gain at about 15% +Unrolling three times gives a gain of 30% compared to attempt 2. +Unrolling four times gives a marginal improvement compared to unrolling +three times. + +I decided to proceed with a four time unrolled loop anyway. It was my gut +feeling that in the next steps I would obtain additional gain from it. + +The next step was triggered by the fact that par contains the xor of all +bytes and rp4 and rp5 each contain the xor of half of the bytes. +So in effect par = rp4 ^ rp5. But as xor is commutative we can also say +that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can +eliminate rp5 (or rp4, but I already foresaw another optimisation). +The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15. + + +Attempt 5 +========= + +Effectively so all odd digit rp assignments in the loop were removed. +This included the else clause of the if statements. +Of course after the loop we need to correct things by adding code like: + rp5 = par ^ rp4; +Also the initial assignments (rp5 = 0; etc) could be removed. +Along the line I also removed the initialisation of rp0/1/2/3. + + +Analysis 5 +========== + +Measurements showed this was a good move. The run-time roughly halved +compared with attempt 4 with 4 times unrolled, and we only require 1/3rd +of the processor time compared to the current code in the linux kernel. + +However, still I thought there was more. I didn't like all the if +statements. Why not keep a running parity and only keep the last if +statement. Time for yet another version! + + +Attempt 6 +========= + +THe code within the for loop was changed to: + + for (i = 0; i < 4; i++) + { + cur = *bp++; tmppar = cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= tmppar; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp8 ^= tmppar; + + cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; + + cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur; + cur = *bp++; tmppar ^= cur; rp8 ^= cur; + + cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; + + par ^= tmppar; + if ((i & 0x1) == 0) rp12 ^= tmppar; + if ((i & 0x2) == 0) rp14 ^= tmppar; + } + +As you can see tmppar is used to accumulate the parity within a for +iteration. In the last 3 statements is is added to par and, if needed, +to rp12 and rp14. + +While making the changes I also found that I could exploit that tmppar +contains the running parity for this iteration. So instead of having: +rp4 ^= cur; rp6 = cur; +I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next +statement. A similar change was done for rp8 and rp10 + + +Analysis 6 +========== + +Measuring this code again showed big gain. When executing the original +linux code 1 million times, this took about 1 second on my system. +(using time to measure the performance). After this iteration I was back +to 0.075 sec. Actually I had to decide to start measuring over 10 +million interations in order not to loose too much accuracy. This one +definitely seemed to be the jackpot! + +There is a little bit more room for improvement though. There are three +places with statements: +rp4 ^= cur; rp6 ^= cur; +It seems more efficient to also maintain a variable rp4_6 in the while +loop; This eliminates 3 statements per loop. Of course after the loop we +need to correct by adding: + rp4 ^= rp4_6; + rp6 ^= rp4_6 +Furthermore there are 4 sequential assingments to rp8. This can be +encoded slightly more efficient by saving tmppar before those 4 lines +and later do rp8 = rp8 ^ tmppar ^ notrp8; +(where notrp8 is the value of rp8 before those 4 lines). +Again a use of the commutative property of xor. +Time for a new test! + + +Attempt 7 +========= + +The new code now looks like: + + for (i = 0; i < 4; i++) + { + cur = *bp++; tmppar = cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= tmppar; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp8 ^= tmppar; + + cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; rp10 ^= tmppar; + + notrp8 = tmppar; + cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; + rp8 = rp8 ^ tmppar ^ notrp8; + + cur = *bp++; tmppar ^= cur; rp4_6 ^= cur; + cur = *bp++; tmppar ^= cur; rp6 ^= cur; + cur = *bp++; tmppar ^= cur; rp4 ^= cur; + cur = *bp++; tmppar ^= cur; + + par ^= tmppar; + if ((i & 0x1) == 0) rp12 ^= tmppar; + if ((i & 0x2) == 0) rp14 ^= tmppar; + } + rp4 ^= rp4_6; + rp6 ^= rp4_6; + + +Not a big change, but every penny counts :-) + + +Analysis 7 +========== + +Acutally this made things worse. Not very much, but I don't want to move +into the wrong direction. Maybe something to investigate later. Could +have to do with caching again. + +Guess that is what there is to win within the loop. Maybe unrolling one +more time will help. I'll keep the optimisations from 7 for now. + + +Attempt 8 +========= + +Unrolled the loop one more time. + + +Analysis 8 +========== + +This makes things worse. Let's stick with attempt 6 and continue from there. +Although it seems that the code within the loop cannot be optimised +further there is still room to optimize the generation of the ecc codes. +We can simply calcualate the total parity. If this is 0 then rp4 = rp5 +etc. If the parity is 1, then rp4 = !rp5; +But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits +in the result byte and then do something like + code[0] |= (code[0] << 1); +Lets test this. + + +Attempt 9 +========= + +Changed the code but again this slightly degrades performance. Tried all +kind of other things, like having dedicated parity arrays to avoid the +shift after parity[rp7] << 7; No gain. +Change the lookup using the parity array by using shift operators (e.g. +replace parity[rp7] << 7 with: +rp7 ^= (rp7 << 4); +rp7 ^= (rp7 << 2); +rp7 ^= (rp7 << 1); +rp7 &= 0x80; +No gain. + +The only marginal change was inverting the parity bits, so we can remove +the last three invert statements. + +Ah well, pity this does not deliver more. Then again 10 million +iterations using the linux driver code takes between 13 and 13.5 +seconds, whereas my code now takes about 0.73 seconds for those 10 +million iterations. So basically I've improved the performance by a +factor 18 on my system. Not that bad. Of course on different hardware +you will get different results. No warranties! + +But of course there is no such thing as a free lunch. The codesize almost +tripled (from 562 bytes to 1434 bytes). Then again, it is not that much. + + +Correcting errors +================= + +For correcting errors I again used the ST application note as a starter, +but I also peeked at the existing code. +The algorithm itself is pretty straightforward. Just xor the given and +the calculated ecc. If all bytes are 0 there is no problem. If 11 bits +are 1 we have one correctable bit error. If there is 1 bit 1, we have an +error in the given ecc code. +It proved to be fastest to do some table lookups. Performance gain +introduced by this is about a factor 2 on my system when a repair had to +be done, and 1% or so if no repair had to be done. +Code size increased from 330 bytes to 686 bytes for this function. +(gcc 4.2, -O3) + + +Conclusion +========== + +The gain when calculating the ecc is tremendous. Om my development hardware +a speedup of a factor of 18 for ecc calculation was achieved. On a test on an +embedded system with a MIPS core a factor 7 was obtained. +On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor +5 (big endian mode, gcc 4.1.2, -O3) +For correction not much gain could be obtained (as bitflips are rare). Then +again there are also much less cycles spent there. + +It seems there is not much more gain possible in this, at least when +programmed in C. Of course it might be possible to squeeze something more +out of it with an assembler program, but due to pipeline behaviour etc +this is very tricky (at least for intel hw). + +Author: Frans Meulenbroeks +Copyright (C) 2008 Koninklijke Philips Electronics NV. diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 5ce0952aa065..49378a9f2b5f 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -95,7 +95,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'p' - Will dump the current registers and flags to your console. -'q' - Will dump a list of all running timers. +'q' - Will dump a list of all running hrtimers. + WARNING: Does not cover any other timers 'r' - Turns off keyboard raw mode and sets it to XLATE. diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt new file mode 100644 index 000000000000..125eed560e5a --- /dev/null +++ b/Documentation/vm/unevictable-lru.txt @@ -0,0 +1,615 @@ + +This document describes the Linux memory management "Unevictable LRU" +infrastructure and the use of this infrastructure to manage several types +of "unevictable" pages. The document attempts to provide the overall +rationale behind this mechanism and the rationale for some of the design +decisions that drove the implementation. The latter design rationale is +discussed in the context of an implementation description. Admittedly, one +can obtain the implementation details--the "what does it do?"--by reading the +code. One hopes that the descriptions below add value by provide the answer +to "why does it do that?". + +Unevictable LRU Infrastructure: + +The Unevictable LRU adds an additional LRU list to track unevictable pages +and to hide these pages from vmscan. This mechanism is based on a patch by +Larry Woodman of Red Hat to address several scalability problems with page +reclaim in Linux. The problems have been observed at customer sites on large +memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB +of main memory will have over 32 million 4k pages in a single zone. When a +large fraction of these pages are not evictable for any reason [see below], +vmscan will spend a lot of time scanning the LRU lists looking for the small +fraction of pages that are evictable. This can result in a situation where +all cpus are spending 100% of their time in vmscan for hours or days on end, +with the system completely unresponsive. + +The Unevictable LRU infrastructure addresses the following classes of +unevictable pages: + ++ page owned by ramfs ++ page mapped into SHM_LOCKed shared memory regions ++ page mapped into VM_LOCKED [mlock()ed] vmas + +The infrastructure might be able to handle other conditions that make pages +unevictable, either by definition or by circumstance, in the future. + + +The Unevictable LRU List + +The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list +called the "unevictable" list and an associated page flag, PG_unevictable, to +indicate that the page is being managed on the unevictable list. The +PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active +flag in that it indicates on which LRU list a page resides when PG_lru is set. +The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU +Kconfig option. + +The Unevictable LRU infrastructure maintains unevictable pages on an additional +LRU list for a few reasons: + +1) We get to "treat unevictable pages just like we treat other pages in the + system, which means we get to use the same code to manipulate them, the + same code to isolate them (for migrate, etc.), the same code to keep track + of the statistics, etc..." [Rik van Riel] + +2) We want to be able to migrate unevictable pages between nodes--for memory + defragmentation, workload management and memory hotplug. The linux kernel + can only migrate pages that it can successfully isolate from the lru lists. + If we were to maintain pages elsewise than on an lru-like list, where they + can be found by isolate_lru_page(), we would prevent their migration, unless + we reworked migration code to find the unevictable pages. + + +The unevictable LRU list does not differentiate between file backed and swap +backed [anon] pages. This differentiation is only important while the pages +are, in fact, evictable. + +The unevictable LRU list benefits from the "arrayification" of the per-zone +LRU lists and statistics originally proposed and posted by Christoph Lameter. + +The unevictable list does not use the lru pagevec mechanism. Rather, +unevictable pages are placed directly on the page's zone's unevictable +list under the zone lru_lock. The reason for this is to prevent stranding +of pages on the unevictable list when one task has the page isolated from the +lru and other tasks are changing the "evictability" state of the page. + + +Unevictable LRU and Memory Controller Interaction + +The memory controller data structure automatically gets a per zone unevictable +lru list as a result of the "arrayification" of the per-zone LRU lists. The +memory controller tracks the movement of pages to and from the unevictable list. +When a memory control group comes under memory pressure, the controller will +not attempt to reclaim pages on the unevictable list. This has a couple of +effects. Because the pages are "hidden" from reclaim on the unevictable list, +the reclaim process can be more efficient, dealing only with pages that have +a chance of being reclaimed. On the other hand, if too many of the pages +charged to the control group are unevictable, the evictable portion of the +working set of the tasks in the control group may not fit into the available +memory. This can cause the control group to thrash or to oom-kill tasks. + + +Unevictable LRU: Detecting Unevictable Pages + +The function page_evictable(page, vma) in vmscan.c determines whether a +page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions, +page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's +address space using a wrapper function. Wrapper functions are used to set, +clear and test the flag to reduce the requirement for #ifdef's throughout the +source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created. +This flag remains for the life of the inode. + +For shared memory regions, AS_UNEVICTABLE is set when an application +successfully SHM_LOCKs the region and is removed when the region is +SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page +tables for the region as does, for example, mlock(). So, we make no special +effort to push any pages in the SHM_LOCKed region to the unevictable list. +Vmscan will do this when/if it encounters the pages during reclaim. On +SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the +unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed +region is destroyed, the pages are also "rescued" from the unevictable list in +the process of freeing them. + +page_evictable() detects mlock()ed pages by testing an additional page flag, +PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a +non-NULL vma is supplied, page_evictable() will check whether the vma is +VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and +update the appropriate statistics if the vma is VM_LOCKED. This method allows +efficient "culling" of pages in the fault path that are being faulted in to +VM_LOCKED vmas. + + +Unevictable Pages and Vmscan [shrink_*_list()] + +If unevictable pages are culled in the fault path, or moved to the unevictable +list at mlock() or mmap() time, vmscan will never encounter the pages until +they have become evictable again, for example, via munlock() and have been +"rescued" from the unevictable list. However, there may be situations where we +decide, for the sake of expediency, to leave a unevictable page on one of the +regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for +such pages in all of the shrink_{active|inactive|page}_list() functions and +will "cull" such pages that it encounters--that is, it diverts those pages to +the unevictable list for the zone being scanned. + +There may be situations where a page is mapped into a VM_LOCKED vma, but the +page is not marked as PageMlocked. Such pages will make it all the way to +shrink_page_list() where they will be detected when vmscan walks the reverse +map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() +will cull the page at that point. + +Note that for anonymous pages, shrink_page_list() attempts to add the page to +the swap cache before it tries to unmap the page. To avoid this unnecessary +consumption of swap space, shrink_page_list() calls try_to_munlock() to check +whether any VM_LOCKED vmas map the page without attempting to unmap the page. +If try_to_munlock() returns SWAP_MLOCK, shrink_page_list() will cull the page +without consuming swap space. try_to_munlock() will be described below. + +To "cull" an unevictable page, vmscan simply puts the page back on the lru +list using putback_lru_page()--the inverse operation to isolate_lru_page()-- +after dropping the page lock. Because the condition which makes the page +unevictable may change once the page is unlocked, putback_lru_page() will +recheck the unevictable state of a page that it places on the unevictable lru +list. If the page has become unevictable, putback_lru_page() removes it from +the list and retries, including the page_unevictable() test. Because such a +race is a rare event and movement of pages onto the unevictable list should be +rare, these extra evictabilty checks should not occur in the majority of calls +to putback_lru_page(). + + +Mlocked Page: Prior Work + +The "Unevictable Mlocked Pages" infrastructure is based on work originally +posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". +Nick posted his patch as an alternative to a patch posted by Christoph +Lameter to achieve the same objective--hiding mlocked pages from vmscan. +In Nick's patch, he used one of the struct page lru list link fields as a count +of VM_LOCKED vmas that map the page. This use of the link field for a count +prevented the management of the pages on an LRU list. Thus, mlocked pages were +not migratable as isolate_lru_page() could not find them and the lru list link +field was not available to the migration subsystem. Nick resolved this by +putting mlocked pages back on the lru list before attempting to isolate them, +thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated +with the Unevictable LRU work, the count was replaced by walking the reverse +map to determine whether any VM_LOCKED vmas mapped the page. More on this +below. + + +Mlocked Pages: Basic Management + +Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of +unevictable pages. When such a page has been "noticed" by the memory +management subsystem, the page is marked with the PG_mlocked [PageMlocked()] +flag. A PageMlocked() page will be placed on the unevictable LRU list when +it is added to the LRU. Pages can be "noticed" by memory management in +several places: + +1) in the mlock()/mlockall() system call handlers. +2) in the mmap() system call handler when mmap()ing a region with the + MAP_LOCKED flag, or mmap()ing a region in a task that has called + mlockall() with the MCL_FUTURE flag. Both of these conditions result + in the VM_LOCKED flag being set for the vma. +3) in the fault path, if mlocked pages are "culled" in the fault path, + and when a VM_LOCKED stack segment is expanded. +4) as mentioned above, in vmscan:shrink_page_list() with attempting to + reclaim a page in a VM_LOCKED vma--via try_to_unmap() or try_to_munlock(). + +Mlocked pages become unlocked and rescued from the unevictable list when: + +1) mapped in a range unlocked via the munlock()/munlockall() system calls. +2) munmapped() out of the last VM_LOCKED vma that maps the page, including + unmapping at task exit. +3) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file. +4) before a page is COWed in a VM_LOCKED vma. + + +Mlocked Pages: mlock()/mlockall() System Call Handling + +Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() +for each vma in the range specified by the call. In the case of mlockall(), +this is the entire active address space of the task. Note that mlock_fixup() +is used for both mlock()ing and munlock()ing a range of memory. A call to +mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED +is treated as a no-op--mlock_fixup() simply returns. + +If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas" +below, mlock_fixup() will attempt to merge the vma with its neighbors or split +off a subset of the vma if the range does not cover the entire vma. Once the +vma has been merged or split or neither, mlock_fixup() will call +__mlock_vma_pages_range() to fault in the pages via get_user_pages() and +to mark the pages as mlocked via mlock_vma_page(). + +Note that the vma being mlocked might be mapped with PROT_NONE. In this case, +get_user_pages() will be unable to fault in the pages. That's OK. If pages +do end up getting faulted into this VM_LOCKED vma, we'll handle them in the +fault path or in vmscan. + +Also note that a page returned by get_user_pages() could be truncated or +migrated out from under us, while we're trying to mlock it. To detect +this, __mlock_vma_pages_range() tests the page_mapping after acquiring +the page lock. If the page is still associated with its mapping, we'll +go ahead and call mlock_vma_page(). If the mapping is gone, we just +unlock the page and move on. Worse case, this results in page mapped +in a VM_LOCKED vma remaining on a normal LRU list without being +PageMlocked(). Again, vmscan will detect and cull such pages. + +mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will +TestSetPageMlocked() for each page returned by get_user_pages(). We use +TestSetPageMlocked() because the page might already be mlocked by another +task/vma and we don't want to do extra work. We especially do not want to +count an mlocked page more than once in the statistics. If the page was +already mlocked, mlock_vma_page() is done. + +If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the +page from the LRU, as it is likely on the appropriate active or inactive list +at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will +putback the page--putback_lru_page()--which will notice that the page is now +mlocked and divert the page to the zone's unevictable LRU list. If +mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle +it later if/when it attempts to reclaim the page. + + +Mlocked Pages: Filtering Special Vmas + +mlock_fixup() filters several classes of "special" vmas: + +1) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind + these mappings are inherently pinned, so we don't need to mark them as + mlocked. In any case, most of the pages have no struct page in which to + so mark the page. Because of this, get_user_pages() will fail for these + vmas, so there is no sense in attempting to visit them. + +2) vmas mapping hugetlbfs page are already effectively pinned into memory. + We don't need nor want to mlock() these pages. However, to preserve the + prior behavior of mlock()--before the unevictable/mlock changes--mlock_fixup() + will call make_pages_present() in the hugetlbfs vma range to allocate the + huge pages and populate the ptes. + +3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of + kernel pages, such as the vdso page, relay channel pages, etc. These pages + are inherently unevictable and are not managed on the LRU lists. + mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls + make_pages_present() to populate the ptes. + +Note that for all of these special vmas, mlock_fixup() does not set the +VM_LOCKED flag. Therefore, we won't have to deal with them later during +munlock() or munmap()--for example, at task exit. Neither does mlock_fixup() +account these vmas against the task's "locked_vm". + +Mlocked Pages: Downgrading the Mmap Semaphore. + +mlock_fixup() must be called with the mmap semaphore held for write, because +it may have to merge or split vmas. However, mlocking a large region of +memory can take a long time--especially if vmscan must reclaim pages to +satisfy the regions requirements. Faulting in a large region with the mmap +semaphore held for write can hold off other faults on the address space, in +the case of a multi-threaded task. It can also hold off scans of the task's +address space via /proc. While testing under heavy load, it was observed that +the ps(1) command could be held off for many minutes while a large segment was +mlock()ed down. + +To address this issue, and to make the system more responsive during mlock()ing +of large segments, mlock_fixup() downgrades the mmap semaphore to read mode +during the call to __mlock_vma_pages_range(). This works fine. However, the +callers of mlock_fixup() expect the semaphore to be returned in write mode. +So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not +support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore +and reacquire it in write mode. In a multi-threaded task, it is possible for +the task memory map to change while the semaphore is dropped. Therefore, +mlock_fixup() looks up the vma at the range start address after reacquiring +the semaphore in write mode and verifies that it still covers the original +range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of +mlock_fixup() have been changed to deal with this new error condition. + +Note: when munlocking a region, all of the pages should already be resident-- +unless we have racing threads mlocking() and munlocking() regions. So, +unlocking should not have to wait for page allocations nor faults of any kind. +Therefore mlock_fixup() does not downgrade the semaphore for munlock(). + + +Mlocked Pages: munlock()/munlockall() System Call Handling + +The munlock() and munlockall() system calls are handled by the same functions-- +do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock +vs lock operation indicated by an argument. So, these system calls are also +handled by mlock_fixup(). Again, if called for an already munlock()ed vma, +mlock_fixup() simply returns. Because of the vma filtering discussed above, +VM_LOCKED will not be set in any "special" vmas. So, these vmas will be +ignored for munlock. + +If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off +the specified range. The range is then munlocked via the function +__mlock_vma_pages_range()--the same function used to mlock a vma range-- +passing a flag to indicate that munlock() is being performed. + +Because the vma access protections could have been changed to PROT_NONE after +faulting in and mlocking some pages, get_user_pages() was unreliable for visiting +these pages for munlocking. Because we don't want to leave pages mlocked(), +get_user_pages() was enhanced to accept a flag to ignore the permissions when +fetching the pages--all of which should be resident as a result of previous +mlock()ing. + +For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling +munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked +flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() +use the Test*PageMlocked() function to handle the case where the page might +have already been unlocked by another task. If the page was mlocked, +munlock_vma_page() updates that zone statistics for the number of mlocked +pages. Note, however, that at this point we haven't checked whether the page +is mapped by other VM_LOCKED vmas. + +We can't call try_to_munlock(), the function that walks the reverse map to check +for other VM_LOCKED vmas, without first isolating the page from the LRU. +try_to_munlock() is a variant of try_to_unmap() and thus requires that the page +not be on an lru list. [More on these below.] However, the call to +isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). +So, we go ahead and clear PG_mlocked up front, as this might be the only chance +we have. If we can successfully isolate the page, we go ahead and +try_to_munlock(), which will restore the PG_mlocked flag and update the zone +page statistics if it finds another vma holding the page mlocked. If we fail +to isolate the page, we'll have left a potentially mlocked page on the LRU. +This is fine, because we'll catch it later when/if vmscan tries to reclaim the +page. This should be relatively rare. + +Mlocked Pages: Migrating Them... + +A page that is being migrated has been isolated from the lru lists and is +held locked across unmapping of the page, updating the page's mapping +[address_space] entry and copying the contents and state, until the +page table entry has been replaced with an entry that refers to the new +page. Linux supports migration of mlocked pages and other unevictable +pages. This involves simply moving the PageMlocked and PageUnevictable states +from the old page to the new page. + +Note that page migration can race with mlocking or munlocking of the same +page. This has been discussed from the mlock/munlock perspective in the +respective sections above. Both processes [migration, m[un]locking], hold +the page locked. This provides the first level of synchronization. Page +migration zeros out the page_mapping of the old page before unlocking it, +so m[un]lock can skip these pages by testing the page mapping under page +lock. + +When completing page migration, we place the new and old pages back onto the +lru after dropping the page lock. The "unneeded" page--old page on success, +new page on failure--will be freed when the reference count held by the +migration process is released. To ensure that we don't strand pages on the +unevictable list because of a race between munlock and migration, page +migration uses the putback_lru_page() function to add migrated pages back to +the lru. + + +Mlocked Pages: mmap(MAP_LOCKED) System Call Handling + +In addition the the mlock()/mlockall() system calls, an application can request +that a region of memory be mlocked using the MAP_LOCKED flag with the mmap() +call. Furthermore, any mmap() call or brk() call that expands the heap by a +task that has previously called mlockall() with the MCL_FUTURE flag will result +in the newly mapped memory being mlocked. Before the unevictable/mlock changes, +the kernel simply called make_pages_present() to allocate pages and populate +the page table. + +To mlock a range of memory under the unevictable/mlock infrastructure, the +mmap() handler and task address space expansion functions call +mlock_vma_pages_range() specifying the vma and the address range to mlock. +mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in +"Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will +have already been set by the caller, in filtered vmas. Thus these vma's need +not be visited for munlock when the region is unmapped. + +For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to +fault/allocate the pages and mlock them. Again, like mlock_fixup(), +mlock_vma_pages_range() downgrades the mmap semaphore to read mode before +attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore +back to write mode before returning. + +The callers of mlock_vma_pages_range() will have already added the memory +range to be mlocked to the task's "locked_vm". To account for filtered vmas, +mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the +callers then subtract a non-negative return value from the task's locked_vm. +A negative return value represent an error--for example, from get_user_pages() +attempting to fault in a vma with PROT_NONE access. In this case, we leave +the memory range accounted as locked_vm, as the protections could be changed +later and pages allocated into that region. + + +Mlocked Pages: munmap()/exit()/exec() System Call Handling + +When unmapping an mlocked region of memory, whether by an explicit call to +munmap() or via an internal unmap from exit() or exec() processing, we must +munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. +Before the unevictable/mlock changes, mlocking did not mark the pages in any way, +so unmapping them required no processing. + +To munlock a range of memory under the unevictable/mlock infrastructure, the +munmap() hander and task address space tear down function call +munlock_vma_pages_all(). The name reflects the observation that one always +specifies the entire vma range when munlock()ing during unmap of a region. +Because of the vma filtering when mlocking() regions, only "normal" vmas that +actually contain mlocked pages will be passed to munlock_vma_pages_all(). + +munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup() +for the munlock case, calls __munlock_vma_pages_range() to walk the page table +for the vma's memory range and munlock_vma_page() each resident page mapped by +the vma. This effectively munlocks the page, only if this is the last +VM_LOCKED vma that maps the page. + + +Mlocked Page: try_to_unmap() + +[Note: the code changes represented by this section are really quite small +compared to the text to describe what happening and why, and to discuss the +implications.] + +Pages can, of course, be mapped into multiple vmas. Some of these vmas may +have VM_LOCKED flag set. It is possible for a page mapped into one or more +VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one +of the active or inactive LRU lists. This could happen if, for example, a +task in the process of munlock()ing the page could not isolate the page from +the LRU. As a result, vmscan/shrink_page_list() might encounter such a page +as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To +handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED +vmas while it is walking a page's reverse map. + +try_to_unmap() is always called, by either vmscan for reclaim or for page +migration, with the argument page locked and isolated from the LRU. BUG_ON() +assertions enforce this requirement. Separate functions handle anonymous and +mapped file pages, as these types of pages have different reverse map +mechanisms. + + try_to_unmap_anon() + +To unmap anonymous pages, each vma in the list anchored in the anon_vma must be +visited--at least until a VM_LOCKED vma is encountered. If the page is being +unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked +pages are migratable. However, for reclaim, if the page is mapped into a +VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap +semphore of the mm_struct to which the vma belongs in read mode. If this is +successful, try_to_unmap() will mlock the page via mlock_vma_page()--we +wouldn't have gotten to try_to_unmap() if the page were already mlocked--and +will return SWAP_MLOCK, indicating that the page is unevictable. If the +mmap semaphore cannot be acquired, we are not sure whether the page is really +unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN. + + try_to_unmap_file() -- linear mappings + +Unmapping of a mapped file page works the same, except that the scan visits +all vmas that maps the page's index/page offset in the page's mapping's +reverse map priority search tree. It must also visit each vma in the page's +mapping's non-linear list, if the list is non-empty. As for anonymous pages, +on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will +attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, +returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. + + try_to_unmap_file() -- non-linear mappings + +If a page's mapping contains a non-empty non-linear mapping vma list, then +try_to_un{map|lock}() must also visit each vma in that list to determine +whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit +all vmas in the non-linear list to ensure that the pages is not/should not be +mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate. +However, there is no easy way to determine whether the page is actually mapped +in a given vma--either for unmapping or testing whether the VM_LOCKED vma +actually pins the page. + +So, try_to_unmap_file() handles non-linear mappings by scanning a certain +number of pages--a "cluster"--in each non-linear vma associated with the page's +mapping, for each file mapped page that vmscan tries to unmap. If this happens +to unmap the page we're trying to unmap, try_to_unmap() will notice this on +return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it +will return SWAP_AGAIN, causing vmscan to recirculate this page. We take +advantage of the cluster scan in try_to_unmap_cluster() as follows: + +For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap +semaphore of the associated mm_struct for read without blocking. If this +attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will +retain the mmap semaphore for the scan; otherwise it drops it here. Then, +for each page in the cluster, if we're holding the mmap semaphore for a locked +vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This +call is a no-op if the page is already locked, but will mlock any pages in +the non-linear mapping that happen to be unlocked. If one of the pages so +mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will +return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan +to cull the page, rather than recirculating it on the inactive list. Again, +if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns +SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but +couldn't be mlocked. + + +Mlocked pages: try_to_munlock() Reverse Map Scan + +TODO/FIXME: a better name might be page_mlocked()--analogous to the +page_referenced() reverse map walker--especially if we continue to call this +from shrink_page_list(). See related TODO/FIXME below. + +When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() System +Call Handling" above--tries to munlock a page, or when shrink_page_list() +encounters an anonymous page that is not yet in the swap cache, they need to +determine whether or not the page is mapped by any VM_LOCKED vma, without +actually attempting to unmap all ptes from the page. For this purpose, the +unevictable/mlock infrastructure introduced a variant of try_to_unmap() called +try_to_munlock(). + +try_to_munlock() calls the same functions as try_to_unmap() for anonymous and +mapped file pages with an additional argument specifing unlock versus unmap +processing. Again, these functions walk the respective reverse maps looking +for VM_LOCKED vmas. When such a vma is found for anonymous pages and file +pages mapped in linear VMAs, as in the try_to_unmap() case, the functions +attempt to acquire the associated mmap semphore, mlock the page via +mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the +pre-clearing of the page's PG_mlocked done by munlock_vma_page() and informs +shrink_page_list() that the anonymous page should be culled rather than added +to the swap cache in preparation for a try_to_unmap() that will almost +certainly fail. + +If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap +semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() +to recycle the page on the inactive list and hope that it has better luck +with the page next time. + +For file pages mapped into non-linear vmas, the try_to_munlock() logic works +slightly differently. On encountering a VM_LOCKED non-linear vma that might +map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking +the page. munlock_vma_page() will just leave the page unlocked and let +vmscan deal with it--the usual fallback position. + +Note that try_to_munlock()'s reverse map walk must visit every vma in a pages' +reverse map to determine that a page is NOT mapped into any VM_LOCKED vma. +However, the scan can terminate when it encounters a VM_LOCKED vma and can +successfully acquire the vma's mmap semphore for read and mlock the page. +Although try_to_munlock() can be called many [very many!] times when +munlock()ing a large region or tearing down a large address space that has been +mlocked via mlockall(), overall this is a fairly rare event. In addition, +although shrink_page_list() calls try_to_munlock() for every anonymous page that +it handles that is not yet in the swap cache, on average anonymous pages will +have very short reverse map lists. + +Mlocked Page: Page Reclaim in shrink_*_list() + +shrink_active_list() culls any obviously unevictable pages--i.e., +!page_evictable(page, NULL)--diverting these to the unevictable lru +list. However, shrink_active_list() only sees unevictable pages that +made it onto the active/inactive lru lists. Note that these pages do not +have PageUnevictable set--otherwise, they would be on the unevictable list and +shrink_active_list would never see them. + +Some examples of these unevictable pages on the LRU lists are: + +1) ramfs pages that have been placed on the lru lists when first allocated. + +2) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to + allocate or fault in the pages in the shared memory region. This happens + when an application accesses the page the first time after SHM_LOCKing + the segment. + +3) Mlocked pages that could not be isolated from the lru and moved to the + unevictable list in mlock_vma_page(). + +3) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't + acquire the vma's mmap semaphore to test the flags and set PageMlocked. + munlock_vma_page() was forced to let the page back on to the normal + LRU list for vmscan to handle. + +shrink_inactive_list() also culls any unevictable pages that it finds +on the inactive lists, again diverting them to the appropriate zone's unevictable +lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became +SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or +pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from +the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice +the latter, but will pass on to shrink_page_list(). + +shrink_page_list() again culls obviously unevictable pages that it could +encounter for similar reason to shrink_inactive_list(). As already discussed, +shrink_page_list() proactively looks for anonymous pages that should have +PG_mlocked set but don't--these would not be detected by page_evictable()--to +avoid adding them to the swap cache unnecessarily. File pages mapped into +VM_LOCKED vmas but without PG_mlocked set will make it all the way to +try_to_unmap(). shrink_page_list() will divert them to the unevictable list when +try_to_unmap() returns SWAP_MLOCK, as discussed above. + +TODO/FIXME: If we can enhance the swap cache to reliably remove entries +with page_count(page) > 2, as long as all ptes are mapped to the page and +not the swap entry, we can probably remove the call to try_to_munlock() in +shrink_page_list() and just remove the page from the swap cache when +try_to_unmap() returns SWAP_MLOCK. Currently, remove_exclusive_swap_page() +doesn't seem to allow that. + + diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index a0f642b6a4b9..6110197757a3 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,7 @@ config AUTO_IRQ_AFFINITY default y source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "System setup" diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 15fda4344424..d069526bd767 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -74,12 +74,14 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_UAC_SIGBUS 7 #define TIF_MEMDIE 8 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ +#define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) +#define _TIF_FREEZE (1<<TIF_FREEZE) /* Work to do on interrupt/exception return. */ #define _TIF_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED) diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index 04dcc5e5d4c1..9cd8dca742a7 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -655,7 +655,7 @@ __marvel_rtc_io(u8 b, unsigned long addr, int write) case 0x71: /* RTC_PORT(1) */ rtc_access.index = index; - rtc_access.data = BCD_TO_BIN(b); + rtc_access.data = bcd2bin(b); rtc_access.function = 0x48 + !write; /* GET/PUT_TOY */ #ifdef CONFIG_SMP @@ -668,7 +668,7 @@ __marvel_rtc_io(u8 b, unsigned long addr, int write) #else __marvel_access_rtc(&rtc_access); #endif - ret = BIN_TO_BCD(rtc_access.data); + ret = bin2bcd(rtc_access.data); break; default: diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 75480cab0893..e6a231435cba 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -346,12 +346,12 @@ time_init(void) year = CMOS_READ(RTC_YEAR); if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } /* PC-like is standard; used for year >= 70 */ @@ -525,7 +525,7 @@ set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -543,8 +543,8 @@ set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds,RTC_SECONDS); CMOS_WRITE(real_minutes,RTC_MINUTES); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4853f9df37bd..df39d20f7425 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -192,6 +192,8 @@ config VECTORS_BASE source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System Type" choice diff --git a/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h b/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h index eb4b190b6657..eb35fca9aea5 100644 --- a/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h +++ b/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h @@ -4,6 +4,43 @@ #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> +struct pxa3xx_nand_timing { + unsigned int tCH; /* Enable signal hold time */ + unsigned int tCS; /* Enable signal setup time */ + unsigned int tWH; /* ND_nWE high duration */ + unsigned int tWP; /* ND_nWE pulse time */ + unsigned int tRH; /* ND_nRE high duration */ + unsigned int tRP; /* ND_nRE pulse width */ + unsigned int tR; /* ND_nWE high to ND_nRE low for read */ + unsigned int tWHR; /* ND_nWE high to ND_nRE low for status read */ + unsigned int tAR; /* ND_ALE low to ND_nRE low delay */ +}; + +struct pxa3xx_nand_cmdset { + uint16_t read1; + uint16_t read2; + uint16_t program; + uint16_t read_status; + uint16_t read_id; + uint16_t erase; + uint16_t reset; + uint16_t lock; + uint16_t unlock; + uint16_t lock_status; +}; + +struct pxa3xx_nand_flash { + const struct pxa3xx_nand_timing *timing; /* NAND Flash timing */ + const struct pxa3xx_nand_cmdset *cmdset; + + uint32_t page_per_block;/* Pages per block (PG_PER_BLK) */ + uint32_t page_size; /* Page size in bytes (PAGE_SZ) */ + uint32_t flash_width; /* Width of Flash memory (DWIDTH_M) */ + uint32_t dfc_width; /* Width of flash controller(DWIDTH_C) */ + uint32_t num_blocks; /* Number of physical blocks in Flash */ + uint32_t chip_id; +}; + struct pxa3xx_nand_platform_data { /* the data flash bus is shared between the Static Memory @@ -12,8 +49,11 @@ struct pxa3xx_nand_platform_data { */ int enable_arbiter; - struct mtd_partition *parts; - unsigned int nr_parts; + const struct mtd_partition *parts; + unsigned int nr_parts; + + const struct pxa3xx_nand_flash * flash; + size_t num_flash; }; extern void pxa3xx_set_nand_info(struct pxa3xx_nand_platform_data *info); diff --git a/arch/arm/plat-mxc/include/mach/mxc_nand.h b/arch/arm/plat-mxc/include/mach/mxc_nand.h new file mode 100644 index 000000000000..2b972df22d12 --- /dev/null +++ b/arch/arm/plat-mxc/include/mach/mxc_nand.h @@ -0,0 +1,27 @@ +/* + * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved. + * Copyright 2008 Sascha Hauer, kernel@pengutronix.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef __ASM_ARCH_NAND_H +#define __ASM_ARCH_NAND_H + +struct mxc_nand_platform_data { + int width; /* data bus width in bytes */ + int hw_ecc; /* 0 if supress hardware ECC */ +}; +#endif /* __ASM_ARCH_NAND_H */ diff --git a/arch/arm/plat-omap/include/mach/onenand.h b/arch/arm/plat-omap/include/mach/onenand.h index d57f20226b28..4649d302c263 100644 --- a/arch/arm/plat-omap/include/mach/onenand.h +++ b/arch/arm/plat-omap/include/mach/onenand.h @@ -16,6 +16,10 @@ struct omap_onenand_platform_data { int gpio_irq; struct mtd_partition *parts; int nr_parts; - int (*onenand_setup)(void __iomem *); + int (*onenand_setup)(void __iomem *, int freq); int dma_channel; }; + +int omap2_onenand_rephase(void); + +#define ONENAND_MAX_PARTITIONS 8 diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 7c239a916275..33a5b2969eb4 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -72,6 +72,8 @@ config GENERIC_BUG source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System Type and features" source "kernel/time/Kconfig" diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index 294b25f9323d..4442f8d2d423 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -96,6 +96,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP) +#define _TIF_FREEZE (1 << TIF_FREEZE) /* Note: The masks below must never span more than 16 bits! */ diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 8102c79aaa94..29e71ed6b8a7 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -64,8 +64,11 @@ config HARDWARE_PM depends on OPROFILE source "init/Kconfig" + source "kernel/Kconfig.preempt" +source "kernel/Kconfig.freezer" + menu "Blackfin Processor Options" comment "Processor and Board Settings" diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 9389d38f222f..07335e719bf8 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -62,6 +62,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "General setup" source "fs/Kconfig.binfmt" diff --git a/arch/cris/arch-v10/drivers/ds1302.c b/arch/cris/arch-v10/drivers/ds1302.c index c9aa3904be05..3bdfaf43390c 100644 --- a/arch/cris/arch-v10/drivers/ds1302.c +++ b/arch/cris/arch-v10/drivers/ds1302.c @@ -215,12 +215,12 @@ get_rtc_time(struct rtc_time *rtc_tm) local_irq_restore(flags); - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values @@ -295,12 +295,12 @@ rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd, else yrs -= 1900; /* RTC (70, 71, ... 99) */ - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); local_irq_save(flags); CMOS_WRITE(yrs, RTC_YEAR); diff --git a/arch/cris/arch-v10/drivers/pcf8563.c b/arch/cris/arch-v10/drivers/pcf8563.c index 8769dc914073..1e90c1a9c849 100644 --- a/arch/cris/arch-v10/drivers/pcf8563.c +++ b/arch/cris/arch-v10/drivers/pcf8563.c @@ -122,7 +122,7 @@ get_rtc_time(struct rtc_time *tm) "information is no longer guaranteed!\n", PCF8563_NAME); } - tm->tm_year = BCD_TO_BIN(tm->tm_year) + + tm->tm_year = bcd2bin(tm->tm_year) + ((tm->tm_mon & 0x80) ? 100 : 0); tm->tm_sec &= 0x7F; tm->tm_min &= 0x7F; @@ -131,11 +131,11 @@ get_rtc_time(struct rtc_time *tm) tm->tm_wday &= 0x07; /* Not coded in BCD. */ tm->tm_mon &= 0x1F; - BCD_TO_BIN(tm->tm_sec); - BCD_TO_BIN(tm->tm_min); - BCD_TO_BIN(tm->tm_hour); - BCD_TO_BIN(tm->tm_mday); - BCD_TO_BIN(tm->tm_mon); + tm->tm_sec = bcd2bin(tm->tm_sec); + tm->tm_min = bcd2bin(tm->tm_min); + tm->tm_hour = bcd2bin(tm->tm_hour); + tm->tm_mday = bcd2bin(tm->tm_mday); + tm->tm_mon = bcd2bin(tm->tm_mon); tm->tm_mon--; /* Month is 1..12 in RTC but 0..11 in linux */ } @@ -282,12 +282,12 @@ int pcf8563_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, century = (tm.tm_year >= 2000) ? 0x80 : 0; tm.tm_year = tm.tm_year % 100; - BIN_TO_BCD(tm.tm_year); - BIN_TO_BCD(tm.tm_mon); - BIN_TO_BCD(tm.tm_mday); - BIN_TO_BCD(tm.tm_hour); - BIN_TO_BCD(tm.tm_min); - BIN_TO_BCD(tm.tm_sec); + tm.tm_year = bin2bcd(tm.tm_year); + tm.tm_mon = bin2bcd(tm.tm_mon); + tm.tm_mday = bin2bcd(tm.tm_mday); + tm.tm_hour = bin2bcd(tm.tm_hour); + tm.tm_min = bin2bcd(tm.tm_min); + tm.tm_sec = bin2bcd(tm.tm_sec); tm.tm_mon |= century; mutex_lock(&rtc_lock); diff --git a/arch/cris/arch-v32/drivers/pcf8563.c b/arch/cris/arch-v32/drivers/pcf8563.c index f263ab571221..f4478506e52c 100644 --- a/arch/cris/arch-v32/drivers/pcf8563.c +++ b/arch/cris/arch-v32/drivers/pcf8563.c @@ -118,7 +118,7 @@ get_rtc_time(struct rtc_time *tm) "information is no longer guaranteed!\n", PCF8563_NAME); } - tm->tm_year = BCD_TO_BIN(tm->tm_year) + + tm->tm_year = bcd2bin(tm->tm_year) + ((tm->tm_mon & 0x80) ? 100 : 0); tm->tm_sec &= 0x7F; tm->tm_min &= 0x7F; @@ -127,11 +127,11 @@ get_rtc_time(struct rtc_time *tm) tm->tm_wday &= 0x07; /* Not coded in BCD. */ tm->tm_mon &= 0x1F; - BCD_TO_BIN(tm->tm_sec); - BCD_TO_BIN(tm->tm_min); - BCD_TO_BIN(tm->tm_hour); - BCD_TO_BIN(tm->tm_mday); - BCD_TO_BIN(tm->tm_mon); + tm->tm_sec = bcd2bin(tm->tm_sec); + tm->tm_min = bcd2bin(tm->tm_min); + tm->tm_hour = bcd2bin(tm->tm_hour); + tm->tm_mday = bcd2bin(tm->tm_mday); + tm->tm_mon = bcd2bin(tm->tm_mon); tm->tm_mon--; /* Month is 1..12 in RTC but 0..11 in linux */ } @@ -279,12 +279,12 @@ int pcf8563_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, century = (tm.tm_year >= 2000) ? 0x80 : 0; tm.tm_year = tm.tm_year % 100; - BIN_TO_BCD(tm.tm_year); - BIN_TO_BCD(tm.tm_mon); - BIN_TO_BCD(tm.tm_mday); - BIN_TO_BCD(tm.tm_hour); - BIN_TO_BCD(tm.tm_min); - BIN_TO_BCD(tm.tm_sec); + tm.tm_year = bin2bcd(tm.tm_year); + tm.tm_mon = bin2bcd(tm.tm_mon); + tm.tm_mday = bin2bcd(tm.tm_mday); + tm.tm_hour = bin2bcd(tm.tm_hour); + tm.tm_min = bin2bcd(tm.tm_min); + tm.tm_sec = bin2bcd(tm.tm_sec); tm.tm_mon |= century; mutex_lock(&rtc_lock); diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c index ff4c6aa75def..074fe7dea96b 100644 --- a/arch/cris/kernel/time.c +++ b/arch/cris/kernel/time.c @@ -127,7 +127,7 @@ int set_rtc_mmss(unsigned long nowtime) return 0; cmos_minutes = CMOS_READ(RTC_MINUTES); - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -142,8 +142,8 @@ int set_rtc_mmss(unsigned long nowtime) real_minutes %= 60; if (abs(real_minutes - cmos_minutes) < 30) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); CMOS_WRITE(real_seconds,RTC_SECONDS); CMOS_WRITE(real_minutes,RTC_MINUTES); } else { @@ -170,12 +170,12 @@ get_cmos_time(void) mon = CMOS_READ(RTC_MONTH); year = CMOS_READ(RTC_YEAR); - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); if ((year += 1900) < 1970) year += 100; diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index a5aac1b07562..9d1552a9ee2c 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Fujitsu FR-V system setup" diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c7966746fbfe..bd1995403c67 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -90,6 +90,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "arch/h8300/Kconfig.cpu" menu "Executable file formats" diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index aafd4d322ec3..700014d2155f 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -89,6 +89,7 @@ static inline struct thread_info *current_thread_info(void) TIF_NEED_RESCHED */ #define TIF_MEMDIE 4 #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ +#define TIF_FREEZE 16 /* is freezing for suspend */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -96,6 +97,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) +#define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 3b7aa38254a8..912c57db2d21 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" config IA64 diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 4956be40d7b5..d98f0f4ff83f 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -2070,14 +2070,13 @@ sba_init(void) if (!ia64_platform_is("hpzx1") && !ia64_platform_is("hpzx1_swiotlb")) return 0; -#if defined(CONFIG_IA64_GENERIC) && defined(CONFIG_CRASH_DUMP) && \ - defined(CONFIG_PROC_FS) +#if defined(CONFIG_IA64_GENERIC) /* If we are booting a kdump kernel, the sba_iommu will * cause devices that were not shutdown properly to MCA * as soon as they are turned back on. Our only option for * a successful kdump kernel boot is to use the swiotlb. */ - if (elfcorehdr_addr < ELFCORE_ADDR_MAX) { + if (is_kdump_kernel()) { if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0) panic("Unable to initialize software I/O TLB:" " Try machvec=dig boot option"); diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c index da60e90eeeb1..23e91290e41f 100644 --- a/arch/ia64/kernel/crash_dump.c +++ b/arch/ia64/kernel/crash_dump.c @@ -8,10 +8,14 @@ #include <linux/errno.h> #include <linux/types.h> +#include <linux/crash_dump.h> #include <asm/page.h> #include <asm/uaccess.h> +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 51b75cea7018..efaff15d8cf1 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -1335,7 +1335,7 @@ kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n) } #endif -#ifdef CONFIG_PROC_VMCORE +#ifdef CONFIG_CRASH_DUMP /* locate the size find a the descriptor at a certain address */ unsigned long __init vmcore_find_descriptor_size (unsigned long address) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index de636b215677..916ba898237f 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -352,7 +352,7 @@ reserve_memory (void) } #endif -#ifdef CONFIG_PROC_VMCORE +#ifdef CONFIG_CRASH_KERNEL if (reserve_elfcorehdr(&rsvd_region[n].start, &rsvd_region[n].end) == 0) n++; @@ -478,7 +478,12 @@ static __init int setup_nomca(char *s) } early_param("nomca", setup_nomca); -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. */ @@ -502,11 +507,11 @@ int __init reserve_elfcorehdr(unsigned long *start, unsigned long *end) * to work properly. */ - if (elfcorehdr_addr >= ELFCORE_ADDR_MAX) + if (!is_vmcore_usable()) return -EINVAL; if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) { - elfcorehdr_addr = ELFCORE_ADDR_MAX; + vmcore_unusable(); return -EINVAL; } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index f482a9098e32..054bcd9439aa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -700,23 +700,6 @@ int arch_add_memory(int nid, u64 start, u64 size) return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE -int remove_memory(u64 start, u64 size) -{ - unsigned long start_pfn, end_pfn; - unsigned long timeout = 120 * HZ; - int ret; - start_pfn = start >> PAGE_SHIFT; - end_pfn = start_pfn + (size >> PAGE_SHIFT); - ret = offline_pages(start_pfn, end_pfn, timeout); - if (ret) - goto out; - /* we can free mem_map at this point */ -out: - return ret; -} -EXPORT_SYMBOL_GPL(remove_memory); -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 00289c178f89..dbaed4a63815 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -42,6 +42,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 677c93a490f6..836fb66f080d 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -62,6 +62,8 @@ mainmenu "Linux/68k Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Platform dependent setup" config EISA diff --git a/arch/m68k/bvme6000/rtc.c b/arch/m68k/bvme6000/rtc.c index 808c9018b115..c50bec8aabb1 100644 --- a/arch/m68k/bvme6000/rtc.c +++ b/arch/m68k/bvme6000/rtc.c @@ -18,7 +18,6 @@ #include <linux/poll.h> #include <linux/module.h> #include <linux/mc146818rtc.h> /* For struct rtc_time and ioctls, etc */ -#include <linux/smp_lock.h> #include <linux/bcd.h> #include <asm/bvme6000hw.h> diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index 0a8998315e5e..76b66feb74df 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -75,6 +75,8 @@ config NO_IOPORT source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" choice diff --git a/arch/m68knommu/include/asm/thread_info.h b/arch/m68knommu/include/asm/thread_info.h index 0c9bc095f3f0..82529f424ea3 100644 --- a/arch/m68knommu/include/asm/thread_info.h +++ b/arch/m68knommu/include/asm/thread_info.h @@ -84,12 +84,14 @@ static inline struct thread_info *current_thread_info(void) #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 4 +#define TIF_FREEZE 16 /* is freezing for suspend */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) +#define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b905744d7915..5f149b030c0f 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER add initrd or initramfs image to the kernel image. Otherwise, say N. +source "kernel/Kconfig.freezer" + menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" config HW_HAS_EISA diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c index 3965fda94a89..1359c03ded51 100644 --- a/arch/mips/dec/time.c +++ b/arch/mips/dec/time.c @@ -45,12 +45,12 @@ unsigned long read_persistent_clock(void) spin_unlock_irqrestore(&rtc_lock, flags); if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - sec = BCD2BIN(sec); - min = BCD2BIN(min); - hour = BCD2BIN(hour); - day = BCD2BIN(day); - mon = BCD2BIN(mon); - year = BCD2BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } year += real_year - 72 + 2000; @@ -83,7 +83,7 @@ int rtc_mips_set_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - cmos_minutes = BCD2BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -99,8 +99,8 @@ int rtc_mips_set_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - real_seconds = BIN2BCD(real_seconds); - real_minutes = BIN2BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds, RTC_SECONDS); CMOS_WRITE(real_minutes, RTC_MINUTES); diff --git a/arch/mips/include/asm/mc146818-time.h b/arch/mips/include/asm/mc146818-time.h index cdc379a0a94e..199b45733a95 100644 --- a/arch/mips/include/asm/mc146818-time.h +++ b/arch/mips/include/asm/mc146818-time.h @@ -44,7 +44,7 @@ static inline int mc146818_set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -60,8 +60,8 @@ static inline int mc146818_set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds, RTC_SECONDS); CMOS_WRITE(real_minutes, RTC_MINUTES); @@ -103,12 +103,12 @@ static inline unsigned long mc146818_get_cmos_time(void) } while (sec != CMOS_READ(RTC_SECONDS)); if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } spin_unlock_irqrestore(&rtc_lock, flags); year = mc146818_decode_year(year); diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c index 6537d90a25bb..2d3c0dca275d 100644 --- a/arch/mips/pmc-sierra/yosemite/setup.c +++ b/arch/mips/pmc-sierra/yosemite/setup.c @@ -79,14 +79,14 @@ unsigned long read_persistent_clock(void) /* Stop the update to the time */ m48t37_base->control = 0x40; - year = BCD2BIN(m48t37_base->year); - year += BCD2BIN(m48t37_base->century) * 100; + year = bcd2bin(m48t37_base->year); + year += bcd2bin(m48t37_base->century) * 100; - month = BCD2BIN(m48t37_base->month); - day = BCD2BIN(m48t37_base->date); - hour = BCD2BIN(m48t37_base->hour); - min = BCD2BIN(m48t37_base->min); - sec = BCD2BIN(m48t37_base->sec); + month = bcd2bin(m48t37_base->month); + day = bcd2bin(m48t37_base->date); + hour = bcd2bin(m48t37_base->hour); + min = bcd2bin(m48t37_base->min); + sec = bcd2bin(m48t37_base->sec); /* Start the update to the time again */ m48t37_base->control = 0x00; @@ -113,22 +113,22 @@ int rtc_mips_set_time(unsigned long tim) m48t37_base->control = 0x80; /* year */ - m48t37_base->year = BIN2BCD(tm.tm_year % 100); - m48t37_base->century = BIN2BCD(tm.tm_year / 100); + m48t37_base->year = bin2bcd(tm.tm_year % 100); + m48t37_base->century = bin2bcd(tm.tm_year / 100); /* month */ - m48t37_base->month = BIN2BCD(tm.tm_mon); + m48t37_base->month = bin2bcd(tm.tm_mon); /* day */ - m48t37_base->date = BIN2BCD(tm.tm_mday); + m48t37_base->date = bin2bcd(tm.tm_mday); /* hour/min/sec */ - m48t37_base->hour = BIN2BCD(tm.tm_hour); - m48t37_base->min = BIN2BCD(tm.tm_min); - m48t37_base->sec = BIN2BCD(tm.tm_sec); + m48t37_base->hour = bin2bcd(tm.tm_hour); + m48t37_base->min = bin2bcd(tm.tm_min); + m48t37_base->sec = bin2bcd(tm.tm_sec); /* day of week -- not really used, but let's keep it up-to-date */ - m48t37_base->day = BIN2BCD(tm.tm_wday + 1); + m48t37_base->day = bin2bcd(tm.tm_wday + 1); /* disable writing */ m48t37_base->control = 0x00; diff --git a/arch/mips/sibyte/swarm/rtc_m41t81.c b/arch/mips/sibyte/swarm/rtc_m41t81.c index 26fbff4c15b1..b732600b47f5 100644 --- a/arch/mips/sibyte/swarm/rtc_m41t81.c +++ b/arch/mips/sibyte/swarm/rtc_m41t81.c @@ -156,32 +156,32 @@ int m41t81_set_time(unsigned long t) */ spin_lock_irqsave(&rtc_lock, flags); - tm.tm_sec = BIN2BCD(tm.tm_sec); + tm.tm_sec = bin2bcd(tm.tm_sec); m41t81_write(M41T81REG_SC, tm.tm_sec); - tm.tm_min = BIN2BCD(tm.tm_min); + tm.tm_min = bin2bcd(tm.tm_min); m41t81_write(M41T81REG_MN, tm.tm_min); - tm.tm_hour = BIN2BCD(tm.tm_hour); + tm.tm_hour = bin2bcd(tm.tm_hour); tm.tm_hour = (tm.tm_hour & 0x3f) | (m41t81_read(M41T81REG_HR) & 0xc0); m41t81_write(M41T81REG_HR, tm.tm_hour); /* tm_wday starts from 0 to 6 */ if (tm.tm_wday == 0) tm.tm_wday = 7; - tm.tm_wday = BIN2BCD(tm.tm_wday); + tm.tm_wday = bin2bcd(tm.tm_wday); m41t81_write(M41T81REG_DY, tm.tm_wday); - tm.tm_mday = BIN2BCD(tm.tm_mday); + tm.tm_mday = bin2bcd(tm.tm_mday); m41t81_write(M41T81REG_DT, tm.tm_mday); /* tm_mon starts from 0, *ick* */ tm.tm_mon ++; - tm.tm_mon = BIN2BCD(tm.tm_mon); + tm.tm_mon = bin2bcd(tm.tm_mon); m41t81_write(M41T81REG_MO, tm.tm_mon); /* we don't do century, everything is beyond 2000 */ tm.tm_year %= 100; - tm.tm_year = BIN2BCD(tm.tm_year); + tm.tm_year = bin2bcd(tm.tm_year); m41t81_write(M41T81REG_YR, tm.tm_year); spin_unlock_irqrestore(&rtc_lock, flags); @@ -209,12 +209,12 @@ unsigned long m41t81_get_time(void) year = m41t81_read(M41T81REG_YR); spin_unlock_irqrestore(&rtc_lock, flags); - sec = BCD2BIN(sec); - min = BCD2BIN(min); - hour = BCD2BIN(hour); - day = BCD2BIN(day); - mon = BCD2BIN(mon); - year = BCD2BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); year += 2000; diff --git a/arch/mips/sibyte/swarm/rtc_xicor1241.c b/arch/mips/sibyte/swarm/rtc_xicor1241.c index ff3e5dabb348..4438b2195c44 100644 --- a/arch/mips/sibyte/swarm/rtc_xicor1241.c +++ b/arch/mips/sibyte/swarm/rtc_xicor1241.c @@ -124,18 +124,18 @@ int xicor_set_time(unsigned long t) xicor_write(X1241REG_SR, X1241REG_SR_WEL | X1241REG_SR_RWEL); /* trivial ones */ - tm.tm_sec = BIN2BCD(tm.tm_sec); + tm.tm_sec = bin2bcd(tm.tm_sec); xicor_write(X1241REG_SC, tm.tm_sec); - tm.tm_min = BIN2BCD(tm.tm_min); + tm.tm_min = bin2bcd(tm.tm_min); xicor_write(X1241REG_MN, tm.tm_min); - tm.tm_mday = BIN2BCD(tm.tm_mday); + tm.tm_mday = bin2bcd(tm.tm_mday); xicor_write(X1241REG_DT, tm.tm_mday); /* tm_mon starts from 0, *ick* */ tm.tm_mon ++; - tm.tm_mon = BIN2BCD(tm.tm_mon); + tm.tm_mon = bin2bcd(tm.tm_mon); xicor_write(X1241REG_MO, tm.tm_mon); /* year is split */ @@ -148,7 +148,7 @@ int xicor_set_time(unsigned long t) tmp = xicor_read(X1241REG_HR); if (tmp & X1241REG_HR_MIL) { /* 24 hour format */ - tm.tm_hour = BIN2BCD(tm.tm_hour); + tm.tm_hour = bin2bcd(tm.tm_hour); tmp = (tmp & ~0x3f) | (tm.tm_hour & 0x3f); } else { /* 12 hour format, with 0x2 for pm */ @@ -157,7 +157,7 @@ int xicor_set_time(unsigned long t) tmp |= 0x20; tm.tm_hour -= 12; } - tm.tm_hour = BIN2BCD(tm.tm_hour); + tm.tm_hour = bin2bcd(tm.tm_hour); tmp |= tm.tm_hour; } xicor_write(X1241REG_HR, tmp); @@ -191,13 +191,13 @@ unsigned long xicor_get_time(void) y2k = xicor_read(X1241REG_Y2K); spin_unlock_irqrestore(&rtc_lock, flags); - sec = BCD2BIN(sec); - min = BCD2BIN(min); - hour = BCD2BIN(hour); - day = BCD2BIN(day); - mon = BCD2BIN(mon); - year = BCD2BIN(year); - y2k = BCD2BIN(y2k); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); + y2k = bcd2bin(y2k); year += (y2k * 100); diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index dd557c9cf001..9a9f43358879 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -68,6 +68,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Matsushita MN10300 system setup" diff --git a/arch/mn10300/kernel/rtc.c b/arch/mn10300/kernel/rtc.c index 042f792d8430..7978470b5749 100644 --- a/arch/mn10300/kernel/rtc.c +++ b/arch/mn10300/kernel/rtc.c @@ -67,7 +67,7 @@ static int set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -84,8 +84,8 @@ static int set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds, RTC_SECONDS); CMOS_WRITE(real_minutes, RTC_MINUTES); diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 8313fccced5e..2bd1f6ef5db0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -90,6 +90,8 @@ config ARCH_MAY_HAVE_PC_FDC source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 380baa1780e9..9391199d9e77 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -230,6 +230,8 @@ config PPC_OF_PLATFORM_PCI source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "arch/powerpc/sysdev/Kconfig" source "arch/powerpc/platforms/Kconfig" diff --git a/arch/powerpc/include/asm/ps3av.h b/arch/powerpc/include/asm/ps3av.h index fda98715cd35..5aa22cffdbd6 100644 --- a/arch/powerpc/include/asm/ps3av.h +++ b/arch/powerpc/include/asm/ps3av.h @@ -678,6 +678,8 @@ struct ps3av_pkt_avb_param { u8 buf[PS3AV_PKT_AVB_PARAM_MAX_BUF_SIZE]; }; +/* channel status */ +extern u8 ps3av_mode_cs_info[]; /** command status **/ #define PS3AV_STATUS_SUCCESS 0x0000 /* success */ @@ -735,6 +737,7 @@ extern int ps3av_get_mode(void); extern int ps3av_video_mode2res(u32, u32 *, u32 *); extern int ps3av_video_mute(int); extern int ps3av_audio_mute(int); +extern int ps3av_audio_mute_analog(int); extern int ps3av_dev_open(void); extern int ps3av_dev_close(void); extern void ps3av_register_flip_ctl(void (*flip_ctl)(int on, void *data), diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index a323c9b32ee1..97e056379728 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -27,6 +27,9 @@ #define DBG(fmt...) #endif +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + void __init reserve_kdump_trampoline(void) { lmb_reserve(0, KDUMP_RESERVE_LIMIT); @@ -66,7 +69,11 @@ void __init setup_kdump_trampoline(void) DBG(" <- setup_kdump_trampoline()\n"); } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ static int __init parse_elfcorehdr(char *p) { if (p) @@ -75,7 +82,6 @@ static int __init parse_elfcorehdr(char *p) return 1; } __setup("elfcorehdr=", parse_elfcorehdr); -#endif static int __init parse_savemaxmem(char *p) { diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 98d7bf99533a..b9e1a1da6e52 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -134,23 +134,6 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(zone, start_pfn, nr_pages); } - -#ifdef CONFIG_MEMORY_HOTREMOVE -int remove_memory(u64 start, u64 size) -{ - unsigned long start_pfn, end_pfn; - int ret; - - start_pfn = start >> PAGE_SHIFT; - end_pfn = start_pfn + (size >> PAGE_SHIFT); - ret = offline_pages(start_pfn, end_pfn, 120 * HZ); - if (ret) - goto out; - /* Arch-specific calls go here - next patch */ -out: - return ret; -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_MEMORY_HOTPLUG */ /* diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bc581d8a7cd9..70b7645ce745 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -78,6 +78,8 @@ config S390 source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Base setup" comment "Processor type and features" diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index ea40a9d690fc..de3fad60c682 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -99,6 +99,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_31BIT 18 /* 32bit process */ #define TIF_MEMDIE 19 #define TIF_RESTORE_SIGMASK 20 /* restore signal mask in do_signal() */ +#define TIF_FREEZE 21 /* thread is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) @@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_USEDFPU (1<<TIF_USEDFPU) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_31BIT (1<<TIF_31BIT) +#define _TIF_FREEZE (1<<TIF_FREEZE) #endif /* __KERNEL__ */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 1169130a97ef..158b0d6d7046 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -189,14 +189,3 @@ int arch_add_memory(int nid, u64 start, u64 size) return rc; } #endif /* CONFIG_MEMORY_HOTPLUG */ - -#ifdef CONFIG_MEMORY_HOTREMOVE -int remove_memory(u64 start, u64 size) -{ - unsigned long start_pfn, end_pfn; - - start_pfn = PFN_DOWN(start); - end_pfn = start_pfn + PFN_DOWN(size); - return offline_pages(start_pfn, end_pfn, 120 * HZ); -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b4aa2a03e19b..cb2c87df70ce 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -121,6 +121,8 @@ config IO_TRAPPED source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System type" # diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c index 4a2ecbe27d8e..95d216255565 100644 --- a/arch/sh/kernel/crash_dump.c +++ b/arch/sh/kernel/crash_dump.c @@ -10,6 +10,9 @@ #include <linux/io.h> #include <asm/uaccess.h> +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 97671dac12a6..e594559c8dba 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -37,6 +37,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "General machine setup" config SMP diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 29899fd5b1b2..80fe547c3f45 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -135,6 +135,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *) #define TIF_POLLING_NRFLAG 9 /* true if poll_idle() is polling * TIF_NEED_RESCHED */ #define TIF_MEMDIE 10 +#define TIF_FREEZE 11 /* is freezing for suspend */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -148,6 +149,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *) #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ _TIF_SIGPENDING | \ _TIF_RESTORE_SIGMASK) +#define _TIF_FREEZE (1<<TIF_FREEZE) #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index c0a737d7292c..639ac805448a 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -237,6 +237,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_ABI_PENDING 12 #define TIF_MEMDIE 13 #define TIF_POLLING_NRFLAG 14 +#define TIF_FREEZE 15 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) @@ -249,6 +250,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) +#define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_USER_WORK_MASK ((0xff << TI_FLAG_WSAVED_SHIFT) | \ _TIF_DO_NOTIFY_RESUME_MASK | \ diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 5446e2a499b1..035b15af90d8 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -96,6 +96,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ def_bool y source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 6976812cfb18..393bccfe1785 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -229,6 +229,8 @@ endmenu source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "drivers/block/Kconfig" source "arch/um/Kconfig.char" diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c index fd0c25ad6af3..129647375a6c 100644 --- a/arch/um/sys-i386/signal.c +++ b/arch/um/sys-i386/signal.c @@ -179,7 +179,8 @@ static int copy_sc_from_user(struct pt_regs *regs, if (have_fpx_regs) { struct user_fxsr_struct fpx; - err = copy_from_user(&fpx, &sc.fpstate->_fxsr_env[0], + err = copy_from_user(&fpx, + &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0], sizeof(struct user_fxsr_struct)); if (err) return 1; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bd3c2c53873e..49349ba77d80 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -193,6 +193,7 @@ config X86_TRAMPOLINE config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 72d0c56c1b48..f7cdb3b457aa 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,6 +13,9 @@ static void *kdump_buf_page; +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index e90a60ef10c2..045b36cada65 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,6 +10,9 @@ #include <linux/uaccess.h> #include <linux/io.h> +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 0a23b5795b25..dd6f2b71561b 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds,RTC_SECONDS); CMOS_WRITE(real_minutes,RTC_MINUTES); @@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void) WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } if (century) { - BCD_TO_BIN(century); + century = bcd2bin(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); } else diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2255782e8d4b..b2c97874ec0f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void) } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ + +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed * by kexec loader to the capture kernel. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a9ec89c3fbca..407d8784f669 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); + vm_unmap_aliases(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0013a729b41d..b61534c7a4c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l /* make sure there are no stray mappings of this page */ kmap_flush_unused(); + vm_unmap_aliases(); } } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ae173f6edd8b..d4d52f5a1cf7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); + vm_unmap_aliases(); xen_mc_batch(); } diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 02e417d3d8e9..a213260b51e5 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -55,6 +55,7 @@ config HZ default 100 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index b1c723f9f58d..70f7f60929ca 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -431,7 +431,7 @@ static ssize_t acpi_battery_alarm_store(struct device *dev, } static struct device_attribute alarm_attr = { - .attr = {.name = "alarm", .mode = 0644, .owner = THIS_MODULE}, + .attr = {.name = "alarm", .mode = 0644}, .show = acpi_battery_alarm_show, .store = acpi_battery_alarm_store, }; diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c index 10a36512647c..7b011e7e29fe 100644 --- a/drivers/acpi/sbs.c +++ b/drivers/acpi/sbs.c @@ -463,7 +463,7 @@ static ssize_t acpi_battery_alarm_store(struct device *dev, } static struct device_attribute alarm_attr = { - .attr = {.name = "alarm", .mode = 0644, .owner = THIS_MODULE}, + .attr = {.name = "alarm", .mode = 0644}, .show = acpi_battery_alarm_show, .store = acpi_battery_alarm_store, }; diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c index bf5b04de02d1..631ee2ee2ca0 100644 --- a/drivers/acpi/sleep/proc.c +++ b/drivers/acpi/sleep/proc.c @@ -120,13 +120,13 @@ static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset) spin_unlock_irqrestore(&rtc_lock, flags); if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hr); - BCD_TO_BIN(day); - BCD_TO_BIN(mo); - BCD_TO_BIN(yr); - BCD_TO_BIN(cent); + sec = bcd2bin(sec); + min = bcd2bin(min); + hr = bcd2bin(hr); + day = bcd2bin(day); + mo = bcd2bin(mo); + yr = bcd2bin(yr); + cent = bcd2bin(cent); } /* we're trusting the FADT (see above) */ @@ -204,7 +204,7 @@ static u32 cmos_bcd_read(int offset, int rtc_control) { u32 val = CMOS_READ(offset); if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(val); + val = bcd2bin(val); return val; } @@ -212,7 +212,7 @@ static u32 cmos_bcd_read(int offset, int rtc_control) static void cmos_bcd_write(u32 val, int offset, int rtc_control) { if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BIN_TO_BCD(val); + val = bin2bcd(val); CMOS_WRITE(val, offset); } diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index 91dec448b3ed..24e80fd927e2 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -115,7 +115,6 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr, table_attr->attr.read = acpi_table_show; table_attr->attr.attr.name = table_attr->name; table_attr->attr.attr.mode = 0444; - table_attr->attr.attr.owner = THIS_MODULE; return; } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index af0d175c025d..5260e9e0df48 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -21,6 +21,8 @@ #include <linux/memory_hotplug.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/stat.h> + #include <asm/atomic.h> #include <asm/uaccess.h> @@ -325,7 +327,7 @@ memory_probe_store(struct class *class, const char *buf, size_t count) return count; } -static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); +static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store); static int memory_probe_init(void) { diff --git a/drivers/base/node.c b/drivers/base/node.c index 5116b78c6325..f5207090885a 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -13,6 +13,7 @@ #include <linux/nodemask.h> #include <linux/cpu.h> #include <linux/device.h> +#include <linux/swap.h> static struct sysdev_class node_class = { .name = "node", @@ -61,34 +62,52 @@ static ssize_t node_read_meminfo(struct sys_device * dev, si_meminfo_node(&i, nid); n = sprintf(buf, "\n" - "Node %d MemTotal: %8lu kB\n" - "Node %d MemFree: %8lu kB\n" - "Node %d MemUsed: %8lu kB\n" - "Node %d Active: %8lu kB\n" - "Node %d Inactive: %8lu kB\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d Active: %8lu kB\n" + "Node %d Inactive: %8lu kB\n" + "Node %d Active(anon): %8lu kB\n" + "Node %d Inactive(anon): %8lu kB\n" + "Node %d Active(file): %8lu kB\n" + "Node %d Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Node %d Unevictable: %8lu kB\n" + "Node %d Mlocked: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM - "Node %d HighTotal: %8lu kB\n" - "Node %d HighFree: %8lu kB\n" - "Node %d LowTotal: %8lu kB\n" - "Node %d LowFree: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n" #endif - "Node %d Dirty: %8lu kB\n" - "Node %d Writeback: %8lu kB\n" - "Node %d FilePages: %8lu kB\n" - "Node %d Mapped: %8lu kB\n" - "Node %d AnonPages: %8lu kB\n" - "Node %d PageTables: %8lu kB\n" - "Node %d NFS_Unstable: %8lu kB\n" - "Node %d Bounce: %8lu kB\n" - "Node %d WritebackTmp: %8lu kB\n" - "Node %d Slab: %8lu kB\n" - "Node %d SReclaimable: %8lu kB\n" - "Node %d SUnreclaim: %8lu kB\n", + "Node %d Dirty: %8lu kB\n" + "Node %d Writeback: %8lu kB\n" + "Node %d FilePages: %8lu kB\n" + "Node %d Mapped: %8lu kB\n" + "Node %d AnonPages: %8lu kB\n" + "Node %d PageTables: %8lu kB\n" + "Node %d NFS_Unstable: %8lu kB\n" + "Node %d Bounce: %8lu kB\n" + "Node %d WritebackTmp: %8lu kB\n" + "Node %d Slab: %8lu kB\n" + "Node %d SReclaimable: %8lu kB\n" + "Node %d SUnreclaim: %8lu kB\n", nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), - nid, K(node_page_state(nid, NR_ACTIVE)), - nid, K(node_page_state(nid, NR_INACTIVE)), + nid, K(node_page_state(nid, NR_ACTIVE_ANON) + + node_page_state(nid, NR_ACTIVE_FILE)), + nid, K(node_page_state(nid, NR_INACTIVE_ANON) + + node_page_state(nid, NR_INACTIVE_FILE)), + nid, K(node_page_state(nid, NR_ACTIVE_ANON)), + nid, K(node_page_state(nid, NR_INACTIVE_ANON)), + nid, K(node_page_state(nid, NR_ACTIVE_FILE)), + nid, K(node_page_state(nid, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + nid, K(node_page_state(nid, NR_UNEVICTABLE)), + nid, K(node_page_state(nid, NR_MLOCK)), +#endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), @@ -173,6 +192,8 @@ int register_node(struct node *node, int num, struct node *parent) sysdev_create_file(&node->sysdev, &attr_meminfo); sysdev_create_file(&node->sysdev, &attr_numastat); sysdev_create_file(&node->sysdev, &attr_distance); + + scan_unevictable_register_node(node); } return error; } @@ -192,6 +213,8 @@ void unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_numastat); sysdev_remove_file(&node->sysdev, &attr_distance); + scan_unevictable_unregister_node(node); + sysdev_unregister(&node->sysdev); } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b82654e883a7..d876ad861237 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -90,7 +90,7 @@ static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); static struct device_attribute dev_attr_firmware_version = { - .attr = { .name = "firmware-version", .mode = S_IRUGO, .owner = THIS_MODULE }, + .attr = { .name = "firmware-version", .mode = S_IRUGO }, .show = aoedisk_show_fwver, }; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7b3351260d56..9034ca585afd 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -391,7 +391,7 @@ static ssize_t pid_show(struct device *dev, } static struct device_attribute pid_attr = { - .attr = { .name = "pid", .mode = S_IRUGO, .owner = THIS_MODULE }, + .attr = { .name = "pid", .mode = S_IRUGO}, .show = pid_show, }; diff --git a/drivers/char/ds1286.c b/drivers/char/ds1286.c index 5329d482b582..0a826d7be10e 100644 --- a/drivers/char/ds1286.c +++ b/drivers/char/ds1286.c @@ -210,8 +210,8 @@ static int ds1286_ioctl(struct inode *inode, struct file *file, if (sec != 0) return -EINVAL; - min = BIN2BCD(min); - min = BIN2BCD(hrs); + min = bin2bcd(min); + min = bin2bcd(hrs); spin_lock(&ds1286_lock); rtc_write(hrs, RTC_HOURS_ALARM); @@ -353,7 +353,7 @@ static int ds1286_proc_output(char *buf) ds1286_get_time(&tm); hundredth = rtc_read(RTC_HUNDREDTH_SECOND); - BCD_TO_BIN(hundredth); + hundredth = bcd2bin(hundredth); p += sprintf(p, "rtc_time\t: %02d:%02d:%02d.%02d\n" @@ -477,12 +477,12 @@ static void ds1286_get_time(struct rtc_time *rtc_tm) rtc_write(save_control, RTC_CMD); spin_unlock_irqrestore(&ds1286_lock, flags); - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values @@ -531,12 +531,12 @@ static int ds1286_set_time(struct rtc_time *rtc_tm) if (yrs >= 100) yrs -= 100; - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); spin_lock_irqsave(&ds1286_lock, flags); save_control = rtc_read(RTC_CMD); @@ -572,8 +572,8 @@ static void ds1286_get_alm_time(struct rtc_time *alm_tm) cmd = rtc_read(RTC_CMD); spin_unlock_irqrestore(&ds1286_lock, flags); - BCD_TO_BIN(alm_tm->tm_min); - BCD_TO_BIN(alm_tm->tm_hour); + alm_tm->tm_min = bcd2bin(alm_tm->tm_min); + alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour); alm_tm->tm_sec = 0; } diff --git a/drivers/char/ds1302.c b/drivers/char/ds1302.c index c5e67a623951..170693c93c73 100644 --- a/drivers/char/ds1302.c +++ b/drivers/char/ds1302.c @@ -131,12 +131,12 @@ get_rtc_time(struct rtc_time *rtc_tm) local_irq_restore(flags); - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values @@ -211,12 +211,12 @@ static long rtc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) else yrs -= 1900; /* RTC (70, 71, ... 99) */ - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); lock_kernel(); local_irq_save(flags); diff --git a/drivers/char/ip27-rtc.c b/drivers/char/ip27-rtc.c index ec9d0443d92c..2abd881b4cbc 100644 --- a/drivers/char/ip27-rtc.c +++ b/drivers/char/ip27-rtc.c @@ -130,12 +130,12 @@ static long rtc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (yrs >= 100) yrs -= 100; - sec = BIN2BCD(sec); - min = BIN2BCD(min); - hrs = BIN2BCD(hrs); - day = BIN2BCD(day); - mon = BIN2BCD(mon); - yrs = BIN2BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); spin_lock_irq(&rtc_lock); rtc->control |= M48T35_RTC_SET; @@ -311,12 +311,12 @@ static void get_rtc_time(struct rtc_time *rtc_tm) rtc->control &= ~M48T35_RTC_READ; spin_unlock_irq(&rtc_lock); - rtc_tm->tm_sec = BCD2BIN(rtc_tm->tm_sec); - rtc_tm->tm_min = BCD2BIN(rtc_tm->tm_min); - rtc_tm->tm_hour = BCD2BIN(rtc_tm->tm_hour); - rtc_tm->tm_mday = BCD2BIN(rtc_tm->tm_mday); - rtc_tm->tm_mon = BCD2BIN(rtc_tm->tm_mon); - rtc_tm->tm_year = BCD2BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); /* * Account for differences between how the RTC uses the values diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c index b930de50407a..3f7da8cf3a80 100644 --- a/drivers/char/pc8736x_gpio.c +++ b/drivers/char/pc8736x_gpio.c @@ -41,7 +41,8 @@ static u8 pc8736x_gpio_shadow[4]; #define SIO_BASE2 0x4E /* alt command-reg to check */ #define SIO_SID 0x20 /* SuperI/O ID Register */ -#define SIO_SID_VALUE 0xe9 /* Expected value in SuperI/O ID Register */ +#define SIO_SID_PC87365 0xe5 /* Expected value in ID Register for PC87365 */ +#define SIO_SID_PC87366 0xe9 /* Expected value in ID Register for PC87366 */ #define SIO_CF1 0x21 /* chip config, bit0 is chip enable */ @@ -91,13 +92,17 @@ static inline int superio_inb(int addr) static int pc8736x_superio_present(void) { + int id; + /* try the 2 possible values, read a hardware reg to verify */ superio_cmd = SIO_BASE1; - if (superio_inb(SIO_SID) == SIO_SID_VALUE) + id = superio_inb(SIO_SID); + if (id == SIO_SID_PC87365 || id == SIO_SID_PC87366) return superio_cmd; superio_cmd = SIO_BASE2; - if (superio_inb(SIO_SID) == SIO_SID_VALUE) + id = superio_inb(SIO_SID); + if (id == SIO_SID_PC87365 || id == SIO_SID_PC87366) return superio_cmd; return 0; diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index 17683de95717..32dc89720d58 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -518,17 +518,17 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel) if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { if (sec < 60) - BIN_TO_BCD(sec); + sec = bin2bcd(sec); else sec = 0xff; if (min < 60) - BIN_TO_BCD(min); + min = bin2bcd(min); else min = 0xff; if (hrs < 24) - BIN_TO_BCD(hrs); + hrs = bin2bcd(hrs); else hrs = 0xff; } @@ -614,12 +614,12 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel) if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); } save_control = CMOS_READ(RTC_CONTROL); @@ -1099,7 +1099,7 @@ no_irq: spin_unlock_irq(&rtc_lock); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(year); /* This should never happen... */ + year = bcd2bin(year); /* This should never happen... */ if (year < 20) { epoch = 2000; @@ -1352,13 +1352,13 @@ static void rtc_get_rtc_time(struct rtc_time *rtc_tm) spin_unlock_irqrestore(&rtc_lock, flags); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); - BCD_TO_BIN(rtc_tm->tm_wday); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); + rtc_tm->tm_wday = bcd2bin(rtc_tm->tm_wday); } #ifdef CONFIG_MACH_DECSTATION @@ -1392,9 +1392,9 @@ static void get_rtc_alm_time(struct rtc_time *alm_tm) spin_unlock_irq(&rtc_lock); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(alm_tm->tm_sec); - BCD_TO_BIN(alm_tm->tm_min); - BCD_TO_BIN(alm_tm->tm_hour); + alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec); + alm_tm->tm_min = bcd2bin(alm_tm->tm_min); + alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour); } } diff --git a/drivers/char/sx.c b/drivers/char/sx.c index 5b8d7a1aa3e6..ba4e86281fbf 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -2504,7 +2504,7 @@ static void __devexit sx_remove_card(struct sx_board *board, del_timer(&board->timer); if (pdev) { #ifdef CONFIG_PCI - pci_iounmap(pdev, board->base2); + iounmap(board->base2); pci_release_region(pdev, IS_CF_BOARD(board) ? 3 : 2); #endif } else { @@ -2677,7 +2677,7 @@ static int __devinit sx_pci_probe(struct pci_dev *pdev, } board->hw_base = pci_resource_start(pdev, reg); board->base2 = - board->base = pci_iomap(pdev, reg, WINDOW_LEN(board)); + board->base = ioremap_nocache(board->hw_base, WINDOW_LEN(board)); if (!board->base) { dev_err(&pdev->dev, "ioremap failed\n"); goto err_reg; @@ -2703,7 +2703,7 @@ static int __devinit sx_pci_probe(struct pci_dev *pdev, return 0; err_unmap: - pci_iounmap(pdev, board->base2); + iounmap(board->base2); err_reg: pci_release_region(pdev, reg); err_flag: diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index dce4cc0e6953..d0c0d64ed366 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -168,7 +168,7 @@ static void sysrq_handle_show_timers(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_show_timers_op = { .handler = sysrq_handle_show_timers, .help_msg = "show-all-timers(Q)", - .action_msg = "Show Pending Timers", + .action_msg = "Show pending hrtimers (no others)", }; static void sysrq_handle_mountro(int key, struct tty_struct *tty) diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c index e70d13defde4..9c47dc48c9fd 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm.c @@ -1157,7 +1157,7 @@ EXPORT_SYMBOL_GPL(tpm_dev_vendor_release); * Once all references to platform device are down to 0, * release all allocated structures. */ -static void tpm_dev_release(struct device *dev) +void tpm_dev_release(struct device *dev) { struct tpm_chip *chip = dev_get_drvdata(dev); diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c index 0e024fe2d8c4..887072f5dc8b 100644 --- a/drivers/edac/cell_edac.c +++ b/drivers/edac/cell_edac.c @@ -142,7 +142,7 @@ static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci) csrow->nr_pages = (r.end - r.start + 1) >> PAGE_SHIFT; csrow->last_page = csrow->first_page + csrow->nr_pages - 1; csrow->mtype = MEM_XDR; - csrow->edac_mode = EDAC_FLAG_EC | EDAC_FLAG_SECDED; + csrow->edac_mode = EDAC_SECDED; dev_dbg(mci->dev, "Initialized on node %d, chanmask=0x%x," " first_page=0x%lx, nr_pages=0x%x\n", diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c index deb154aa47c4..4353414a0b77 100644 --- a/drivers/firmware/iscsi_ibft.c +++ b/drivers/firmware/iscsi_ibft.c @@ -732,7 +732,6 @@ static int __init ibft_create_attribute(struct ibft_kobject *kobj_data, attr->attr.name = name; attr->attr.mode = S_IRUSR; - attr->attr.owner = THIS_MODULE; attr->hdr = hdr; attr->show = show; diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 9112830107a5..22edc4273ef6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -248,7 +248,7 @@ static ssize_t gpio_value_show(struct device *dev, if (!test_bit(FLAG_EXPORT, &desc->flags)) status = -EIO; else - status = sprintf(buf, "%d\n", gpio_get_value_cansleep(gpio)); + status = sprintf(buf, "%d\n", !!gpio_get_value_cansleep(gpio)); mutex_unlock(&sysfs_lock); return status; @@ -1105,7 +1105,7 @@ int gpio_get_value_cansleep(unsigned gpio) might_sleep_if(extra_checks); chip = gpio_to_chip(gpio); - return chip->get(chip, gpio - chip->base); + return chip->get ? chip->get(chip, gpio - chip->base) : 0; } EXPORT_SYMBOL_GPL(gpio_get_value_cansleep); diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c index b06b8e090a27..bc011da79e14 100644 --- a/drivers/hwmon/applesmc.c +++ b/drivers/hwmon/applesmc.c @@ -49,6 +49,9 @@ #define APPLESMC_MAX_DATA_LENGTH 32 +#define APPLESMC_MIN_WAIT 0x0040 +#define APPLESMC_MAX_WAIT 0x8000 + #define APPLESMC_STATUS_MASK 0x0f #define APPLESMC_READ_CMD 0x10 #define APPLESMC_WRITE_CMD 0x11 @@ -57,8 +60,8 @@ #define KEY_COUNT_KEY "#KEY" /* r-o ui32 */ -#define LIGHT_SENSOR_LEFT_KEY "ALV0" /* r-o {alv (6 bytes) */ -#define LIGHT_SENSOR_RIGHT_KEY "ALV1" /* r-o {alv (6 bytes) */ +#define LIGHT_SENSOR_LEFT_KEY "ALV0" /* r-o {alv (6-10 bytes) */ +#define LIGHT_SENSOR_RIGHT_KEY "ALV1" /* r-o {alv (6-10 bytes) */ #define BACKLIGHT_KEY "LKSB" /* w-o {lkb (2 bytes) */ #define CLAMSHELL_KEY "MSLD" /* r-o ui8 (unused) */ @@ -104,6 +107,15 @@ static const char* temperature_sensors_sets[][36] = { /* Set 6: Macbook3 set */ { "TB0T", "TC0D", "TC0P", "TM0P", "TN0P", "TTF0", "TW0P", "Th0H", "Th0S", "Th1H", NULL }, +/* Set 7: Macbook Air */ + { "TB0T", "TB1S", "TB1T", "TB2S", "TB2T", "TC0D", "TC0P", "TCFP", + "TTF0", "TW0P", "Th0H", "Tp0P", "TpFP", "Ts0P", "Ts0S", NULL }, +/* Set 8: Macbook Pro 4,1 (Penryn) */ + { "TB0T", "TC0D", "TC0P", "TG0D", "TG0H", "TTF0", "TW0P", "Th0H", + "Th1H", "Th2H", "Tm0P", "Ts0P", NULL }, +/* Set 9: Macbook Pro 3,1 (Santa Rosa) */ + { "TALP", "TB0T", "TC0D", "TC0P", "TG0D", "TG0H", "TTF0", "TW0P", + "Th0H", "Th1H", "Th2H", "Tm0P", "Ts0P", NULL }, }; /* List of keys used to read/write fan speeds */ @@ -163,25 +175,25 @@ static unsigned int key_at_index; static struct workqueue_struct *applesmc_led_wq; /* - * __wait_status - Wait up to 2ms for the status port to get a certain value + * __wait_status - Wait up to 32ms for the status port to get a certain value * (masked with 0x0f), returning zero if the value is obtained. Callers must * hold applesmc_lock. */ static int __wait_status(u8 val) { - unsigned int i; + int us; val = val & APPLESMC_STATUS_MASK; - for (i = 0; i < 200; i++) { + for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { + udelay(us); if ((inb(APPLESMC_CMD_PORT) & APPLESMC_STATUS_MASK) == val) { if (debug) printk(KERN_DEBUG - "Waited %d us for status %x\n", - i*10, val); + "Waited %d us for status %x\n", + 2 * us - APPLESMC_MIN_WAIT, val); return 0; } - udelay(10); } printk(KERN_WARNING "applesmc: wait status failed: %x != %x\n", @@ -191,6 +203,25 @@ static int __wait_status(u8 val) } /* + * special treatment of command port - on newer macbooks, it seems necessary + * to resend the command byte before polling the status again. Callers must + * hold applesmc_lock. + */ +static int send_command(u8 cmd) +{ + int us; + for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) { + outb(cmd, APPLESMC_CMD_PORT); + udelay(us); + if ((inb(APPLESMC_CMD_PORT) & APPLESMC_STATUS_MASK) == 0x0c) + return 0; + } + printk(KERN_WARNING "applesmc: command failed: %x -> %x\n", + cmd, inb(APPLESMC_CMD_PORT)); + return -EIO; +} + +/* * applesmc_read_key - reads len bytes from a given key, and put them in buffer. * Returns zero on success or a negative error on failure. Callers must * hold applesmc_lock. @@ -205,8 +236,7 @@ static int applesmc_read_key(const char* key, u8* buffer, u8 len) return -EINVAL; } - outb(APPLESMC_READ_CMD, APPLESMC_CMD_PORT); - if (__wait_status(0x0c)) + if (send_command(APPLESMC_READ_CMD)) return -EIO; for (i = 0; i < 4; i++) { @@ -249,8 +279,7 @@ static int applesmc_write_key(const char* key, u8* buffer, u8 len) return -EINVAL; } - outb(APPLESMC_WRITE_CMD, APPLESMC_CMD_PORT); - if (__wait_status(0x0c)) + if (send_command(APPLESMC_WRITE_CMD)) return -EIO; for (i = 0; i < 4; i++) { @@ -284,8 +313,7 @@ static int applesmc_get_key_at_index(int index, char* key) readkey[2] = index >> 8; readkey[3] = index; - outb(APPLESMC_GET_KEY_BY_INDEX_CMD, APPLESMC_CMD_PORT); - if (__wait_status(0x0c)) + if (send_command(APPLESMC_GET_KEY_BY_INDEX_CMD)) return -EIO; for (i = 0; i < 4; i++) { @@ -315,8 +343,7 @@ static int applesmc_get_key_type(char* key, char* type) { int i; - outb(APPLESMC_GET_KEY_TYPE_CMD, APPLESMC_CMD_PORT); - if (__wait_status(0x0c)) + if (send_command(APPLESMC_GET_KEY_TYPE_CMD)) return -EIO; for (i = 0; i < 4; i++) { @@ -325,7 +352,7 @@ static int applesmc_get_key_type(char* key, char* type) return -EIO; } - outb(5, APPLESMC_DATA_PORT); + outb(6, APPLESMC_DATA_PORT); for (i = 0; i < 6; i++) { if (__wait_status(0x05)) @@ -527,17 +554,27 @@ out: static ssize_t applesmc_light_show(struct device *dev, struct device_attribute *attr, char *sysfsbuf) { + static int data_length; int ret; u8 left = 0, right = 0; - u8 buffer[6]; + u8 buffer[10], query[6]; mutex_lock(&applesmc_lock); - ret = applesmc_read_key(LIGHT_SENSOR_LEFT_KEY, buffer, 6); + if (!data_length) { + ret = applesmc_get_key_type(LIGHT_SENSOR_LEFT_KEY, query); + if (ret) + goto out; + data_length = clamp_val(query[0], 0, 10); + printk(KERN_INFO "applesmc: light sensor data length set to " + "%d\n", data_length); + } + + ret = applesmc_read_key(LIGHT_SENSOR_LEFT_KEY, buffer, data_length); left = buffer[2]; if (ret) goto out; - ret = applesmc_read_key(LIGHT_SENSOR_RIGHT_KEY, buffer, 6); + ret = applesmc_read_key(LIGHT_SENSOR_RIGHT_KEY, buffer, data_length); right = buffer[2]; out: @@ -1233,39 +1270,57 @@ static __initdata struct dmi_match_data applesmc_dmi_data[] = { { .accelerometer = 0, .light = 0, .temperature_set = 5 }, /* MacBook3: accelerometer and temperature set 6 */ { .accelerometer = 1, .light = 0, .temperature_set = 6 }, +/* MacBook Air: accelerometer, backlight and temperature set 7 */ + { .accelerometer = 1, .light = 1, .temperature_set = 7 }, +/* MacBook Pro 4: accelerometer, backlight and temperature set 8 */ + { .accelerometer = 1, .light = 1, .temperature_set = 8 }, +/* MacBook Pro 3: accelerometer, backlight and temperature set 9 */ + { .accelerometer = 1, .light = 1, .temperature_set = 9 }, }; /* Note that DMI_MATCH(...,"MacBook") will match "MacBookPro1,1". * So we need to put "Apple MacBook Pro" before "Apple MacBook". */ static __initdata struct dmi_system_id applesmc_whitelist[] = { + { applesmc_dmi_match, "Apple MacBook Air", { + DMI_MATCH(DMI_BOARD_VENDOR, "Apple"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookAir") }, + &applesmc_dmi_data[7]}, + { applesmc_dmi_match, "Apple MacBook Pro 4", { + DMI_MATCH(DMI_BOARD_VENDOR, "Apple"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro4") }, + &applesmc_dmi_data[8]}, + { applesmc_dmi_match, "Apple MacBook Pro 3", { + DMI_MATCH(DMI_BOARD_VENDOR, "Apple"), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro3") }, + &applesmc_dmi_data[9]}, { applesmc_dmi_match, "Apple MacBook Pro", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"MacBookPro") }, - (void*)&applesmc_dmi_data[0]}, + &applesmc_dmi_data[0]}, { applesmc_dmi_match, "Apple MacBook (v2)", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"MacBook2") }, - (void*)&applesmc_dmi_data[1]}, + &applesmc_dmi_data[1]}, { applesmc_dmi_match, "Apple MacBook (v3)", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"MacBook3") }, - (void*)&applesmc_dmi_data[6]}, + &applesmc_dmi_data[6]}, { applesmc_dmi_match, "Apple MacBook", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"MacBook") }, - (void*)&applesmc_dmi_data[2]}, + &applesmc_dmi_data[2]}, { applesmc_dmi_match, "Apple Macmini", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"Macmini") }, - (void*)&applesmc_dmi_data[3]}, + &applesmc_dmi_data[3]}, { applesmc_dmi_match, "Apple MacPro2", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"MacPro2") }, - (void*)&applesmc_dmi_data[4]}, + &applesmc_dmi_data[4]}, { applesmc_dmi_match, "Apple iMac", { DMI_MATCH(DMI_BOARD_VENDOR,"Apple"), DMI_MATCH(DMI_PRODUCT_NAME,"iMac") }, - (void*)&applesmc_dmi_data[5]}, + &applesmc_dmi_data[5]}, { .ident = NULL } }; diff --git a/drivers/hwmon/pc87360.c b/drivers/hwmon/pc87360.c index 9b462bb13fa3..5fbfa34c110e 100644 --- a/drivers/hwmon/pc87360.c +++ b/drivers/hwmon/pc87360.c @@ -75,7 +75,8 @@ MODULE_PARM_DESC(force_id, "Override the detected device ID"); #define FSCM 0x09 /* Logical device: fans */ #define VLM 0x0d /* Logical device: voltages */ #define TMS 0x0e /* Logical device: temperatures */ -static const u8 logdev[3] = { FSCM, VLM, TMS }; +#define LDNI_MAX 3 +static const u8 logdev[LDNI_MAX] = { FSCM, VLM, TMS }; #define LD_FAN 0 #define LD_IN 1 @@ -489,11 +490,66 @@ static struct sensor_device_attribute in_max[] = { SENSOR_ATTR(in10_max, S_IWUSR | S_IRUGO, show_in_max, set_in_max, 10), }; +/* (temp & vin) channel status register alarm bits (pdf sec.11.5.12) */ +#define CHAN_ALM_MIN 0x02 /* min limit crossed */ +#define CHAN_ALM_MAX 0x04 /* max limit exceeded */ +#define TEMP_ALM_CRIT 0x08 /* temp crit exceeded (temp only) */ + +/* show_in_min/max_alarm() reads data from the per-channel status + register (sec 11.5.12), not the vin event status registers (sec + 11.5.2) that (legacy) show_in_alarm() resds (via data->in_alarms) */ + +static ssize_t show_in_min_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MIN)); +} +static ssize_t show_in_max_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MAX)); +} + +static struct sensor_device_attribute in_min_alarm[] = { + SENSOR_ATTR(in0_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 0), + SENSOR_ATTR(in1_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 1), + SENSOR_ATTR(in2_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 2), + SENSOR_ATTR(in3_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 3), + SENSOR_ATTR(in4_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 4), + SENSOR_ATTR(in5_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 5), + SENSOR_ATTR(in6_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 6), + SENSOR_ATTR(in7_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 7), + SENSOR_ATTR(in8_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 8), + SENSOR_ATTR(in9_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 9), + SENSOR_ATTR(in10_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 10), +}; +static struct sensor_device_attribute in_max_alarm[] = { + SENSOR_ATTR(in0_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 0), + SENSOR_ATTR(in1_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 1), + SENSOR_ATTR(in2_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 2), + SENSOR_ATTR(in3_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 3), + SENSOR_ATTR(in4_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 4), + SENSOR_ATTR(in5_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 5), + SENSOR_ATTR(in6_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 6), + SENSOR_ATTR(in7_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 7), + SENSOR_ATTR(in8_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 8), + SENSOR_ATTR(in9_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 9), + SENSOR_ATTR(in10_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 10), +}; + #define VIN_UNIT_ATTRS(X) \ &in_input[X].dev_attr.attr, \ &in_status[X].dev_attr.attr, \ &in_min[X].dev_attr.attr, \ - &in_max[X].dev_attr.attr + &in_max[X].dev_attr.attr, \ + &in_min_alarm[X].dev_attr.attr, \ + &in_max_alarm[X].dev_attr.attr static ssize_t show_vid(struct device *dev, struct device_attribute *attr, char *buf) { @@ -658,12 +714,68 @@ static struct sensor_device_attribute therm_crit[] = { show_therm_crit, set_therm_crit, 2+11), }; +/* show_therm_min/max_alarm() reads data from the per-channel voltage + status register (sec 11.5.12) */ + +static ssize_t show_therm_min_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MIN)); +} +static ssize_t show_therm_max_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MAX)); +} +static ssize_t show_therm_crit_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->in_status[nr] & TEMP_ALM_CRIT)); +} + +static struct sensor_device_attribute therm_min_alarm[] = { + SENSOR_ATTR(temp4_min_alarm, S_IRUGO, + show_therm_min_alarm, NULL, 0+11), + SENSOR_ATTR(temp5_min_alarm, S_IRUGO, + show_therm_min_alarm, NULL, 1+11), + SENSOR_ATTR(temp6_min_alarm, S_IRUGO, + show_therm_min_alarm, NULL, 2+11), +}; +static struct sensor_device_attribute therm_max_alarm[] = { + SENSOR_ATTR(temp4_max_alarm, S_IRUGO, + show_therm_max_alarm, NULL, 0+11), + SENSOR_ATTR(temp5_max_alarm, S_IRUGO, + show_therm_max_alarm, NULL, 1+11), + SENSOR_ATTR(temp6_max_alarm, S_IRUGO, + show_therm_max_alarm, NULL, 2+11), +}; +static struct sensor_device_attribute therm_crit_alarm[] = { + SENSOR_ATTR(temp4_crit_alarm, S_IRUGO, + show_therm_crit_alarm, NULL, 0+11), + SENSOR_ATTR(temp5_crit_alarm, S_IRUGO, + show_therm_crit_alarm, NULL, 1+11), + SENSOR_ATTR(temp6_crit_alarm, S_IRUGO, + show_therm_crit_alarm, NULL, 2+11), +}; + #define THERM_UNIT_ATTRS(X) \ &therm_input[X].dev_attr.attr, \ &therm_status[X].dev_attr.attr, \ &therm_min[X].dev_attr.attr, \ &therm_max[X].dev_attr.attr, \ - &therm_crit[X].dev_attr.attr + &therm_crit[X].dev_attr.attr, \ + &therm_min_alarm[X].dev_attr.attr, \ + &therm_max_alarm[X].dev_attr.attr, \ + &therm_crit_alarm[X].dev_attr.attr static struct attribute * pc8736x_therm_attr_array[] = { THERM_UNIT_ATTRS(0), @@ -790,12 +902,76 @@ static ssize_t show_temp_alarms(struct device *dev, struct device_attribute *att } static DEVICE_ATTR(alarms_temp, S_IRUGO, show_temp_alarms, NULL); +/* show_temp_min/max_alarm() reads data from the per-channel status + register (sec 12.3.7), not the temp event status registers (sec + 12.3.2) that show_temp_alarm() reads (via data->temp_alarms) */ + +static ssize_t show_temp_min_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->temp_status[nr] & CHAN_ALM_MIN)); +} +static ssize_t show_temp_max_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->temp_status[nr] & CHAN_ALM_MAX)); +} +static ssize_t show_temp_crit_alarm(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->temp_status[nr] & TEMP_ALM_CRIT)); +} + +static struct sensor_device_attribute temp_min_alarm[] = { + SENSOR_ATTR(temp1_min_alarm, S_IRUGO, show_temp_min_alarm, NULL, 0), + SENSOR_ATTR(temp2_min_alarm, S_IRUGO, show_temp_min_alarm, NULL, 1), + SENSOR_ATTR(temp3_min_alarm, S_IRUGO, show_temp_min_alarm, NULL, 2), +}; +static struct sensor_device_attribute temp_max_alarm[] = { + SENSOR_ATTR(temp1_max_alarm, S_IRUGO, show_temp_max_alarm, NULL, 0), + SENSOR_ATTR(temp2_max_alarm, S_IRUGO, show_temp_max_alarm, NULL, 1), + SENSOR_ATTR(temp3_max_alarm, S_IRUGO, show_temp_max_alarm, NULL, 2), +}; +static struct sensor_device_attribute temp_crit_alarm[] = { + SENSOR_ATTR(temp1_crit_alarm, S_IRUGO, show_temp_crit_alarm, NULL, 0), + SENSOR_ATTR(temp2_crit_alarm, S_IRUGO, show_temp_crit_alarm, NULL, 1), + SENSOR_ATTR(temp3_crit_alarm, S_IRUGO, show_temp_crit_alarm, NULL, 2), +}; + +#define TEMP_FAULT 0x40 /* open diode */ +static ssize_t show_temp_fault(struct device *dev, + struct device_attribute *devattr, char *buf) +{ + struct pc87360_data *data = pc87360_update_device(dev); + unsigned nr = to_sensor_dev_attr(devattr)->index; + + return sprintf(buf, "%u\n", !!(data->temp_status[nr] & TEMP_FAULT)); +} +static struct sensor_device_attribute temp_fault[] = { + SENSOR_ATTR(temp1_fault, S_IRUGO, show_temp_fault, NULL, 0), + SENSOR_ATTR(temp2_fault, S_IRUGO, show_temp_fault, NULL, 1), + SENSOR_ATTR(temp3_fault, S_IRUGO, show_temp_fault, NULL, 2), +}; + #define TEMP_UNIT_ATTRS(X) \ &temp_input[X].dev_attr.attr, \ &temp_status[X].dev_attr.attr, \ &temp_min[X].dev_attr.attr, \ &temp_max[X].dev_attr.attr, \ - &temp_crit[X].dev_attr.attr + &temp_crit[X].dev_attr.attr, \ + &temp_min_alarm[X].dev_attr.attr, \ + &temp_max_alarm[X].dev_attr.attr, \ + &temp_crit_alarm[X].dev_attr.attr, \ + &temp_fault[X].dev_attr.attr static struct attribute * pc8736x_temp_attr_array[] = { TEMP_UNIT_ATTRS(0), @@ -809,8 +985,8 @@ static const struct attribute_group pc8736x_temp_group = { .attrs = pc8736x_temp_attr_array, }; -static ssize_t show_name(struct device *dev, struct device_attribute - *devattr, char *buf) +static ssize_t show_name(struct device *dev, + struct device_attribute *devattr, char *buf) { struct pc87360_data *data = dev_get_drvdata(dev); return sprintf(buf, "%s\n", data->name); @@ -955,7 +1131,7 @@ static int __devinit pc87360_probe(struct platform_device *pdev) mutex_init(&data->update_lock); platform_set_drvdata(pdev, data); - for (i = 0; i < 3; i++) { + for (i = 0; i < LDNI_MAX; i++) { if (((data->address[i] = extra_isa[i])) && !request_region(extra_isa[i], PC87360_EXTENT, pc87360_driver.driver.name)) { @@ -1031,7 +1207,15 @@ static int __devinit pc87360_probe(struct platform_device *pdev) || (err = device_create_file(dev, &temp_crit[i].dev_attr)) || (err = device_create_file(dev, - &temp_status[i].dev_attr))) + &temp_status[i].dev_attr)) + || (err = device_create_file(dev, + &temp_min_alarm[i].dev_attr)) + || (err = device_create_file(dev, + &temp_max_alarm[i].dev_attr)) + || (err = device_create_file(dev, + &temp_crit_alarm[i].dev_attr)) + || (err = device_create_file(dev, + &temp_fault[i].dev_attr))) goto ERROR3; } if ((err = device_create_file(dev, &dev_attr_alarms_temp))) @@ -1131,6 +1315,16 @@ static void pc87360_write_value(struct pc87360_data *data, u8 ldi, u8 bank, mutex_unlock(&(data->lock)); } +/* (temp & vin) channel conversion status register flags (pdf sec.11.5.12) */ +#define CHAN_CNVRTD 0x80 /* new data ready */ +#define CHAN_ENA 0x01 /* enabled channel (temp or vin) */ +#define CHAN_ALM_ENA 0x10 /* propagate to alarms-reg ?? (chk val!) */ +#define CHAN_READY (CHAN_ENA|CHAN_CNVRTD) /* sample ready mask */ + +#define TEMP_OTS_OE 0x20 /* OTS Output Enable */ +#define VIN_RW1C_MASK (CHAN_READY|CHAN_ALM_MAX|CHAN_ALM_MIN) /* 0x87 */ +#define TEMP_RW1C_MASK (VIN_RW1C_MASK|TEMP_ALM_CRIT|TEMP_FAULT) /* 0xCF */ + static void pc87360_init_device(struct platform_device *pdev, int use_thermistors) { @@ -1152,11 +1346,12 @@ static void pc87360_init_device(struct platform_device *pdev, nr = data->innr < 11 ? data->innr : 11; for (i = 0; i < nr; i++) { + reg = pc87360_read_value(data, LD_IN, i, + PC87365_REG_IN_STATUS); + dev_dbg(&pdev->dev, "bios in%d status:0x%02x\n", i, reg); if (init >= init_in[i]) { /* Forcibly enable voltage channel */ - reg = pc87360_read_value(data, LD_IN, i, - PC87365_REG_IN_STATUS); - if (!(reg & 0x01)) { + if (!(reg & CHAN_ENA)) { dev_dbg(&pdev->dev, "Forcibly " "enabling in%d\n", i); pc87360_write_value(data, LD_IN, i, @@ -1168,19 +1363,24 @@ static void pc87360_init_device(struct platform_device *pdev, /* We can't blindly trust the Super-I/O space configuration bit, most BIOS won't set it properly */ + dev_dbg(&pdev->dev, "bios thermistors:%d\n", use_thermistors); for (i = 11; i < data->innr; i++) { reg = pc87360_read_value(data, LD_IN, i, PC87365_REG_TEMP_STATUS); - use_thermistors = use_thermistors || (reg & 0x01); + use_thermistors = use_thermistors || (reg & CHAN_ENA); + /* thermistors are temp[4-6], measured on vin[11-14] */ + dev_dbg(&pdev->dev, "bios temp%d_status:0x%02x\n", i-7, reg); } + dev_dbg(&pdev->dev, "using thermistors:%d\n", use_thermistors); i = use_thermistors ? 2 : 0; for (; i < data->tempnr; i++) { + reg = pc87360_read_value(data, LD_TEMP, i, + PC87365_REG_TEMP_STATUS); + dev_dbg(&pdev->dev, "bios temp%d_status:0x%02x\n", i+1, reg); if (init >= init_temp[i]) { /* Forcibly enable temperature channel */ - reg = pc87360_read_value(data, LD_TEMP, i, - PC87365_REG_TEMP_STATUS); - if (!(reg & 0x01)) { + if (!(reg & CHAN_ENA)) { dev_dbg(&pdev->dev, "Forcibly " "enabling temp%d\n", i+1); pc87360_write_value(data, LD_TEMP, i, @@ -1197,7 +1397,7 @@ static void pc87360_init_device(struct platform_device *pdev, diodes */ reg = pc87360_read_value(data, LD_TEMP, (i-11)/2, PC87365_REG_TEMP_STATUS); - if (reg & 0x01) { + if (reg & CHAN_ENA) { dev_dbg(&pdev->dev, "Skipping " "temp%d, pin already in use " "by temp%d\n", i-7, (i-11)/2); @@ -1207,7 +1407,7 @@ static void pc87360_init_device(struct platform_device *pdev, /* Forcibly enable thermistor channel */ reg = pc87360_read_value(data, LD_IN, i, PC87365_REG_IN_STATUS); - if (!(reg & 0x01)) { + if (!(reg & CHAN_ENA)) { dev_dbg(&pdev->dev, "Forcibly " "enabling temp%d\n", i-7); pc87360_write_value(data, LD_IN, i, @@ -1221,7 +1421,8 @@ static void pc87360_init_device(struct platform_device *pdev, if (data->innr) { reg = pc87360_read_value(data, LD_IN, NO_BANK, PC87365_REG_IN_CONFIG); - if (reg & 0x01) { + dev_dbg(&pdev->dev, "bios vin-cfg:0x%02x\n", reg); + if (reg & CHAN_ENA) { dev_dbg(&pdev->dev, "Forcibly " "enabling monitoring (VLM)\n"); pc87360_write_value(data, LD_IN, NO_BANK, @@ -1233,7 +1434,8 @@ static void pc87360_init_device(struct platform_device *pdev, if (data->tempnr) { reg = pc87360_read_value(data, LD_TEMP, NO_BANK, PC87365_REG_TEMP_CONFIG); - if (reg & 0x01) { + dev_dbg(&pdev->dev, "bios temp-cfg:0x%02x\n", reg); + if (reg & CHAN_ENA) { dev_dbg(&pdev->dev, "Forcibly enabling " "monitoring (TMS)\n"); pc87360_write_value(data, LD_TEMP, NO_BANK, @@ -1336,11 +1538,11 @@ static struct pc87360_data *pc87360_update_device(struct device *dev) pc87360_write_value(data, LD_IN, i, PC87365_REG_IN_STATUS, data->in_status[i]); - if ((data->in_status[i] & 0x81) == 0x81) { + if ((data->in_status[i] & CHAN_READY) == CHAN_READY) { data->in[i] = pc87360_read_value(data, LD_IN, i, PC87365_REG_IN); } - if (data->in_status[i] & 0x01) { + if (data->in_status[i] & CHAN_ENA) { data->in_min[i] = pc87360_read_value(data, LD_IN, i, PC87365_REG_IN_MIN); @@ -1373,12 +1575,12 @@ static struct pc87360_data *pc87360_update_device(struct device *dev) pc87360_write_value(data, LD_TEMP, i, PC87365_REG_TEMP_STATUS, data->temp_status[i]); - if ((data->temp_status[i] & 0x81) == 0x81) { + if ((data->temp_status[i] & CHAN_READY) == CHAN_READY) { data->temp[i] = pc87360_read_value(data, LD_TEMP, i, PC87365_REG_TEMP); } - if (data->temp_status[i] & 0x01) { + if (data->temp_status[i] & CHAN_ENA) { data->temp_min[i] = pc87360_read_value(data, LD_TEMP, i, PC87365_REG_TEMP_MIN); diff --git a/drivers/i2c/chips/at24.c b/drivers/i2c/chips/at24.c index 2a4acb269569..d4775528abc6 100644 --- a/drivers/i2c/chips/at24.c +++ b/drivers/i2c/chips/at24.c @@ -460,7 +460,6 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id) */ at24->bin.attr.name = "eeprom"; at24->bin.attr.mode = chip.flags & AT24_FLAG_IRUGO ? S_IRUGO : S_IRUSR; - at24->bin.attr.owner = THIS_MODULE; at24->bin.read = at24_bin_read; at24->bin.size = chip.byte_len; diff --git a/drivers/i2c/chips/ds1682.c b/drivers/i2c/chips/ds1682.c index 23be4d42cb02..f3ee4a1abb77 100644 --- a/drivers/i2c/chips/ds1682.c +++ b/drivers/i2c/chips/ds1682.c @@ -190,7 +190,6 @@ static struct bin_attribute ds1682_eeprom_attr = { .attr = { .name = "eeprom", .mode = S_IRUGO | S_IWUSR, - .owner = THIS_MODULE, }, .size = DS1682_EEPROM_SIZE, .read = ds1682_eeprom_read, diff --git a/drivers/i2c/chips/menelaus.c b/drivers/i2c/chips/menelaus.c index 176126d3a01d..4b364bae6b3e 100644 --- a/drivers/i2c/chips/menelaus.c +++ b/drivers/i2c/chips/menelaus.c @@ -832,52 +832,52 @@ static irqreturn_t menelaus_irq(int irq, void *_menelaus) static void menelaus_to_time(char *regs, struct rtc_time *t) { - t->tm_sec = BCD2BIN(regs[0]); - t->tm_min = BCD2BIN(regs[1]); + t->tm_sec = bcd2bin(regs[0]); + t->tm_min = bcd2bin(regs[1]); if (the_menelaus->rtc_control & RTC_CTRL_MODE12) { - t->tm_hour = BCD2BIN(regs[2] & 0x1f) - 1; + t->tm_hour = bcd2bin(regs[2] & 0x1f) - 1; if (regs[2] & RTC_HR_PM) t->tm_hour += 12; } else - t->tm_hour = BCD2BIN(regs[2] & 0x3f); - t->tm_mday = BCD2BIN(regs[3]); - t->tm_mon = BCD2BIN(regs[4]) - 1; - t->tm_year = BCD2BIN(regs[5]) + 100; + t->tm_hour = bcd2bin(regs[2] & 0x3f); + t->tm_mday = bcd2bin(regs[3]); + t->tm_mon = bcd2bin(regs[4]) - 1; + t->tm_year = bcd2bin(regs[5]) + 100; } static int time_to_menelaus(struct rtc_time *t, int regnum) { int hour, status; - status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_sec)); + status = menelaus_write_reg(regnum++, bin2bcd(t->tm_sec)); if (status < 0) goto fail; - status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_min)); + status = menelaus_write_reg(regnum++, bin2bcd(t->tm_min)); if (status < 0) goto fail; if (the_menelaus->rtc_control & RTC_CTRL_MODE12) { hour = t->tm_hour + 1; if (hour > 12) - hour = RTC_HR_PM | BIN2BCD(hour - 12); + hour = RTC_HR_PM | bin2bcd(hour - 12); else - hour = BIN2BCD(hour); + hour = bin2bcd(hour); } else - hour = BIN2BCD(t->tm_hour); + hour = bin2bcd(t->tm_hour); status = menelaus_write_reg(regnum++, hour); if (status < 0) goto fail; - status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_mday)); + status = menelaus_write_reg(regnum++, bin2bcd(t->tm_mday)); if (status < 0) goto fail; - status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_mon + 1)); + status = menelaus_write_reg(regnum++, bin2bcd(t->tm_mon + 1)); if (status < 0) goto fail; - status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_year - 100)); + status = menelaus_write_reg(regnum++, bin2bcd(t->tm_year - 100)); if (status < 0) goto fail; @@ -914,7 +914,7 @@ static int menelaus_read_time(struct device *dev, struct rtc_time *t) } menelaus_to_time(regs, t); - t->tm_wday = BCD2BIN(regs[6]); + t->tm_wday = bcd2bin(regs[6]); return 0; } @@ -927,7 +927,7 @@ static int menelaus_set_time(struct device *dev, struct rtc_time *t) status = time_to_menelaus(t, MENELAUS_RTC_SEC); if (status < 0) return status; - status = menelaus_write_reg(MENELAUS_RTC_WKDAY, BIN2BCD(t->tm_wday)); + status = menelaus_write_reg(MENELAUS_RTC_WKDAY, bin2bcd(t->tm_wday)); if (status < 0) { dev_err(&the_menelaus->client->dev, "rtc write reg %02x " "err %d\n", MENELAUS_RTC_WKDAY, status); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index a78d35aecee3..f1e82a92e61e 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -122,7 +122,7 @@ struct cm_counter_attribute { #define CM_COUNTER_ATTR(_name, _index) \ struct cm_counter_attribute cm_##_name##_counter_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE }, \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ .index = _index \ } diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c index c7c770c28988..aa1ff524256e 100644 --- a/drivers/media/dvb/ttpci/av7110.c +++ b/drivers/media/dvb/ttpci/av7110.c @@ -36,7 +36,6 @@ #include <linux/fs.h> #include <linux/timer.h> #include <linux/poll.h> -#include <linux/byteorder/swabb.h> #include <linux/smp_lock.h> #include <linux/kernel.h> @@ -52,6 +51,7 @@ #include <linux/i2c.h> #include <linux/kthread.h> #include <asm/unaligned.h> +#include <asm/byteorder.h> #include <asm/system.h> diff --git a/drivers/media/video/cx18/cx18-driver.h b/drivers/media/video/cx18/cx18-driver.h index fa8be0731a3f..a4b1708fafe7 100644 --- a/drivers/media/video/cx18/cx18-driver.h +++ b/drivers/media/video/cx18/cx18-driver.h @@ -41,6 +41,7 @@ #include <linux/pagemap.h> #include <linux/workqueue.h> #include <linux/mutex.h> +#include <asm/byteorder.h> #include <linux/dvb/video.h> #include <linux/dvb/audio.h> diff --git a/drivers/media/video/ivtv/ivtv-driver.h b/drivers/media/video/ivtv/ivtv-driver.h index bc29436e8a3c..3733b2afec5f 100644 --- a/drivers/media/video/ivtv/ivtv-driver.h +++ b/drivers/media/video/ivtv/ivtv-driver.h @@ -55,6 +55,7 @@ #include <linux/mutex.h> #include <asm/uaccess.h> #include <asm/system.h> +#include <asm/byteorder.h> #include <linux/dvb/video.h> #include <linux/dvb/audio.h> diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 6e291bf8237a..5263913e0c69 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1044,7 +1044,6 @@ static int mspro_block_read_attributes(struct memstick_dev *card) s_attr->dev_attr.attr.name = s_attr->name; s_attr->dev_attr.attr.mode = S_IRUGO; - s_attr->dev_attr.attr.owner = THIS_MODULE; s_attr->dev_attr.show = mspro_block_attr_show(s_attr->id); if (!rc) diff --git a/drivers/misc/hp-wmi.c b/drivers/misc/hp-wmi.c index 5dabfb69ee53..4b7c24c519c3 100644 --- a/drivers/misc/hp-wmi.c +++ b/drivers/misc/hp-wmi.c @@ -82,6 +82,7 @@ static struct key_entry hp_wmi_keymap[] = { {KE_KEY, 0x03, KEY_BRIGHTNESSDOWN}, {KE_KEY, 0x20e6, KEY_PROG1}, {KE_KEY, 0x2142, KEY_MEDIA}, + {KE_KEY, 0x213b, KEY_INFO}, {KE_KEY, 0x231b, KEY_HELP}, {KE_END, 0} }; diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 14f11f8b9e5f..a90d50c2c3e5 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -172,6 +172,11 @@ config MTD_CHAR memory chips, and also use ioctl() to obtain information about the device, or to erase parts of it. +config HAVE_MTD_OTP + bool + help + Enable access to OTP regions using MTD_CHAR. + config MTD_BLKDEVS tristate "Common interface to block layer for MTD 'translation layers'" depends on BLOCK diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig index 479d32b57a1e..9408099eec48 100644 --- a/drivers/mtd/chips/Kconfig +++ b/drivers/mtd/chips/Kconfig @@ -6,6 +6,7 @@ menu "RAM/ROM/Flash chip drivers" config MTD_CFI tristate "Detect flash chips by Common Flash Interface (CFI) probe" select MTD_GEN_PROBE + select MTD_CFI_UTIL help The Common Flash Interface specification was developed by Intel, AMD and other flash manufactures that provides a universal method @@ -154,6 +155,7 @@ config MTD_CFI_I8 config MTD_OTP bool "Protection Registers aka one-time programmable (OTP) bits" depends on MTD_CFI_ADV_OPTIONS + select HAVE_MTD_OTP default n help This enables support for reading, writing and locking so called @@ -187,7 +189,7 @@ config MTD_CFI_INTELEXT StrataFlash and other parts. config MTD_CFI_AMDSTD - tristate "Support for AMD/Fujitsu flash chips" + tristate "Support for AMD/Fujitsu/Spansion flash chips" depends on MTD_GEN_PROBE select MTD_CFI_UTIL help diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c index 5f1b472137a0..c93a8be5d5f1 100644 --- a/drivers/mtd/chips/cfi_cmdset_0001.c +++ b/drivers/mtd/chips/cfi_cmdset_0001.c @@ -478,6 +478,28 @@ struct mtd_info *cfi_cmdset_0001(struct map_info *map, int primary) else cfi->chips[i].erase_time = 2000000; + if (cfi->cfiq->WordWriteTimeoutTyp && + cfi->cfiq->WordWriteTimeoutMax) + cfi->chips[i].word_write_time_max = + 1<<(cfi->cfiq->WordWriteTimeoutTyp + + cfi->cfiq->WordWriteTimeoutMax); + else + cfi->chips[i].word_write_time_max = 50000 * 8; + + if (cfi->cfiq->BufWriteTimeoutTyp && + cfi->cfiq->BufWriteTimeoutMax) + cfi->chips[i].buffer_write_time_max = + 1<<(cfi->cfiq->BufWriteTimeoutTyp + + cfi->cfiq->BufWriteTimeoutMax); + + if (cfi->cfiq->BlockEraseTimeoutTyp && + cfi->cfiq->BlockEraseTimeoutMax) + cfi->chips[i].erase_time_max = + 1000<<(cfi->cfiq->BlockEraseTimeoutTyp + + cfi->cfiq->BlockEraseTimeoutMax); + else + cfi->chips[i].erase_time_max = 2000000 * 8; + cfi->chips[i].ref_point_counter = 0; init_waitqueue_head(&(cfi->chips[i].wq)); } @@ -703,6 +725,10 @@ static int chip_ready (struct map_info *map, struct flchip *chip, unsigned long struct cfi_pri_intelext *cfip = cfi->cmdset_priv; unsigned long timeo = jiffies + HZ; + /* Prevent setting state FL_SYNCING for chip in suspended state. */ + if (mode == FL_SYNCING && chip->oldstate != FL_READY) + goto sleep; + switch (chip->state) { case FL_STATUS: @@ -808,8 +834,9 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr DECLARE_WAITQUEUE(wait, current); retry: - if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING - || mode == FL_OTP_WRITE || mode == FL_SHUTDOWN)) { + if (chip->priv && + (mode == FL_WRITING || mode == FL_ERASING || mode == FL_OTP_WRITE + || mode == FL_SHUTDOWN) && chip->state != FL_SYNCING) { /* * OK. We have possibility for contention on the write/erase * operations which are global to the real chip and not per @@ -859,6 +886,14 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr return ret; } spin_lock(&shared->lock); + + /* We should not own chip if it is already + * in FL_SYNCING state. Put contender and retry. */ + if (chip->state == FL_SYNCING) { + put_chip(map, contender, contender->start); + spin_unlock(contender->mutex); + goto retry; + } spin_unlock(contender->mutex); } @@ -1012,7 +1047,7 @@ static void __xipram xip_enable(struct map_info *map, struct flchip *chip, static int __xipram xip_wait_for_operation( struct map_info *map, struct flchip *chip, - unsigned long adr, unsigned int chip_op_time ) + unsigned long adr, unsigned int chip_op_time_max) { struct cfi_private *cfi = map->fldrv_priv; struct cfi_pri_intelext *cfip = cfi->cmdset_priv; @@ -1021,7 +1056,7 @@ static int __xipram xip_wait_for_operation( flstate_t oldstate, newstate; start = xip_currtime(); - usec = chip_op_time * 8; + usec = chip_op_time_max; if (usec == 0) usec = 500000; done = 0; @@ -1131,8 +1166,8 @@ static int __xipram xip_wait_for_operation( #define XIP_INVAL_CACHED_RANGE(map, from, size) \ INVALIDATE_CACHED_RANGE(map, from, size) -#define INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, inval_adr, inval_len, usec) \ - xip_wait_for_operation(map, chip, cmd_adr, usec) +#define INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, inval_adr, inval_len, usec, usec_max) \ + xip_wait_for_operation(map, chip, cmd_adr, usec_max) #else @@ -1144,7 +1179,7 @@ static int __xipram xip_wait_for_operation( static int inval_cache_and_wait_for_operation( struct map_info *map, struct flchip *chip, unsigned long cmd_adr, unsigned long inval_adr, int inval_len, - unsigned int chip_op_time) + unsigned int chip_op_time, unsigned int chip_op_time_max) { struct cfi_private *cfi = map->fldrv_priv; map_word status, status_OK = CMD(0x80); @@ -1156,8 +1191,7 @@ static int inval_cache_and_wait_for_operation( INVALIDATE_CACHED_RANGE(map, inval_adr, inval_len); spin_lock(chip->mutex); - /* set our timeout to 8 times the expected delay */ - timeo = chip_op_time * 8; + timeo = chip_op_time_max; if (!timeo) timeo = 500000; reset_timeo = timeo; @@ -1217,8 +1251,8 @@ static int inval_cache_and_wait_for_operation( #endif -#define WAIT_TIMEOUT(map, chip, adr, udelay) \ - INVAL_CACHE_AND_WAIT(map, chip, adr, 0, 0, udelay); +#define WAIT_TIMEOUT(map, chip, adr, udelay, udelay_max) \ + INVAL_CACHE_AND_WAIT(map, chip, adr, 0, 0, udelay, udelay_max); static int do_point_onechip (struct map_info *map, struct flchip *chip, loff_t adr, size_t len) @@ -1452,7 +1486,8 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip, ret = INVAL_CACHE_AND_WAIT(map, chip, adr, adr, map_bankwidth(map), - chip->word_write_time); + chip->word_write_time, + chip->word_write_time_max); if (ret) { xip_enable(map, chip, adr); printk(KERN_ERR "%s: word write error (status timeout)\n", map->name); @@ -1623,7 +1658,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip, chip->state = FL_WRITING_TO_BUFFER; map_write(map, write_cmd, cmd_adr); - ret = WAIT_TIMEOUT(map, chip, cmd_adr, 0); + ret = WAIT_TIMEOUT(map, chip, cmd_adr, 0, 0); if (ret) { /* Argh. Not ready for write to buffer */ map_word Xstatus = map_read(map, cmd_adr); @@ -1640,7 +1675,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip, /* Figure out the number of words to write */ word_gap = (-adr & (map_bankwidth(map)-1)); - words = (len - word_gap + map_bankwidth(map) - 1) / map_bankwidth(map); + words = DIV_ROUND_UP(len - word_gap, map_bankwidth(map)); if (!word_gap) { words--; } else { @@ -1692,7 +1727,8 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip, ret = INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, initial_adr, initial_len, - chip->buffer_write_time); + chip->buffer_write_time, + chip->buffer_write_time_max); if (ret) { map_write(map, CMD(0x70), cmd_adr); chip->state = FL_STATUS; @@ -1827,7 +1863,8 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip, ret = INVAL_CACHE_AND_WAIT(map, chip, adr, adr, len, - chip->erase_time); + chip->erase_time, + chip->erase_time_max); if (ret) { map_write(map, CMD(0x70), adr); chip->state = FL_STATUS; @@ -2006,7 +2043,7 @@ static int __xipram do_xxlock_oneblock(struct map_info *map, struct flchip *chip */ udelay = (!extp || !(extp->FeatureSupport & (1 << 5))) ? 1000000/HZ : 0; - ret = WAIT_TIMEOUT(map, chip, adr, udelay); + ret = WAIT_TIMEOUT(map, chip, adr, udelay, udelay * 100); if (ret) { map_write(map, CMD(0x70), adr); chip->state = FL_STATUS; diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index a972cc6be436..3e6f5d8609e8 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -13,6 +13,8 @@ * XIP support hooks by Vitaly Wool (based on code for Intel flash * by Nicolas Pitre) * + * 25/09/2008 Christopher Moore: TopBottom fixup for many Macronix with CFI V1.0 + * * Occasionally maintained by Thayne Harbaugh tharbaugh at lnxi dot com * * This code is GPL @@ -43,6 +45,7 @@ #define MANUFACTURER_AMD 0x0001 #define MANUFACTURER_ATMEL 0x001F +#define MANUFACTURER_MACRONIX 0x00C2 #define MANUFACTURER_SST 0x00BF #define SST49LF004B 0x0060 #define SST49LF040B 0x0050 @@ -144,12 +147,44 @@ static void fixup_amd_bootblock(struct mtd_info *mtd, void* param) if (((major << 8) | minor) < 0x3131) { /* CFI version 1.0 => don't trust bootloc */ + + DEBUG(MTD_DEBUG_LEVEL1, + "%s: JEDEC Vendor ID is 0x%02X Device ID is 0x%02X\n", + map->name, cfi->mfr, cfi->id); + + /* AFAICS all 29LV400 with a bottom boot block have a device ID + * of 0x22BA in 16-bit mode and 0xBA in 8-bit mode. + * These were badly detected as they have the 0x80 bit set + * so treat them as a special case. + */ + if (((cfi->id == 0xBA) || (cfi->id == 0x22BA)) && + + /* Macronix added CFI to their 2nd generation + * MX29LV400C B/T but AFAICS no other 29LV400 (AMD, + * Fujitsu, Spansion, EON, ESI and older Macronix) + * has CFI. + * + * Therefore also check the manufacturer. + * This reduces the risk of false detection due to + * the 8-bit device ID. + */ + (cfi->mfr == MANUFACTURER_MACRONIX)) { + DEBUG(MTD_DEBUG_LEVEL1, + "%s: Macronix MX29LV400C with bottom boot block" + " detected\n", map->name); + extp->TopBottom = 2; /* bottom boot */ + } else if (cfi->id & 0x80) { printk(KERN_WARNING "%s: JEDEC Device ID is 0x%02X. Assuming broken CFI table.\n", map->name, cfi->id); extp->TopBottom = 3; /* top boot */ } else { extp->TopBottom = 2; /* bottom boot */ } + + DEBUG(MTD_DEBUG_LEVEL1, + "%s: AMD CFI PRI V%c.%c has no boot block field;" + " deduced %s from Device ID\n", map->name, major, minor, + extp->TopBottom == 2 ? "bottom" : "top"); } } #endif @@ -178,10 +213,18 @@ static void fixup_convert_atmel_pri(struct mtd_info *mtd, void *param) if (atmel_pri.Features & 0x02) extp->EraseSuspend = 2; - if (atmel_pri.BottomBoot) - extp->TopBottom = 2; - else - extp->TopBottom = 3; + /* Some chips got it backwards... */ + if (cfi->id == AT49BV6416) { + if (atmel_pri.BottomBoot) + extp->TopBottom = 3; + else + extp->TopBottom = 2; + } else { + if (atmel_pri.BottomBoot) + extp->TopBottom = 2; + else + extp->TopBottom = 3; + } /* burst write mode not supported */ cfi->cfiq->BufWriteTimeoutTyp = 0; @@ -243,6 +286,7 @@ static struct cfi_fixup cfi_fixup_table[] = { { CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri, NULL }, #ifdef AMD_BOOTLOC_BUG { CFI_MFR_AMD, CFI_ID_ANY, fixup_amd_bootblock, NULL }, + { MANUFACTURER_MACRONIX, CFI_ID_ANY, fixup_amd_bootblock, NULL }, #endif { CFI_MFR_AMD, 0x0050, fixup_use_secsi, NULL, }, { CFI_MFR_AMD, 0x0053, fixup_use_secsi, NULL, }, diff --git a/drivers/mtd/chips/cfi_probe.c b/drivers/mtd/chips/cfi_probe.c index c418e92e1d92..e63e6749429a 100644 --- a/drivers/mtd/chips/cfi_probe.c +++ b/drivers/mtd/chips/cfi_probe.c @@ -44,17 +44,14 @@ do { \ #define xip_enable(base, map, cfi) \ do { \ - cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); \ - cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); \ + cfi_qry_mode_off(base, map, cfi); \ xip_allowed(base, map); \ } while (0) #define xip_disable_qry(base, map, cfi) \ do { \ xip_disable(); \ - cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); \ - cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); \ - cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); \ + cfi_qry_mode_on(base, map, cfi); \ } while (0) #else @@ -70,32 +67,6 @@ do { \ in: interleave,type,mode ret: table index, <0 for error */ -static int __xipram qry_present(struct map_info *map, __u32 base, - struct cfi_private *cfi) -{ - int osf = cfi->interleave * cfi->device_type; // scale factor - map_word val[3]; - map_word qry[3]; - - qry[0] = cfi_build_cmd('Q', map, cfi); - qry[1] = cfi_build_cmd('R', map, cfi); - qry[2] = cfi_build_cmd('Y', map, cfi); - - val[0] = map_read(map, base + osf*0x10); - val[1] = map_read(map, base + osf*0x11); - val[2] = map_read(map, base + osf*0x12); - - if (!map_word_equal(map, qry[0], val[0])) - return 0; - - if (!map_word_equal(map, qry[1], val[1])) - return 0; - - if (!map_word_equal(map, qry[2], val[2])) - return 0; - - return 1; // "QRY" found -} static int __xipram cfi_probe_chip(struct map_info *map, __u32 base, unsigned long *chip_map, struct cfi_private *cfi) @@ -116,11 +87,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base, } xip_disable(); - cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); - cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); - cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); - - if (!qry_present(map,base,cfi)) { + if (!cfi_qry_mode_on(base, map, cfi)) { xip_enable(base, map, cfi); return 0; } @@ -141,14 +108,13 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base, start = i << cfi->chipshift; /* This chip should be in read mode if it's one we've already touched. */ - if (qry_present(map, start, cfi)) { + if (cfi_qry_present(map, start, cfi)) { /* Eep. This chip also had the QRY marker. * Is it an alias for the new one? */ - cfi_send_gen_cmd(0xF0, 0, start, map, cfi, cfi->device_type, NULL); - cfi_send_gen_cmd(0xFF, 0, start, map, cfi, cfi->device_type, NULL); + cfi_qry_mode_off(start, map, cfi); /* If the QRY marker goes away, it's an alias */ - if (!qry_present(map, start, cfi)) { + if (!cfi_qry_present(map, start, cfi)) { xip_allowed(base, map); printk(KERN_DEBUG "%s: Found an alias at 0x%x for the chip at 0x%lx\n", map->name, base, start); @@ -158,10 +124,9 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base, * unfortunate. Stick the new chip in read mode * too and if it's the same, assume it's an alias. */ /* FIXME: Use other modes to do a proper check */ - cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); - cfi_send_gen_cmd(0xFF, 0, start, map, cfi, cfi->device_type, NULL); + cfi_qry_mode_off(base, map, cfi); - if (qry_present(map, base, cfi)) { + if (cfi_qry_present(map, base, cfi)) { xip_allowed(base, map); printk(KERN_DEBUG "%s: Found an alias at 0x%x for the chip at 0x%lx\n", map->name, base, start); @@ -176,8 +141,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base, cfi->numchips++; /* Put it back into Read Mode */ - cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); - cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); + cfi_qry_mode_off(base, map, cfi); xip_allowed(base, map); printk(KERN_INFO "%s: Found %d x%d devices at 0x%x in %d-bit bank\n", @@ -237,9 +201,7 @@ static int __xipram cfi_chip_setup(struct map_info *map, cfi_read_query(map, base + 0xf * ofs_factor); /* Put it back into Read Mode */ - cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); - /* ... even if it's an Intel chip */ - cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); + cfi_qry_mode_off(base, map, cfi); xip_allowed(base, map); /* Do any necessary byteswapping */ diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c index 0ee457018016..34d40e25d312 100644 --- a/drivers/mtd/chips/cfi_util.c +++ b/drivers/mtd/chips/cfi_util.c @@ -24,6 +24,66 @@ #include <linux/mtd/cfi.h> #include <linux/mtd/compatmac.h> +int __xipram cfi_qry_present(struct map_info *map, __u32 base, + struct cfi_private *cfi) +{ + int osf = cfi->interleave * cfi->device_type; /* scale factor */ + map_word val[3]; + map_word qry[3]; + + qry[0] = cfi_build_cmd('Q', map, cfi); + qry[1] = cfi_build_cmd('R', map, cfi); + qry[2] = cfi_build_cmd('Y', map, cfi); + + val[0] = map_read(map, base + osf*0x10); + val[1] = map_read(map, base + osf*0x11); + val[2] = map_read(map, base + osf*0x12); + + if (!map_word_equal(map, qry[0], val[0])) + return 0; + + if (!map_word_equal(map, qry[1], val[1])) + return 0; + + if (!map_word_equal(map, qry[2], val[2])) + return 0; + + return 1; /* "QRY" found */ +} +EXPORT_SYMBOL_GPL(cfi_qry_present); + +int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map, + struct cfi_private *cfi) +{ + cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); + cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); + if (cfi_qry_present(map, base, cfi)) + return 1; + /* QRY not found probably we deal with some odd CFI chips */ + /* Some revisions of some old Intel chips? */ + cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); + cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); + cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); + if (cfi_qry_present(map, base, cfi)) + return 1; + /* ST M29DW chips */ + cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); + cfi_send_gen_cmd(0x98, 0x555, base, map, cfi, cfi->device_type, NULL); + if (cfi_qry_present(map, base, cfi)) + return 1; + /* QRY not found */ + return 0; +} +EXPORT_SYMBOL_GPL(cfi_qry_mode_on); + +void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map, + struct cfi_private *cfi) +{ + cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); + cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); +} +EXPORT_SYMBOL_GPL(cfi_qry_mode_off); + struct cfi_extquery * __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* name) { @@ -48,8 +108,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n #endif /* Switch it into Query Mode */ - cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); - + cfi_qry_mode_on(base, map, cfi); /* Read in the Extended Query Table */ for (i=0; i<size; i++) { ((unsigned char *)extp)[i] = @@ -57,8 +116,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n } /* Make sure it returns to read mode */ - cfi_send_gen_cmd(0xf0, 0, base, map, cfi, cfi->device_type, NULL); - cfi_send_gen_cmd(0xff, 0, base, map, cfi, cfi->device_type, NULL); + cfi_qry_mode_off(base, map, cfi); #ifdef CONFIG_MTD_XIP (void) map_read(map, base); diff --git a/drivers/mtd/chips/gen_probe.c b/drivers/mtd/chips/gen_probe.c index f061885b2812..e2dc96441e05 100644 --- a/drivers/mtd/chips/gen_probe.c +++ b/drivers/mtd/chips/gen_probe.c @@ -111,7 +111,7 @@ static struct cfi_private *genprobe_ident_chips(struct map_info *map, struct chi max_chips = 1; } - mapsize = sizeof(long) * ( (max_chips + BITS_PER_LONG-1) / BITS_PER_LONG ); + mapsize = sizeof(long) * DIV_ROUND_UP(max_chips, BITS_PER_LONG); chip_map = kzalloc(mapsize, GFP_KERNEL); if (!chip_map) { printk(KERN_WARNING "%s: kmalloc failed for CFI chip map\n", map->name); diff --git a/drivers/mtd/cmdlinepart.c b/drivers/mtd/cmdlinepart.c index 71bc07f149b7..50a340388e74 100644 --- a/drivers/mtd/cmdlinepart.c +++ b/drivers/mtd/cmdlinepart.c @@ -7,6 +7,7 @@ * * mtdparts=<mtddef>[;<mtddef] * <mtddef> := <mtd-id>:<partdef>[,<partdef>] + * where <mtd-id> is the name from the "cat /proc/mtd" command * <partdef> := <size>[@offset][<name>][ro][lk] * <mtd-id> := unique name used in mapping driver/device (mtd->name) * <size> := standard linux memsize OR "-" to denote all remaining space diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig index 9c613f06623c..6fde0a2e3567 100644 --- a/drivers/mtd/devices/Kconfig +++ b/drivers/mtd/devices/Kconfig @@ -59,6 +59,27 @@ config MTD_DATAFLASH Sometimes DataFlash chips are packaged inside MMC-format cards; at this writing, the MMC stack won't handle those. +config MTD_DATAFLASH_WRITE_VERIFY + bool "Verify DataFlash page writes" + depends on MTD_DATAFLASH + help + This adds an extra check when data is written to the flash. + It may help if you are verifying chip setup (timings etc) on + your board. There is a rare possibility that even though the + device thinks the write was successful, a bit could have been + flipped accidentally due to device wear or something else. + +config MTD_DATAFLASH_OTP + bool "DataFlash OTP support (Security Register)" + depends on MTD_DATAFLASH + select HAVE_MTD_OTP + help + Newer DataFlash chips (revisions C and D) support 128 bytes of + one-time-programmable (OTP) data. The first half may be written + (once) with up to 64 bytes of data, such as a serial number or + other key product data. The second half is programmed with a + unique-to-each-chip bit pattern at the factory. + config MTD_M25P80 tristate "Support most SPI Flash chips (AT26DF, M25P, W25X, ...)" depends on SPI_MASTER && EXPERIMENTAL diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index b35c3333e210..76a76751da36 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -39,6 +39,7 @@ #define OPCODE_PP 0x02 /* Page program (up to 256 bytes) */ #define OPCODE_BE_4K 0x20 /* Erase 4KiB block */ #define OPCODE_BE_32K 0x52 /* Erase 32KiB block */ +#define OPCODE_BE 0xc7 /* Erase whole flash block */ #define OPCODE_SE 0xd8 /* Sector erase (usually 64KiB) */ #define OPCODE_RDID 0x9f /* Read JEDEC ID */ @@ -161,6 +162,31 @@ static int wait_till_ready(struct m25p *flash) return 1; } +/* + * Erase the whole flash memory + * + * Returns 0 if successful, non-zero otherwise. + */ +static int erase_block(struct m25p *flash) +{ + DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %dKiB\n", + flash->spi->dev.bus_id, __func__, + flash->mtd.size / 1024); + + /* Wait until finished previous write command. */ + if (wait_till_ready(flash)) + return 1; + + /* Send write enable, then erase commands. */ + write_enable(flash); + + /* Set up command buffer. */ + flash->command[0] = OPCODE_BE; + + spi_write(flash->spi, flash->command, 1); + + return 0; +} /* * Erase one sector of flash memory at offset ``offset'' which is any @@ -229,15 +255,21 @@ static int m25p80_erase(struct mtd_info *mtd, struct erase_info *instr) */ /* now erase those sectors */ - while (len) { - if (erase_sector(flash, addr)) { - instr->state = MTD_ERASE_FAILED; - mutex_unlock(&flash->lock); - return -EIO; - } + if (len == flash->mtd.size && erase_block(flash)) { + instr->state = MTD_ERASE_FAILED; + mutex_unlock(&flash->lock); + return -EIO; + } else { + while (len) { + if (erase_sector(flash, addr)) { + instr->state = MTD_ERASE_FAILED; + mutex_unlock(&flash->lock); + return -EIO; + } - addr += mtd->erasesize; - len -= mtd->erasesize; + addr += mtd->erasesize; + len -= mtd->erasesize; + } } mutex_unlock(&flash->lock); @@ -437,6 +469,7 @@ struct flash_info { * then a two byte device id. */ u32 jedec_id; + u16 ext_id; /* The size listed here is what works with OPCODE_SE, which isn't * necessarily called a "sector" by the vendor. @@ -456,72 +489,75 @@ struct flash_info { static struct flash_info __devinitdata m25p_data [] = { /* Atmel -- some are (confusingly) marketed as "DataFlash" */ - { "at25fs010", 0x1f6601, 32 * 1024, 4, SECT_4K, }, - { "at25fs040", 0x1f6604, 64 * 1024, 8, SECT_4K, }, + { "at25fs010", 0x1f6601, 0, 32 * 1024, 4, SECT_4K, }, + { "at25fs040", 0x1f6604, 0, 64 * 1024, 8, SECT_4K, }, - { "at25df041a", 0x1f4401, 64 * 1024, 8, SECT_4K, }, - { "at25df641", 0x1f4800, 64 * 1024, 128, SECT_4K, }, + { "at25df041a", 0x1f4401, 0, 64 * 1024, 8, SECT_4K, }, + { "at25df641", 0x1f4800, 0, 64 * 1024, 128, SECT_4K, }, - { "at26f004", 0x1f0400, 64 * 1024, 8, SECT_4K, }, - { "at26df081a", 0x1f4501, 64 * 1024, 16, SECT_4K, }, - { "at26df161a", 0x1f4601, 64 * 1024, 32, SECT_4K, }, - { "at26df321", 0x1f4701, 64 * 1024, 64, SECT_4K, }, + { "at26f004", 0x1f0400, 0, 64 * 1024, 8, SECT_4K, }, + { "at26df081a", 0x1f4501, 0, 64 * 1024, 16, SECT_4K, }, + { "at26df161a", 0x1f4601, 0, 64 * 1024, 32, SECT_4K, }, + { "at26df321", 0x1f4701, 0, 64 * 1024, 64, SECT_4K, }, /* Spansion -- single (large) sector size only, at least * for the chips listed here (without boot sectors). */ - { "s25sl004a", 0x010212, 64 * 1024, 8, }, - { "s25sl008a", 0x010213, 64 * 1024, 16, }, - { "s25sl016a", 0x010214, 64 * 1024, 32, }, - { "s25sl032a", 0x010215, 64 * 1024, 64, }, - { "s25sl064a", 0x010216, 64 * 1024, 128, }, + { "s25sl004a", 0x010212, 0, 64 * 1024, 8, }, + { "s25sl008a", 0x010213, 0, 64 * 1024, 16, }, + { "s25sl016a", 0x010214, 0, 64 * 1024, 32, }, + { "s25sl032a", 0x010215, 0, 64 * 1024, 64, }, + { "s25sl064a", 0x010216, 0, 64 * 1024, 128, }, + { "s25sl12800", 0x012018, 0x0300, 256 * 1024, 64, }, + { "s25sl12801", 0x012018, 0x0301, 64 * 1024, 256, }, /* SST -- large erase sizes are "overlays", "sectors" are 4K */ - { "sst25vf040b", 0xbf258d, 64 * 1024, 8, SECT_4K, }, - { "sst25vf080b", 0xbf258e, 64 * 1024, 16, SECT_4K, }, - { "sst25vf016b", 0xbf2541, 64 * 1024, 32, SECT_4K, }, - { "sst25vf032b", 0xbf254a, 64 * 1024, 64, SECT_4K, }, + { "sst25vf040b", 0xbf258d, 0, 64 * 1024, 8, SECT_4K, }, + { "sst25vf080b", 0xbf258e, 0, 64 * 1024, 16, SECT_4K, }, + { "sst25vf016b", 0xbf2541, 0, 64 * 1024, 32, SECT_4K, }, + { "sst25vf032b", 0xbf254a, 0, 64 * 1024, 64, SECT_4K, }, /* ST Microelectronics -- newer production may have feature updates */ - { "m25p05", 0x202010, 32 * 1024, 2, }, - { "m25p10", 0x202011, 32 * 1024, 4, }, - { "m25p20", 0x202012, 64 * 1024, 4, }, - { "m25p40", 0x202013, 64 * 1024, 8, }, - { "m25p80", 0, 64 * 1024, 16, }, - { "m25p16", 0x202015, 64 * 1024, 32, }, - { "m25p32", 0x202016, 64 * 1024, 64, }, - { "m25p64", 0x202017, 64 * 1024, 128, }, - { "m25p128", 0x202018, 256 * 1024, 64, }, - - { "m45pe80", 0x204014, 64 * 1024, 16, }, - { "m45pe16", 0x204015, 64 * 1024, 32, }, - - { "m25pe80", 0x208014, 64 * 1024, 16, }, - { "m25pe16", 0x208015, 64 * 1024, 32, SECT_4K, }, + { "m25p05", 0x202010, 0, 32 * 1024, 2, }, + { "m25p10", 0x202011, 0, 32 * 1024, 4, }, + { "m25p20", 0x202012, 0, 64 * 1024, 4, }, + { "m25p40", 0x202013, 0, 64 * 1024, 8, }, + { "m25p80", 0, 0, 64 * 1024, 16, }, + { "m25p16", 0x202015, 0, 64 * 1024, 32, }, + { "m25p32", 0x202016, 0, 64 * 1024, 64, }, + { "m25p64", 0x202017, 0, 64 * 1024, 128, }, + { "m25p128", 0x202018, 0, 256 * 1024, 64, }, + + { "m45pe80", 0x204014, 0, 64 * 1024, 16, }, + { "m45pe16", 0x204015, 0, 64 * 1024, 32, }, + + { "m25pe80", 0x208014, 0, 64 * 1024, 16, }, + { "m25pe16", 0x208015, 0, 64 * 1024, 32, SECT_4K, }, /* Winbond -- w25x "blocks" are 64K, "sectors" are 4KiB */ - { "w25x10", 0xef3011, 64 * 1024, 2, SECT_4K, }, - { "w25x20", 0xef3012, 64 * 1024, 4, SECT_4K, }, - { "w25x40", 0xef3013, 64 * 1024, 8, SECT_4K, }, - { "w25x80", 0xef3014, 64 * 1024, 16, SECT_4K, }, - { "w25x16", 0xef3015, 64 * 1024, 32, SECT_4K, }, - { "w25x32", 0xef3016, 64 * 1024, 64, SECT_4K, }, - { "w25x64", 0xef3017, 64 * 1024, 128, SECT_4K, }, + { "w25x10", 0xef3011, 0, 64 * 1024, 2, SECT_4K, }, + { "w25x20", 0xef3012, 0, 64 * 1024, 4, SECT_4K, }, + { "w25x40", 0xef3013, 0, 64 * 1024, 8, SECT_4K, }, + { "w25x80", 0xef3014, 0, 64 * 1024, 16, SECT_4K, }, + { "w25x16", 0xef3015, 0, 64 * 1024, 32, SECT_4K, }, + { "w25x32", 0xef3016, 0, 64 * 1024, 64, SECT_4K, }, + { "w25x64", 0xef3017, 0, 64 * 1024, 128, SECT_4K, }, }; static struct flash_info *__devinit jedec_probe(struct spi_device *spi) { int tmp; u8 code = OPCODE_RDID; - u8 id[3]; + u8 id[5]; u32 jedec; + u16 ext_jedec; struct flash_info *info; /* JEDEC also defines an optional "extended device information" * string for after vendor-specific data, after the three bytes * we use here. Supporting some chips might require using it. */ - tmp = spi_write_then_read(spi, &code, 1, id, 3); + tmp = spi_write_then_read(spi, &code, 1, id, 5); if (tmp < 0) { DEBUG(MTD_DEBUG_LEVEL0, "%s: error %d reading JEDEC ID\n", spi->dev.bus_id, tmp); @@ -533,10 +569,14 @@ static struct flash_info *__devinit jedec_probe(struct spi_device *spi) jedec = jedec << 8; jedec |= id[2]; + ext_jedec = id[3] << 8 | id[4]; + for (tmp = 0, info = m25p_data; tmp < ARRAY_SIZE(m25p_data); tmp++, info++) { if (info->jedec_id == jedec) + if (ext_jedec != 0 && info->ext_id != ext_jedec) + continue; return info; } dev_err(&spi->dev, "unrecognized JEDEC id %06x\n", jedec); diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c index 8bd0dea6885f..6dd9aff8bb2d 100644 --- a/drivers/mtd/devices/mtd_dataflash.c +++ b/drivers/mtd/devices/mtd_dataflash.c @@ -30,12 +30,10 @@ * doesn't (yet) use these for any kind of i/o overlap or prefetching. * * Sometimes DataFlash is packaged in MMC-format cards, although the - * MMC stack can't use SPI (yet), or distinguish between MMC and DataFlash + * MMC stack can't (yet?) distinguish between MMC and DataFlash * protocols during enumeration. */ -#define CONFIG_DATAFLASH_WRITE_VERIFY - /* reads can bypass the buffers */ #define OP_READ_CONTINUOUS 0xE8 #define OP_READ_PAGE 0xD2 @@ -80,7 +78,8 @@ */ #define OP_READ_ID 0x9F #define OP_READ_SECURITY 0x77 -#define OP_WRITE_SECURITY 0x9A /* OTP bits */ +#define OP_WRITE_SECURITY_REVC 0x9A +#define OP_WRITE_SECURITY 0x9B /* revision D */ struct dataflash { @@ -402,7 +401,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len, (void) dataflash_waitready(priv->spi); -#ifdef CONFIG_DATAFLASH_WRITE_VERIFY +#ifdef CONFIG_MTD_DATAFLASH_VERIFY_WRITE /* (3) Compare to Buffer1 */ addr = pageaddr << priv->page_offset; @@ -431,7 +430,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len, } else status = 0; -#endif /* CONFIG_DATAFLASH_WRITE_VERIFY */ +#endif /* CONFIG_MTD_DATAFLASH_VERIFY_WRITE */ remaining = remaining - writelen; pageaddr++; @@ -451,16 +450,192 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len, /* ......................................................................... */ +#ifdef CONFIG_MTD_DATAFLASH_OTP + +static int dataflash_get_otp_info(struct mtd_info *mtd, + struct otp_info *info, size_t len) +{ + /* Report both blocks as identical: bytes 0..64, locked. + * Unless the user block changed from all-ones, we can't + * tell whether it's still writable; so we assume it isn't. + */ + info->start = 0; + info->length = 64; + info->locked = 1; + return sizeof(*info); +} + +static ssize_t otp_read(struct spi_device *spi, unsigned base, + uint8_t *buf, loff_t off, size_t len) +{ + struct spi_message m; + size_t l; + uint8_t *scratch; + struct spi_transfer t; + int status; + + if (off > 64) + return -EINVAL; + + if ((off + len) > 64) + len = 64 - off; + if (len == 0) + return len; + + spi_message_init(&m); + + l = 4 + base + off + len; + scratch = kzalloc(l, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + + /* OUT: OP_READ_SECURITY, 3 don't-care bytes, zeroes + * IN: ignore 4 bytes, data bytes 0..N (max 127) + */ + scratch[0] = OP_READ_SECURITY; + + memset(&t, 0, sizeof t); + t.tx_buf = scratch; + t.rx_buf = scratch; + t.len = l; + spi_message_add_tail(&t, &m); + + dataflash_waitready(spi); + + status = spi_sync(spi, &m); + if (status >= 0) { + memcpy(buf, scratch + 4 + base + off, len); + status = len; + } + + kfree(scratch); + return status; +} + +static int dataflash_read_fact_otp(struct mtd_info *mtd, + loff_t from, size_t len, size_t *retlen, u_char *buf) +{ + struct dataflash *priv = (struct dataflash *)mtd->priv; + int status; + + /* 64 bytes, from 0..63 ... start at 64 on-chip */ + mutex_lock(&priv->lock); + status = otp_read(priv->spi, 64, buf, from, len); + mutex_unlock(&priv->lock); + + if (status < 0) + return status; + *retlen = status; + return 0; +} + +static int dataflash_read_user_otp(struct mtd_info *mtd, + loff_t from, size_t len, size_t *retlen, u_char *buf) +{ + struct dataflash *priv = (struct dataflash *)mtd->priv; + int status; + + /* 64 bytes, from 0..63 ... start at 0 on-chip */ + mutex_lock(&priv->lock); + status = otp_read(priv->spi, 0, buf, from, len); + mutex_unlock(&priv->lock); + + if (status < 0) + return status; + *retlen = status; + return 0; +} + +static int dataflash_write_user_otp(struct mtd_info *mtd, + loff_t from, size_t len, size_t *retlen, u_char *buf) +{ + struct spi_message m; + const size_t l = 4 + 64; + uint8_t *scratch; + struct spi_transfer t; + struct dataflash *priv = (struct dataflash *)mtd->priv; + int status; + + if (len > 64) + return -EINVAL; + + /* Strictly speaking, we *could* truncate the write ... but + * let's not do that for the only write that's ever possible. + */ + if ((from + len) > 64) + return -EINVAL; + + /* OUT: OP_WRITE_SECURITY, 3 zeroes, 64 data-or-zero bytes + * IN: ignore all + */ + scratch = kzalloc(l, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + scratch[0] = OP_WRITE_SECURITY; + memcpy(scratch + 4 + from, buf, len); + + spi_message_init(&m); + + memset(&t, 0, sizeof t); + t.tx_buf = scratch; + t.len = l; + spi_message_add_tail(&t, &m); + + /* Write the OTP bits, if they've not yet been written. + * This modifies SRAM buffer1. + */ + mutex_lock(&priv->lock); + dataflash_waitready(priv->spi); + status = spi_sync(priv->spi, &m); + mutex_unlock(&priv->lock); + + kfree(scratch); + + if (status >= 0) { + status = 0; + *retlen = len; + } + return status; +} + +static char *otp_setup(struct mtd_info *device, char revision) +{ + device->get_fact_prot_info = dataflash_get_otp_info; + device->read_fact_prot_reg = dataflash_read_fact_otp; + device->get_user_prot_info = dataflash_get_otp_info; + device->read_user_prot_reg = dataflash_read_user_otp; + + /* rev c parts (at45db321c and at45db1281 only!) use a + * different write procedure; not (yet?) implemented. + */ + if (revision > 'c') + device->write_user_prot_reg = dataflash_write_user_otp; + + return ", OTP"; +} + +#else + +static char *otp_setup(struct mtd_info *device, char revision) +{ + return " (OTP)"; +} + +#endif + +/* ......................................................................... */ + /* * Register DataFlash device with MTD subsystem. */ static int __devinit -add_dataflash(struct spi_device *spi, char *name, - int nr_pages, int pagesize, int pageoffset) +add_dataflash_otp(struct spi_device *spi, char *name, + int nr_pages, int pagesize, int pageoffset, char revision) { struct dataflash *priv; struct mtd_info *device; struct flash_platform_data *pdata = spi->dev.platform_data; + char *otp_tag = ""; priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) @@ -489,8 +664,12 @@ add_dataflash(struct spi_device *spi, char *name, device->write = dataflash_write; device->priv = priv; - dev_info(&spi->dev, "%s (%d KBytes) pagesize %d bytes\n", - name, DIV_ROUND_UP(device->size, 1024), pagesize); + if (revision >= 'c') + otp_tag = otp_setup(device, revision); + + dev_info(&spi->dev, "%s (%d KBytes) pagesize %d bytes%s\n", + name, DIV_ROUND_UP(device->size, 1024), + pagesize, otp_tag); dev_set_drvdata(&spi->dev, priv); if (mtd_has_partitions()) { @@ -519,6 +698,14 @@ add_dataflash(struct spi_device *spi, char *name, return add_mtd_device(device) == 1 ? -ENODEV : 0; } +static inline int __devinit +add_dataflash(struct spi_device *spi, char *name, + int nr_pages, int pagesize, int pageoffset) +{ + return add_dataflash_otp(spi, name, nr_pages, pagesize, + pageoffset, 0); +} + struct flash_info { char *name; @@ -664,13 +851,16 @@ static int __devinit dataflash_probe(struct spi_device *spi) * Try to detect dataflash by JEDEC ID. * If it succeeds we know we have either a C or D part. * D will support power of 2 pagesize option. + * Both support the security register, though with different + * write procedures. */ info = jedec_probe(spi); if (IS_ERR(info)) return PTR_ERR(info); if (info != NULL) - return add_dataflash(spi, info->name, info->nr_pages, - info->pagesize, info->pageoffset); + return add_dataflash_otp(spi, info->name, info->nr_pages, + info->pagesize, info->pageoffset, + (info->flags & SUP_POW2PS) ? 'd' : 'c'); /* * Older chips support only legacy commands, identifing diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c index c4f9d3378b24..50ce13887f63 100644 --- a/drivers/mtd/inftlcore.c +++ b/drivers/mtd/inftlcore.c @@ -388,6 +388,10 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned if (thisEUN == targetEUN) break; + /* Unlink the last block from the chain. */ + inftl->PUtable[prevEUN] = BLOCK_NIL; + + /* Now try to erase it. */ if (INFTL_formatblock(inftl, thisEUN) < 0) { /* * Could not erase : mark block as reserved. @@ -396,7 +400,6 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned } else { /* Correctly erased : mark it as free */ inftl->PUtable[thisEUN] = BLOCK_FREE; - inftl->PUtable[prevEUN] = BLOCK_NIL; inftl->numfreeEUNs++; } } diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index df8e00bba07b..5ea169362164 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -332,30 +332,6 @@ config MTD_CFI_FLAGADM Mapping for the Flaga digital module. If you don't have one, ignore this setting. -config MTD_WALNUT - tristate "Flash device mapped on IBM 405GP Walnut" - depends on MTD_JEDECPROBE && WALNUT && !PPC_MERGE - help - This enables access routines for the flash chips on the IBM 405GP - Walnut board. If you have one of these boards and would like to - use the flash chips on it, say 'Y'. - -config MTD_EBONY - tristate "Flash devices mapped on IBM 440GP Ebony" - depends on MTD_JEDECPROBE && EBONY && !PPC_MERGE - help - This enables access routines for the flash chips on the IBM 440GP - Ebony board. If you have one of these boards and would like to - use the flash chips on it, say 'Y'. - -config MTD_OCOTEA - tristate "Flash devices mapped on IBM 440GX Ocotea" - depends on MTD_CFI && OCOTEA && !PPC_MERGE - help - This enables access routines for the flash chips on the IBM 440GX - Ocotea board. If you have one of these boards and would like to - use the flash chips on it, say 'Y'. - config MTD_REDWOOD tristate "CFI Flash devices mapped on IBM Redwood" depends on MTD_CFI && ( REDWOOD_4 || REDWOOD_5 || REDWOOD_6 ) @@ -458,13 +434,6 @@ config MTD_CEIVA PhotoMax Digital Picture Frame. If you have such a device, say 'Y'. -config MTD_NOR_TOTO - tristate "NOR Flash device on TOTO board" - depends on ARCH_OMAP && OMAP_TOTO - help - This enables access to the NOR flash on the Texas Instruments - TOTO board. - config MTD_H720X tristate "Hynix evaluation board mappings" depends on MTD_CFI && ( ARCH_H7201 || ARCH_H7202 ) @@ -522,7 +491,7 @@ config MTD_BFIN_ASYNC config MTD_UCLINUX tristate "Generic uClinux RAM/ROM filesystem support" - depends on MTD_PARTITIONS && !MMU + depends on MTD_PARTITIONS && MTD_RAM && !MMU help Map driver to support image based filesystems for uClinux. diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile index 6cda6df973e5..6d9ba35caf11 100644 --- a/drivers/mtd/maps/Makefile +++ b/drivers/mtd/maps/Makefile @@ -50,12 +50,8 @@ obj-$(CONFIG_MTD_REDWOOD) += redwood.o obj-$(CONFIG_MTD_UCLINUX) += uclinux.o obj-$(CONFIG_MTD_NETtel) += nettel.o obj-$(CONFIG_MTD_SCB2_FLASH) += scb2_flash.o -obj-$(CONFIG_MTD_EBONY) += ebony.o -obj-$(CONFIG_MTD_OCOTEA) += ocotea.o -obj-$(CONFIG_MTD_WALNUT) += walnut.o obj-$(CONFIG_MTD_H720X) += h720x-flash.o obj-$(CONFIG_MTD_SBC8240) += sbc8240.o -obj-$(CONFIG_MTD_NOR_TOTO) += omap-toto-flash.o obj-$(CONFIG_MTD_IXP4XX) += ixp4xx.o obj-$(CONFIG_MTD_IXP2000) += ixp2000.o obj-$(CONFIG_MTD_WRSBC8260) += wr_sbc82xx_flash.o diff --git a/drivers/mtd/maps/ebony.c b/drivers/mtd/maps/ebony.c deleted file mode 100644 index d92b7c70d3ed..000000000000 --- a/drivers/mtd/maps/ebony.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Mapping for Ebony user flash - * - * Matt Porter <mporter@kernel.crashing.org> - * - * Copyright 2002-2004 MontaVista Software Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mtd/mtd.h> -#include <linux/mtd/map.h> -#include <linux/mtd/partitions.h> -#include <asm/io.h> -#include <asm/ibm44x.h> -#include <platforms/4xx/ebony.h> - -static struct mtd_info *flash; - -static struct map_info ebony_small_map = { - .name = "Ebony small flash", - .size = EBONY_SMALL_FLASH_SIZE, - .bankwidth = 1, -}; - -static struct map_info ebony_large_map = { - .name = "Ebony large flash", - .size = EBONY_LARGE_FLASH_SIZE, - .bankwidth = 1, -}; - -static struct mtd_partition ebony_small_partitions[] = { - { - .name = "OpenBIOS", - .offset = 0x0, - .size = 0x80000, - } -}; - -static struct mtd_partition ebony_large_partitions[] = { - { - .name = "fs", - .offset = 0, - .size = 0x380000, - }, - { - .name = "firmware", - .offset = 0x380000, - .size = 0x80000, - } -}; - -int __init init_ebony(void) -{ - u8 fpga0_reg; - u8 __iomem *fpga0_adr; - unsigned long long small_flash_base, large_flash_base; - - fpga0_adr = ioremap64(EBONY_FPGA_ADDR, 16); - if (!fpga0_adr) - return -ENOMEM; - - fpga0_reg = readb(fpga0_adr); - iounmap(fpga0_adr); - - if (EBONY_BOOT_SMALL_FLASH(fpga0_reg) && - !EBONY_FLASH_SEL(fpga0_reg)) - small_flash_base = EBONY_SMALL_FLASH_HIGH2; - else if (EBONY_BOOT_SMALL_FLASH(fpga0_reg) && - EBONY_FLASH_SEL(fpga0_reg)) - small_flash_base = EBONY_SMALL_FLASH_HIGH1; - else if (!EBONY_BOOT_SMALL_FLASH(fpga0_reg) && - !EBONY_FLASH_SEL(fpga0_reg)) - small_flash_base = EBONY_SMALL_FLASH_LOW2; - else - small_flash_base = EBONY_SMALL_FLASH_LOW1; - - if (EBONY_BOOT_SMALL_FLASH(fpga0_reg) && - !EBONY_ONBRD_FLASH_EN(fpga0_reg)) - large_flash_base = EBONY_LARGE_FLASH_LOW; - else - large_flash_base = EBONY_LARGE_FLASH_HIGH; - - ebony_small_map.phys = small_flash_base; - ebony_small_map.virt = ioremap64(small_flash_base, - ebony_small_map.size); - - if (!ebony_small_map.virt) { - printk("Failed to ioremap flash\n"); - return -EIO; - } - - simple_map_init(&ebony_small_map); - - flash = do_map_probe("jedec_probe", &ebony_small_map); - if (flash) { - flash->owner = THIS_MODULE; - add_mtd_partitions(flash, ebony_small_partitions, - ARRAY_SIZE(ebony_small_partitions)); - } else { - printk("map probe failed for flash\n"); - iounmap(ebony_small_map.virt); - return -ENXIO; - } - - ebony_large_map.phys = large_flash_base; - ebony_large_map.virt = ioremap64(large_flash_base, - ebony_large_map.size); - - if (!ebony_large_map.virt) { - printk("Failed to ioremap flash\n"); - iounmap(ebony_small_map.virt); - return -EIO; - } - - simple_map_init(&ebony_large_map); - - flash = do_map_probe("jedec_probe", &ebony_large_map); - if (flash) { - flash->owner = THIS_MODULE; - add_mtd_partitions(flash, ebony_large_partitions, - ARRAY_SIZE(ebony_large_partitions)); - } else { - printk("map probe failed for flash\n"); - iounmap(ebony_small_map.virt); - iounmap(ebony_large_map.virt); - return -ENXIO; - } - - return 0; -} - -static void __exit cleanup_ebony(void) -{ - if (flash) { - del_mtd_partitions(flash); - map_destroy(flash); - } - - if (ebony_small_map.virt) { - iounmap(ebony_small_map.virt); - ebony_small_map.virt = NULL; - } - - if (ebony_large_map.virt) { - iounmap(ebony_large_map.virt); - ebony_large_map.virt = NULL; - } -} - -module_init(init_ebony); -module_exit(cleanup_ebony); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Matt Porter <mporter@kernel.crashing.org>"); -MODULE_DESCRIPTION("MTD map and partitions for IBM 440GP Ebony boards"); diff --git a/drivers/mtd/maps/ocotea.c b/drivers/mtd/maps/ocotea.c deleted file mode 100644 index 5522eac8c980..000000000000 --- a/drivers/mtd/maps/ocotea.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Mapping for Ocotea user flash - * - * Matt Porter <mporter@kernel.crashing.org> - * - * Copyright 2002-2004 MontaVista Software Inc. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mtd/mtd.h> -#include <linux/mtd/map.h> -#include <linux/mtd/partitions.h> -#include <asm/io.h> -#include <asm/ibm44x.h> -#include <platforms/4xx/ocotea.h> - -static struct mtd_info *flash; - -static struct map_info ocotea_small_map = { - .name = "Ocotea small flash", - .size = OCOTEA_SMALL_FLASH_SIZE, - .buswidth = 1, -}; - -static struct map_info ocotea_large_map = { - .name = "Ocotea large flash", - .size = OCOTEA_LARGE_FLASH_SIZE, - .buswidth = 1, -}; - -static struct mtd_partition ocotea_small_partitions[] = { - { - .name = "pibs", - .offset = 0x0, - .size = 0x100000, - } -}; - -static struct mtd_partition ocotea_large_partitions[] = { - { - .name = "fs", - .offset = 0, - .size = 0x300000, - }, - { - .name = "firmware", - .offset = 0x300000, - .size = 0x100000, - } -}; - -int __init init_ocotea(void) -{ - u8 fpga0_reg; - u8 *fpga0_adr; - unsigned long long small_flash_base, large_flash_base; - - fpga0_adr = ioremap64(OCOTEA_FPGA_ADDR, 16); - if (!fpga0_adr) - return -ENOMEM; - - fpga0_reg = readb((unsigned long)fpga0_adr); - iounmap(fpga0_adr); - - if (OCOTEA_BOOT_LARGE_FLASH(fpga0_reg)) { - small_flash_base = OCOTEA_SMALL_FLASH_HIGH; - large_flash_base = OCOTEA_LARGE_FLASH_LOW; - } - else { - small_flash_base = OCOTEA_SMALL_FLASH_LOW; - large_flash_base = OCOTEA_LARGE_FLASH_HIGH; - } - - ocotea_small_map.phys = small_flash_base; - ocotea_small_map.virt = ioremap64(small_flash_base, - ocotea_small_map.size); - - if (!ocotea_small_map.virt) { - printk("Failed to ioremap flash\n"); - return -EIO; - } - - simple_map_init(&ocotea_small_map); - - flash = do_map_probe("map_rom", &ocotea_small_map); - if (flash) { - flash->owner = THIS_MODULE; - add_mtd_partitions(flash, ocotea_small_partitions, - ARRAY_SIZE(ocotea_small_partitions)); - } else { - printk("map probe failed for flash\n"); - iounmap(ocotea_small_map.virt); - return -ENXIO; - } - - ocotea_large_map.phys = large_flash_base; - ocotea_large_map.virt = ioremap64(large_flash_base, - ocotea_large_map.size); - - if (!ocotea_large_map.virt) { - printk("Failed to ioremap flash\n"); - iounmap(ocotea_small_map.virt); - return -EIO; - } - - simple_map_init(&ocotea_large_map); - - flash = do_map_probe("cfi_probe", &ocotea_large_map); - if (flash) { - flash->owner = THIS_MODULE; - add_mtd_partitions(flash, ocotea_large_partitions, - ARRAY_SIZE(ocotea_large_partitions)); - } else { - printk("map probe failed for flash\n"); - iounmap(ocotea_small_map.virt); - iounmap(ocotea_large_map.virt); - return -ENXIO; - } - - return 0; -} - -static void __exit cleanup_ocotea(void) -{ - if (flash) { - del_mtd_partitions(flash); - map_destroy(flash); - } - - if (ocotea_small_map.virt) { - iounmap((void *)ocotea_small_map.virt); - ocotea_small_map.virt = 0; - } - - if (ocotea_large_map.virt) { - iounmap((void *)ocotea_large_map.virt); - ocotea_large_map.virt = 0; - } -} - -module_init(init_ocotea); -module_exit(cleanup_ocotea); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Matt Porter <mporter@kernel.crashing.org>"); -MODULE_DESCRIPTION("MTD map and partitions for IBM 440GX Ocotea boards"); diff --git a/drivers/mtd/maps/omap-toto-flash.c b/drivers/mtd/maps/omap-toto-flash.c deleted file mode 100644 index 0a60ebbc2175..000000000000 --- a/drivers/mtd/maps/omap-toto-flash.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * NOR Flash memory access on TI Toto board - * - * jzhang@ti.com (C) 2003 Texas Instruments. - * - * (C) 2002 MontVista Software, Inc. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/slab.h> - -#include <linux/mtd/mtd.h> -#include <linux/mtd/map.h> -#include <linux/mtd/partitions.h> - -#include <asm/hardware.h> -#include <asm/io.h> - - -#ifndef CONFIG_ARCH_OMAP -#error This is for OMAP architecture only -#endif - -//these lines need be moved to a hardware header file -#define OMAP_TOTO_FLASH_BASE 0xd8000000 -#define OMAP_TOTO_FLASH_SIZE 0x80000 - -static struct map_info omap_toto_map_flash = { - .name = "OMAP Toto flash", - .bankwidth = 2, - .virt = (void __iomem *)OMAP_TOTO_FLASH_BASE, -}; - - -static struct mtd_partition toto_flash_partitions[] = { - { - .name = "BootLoader", - .size = 0x00040000, /* hopefully u-boot will stay 128k + 128*/ - .offset = 0, - .mask_flags = MTD_WRITEABLE, /* force read-only */ - }, { - .name = "ReservedSpace", - .size = 0x00030000, - .offset = MTDPART_OFS_APPEND, - //mask_flags: MTD_WRITEABLE, /* force read-only */ - }, { - .name = "EnvArea", /* bottom 64KiB for env vars */ - .size = MTDPART_SIZ_FULL, - .offset = MTDPART_OFS_APPEND, - } -}; - -static struct mtd_partition *parsed_parts; - -static struct mtd_info *flash_mtd; - -static int __init init_flash (void) -{ - - struct mtd_partition *parts; - int nb_parts = 0; - int parsed_nr_parts = 0; - const char *part_type; - - /* - * Static partition definition selection - */ - part_type = "static"; - - parts = toto_flash_partitions; - nb_parts = ARRAY_SIZE(toto_flash_partitions); - omap_toto_map_flash.size = OMAP_TOTO_FLASH_SIZE; - omap_toto_map_flash.phys = virt_to_phys(OMAP_TOTO_FLASH_BASE); - - simple_map_init(&omap_toto_map_flash); - /* - * Now let's probe for the actual flash. Do it here since - * specific machine settings might have been set above. - */ - printk(KERN_NOTICE "OMAP toto flash: probing %d-bit flash bus\n", - omap_toto_map_flash.bankwidth*8); - flash_mtd = do_map_probe("jedec_probe", &omap_toto_map_flash); - if (!flash_mtd) - return -ENXIO; - - if (parsed_nr_parts > 0) { - parts = parsed_parts; - nb_parts = parsed_nr_parts; - } - - if (nb_parts == 0) { - printk(KERN_NOTICE "OMAP toto flash: no partition info available," - "registering whole flash at once\n"); - if (add_mtd_device(flash_mtd)){ - return -ENXIO; - } - } else { - printk(KERN_NOTICE "Using %s partition definition\n", - part_type); - return add_mtd_partitions(flash_mtd, parts, nb_parts); - } - return 0; -} - -int __init omap_toto_mtd_init(void) -{ - int status; - - if (status = init_flash()) { - printk(KERN_ERR "OMAP Toto Flash: unable to init map for toto flash\n"); - } - return status; -} - -static void __exit omap_toto_mtd_cleanup(void) -{ - if (flash_mtd) { - del_mtd_partitions(flash_mtd); - map_destroy(flash_mtd); - kfree(parsed_parts); - } -} - -module_init(omap_toto_mtd_init); -module_exit(omap_toto_mtd_cleanup); - -MODULE_AUTHOR("Jian Zhang"); -MODULE_DESCRIPTION("OMAP Toto board map driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/maps/pci.c b/drivers/mtd/maps/pci.c index 5c6a25c90380..48f4cf5cb9d1 100644 --- a/drivers/mtd/maps/pci.c +++ b/drivers/mtd/maps/pci.c @@ -203,15 +203,8 @@ intel_dc21285_init(struct pci_dev *dev, struct map_pci_info *map) * not enabled, should we be allocating a new resource for it * or simply enabling it? */ - if (!(pci_resource_flags(dev, PCI_ROM_RESOURCE) & - IORESOURCE_ROM_ENABLE)) { - u32 val; - pci_resource_flags(dev, PCI_ROM_RESOURCE) |= IORESOURCE_ROM_ENABLE; - pci_read_config_dword(dev, PCI_ROM_ADDRESS, &val); - val |= PCI_ROM_ADDRESS_ENABLE; - pci_write_config_dword(dev, PCI_ROM_ADDRESS, val); - printk("%s: enabling expansion ROM\n", pci_name(dev)); - } + pci_enable_rom(dev); + printk("%s: enabling expansion ROM\n", pci_name(dev)); } if (!len || !base) @@ -232,18 +225,13 @@ intel_dc21285_init(struct pci_dev *dev, struct map_pci_info *map) static void intel_dc21285_exit(struct pci_dev *dev, struct map_pci_info *map) { - u32 val; - if (map->base) iounmap(map->base); /* * We need to undo the PCI BAR2/PCI ROM BAR address alteration. */ - pci_resource_flags(dev, PCI_ROM_RESOURCE) &= ~IORESOURCE_ROM_ENABLE; - pci_read_config_dword(dev, PCI_ROM_ADDRESS, &val); - val &= ~PCI_ROM_ADDRESS_ENABLE; - pci_write_config_dword(dev, PCI_ROM_ADDRESS, val); + pci_disable_rom(dev); } static unsigned long diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c index 49acd4171893..5fcfec034a94 100644 --- a/drivers/mtd/maps/physmap_of.c +++ b/drivers/mtd/maps/physmap_of.c @@ -230,8 +230,7 @@ static int __devinit of_flash_probe(struct of_device *dev, #ifdef CONFIG_MTD_OF_PARTS if (err == 0) { - err = of_mtd_parse_partitions(&dev->dev, info->mtd, - dp, &info->parts); + err = of_mtd_parse_partitions(&dev->dev, dp, &info->parts); if (err < 0) return err; } diff --git a/drivers/mtd/maps/walnut.c b/drivers/mtd/maps/walnut.c deleted file mode 100644 index e243476c8171..000000000000 --- a/drivers/mtd/maps/walnut.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Mapping for Walnut flash - * (used ebony.c as a "framework") - * - * Heikki Lindholm <holindho@infradead.org> - * - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mtd/mtd.h> -#include <linux/mtd/map.h> -#include <linux/mtd/partitions.h> -#include <asm/io.h> -#include <asm/ibm4xx.h> -#include <platforms/4xx/walnut.h> - -/* these should be in platforms/4xx/walnut.h ? */ -#define WALNUT_FLASH_ONBD_N(x) (x & 0x02) -#define WALNUT_FLASH_SRAM_SEL(x) (x & 0x01) -#define WALNUT_FLASH_LOW 0xFFF00000 -#define WALNUT_FLASH_HIGH 0xFFF80000 -#define WALNUT_FLASH_SIZE 0x80000 - -static struct mtd_info *flash; - -static struct map_info walnut_map = { - .name = "Walnut flash", - .size = WALNUT_FLASH_SIZE, - .bankwidth = 1, -}; - -/* Actually, OpenBIOS is the last 128 KiB of the flash - better - * partitioning could be made */ -static struct mtd_partition walnut_partitions[] = { - { - .name = "OpenBIOS", - .offset = 0x0, - .size = WALNUT_FLASH_SIZE, - /*.mask_flags = MTD_WRITEABLE, */ /* force read-only */ - } -}; - -int __init init_walnut(void) -{ - u8 fpga_brds1; - void *fpga_brds1_adr; - void *fpga_status_adr; - unsigned long flash_base; - - /* this should already be mapped (platform/4xx/walnut.c) */ - fpga_status_adr = ioremap(WALNUT_FPGA_BASE, 8); - if (!fpga_status_adr) - return -ENOMEM; - - fpga_brds1_adr = fpga_status_adr+5; - fpga_brds1 = readb(fpga_brds1_adr); - /* iounmap(fpga_status_adr); */ - - if (WALNUT_FLASH_ONBD_N(fpga_brds1)) { - printk("The on-board flash is disabled (U79 sw 5)!"); - iounmap(fpga_status_adr); - return -EIO; - } - if (WALNUT_FLASH_SRAM_SEL(fpga_brds1)) - flash_base = WALNUT_FLASH_LOW; - else - flash_base = WALNUT_FLASH_HIGH; - - walnut_map.phys = flash_base; - walnut_map.virt = - (void __iomem *)ioremap(flash_base, walnut_map.size); - - if (!walnut_map.virt) { - printk("Failed to ioremap flash.\n"); - iounmap(fpga_status_adr); - return -EIO; - } - - simple_map_init(&walnut_map); - - flash = do_map_probe("jedec_probe", &walnut_map); - if (flash) { - flash->owner = THIS_MODULE; - add_mtd_partitions(flash, walnut_partitions, - ARRAY_SIZE(walnut_partitions)); - } else { - printk("map probe failed for flash\n"); - iounmap(fpga_status_adr); - return -ENXIO; - } - - iounmap(fpga_status_adr); - return 0; -} - -static void __exit cleanup_walnut(void) -{ - if (flash) { - del_mtd_partitions(flash); - map_destroy(flash); - } - - if (walnut_map.virt) { - iounmap((void *)walnut_map.virt); - walnut_map.virt = 0; - } -} - -module_init(init_walnut); -module_exit(cleanup_walnut); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Heikki Lindholm <holindho@infradead.org>"); -MODULE_DESCRIPTION("MTD map and partitions for IBM 405GP Walnut boards"); diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 1c74762dec89..963840e9b5bf 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -348,7 +348,7 @@ static void mtdchar_erase_callback (struct erase_info *instr) wake_up((wait_queue_head_t *)instr->priv); } -#if defined(CONFIG_MTD_OTP) || defined(CONFIG_MTD_ONENAND_OTP) +#ifdef CONFIG_HAVE_MTD_OTP static int otp_select_filemode(struct mtd_file_info *mfi, int mode) { struct mtd_info *mtd = mfi->mtd; @@ -665,7 +665,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file, break; } -#if defined(CONFIG_MTD_OTP) || defined(CONFIG_MTD_ONENAND_OTP) +#ifdef CONFIG_HAVE_MTD_OTP case OTPSELECT: { int mode; diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 2972a5edb73d..789842d0e6f2 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -444,7 +444,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr) return -EINVAL; } - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; /* make a local copy of instr to avoid modifying the caller's struct */ erase = kmalloc(sizeof (struct erase_info), GFP_KERNEL); @@ -493,7 +493,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr) /* sanity check: should never happen since * block alignment has been checked above */ BUG_ON(err == -EINVAL); - if (erase->fail_addr != 0xffffffff) + if (erase->fail_addr != MTD_FAIL_ADDR_UNKNOWN) instr->fail_addr = erase->fail_addr + offset; break; } diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index 5a680e1e61f1..aebb3b27edbd 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -33,6 +33,7 @@ #include <linux/interrupt.h> #include <linux/mtd/mtd.h> +#define MTDOOPS_KERNMSG_MAGIC 0x5d005d00 #define OOPS_PAGE_SIZE 4096 static struct mtdoops_context { @@ -99,7 +100,7 @@ static void mtdoops_inc_counter(struct mtdoops_context *cxt) int ret; cxt->nextpage++; - if (cxt->nextpage > cxt->oops_pages) + if (cxt->nextpage >= cxt->oops_pages) cxt->nextpage = 0; cxt->nextcount++; if (cxt->nextcount == 0xffffffff) @@ -141,7 +142,7 @@ static void mtdoops_workfunc_erase(struct work_struct *work) mod = (cxt->nextpage * OOPS_PAGE_SIZE) % mtd->erasesize; if (mod != 0) { cxt->nextpage = cxt->nextpage + ((mtd->erasesize - mod) / OOPS_PAGE_SIZE); - if (cxt->nextpage > cxt->oops_pages) + if (cxt->nextpage >= cxt->oops_pages) cxt->nextpage = 0; } @@ -158,7 +159,7 @@ badblock: cxt->nextpage * OOPS_PAGE_SIZE); i++; cxt->nextpage = cxt->nextpage + (mtd->erasesize / OOPS_PAGE_SIZE); - if (cxt->nextpage > cxt->oops_pages) + if (cxt->nextpage >= cxt->oops_pages) cxt->nextpage = 0; if (i == (cxt->oops_pages / (mtd->erasesize / OOPS_PAGE_SIZE))) { printk(KERN_ERR "mtdoops: All blocks bad!\n"); @@ -224,40 +225,40 @@ static void find_next_position(struct mtdoops_context *cxt) { struct mtd_info *mtd = cxt->mtd; int ret, page, maxpos = 0; - u32 count, maxcount = 0xffffffff; + u32 count[2], maxcount = 0xffffffff; size_t retlen; for (page = 0; page < cxt->oops_pages; page++) { - ret = mtd->read(mtd, page * OOPS_PAGE_SIZE, 4, &retlen, (u_char *) &count); - if ((retlen != 4) || ((ret < 0) && (ret != -EUCLEAN))) { - printk(KERN_ERR "mtdoops: Read failure at %d (%td of 4 read)" + ret = mtd->read(mtd, page * OOPS_PAGE_SIZE, 8, &retlen, (u_char *) &count[0]); + if ((retlen != 8) || ((ret < 0) && (ret != -EUCLEAN))) { + printk(KERN_ERR "mtdoops: Read failure at %d (%td of 8 read)" ", err %d.\n", page * OOPS_PAGE_SIZE, retlen, ret); continue; } - if (count == 0xffffffff) + if (count[1] != MTDOOPS_KERNMSG_MAGIC) + continue; + if (count[0] == 0xffffffff) continue; if (maxcount == 0xffffffff) { - maxcount = count; + maxcount = count[0]; maxpos = page; - } else if ((count < 0x40000000) && (maxcount > 0xc0000000)) { - maxcount = count; + } else if ((count[0] < 0x40000000) && (maxcount > 0xc0000000)) { + maxcount = count[0]; maxpos = page; - } else if ((count > maxcount) && (count < 0xc0000000)) { - maxcount = count; + } else if ((count[0] > maxcount) && (count[0] < 0xc0000000)) { + maxcount = count[0]; maxpos = page; - } else if ((count > maxcount) && (count > 0xc0000000) + } else if ((count[0] > maxcount) && (count[0] > 0xc0000000) && (maxcount > 0x80000000)) { - maxcount = count; + maxcount = count[0]; maxpos = page; } } if (maxcount == 0xffffffff) { cxt->nextpage = 0; cxt->nextcount = 1; - cxt->ready = 1; - printk(KERN_DEBUG "mtdoops: Ready %d, %d (first init)\n", - cxt->nextpage, cxt->nextcount); + schedule_work(&cxt->work_erase); return; } @@ -358,8 +359,9 @@ mtdoops_console_write(struct console *co, const char *s, unsigned int count) if (cxt->writecount == 0) { u32 *stamp = cxt->oops_buf; - *stamp = cxt->nextcount; - cxt->writecount = 4; + *stamp++ = cxt->nextcount; + *stamp = MTDOOPS_KERNMSG_MAGIC; + cxt->writecount = 8; } if ((count + cxt->writecount) > OOPS_PAGE_SIZE) diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 9a06dc93ee0d..3728913fa5fa 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -214,7 +214,7 @@ static int part_erase(struct mtd_info *mtd, struct erase_info *instr) instr->addr += part->offset; ret = part->master->erase(part->master, instr); if (ret) { - if (instr->fail_addr != 0xffffffff) + if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN) instr->fail_addr -= part->offset; instr->addr -= part->offset; } @@ -226,7 +226,7 @@ void mtd_erase_callback(struct erase_info *instr) if (instr->mtd->erase == part_erase) { struct mtd_part *part = PART(instr->mtd); - if (instr->fail_addr != 0xffffffff) + if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN) instr->fail_addr -= part->offset; instr->addr -= part->offset; } diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 41f361c49b32..1c2e9450d663 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -56,6 +56,12 @@ config MTD_NAND_H1900 help This enables the driver for the iPAQ h1900 flash. +config MTD_NAND_GPIO + tristate "GPIO NAND Flash driver" + depends on GENERIC_GPIO && ARM + help + This enables a GPIO based NAND flash driver. + config MTD_NAND_SPIA tristate "NAND Flash device on SPIA board" depends on ARCH_P720T @@ -68,12 +74,6 @@ config MTD_NAND_AMS_DELTA help Support for NAND flash on Amstrad E3 (Delta). -config MTD_NAND_TOTO - tristate "NAND Flash device on TOTO board" - depends on ARCH_OMAP && BROKEN - help - Support for NAND flash on Texas Instruments Toto platform. - config MTD_NAND_TS7250 tristate "NAND Flash device on TS-7250 board" depends on MACH_TS72XX @@ -163,13 +163,6 @@ config MTD_NAND_S3C2410_HWECC incorrect ECC generation, and if using these, the default of software ECC is preferable. -config MTD_NAND_NDFC - tristate "NDFC NanD Flash Controller" - depends on 4xx && !PPC_MERGE - select MTD_NAND_ECC_SMC - help - NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs - config MTD_NAND_S3C2410_CLKSTOP bool "S3C2410 NAND IDLE clock stop" depends on MTD_NAND_S3C2410 @@ -340,6 +333,13 @@ config MTD_NAND_PXA3xx This enables the driver for the NAND flash device found on PXA3xx processors +config MTD_NAND_PXA3xx_BUILTIN + bool "Use builtin definitions for some NAND chips (deprecated)" + depends on MTD_NAND_PXA3xx + help + This enables builtin definitions for some NAND chips. This + is deprecated in favor of platform specific data. + config MTD_NAND_CM_X270 tristate "Support for NAND Flash on CM-X270 modules" depends on MTD_NAND && MACH_ARMCORE @@ -400,10 +400,24 @@ config MTD_NAND_FSL_ELBC config MTD_NAND_FSL_UPM tristate "Support for NAND on Freescale UPM" - depends on MTD_NAND && OF_GPIO && (PPC_83xx || PPC_85xx) + depends on MTD_NAND && (PPC_83xx || PPC_85xx) select FSL_LBC help Enables support for NAND Flash chips wired onto Freescale PowerPC processor localbus with User-Programmable Machine support. +config MTD_NAND_MXC + tristate "MXC NAND support" + depends on ARCH_MX2 + help + This enables the driver for the NAND flash controller on the + MXC processors. + +config MTD_NAND_SH_FLCTL + tristate "Support for NAND on Renesas SuperH FLCTL" + depends on MTD_NAND && SUPERH && CPU_SUBTYPE_SH7723 + help + Several Renesas SuperH CPU has FLCTL. This option enables support + for NAND Flash using FLCTL. This driver support SH7723. + endif # MTD_NAND diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile index b786c5da82da..b661586afbfc 100644 --- a/drivers/mtd/nand/Makefile +++ b/drivers/mtd/nand/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_MTD_NAND_IDS) += nand_ids.o obj-$(CONFIG_MTD_NAND_CAFE) += cafe_nand.o obj-$(CONFIG_MTD_NAND_SPIA) += spia.o obj-$(CONFIG_MTD_NAND_AMS_DELTA) += ams-delta.o -obj-$(CONFIG_MTD_NAND_TOTO) += toto.o obj-$(CONFIG_MTD_NAND_AUTCPU12) += autcpu12.o obj-$(CONFIG_MTD_NAND_EDB7312) += edb7312.o obj-$(CONFIG_MTD_NAND_AU1550) += au1550nd.o @@ -24,6 +23,7 @@ obj-$(CONFIG_MTD_NAND_NANDSIM) += nandsim.o obj-$(CONFIG_MTD_NAND_CS553X) += cs553x_nand.o obj-$(CONFIG_MTD_NAND_NDFC) += ndfc.o obj-$(CONFIG_MTD_NAND_ATMEL) += atmel_nand.o +obj-$(CONFIG_MTD_NAND_GPIO) += gpio.o obj-$(CONFIG_MTD_NAND_CM_X270) += cmx270_nand.o obj-$(CONFIG_MTD_NAND_BASLER_EXCITE) += excite_nandflash.o obj-$(CONFIG_MTD_NAND_PXA3xx) += pxa3xx_nand.o @@ -34,5 +34,7 @@ obj-$(CONFIG_MTD_NAND_PASEMI) += pasemi_nand.o obj-$(CONFIG_MTD_NAND_ORION) += orion_nand.o obj-$(CONFIG_MTD_NAND_FSL_ELBC) += fsl_elbc_nand.o obj-$(CONFIG_MTD_NAND_FSL_UPM) += fsl_upm.o +obj-$(CONFIG_MTD_NAND_SH_FLCTL) += sh_flctl.o +obj-$(CONFIG_MTD_NAND_MXC) += mxc_nand.o nand-objs := nand_base.o nand_bbt.o diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c index 3387e0d5076b..c98c1570a40b 100644 --- a/drivers/mtd/nand/atmel_nand.c +++ b/drivers/mtd/nand/atmel_nand.c @@ -174,48 +174,6 @@ static void atmel_write_buf16(struct mtd_info *mtd, const u8 *buf, int len) } /* - * write oob for small pages - */ -static int atmel_nand_write_oob_512(struct mtd_info *mtd, - struct nand_chip *chip, int page) -{ - int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad; - int eccsize = chip->ecc.size, length = mtd->oobsize; - int len, pos, status = 0; - const uint8_t *bufpoi = chip->oob_poi; - - pos = eccsize + chunk; - - chip->cmdfunc(mtd, NAND_CMD_SEQIN, pos, page); - len = min_t(int, length, chunk); - chip->write_buf(mtd, bufpoi, len); - bufpoi += len; - length -= len; - if (length > 0) - chip->write_buf(mtd, bufpoi, length); - - chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1); - status = chip->waitfunc(mtd, chip); - - return status & NAND_STATUS_FAIL ? -EIO : 0; - -} - -/* - * read oob for small pages - */ -static int atmel_nand_read_oob_512(struct mtd_info *mtd, - struct nand_chip *chip, int page, int sndcmd) -{ - if (sndcmd) { - chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page); - sndcmd = 0; - } - chip->read_buf(mtd, chip->oob_poi, mtd->oobsize); - return sndcmd; -} - -/* * Calculate HW ECC * * function called after a write @@ -235,14 +193,14 @@ static int atmel_nand_calculate(struct mtd_info *mtd, /* get the first 2 ECC bytes */ ecc_value = ecc_readl(host->ecc, PR); - ecc_code[eccpos[0]] = ecc_value & 0xFF; - ecc_code[eccpos[1]] = (ecc_value >> 8) & 0xFF; + ecc_code[0] = ecc_value & 0xFF; + ecc_code[1] = (ecc_value >> 8) & 0xFF; /* get the last 2 ECC bytes */ ecc_value = ecc_readl(host->ecc, NPR) & ATMEL_ECC_NPARITY; - ecc_code[eccpos[2]] = ecc_value & 0xFF; - ecc_code[eccpos[3]] = (ecc_value >> 8) & 0xFF; + ecc_code[2] = ecc_value & 0xFF; + ecc_code[3] = (ecc_value >> 8) & 0xFF; return 0; } @@ -476,14 +434,12 @@ static int __init atmel_nand_probe(struct platform_device *pdev) res = -EIO; goto err_ecc_ioremap; } - nand_chip->ecc.mode = NAND_ECC_HW_SYNDROME; + nand_chip->ecc.mode = NAND_ECC_HW; nand_chip->ecc.calculate = atmel_nand_calculate; nand_chip->ecc.correct = atmel_nand_correct; nand_chip->ecc.hwctl = atmel_nand_hwctl; nand_chip->ecc.read_page = atmel_nand_read_page; nand_chip->ecc.bytes = 4; - nand_chip->ecc.prepad = 0; - nand_chip->ecc.postpad = 0; } nand_chip->chip_delay = 20; /* 20us command delay time */ @@ -514,7 +470,7 @@ static int __init atmel_nand_probe(struct platform_device *pdev) goto err_scan_ident; } - if (nand_chip->ecc.mode == NAND_ECC_HW_SYNDROME) { + if (nand_chip->ecc.mode == NAND_ECC_HW) { /* ECC is calculated for the whole page (1 step) */ nand_chip->ecc.size = mtd->writesize; @@ -522,8 +478,6 @@ static int __init atmel_nand_probe(struct platform_device *pdev) switch (mtd->writesize) { case 512: nand_chip->ecc.layout = &atmel_oobinfo_small; - nand_chip->ecc.read_oob = atmel_nand_read_oob_512; - nand_chip->ecc.write_oob = atmel_nand_write_oob_512; ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_528); break; case 1024: diff --git a/drivers/mtd/nand/cs553x_nand.c b/drivers/mtd/nand/cs553x_nand.c index 3370a800fd36..9f1b451005ca 100644 --- a/drivers/mtd/nand/cs553x_nand.c +++ b/drivers/mtd/nand/cs553x_nand.c @@ -289,8 +289,10 @@ static int __init cs553x_init(void) int i; uint64_t val; +#ifdef CONFIG_MTD_PARTITIONS int mtd_parts_nb = 0; struct mtd_partition *mtd_parts = NULL; +#endif /* If the CPU isn't a Geode GX or LX, abort */ if (!is_geode()) diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c index 98ad3cefcaf4..4aa5bd6158da 100644 --- a/drivers/mtd/nand/fsl_elbc_nand.c +++ b/drivers/mtd/nand/fsl_elbc_nand.c @@ -918,8 +918,7 @@ static int __devinit fsl_elbc_chip_probe(struct fsl_elbc_ctrl *ctrl, #ifdef CONFIG_MTD_OF_PARTS if (ret == 0) { - ret = of_mtd_parse_partitions(priv->dev, &priv->mtd, - node, &parts); + ret = of_mtd_parse_partitions(priv->dev, node, &parts); if (ret < 0) goto err; } diff --git a/drivers/mtd/nand/fsl_upm.c b/drivers/mtd/nand/fsl_upm.c index 1ebfd87f00b4..024e3fffd4bb 100644 --- a/drivers/mtd/nand/fsl_upm.c +++ b/drivers/mtd/nand/fsl_upm.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/module.h> +#include <linux/delay.h> #include <linux/mtd/nand.h> #include <linux/mtd/nand_ecc.h> #include <linux/mtd/partitions.h> @@ -36,8 +37,6 @@ struct fsl_upm_nand { uint8_t upm_cmd_offset; void __iomem *io_base; int rnb_gpio; - const uint32_t *wait_pattern; - const uint32_t *wait_write; int chip_delay; }; @@ -61,10 +60,11 @@ static void fun_wait_rnb(struct fsl_upm_nand *fun) if (fun->rnb_gpio >= 0) { while (--cnt && !fun_chip_ready(&fun->mtd)) cpu_relax(); + if (!cnt) + dev_err(fun->dev, "tired waiting for RNB\n"); + } else { + ndelay(100); } - - if (!cnt) - dev_err(fun->dev, "tired waiting for RNB\n"); } static void fun_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl) @@ -89,8 +89,7 @@ static void fun_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl) fsl_upm_run_pattern(&fun->upm, fun->io_base, cmd); - if (fun->wait_pattern) - fun_wait_rnb(fun); + fun_wait_rnb(fun); } static uint8_t fun_read_byte(struct mtd_info *mtd) @@ -116,14 +115,16 @@ static void fun_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len) for (i = 0; i < len; i++) { out_8(fun->chip.IO_ADDR_W, buf[i]); - if (fun->wait_write) - fun_wait_rnb(fun); + fun_wait_rnb(fun); } } -static int __devinit fun_chip_init(struct fsl_upm_nand *fun) +static int __devinit fun_chip_init(struct fsl_upm_nand *fun, + const struct device_node *upm_np, + const struct resource *io_res) { int ret; + struct device_node *flash_np; #ifdef CONFIG_MTD_PARTITIONS static const char *part_types[] = { "cmdlinepart", NULL, }; #endif @@ -143,18 +144,37 @@ static int __devinit fun_chip_init(struct fsl_upm_nand *fun) fun->mtd.priv = &fun->chip; fun->mtd.owner = THIS_MODULE; + flash_np = of_get_next_child(upm_np, NULL); + if (!flash_np) + return -ENODEV; + + fun->mtd.name = kasprintf(GFP_KERNEL, "%x.%s", io_res->start, + flash_np->name); + if (!fun->mtd.name) { + ret = -ENOMEM; + goto err; + } + ret = nand_scan(&fun->mtd, 1); if (ret) - return ret; - - fun->mtd.name = fun->dev->bus_id; + goto err; #ifdef CONFIG_MTD_PARTITIONS ret = parse_mtd_partitions(&fun->mtd, part_types, &fun->parts, 0); + +#ifdef CONFIG_MTD_OF_PARTS + if (ret == 0) + ret = of_mtd_parse_partitions(fun->dev, &fun->mtd, + flash_np, &fun->parts); +#endif if (ret > 0) - return add_mtd_partitions(&fun->mtd, fun->parts, ret); + ret = add_mtd_partitions(&fun->mtd, fun->parts, ret); + else #endif - return add_mtd_device(&fun->mtd); + ret = add_mtd_device(&fun->mtd); +err: + of_node_put(flash_np); + return ret; } static int __devinit fun_probe(struct of_device *ofdev, @@ -211,6 +231,12 @@ static int __devinit fun_probe(struct of_device *ofdev, goto err2; } + prop = of_get_property(ofdev->node, "chip-delay", NULL); + if (prop) + fun->chip_delay = *prop; + else + fun->chip_delay = 50; + fun->io_base = devm_ioremap_nocache(&ofdev->dev, io_res.start, io_res.end - io_res.start + 1); if (!fun->io_base) { @@ -220,17 +246,8 @@ static int __devinit fun_probe(struct of_device *ofdev, fun->dev = &ofdev->dev; fun->last_ctrl = NAND_CLE; - fun->wait_pattern = of_get_property(ofdev->node, "fsl,wait-pattern", - NULL); - fun->wait_write = of_get_property(ofdev->node, "fsl,wait-write", NULL); - - prop = of_get_property(ofdev->node, "chip-delay", NULL); - if (prop) - fun->chip_delay = *prop; - else - fun->chip_delay = 50; - ret = fun_chip_init(fun); + ret = fun_chip_init(fun, ofdev->node, &io_res); if (ret) goto err2; @@ -251,6 +268,7 @@ static int __devexit fun_remove(struct of_device *ofdev) struct fsl_upm_nand *fun = dev_get_drvdata(&ofdev->dev); nand_release(&fun->mtd); + kfree(fun->mtd.name); if (fun->rnb_gpio >= 0) gpio_free(fun->rnb_gpio); diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c new file mode 100644 index 000000000000..8f902e75aa85 --- /dev/null +++ b/drivers/mtd/nand/gpio.c @@ -0,0 +1,375 @@ +/* + * drivers/mtd/nand/gpio.c + * + * Updated, and converted to generic GPIO based driver by Russell King. + * + * Written by Ben Dooks <ben@simtec.co.uk> + * Based on 2.4 version by Mark Whittaker + * + * © 2004 Simtec Electronics + * + * Device driver for NAND connected via GPIO + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/gpio.h> +#include <linux/io.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/nand.h> +#include <linux/mtd/partitions.h> +#include <linux/mtd/nand-gpio.h> + +struct gpiomtd { + void __iomem *io_sync; + struct mtd_info mtd_info; + struct nand_chip nand_chip; + struct gpio_nand_platdata plat; +}; + +#define gpio_nand_getpriv(x) container_of(x, struct gpiomtd, mtd_info) + + +#ifdef CONFIG_ARM +/* gpio_nand_dosync() + * + * Make sure the GPIO state changes occur in-order with writes to NAND + * memory region. + * Needed on PXA due to bus-reordering within the SoC itself (see section on + * I/O ordering in PXA manual (section 2.3, p35) + */ +static void gpio_nand_dosync(struct gpiomtd *gpiomtd) +{ + unsigned long tmp; + + if (gpiomtd->io_sync) { + /* + * Linux memory barriers don't cater for what's required here. + * What's required is what's here - a read from a separate + * region with a dependency on that read. + */ + tmp = readl(gpiomtd->io_sync); + asm volatile("mov %1, %0\n" : "=r" (tmp) : "r" (tmp)); + } +} +#else +static inline void gpio_nand_dosync(struct gpiomtd *gpiomtd) {} +#endif + +static void gpio_nand_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl) +{ + struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd); + + gpio_nand_dosync(gpiomtd); + + if (ctrl & NAND_CTRL_CHANGE) { + gpio_set_value(gpiomtd->plat.gpio_nce, !(ctrl & NAND_NCE)); + gpio_set_value(gpiomtd->plat.gpio_cle, !!(ctrl & NAND_CLE)); + gpio_set_value(gpiomtd->plat.gpio_ale, !!(ctrl & NAND_ALE)); + gpio_nand_dosync(gpiomtd); + } + if (cmd == NAND_CMD_NONE) + return; + + writeb(cmd, gpiomtd->nand_chip.IO_ADDR_W); + gpio_nand_dosync(gpiomtd); +} + +static void gpio_nand_writebuf(struct mtd_info *mtd, const u_char *buf, int len) +{ + struct nand_chip *this = mtd->priv; + + writesb(this->IO_ADDR_W, buf, len); +} + +static void gpio_nand_readbuf(struct mtd_info *mtd, u_char *buf, int len) +{ + struct nand_chip *this = mtd->priv; + + readsb(this->IO_ADDR_R, buf, len); +} + +static int gpio_nand_verifybuf(struct mtd_info *mtd, const u_char *buf, int len) +{ + struct nand_chip *this = mtd->priv; + unsigned char read, *p = (unsigned char *) buf; + int i, err = 0; + + for (i = 0; i < len; i++) { + read = readb(this->IO_ADDR_R); + if (read != p[i]) { + pr_debug("%s: err at %d (read %04x vs %04x)\n", + __func__, i, read, p[i]); + err = -EFAULT; + } + } + return err; +} + +static void gpio_nand_writebuf16(struct mtd_info *mtd, const u_char *buf, + int len) +{ + struct nand_chip *this = mtd->priv; + + if (IS_ALIGNED((unsigned long)buf, 2)) { + writesw(this->IO_ADDR_W, buf, len>>1); + } else { + int i; + unsigned short *ptr = (unsigned short *)buf; + + for (i = 0; i < len; i += 2, ptr++) + writew(*ptr, this->IO_ADDR_W); + } +} + +static void gpio_nand_readbuf16(struct mtd_info *mtd, u_char *buf, int len) +{ + struct nand_chip *this = mtd->priv; + + if (IS_ALIGNED((unsigned long)buf, 2)) { + readsw(this->IO_ADDR_R, buf, len>>1); + } else { + int i; + unsigned short *ptr = (unsigned short *)buf; + + for (i = 0; i < len; i += 2, ptr++) + *ptr = readw(this->IO_ADDR_R); + } +} + +static int gpio_nand_verifybuf16(struct mtd_info *mtd, const u_char *buf, + int len) +{ + struct nand_chip *this = mtd->priv; + unsigned short read, *p = (unsigned short *) buf; + int i, err = 0; + len >>= 1; + + for (i = 0; i < len; i++) { + read = readw(this->IO_ADDR_R); + if (read != p[i]) { + pr_debug("%s: err at %d (read %04x vs %04x)\n", + __func__, i, read, p[i]); + err = -EFAULT; + } + } + return err; +} + + +static int gpio_nand_devready(struct mtd_info *mtd) +{ + struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd); + return gpio_get_value(gpiomtd->plat.gpio_rdy); +} + +static int __devexit gpio_nand_remove(struct platform_device *dev) +{ + struct gpiomtd *gpiomtd = platform_get_drvdata(dev); + struct resource *res; + + nand_release(&gpiomtd->mtd_info); + + res = platform_get_resource(dev, IORESOURCE_MEM, 1); + iounmap(gpiomtd->io_sync); + if (res) + release_mem_region(res->start, res->end - res->start + 1); + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + iounmap(gpiomtd->nand_chip.IO_ADDR_R); + release_mem_region(res->start, res->end - res->start + 1); + + if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) + gpio_set_value(gpiomtd->plat.gpio_nwp, 0); + gpio_set_value(gpiomtd->plat.gpio_nce, 1); + + gpio_free(gpiomtd->plat.gpio_cle); + gpio_free(gpiomtd->plat.gpio_ale); + gpio_free(gpiomtd->plat.gpio_nce); + if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) + gpio_free(gpiomtd->plat.gpio_nwp); + gpio_free(gpiomtd->plat.gpio_rdy); + + kfree(gpiomtd); + + return 0; +} + +static void __iomem *request_and_remap(struct resource *res, size_t size, + const char *name, int *err) +{ + void __iomem *ptr; + + if (!request_mem_region(res->start, res->end - res->start + 1, name)) { + *err = -EBUSY; + return NULL; + } + + ptr = ioremap(res->start, size); + if (!ptr) { + release_mem_region(res->start, res->end - res->start + 1); + *err = -ENOMEM; + } + return ptr; +} + +static int __devinit gpio_nand_probe(struct platform_device *dev) +{ + struct gpiomtd *gpiomtd; + struct nand_chip *this; + struct resource *res0, *res1; + int ret; + + if (!dev->dev.platform_data) + return -EINVAL; + + res0 = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res0) + return -EINVAL; + + gpiomtd = kzalloc(sizeof(*gpiomtd), GFP_KERNEL); + if (gpiomtd == NULL) { + dev_err(&dev->dev, "failed to create NAND MTD\n"); + return -ENOMEM; + } + + this = &gpiomtd->nand_chip; + this->IO_ADDR_R = request_and_remap(res0, 2, "NAND", &ret); + if (!this->IO_ADDR_R) { + dev_err(&dev->dev, "unable to map NAND\n"); + goto err_map; + } + + res1 = platform_get_resource(dev, IORESOURCE_MEM, 1); + if (res1) { + gpiomtd->io_sync = request_and_remap(res1, 4, "NAND sync", &ret); + if (!gpiomtd->io_sync) { + dev_err(&dev->dev, "unable to map sync NAND\n"); + goto err_sync; + } + } + + memcpy(&gpiomtd->plat, dev->dev.platform_data, sizeof(gpiomtd->plat)); + + ret = gpio_request(gpiomtd->plat.gpio_nce, "NAND NCE"); + if (ret) + goto err_nce; + gpio_direction_output(gpiomtd->plat.gpio_nce, 1); + if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) { + ret = gpio_request(gpiomtd->plat.gpio_nwp, "NAND NWP"); + if (ret) + goto err_nwp; + gpio_direction_output(gpiomtd->plat.gpio_nwp, 1); + } + ret = gpio_request(gpiomtd->plat.gpio_ale, "NAND ALE"); + if (ret) + goto err_ale; + gpio_direction_output(gpiomtd->plat.gpio_ale, 0); + ret = gpio_request(gpiomtd->plat.gpio_cle, "NAND CLE"); + if (ret) + goto err_cle; + gpio_direction_output(gpiomtd->plat.gpio_cle, 0); + ret = gpio_request(gpiomtd->plat.gpio_rdy, "NAND RDY"); + if (ret) + goto err_rdy; + gpio_direction_input(gpiomtd->plat.gpio_rdy); + + + this->IO_ADDR_W = this->IO_ADDR_R; + this->ecc.mode = NAND_ECC_SOFT; + this->options = gpiomtd->plat.options; + this->chip_delay = gpiomtd->plat.chip_delay; + + /* install our routines */ + this->cmd_ctrl = gpio_nand_cmd_ctrl; + this->dev_ready = gpio_nand_devready; + + if (this->options & NAND_BUSWIDTH_16) { + this->read_buf = gpio_nand_readbuf16; + this->write_buf = gpio_nand_writebuf16; + this->verify_buf = gpio_nand_verifybuf16; + } else { + this->read_buf = gpio_nand_readbuf; + this->write_buf = gpio_nand_writebuf; + this->verify_buf = gpio_nand_verifybuf; + } + + /* set the mtd private data for the nand driver */ + gpiomtd->mtd_info.priv = this; + gpiomtd->mtd_info.owner = THIS_MODULE; + + if (nand_scan(&gpiomtd->mtd_info, 1)) { + dev_err(&dev->dev, "no nand chips found?\n"); + ret = -ENXIO; + goto err_wp; + } + + if (gpiomtd->plat.adjust_parts) + gpiomtd->plat.adjust_parts(&gpiomtd->plat, + gpiomtd->mtd_info.size); + + add_mtd_partitions(&gpiomtd->mtd_info, gpiomtd->plat.parts, + gpiomtd->plat.num_parts); + platform_set_drvdata(dev, gpiomtd); + + return 0; + +err_wp: + if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) + gpio_set_value(gpiomtd->plat.gpio_nwp, 0); + gpio_free(gpiomtd->plat.gpio_rdy); +err_rdy: + gpio_free(gpiomtd->plat.gpio_cle); +err_cle: + gpio_free(gpiomtd->plat.gpio_ale); +err_ale: + if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) + gpio_free(gpiomtd->plat.gpio_nwp); +err_nwp: + gpio_free(gpiomtd->plat.gpio_nce); +err_nce: + iounmap(gpiomtd->io_sync); + if (res1) + release_mem_region(res1->start, res1->end - res1->start + 1); +err_sync: + iounmap(gpiomtd->nand_chip.IO_ADDR_R); + release_mem_region(res0->start, res0->end - res0->start + 1); +err_map: + kfree(gpiomtd); + return ret; +} + +static struct platform_driver gpio_nand_driver = { + .probe = gpio_nand_probe, + .remove = gpio_nand_remove, + .driver = { + .name = "gpio-nand", + }, +}; + +static int __init gpio_nand_init(void) +{ + printk(KERN_INFO "GPIO NAND driver, © 2004 Simtec Electronics\n"); + + return platform_driver_register(&gpio_nand_driver); +} + +static void __exit gpio_nand_exit(void) +{ + platform_driver_unregister(&gpio_nand_driver); +} + +module_init(gpio_nand_init); +module_exit(gpio_nand_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>"); +MODULE_DESCRIPTION("GPIO NAND Driver"); diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c new file mode 100644 index 000000000000..21fd4f1c4806 --- /dev/null +++ b/drivers/mtd/nand/mxc_nand.c @@ -0,0 +1,1077 @@ +/* + * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved. + * Copyright 2008 Sascha Hauer, kernel@pengutronix.de + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/nand.h> +#include <linux/mtd/partitions.h> +#include <linux/interrupt.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/clk.h> +#include <linux/err.h> +#include <linux/io.h> + +#include <asm/mach/flash.h> +#include <mach/mxc_nand.h> + +#define DRIVER_NAME "mxc_nand" + +/* Addresses for NFC registers */ +#define NFC_BUF_SIZE 0xE00 +#define NFC_BUF_ADDR 0xE04 +#define NFC_FLASH_ADDR 0xE06 +#define NFC_FLASH_CMD 0xE08 +#define NFC_CONFIG 0xE0A +#define NFC_ECC_STATUS_RESULT 0xE0C +#define NFC_RSLTMAIN_AREA 0xE0E +#define NFC_RSLTSPARE_AREA 0xE10 +#define NFC_WRPROT 0xE12 +#define NFC_UNLOCKSTART_BLKADDR 0xE14 +#define NFC_UNLOCKEND_BLKADDR 0xE16 +#define NFC_NF_WRPRST 0xE18 +#define NFC_CONFIG1 0xE1A +#define NFC_CONFIG2 0xE1C + +/* Addresses for NFC RAM BUFFER Main area 0 */ +#define MAIN_AREA0 0x000 +#define MAIN_AREA1 0x200 +#define MAIN_AREA2 0x400 +#define MAIN_AREA3 0x600 + +/* Addresses for NFC SPARE BUFFER Spare area 0 */ +#define SPARE_AREA0 0x800 +#define SPARE_AREA1 0x810 +#define SPARE_AREA2 0x820 +#define SPARE_AREA3 0x830 + +/* Set INT to 0, FCMD to 1, rest to 0 in NFC_CONFIG2 Register + * for Command operation */ +#define NFC_CMD 0x1 + +/* Set INT to 0, FADD to 1, rest to 0 in NFC_CONFIG2 Register + * for Address operation */ +#define NFC_ADDR 0x2 + +/* Set INT to 0, FDI to 1, rest to 0 in NFC_CONFIG2 Register + * for Input operation */ +#define NFC_INPUT 0x4 + +/* Set INT to 0, FDO to 001, rest to 0 in NFC_CONFIG2 Register + * for Data Output operation */ +#define NFC_OUTPUT 0x8 + +/* Set INT to 0, FD0 to 010, rest to 0 in NFC_CONFIG2 Register + * for Read ID operation */ +#define NFC_ID 0x10 + +/* Set INT to 0, FDO to 100, rest to 0 in NFC_CONFIG2 Register + * for Read Status operation */ +#define NFC_STATUS 0x20 + +/* Set INT to 1, rest to 0 in NFC_CONFIG2 Register for Read + * Status operation */ +#define NFC_INT 0x8000 + +#define NFC_SP_EN (1 << 2) +#define NFC_ECC_EN (1 << 3) +#define NFC_INT_MSK (1 << 4) +#define NFC_BIG (1 << 5) +#define NFC_RST (1 << 6) +#define NFC_CE (1 << 7) +#define NFC_ONE_CYCLE (1 << 8) + +struct mxc_nand_host { + struct mtd_info mtd; + struct nand_chip nand; + struct mtd_partition *parts; + struct device *dev; + + void __iomem *regs; + int spare_only; + int status_request; + int pagesize_2k; + uint16_t col_addr; + struct clk *clk; + int clk_act; + int irq; + + wait_queue_head_t irq_waitq; +}; + +/* Define delays in microsec for NAND device operations */ +#define TROP_US_DELAY 2000 +/* Macros to get byte and bit positions of ECC */ +#define COLPOS(x) ((x) >> 3) +#define BITPOS(x) ((x) & 0xf) + +/* Define single bit Error positions in Main & Spare area */ +#define MAIN_SINGLEBIT_ERROR 0x4 +#define SPARE_SINGLEBIT_ERROR 0x1 + +/* OOB placement block for use with hardware ecc generation */ +static struct nand_ecclayout nand_hw_eccoob_8 = { + .eccbytes = 5, + .eccpos = {6, 7, 8, 9, 10}, + .oobfree = {{0, 5}, {11, 5}, } +}; + +static struct nand_ecclayout nand_hw_eccoob_16 = { + .eccbytes = 5, + .eccpos = {6, 7, 8, 9, 10}, + .oobfree = {{0, 6}, {12, 4}, } +}; + +#ifdef CONFIG_MTD_PARTITIONS +static const char *part_probes[] = { "RedBoot", "cmdlinepart", NULL }; +#endif + +static irqreturn_t mxc_nfc_irq(int irq, void *dev_id) +{ + struct mxc_nand_host *host = dev_id; + + uint16_t tmp; + + tmp = readw(host->regs + NFC_CONFIG1); + tmp |= NFC_INT_MSK; /* Disable interrupt */ + writew(tmp, host->regs + NFC_CONFIG1); + + wake_up(&host->irq_waitq); + + return IRQ_HANDLED; +} + +/* This function polls the NANDFC to wait for the basic operation to + * complete by checking the INT bit of config2 register. + */ +static void wait_op_done(struct mxc_nand_host *host, int max_retries, + uint16_t param, int useirq) +{ + uint32_t tmp; + + if (useirq) { + if ((readw(host->regs + NFC_CONFIG2) & NFC_INT) == 0) { + + tmp = readw(host->regs + NFC_CONFIG1); + tmp &= ~NFC_INT_MSK; /* Enable interrupt */ + writew(tmp, host->regs + NFC_CONFIG1); + + wait_event(host->irq_waitq, + readw(host->regs + NFC_CONFIG2) & NFC_INT); + + tmp = readw(host->regs + NFC_CONFIG2); + tmp &= ~NFC_INT; + writew(tmp, host->regs + NFC_CONFIG2); + } + } else { + while (max_retries-- > 0) { + if (readw(host->regs + NFC_CONFIG2) & NFC_INT) { + tmp = readw(host->regs + NFC_CONFIG2); + tmp &= ~NFC_INT; + writew(tmp, host->regs + NFC_CONFIG2); + break; + } + udelay(1); + } + if (max_retries <= 0) + DEBUG(MTD_DEBUG_LEVEL0, "%s(%d): INT not set\n", + __func__, param); + } +} + +/* This function issues the specified command to the NAND device and + * waits for completion. */ +static void send_cmd(struct mxc_nand_host *host, uint16_t cmd, int useirq) +{ + DEBUG(MTD_DEBUG_LEVEL3, "send_cmd(host, 0x%x, %d)\n", cmd, useirq); + + writew(cmd, host->regs + NFC_FLASH_CMD); + writew(NFC_CMD, host->regs + NFC_CONFIG2); + + /* Wait for operation to complete */ + wait_op_done(host, TROP_US_DELAY, cmd, useirq); +} + +/* This function sends an address (or partial address) to the + * NAND device. The address is used to select the source/destination for + * a NAND command. */ +static void send_addr(struct mxc_nand_host *host, uint16_t addr, int islast) +{ + DEBUG(MTD_DEBUG_LEVEL3, "send_addr(host, 0x%x %d)\n", addr, islast); + + writew(addr, host->regs + NFC_FLASH_ADDR); + writew(NFC_ADDR, host->regs + NFC_CONFIG2); + + /* Wait for operation to complete */ + wait_op_done(host, TROP_US_DELAY, addr, islast); +} + +/* This function requests the NANDFC to initate the transfer + * of data currently in the NANDFC RAM buffer to the NAND device. */ +static void send_prog_page(struct mxc_nand_host *host, uint8_t buf_id, + int spare_only) +{ + DEBUG(MTD_DEBUG_LEVEL3, "send_prog_page (%d)\n", spare_only); + + /* NANDFC buffer 0 is used for page read/write */ + writew(buf_id, host->regs + NFC_BUF_ADDR); + + /* Configure spare or page+spare access */ + if (!host->pagesize_2k) { + uint16_t config1 = readw(host->regs + NFC_CONFIG1); + if (spare_only) + config1 |= NFC_SP_EN; + else + config1 &= ~(NFC_SP_EN); + writew(config1, host->regs + NFC_CONFIG1); + } + + writew(NFC_INPUT, host->regs + NFC_CONFIG2); + + /* Wait for operation to complete */ + wait_op_done(host, TROP_US_DELAY, spare_only, true); +} + +/* Requests NANDFC to initated the transfer of data from the + * NAND device into in the NANDFC ram buffer. */ +static void send_read_page(struct mxc_nand_host *host, uint8_t buf_id, + int spare_only) +{ + DEBUG(MTD_DEBUG_LEVEL3, "send_read_page (%d)\n", spare_only); + + /* NANDFC buffer 0 is used for page read/write */ + writew(buf_id, host->regs + NFC_BUF_ADDR); + + /* Configure spare or page+spare access */ + if (!host->pagesize_2k) { + uint32_t config1 = readw(host->regs + NFC_CONFIG1); + if (spare_only) + config1 |= NFC_SP_EN; + else + config1 &= ~NFC_SP_EN; + writew(config1, host->regs + NFC_CONFIG1); + } + + writew(NFC_OUTPUT, host->regs + NFC_CONFIG2); + + /* Wait for operation to complete */ + wait_op_done(host, TROP_US_DELAY, spare_only, true); +} + +/* Request the NANDFC to perform a read of the NAND device ID. */ +static void send_read_id(struct mxc_nand_host *host) +{ + struct nand_chip *this = &host->nand; + uint16_t tmp; + + /* NANDFC buffer 0 is used for device ID output */ + writew(0x0, host->regs + NFC_BUF_ADDR); + + /* Read ID into main buffer */ + tmp = readw(host->regs + NFC_CONFIG1); + tmp &= ~NFC_SP_EN; + writew(tmp, host->regs + NFC_CONFIG1); + + writew(NFC_ID, host->regs + NFC_CONFIG2); + + /* Wait for operation to complete */ + wait_op_done(host, TROP_US_DELAY, 0, true); + + if (this->options & NAND_BUSWIDTH_16) { + void __iomem *main_buf = host->regs + MAIN_AREA0; + /* compress the ID info */ + writeb(readb(main_buf + 2), main_buf + 1); + writeb(readb(main_buf + 4), main_buf + 2); + writeb(readb(main_buf + 6), main_buf + 3); + writeb(readb(main_buf + 8), main_buf + 4); + writeb(readb(main_buf + 10), main_buf + 5); + } +} + +/* This function requests the NANDFC to perform a read of the + * NAND device status and returns the current status. */ +static uint16_t get_dev_status(struct mxc_nand_host *host) +{ + void __iomem *main_buf = host->regs + MAIN_AREA1; + uint32_t store; + uint16_t ret, tmp; + /* Issue status request to NAND device */ + + /* store the main area1 first word, later do recovery */ + store = readl(main_buf); + /* NANDFC buffer 1 is used for device status to prevent + * corruption of read/write buffer on status requests. */ + writew(1, host->regs + NFC_BUF_ADDR); + + /* Read status into main buffer */ + tmp = readw(host->regs + NFC_CONFIG1); + tmp &= ~NFC_SP_EN; + writew(tmp, host->regs + NFC_CONFIG1); + + writew(NFC_STATUS, host->regs + NFC_CONFIG2); + + /* Wait for operation to complete */ + wait_op_done(host, TROP_US_DELAY, 0, true); + + /* Status is placed in first word of main buffer */ + /* get status, then recovery area 1 data */ + ret = readw(main_buf); + writel(store, main_buf); + + return ret; +} + +/* This functions is used by upper layer to checks if device is ready */ +static int mxc_nand_dev_ready(struct mtd_info *mtd) +{ + /* + * NFC handles R/B internally. Therefore, this function + * always returns status as ready. + */ + return 1; +} + +static void mxc_nand_enable_hwecc(struct mtd_info *mtd, int mode) +{ + /* + * If HW ECC is enabled, we turn it on during init. There is + * no need to enable again here. + */ +} + +static int mxc_nand_correct_data(struct mtd_info *mtd, u_char *dat, + u_char *read_ecc, u_char *calc_ecc) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + + /* + * 1-Bit errors are automatically corrected in HW. No need for + * additional correction. 2-Bit errors cannot be corrected by + * HW ECC, so we need to return failure + */ + uint16_t ecc_status = readw(host->regs + NFC_ECC_STATUS_RESULT); + + if (((ecc_status & 0x3) == 2) || ((ecc_status >> 2) == 2)) { + DEBUG(MTD_DEBUG_LEVEL0, + "MXC_NAND: HWECC uncorrectable 2-bit ECC error\n"); + return -1; + } + + return 0; +} + +static int mxc_nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat, + u_char *ecc_code) +{ + return 0; +} + +static u_char mxc_nand_read_byte(struct mtd_info *mtd) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + uint8_t ret = 0; + uint16_t col, rd_word; + uint16_t __iomem *main_buf = host->regs + MAIN_AREA0; + uint16_t __iomem *spare_buf = host->regs + SPARE_AREA0; + + /* Check for status request */ + if (host->status_request) + return get_dev_status(host) & 0xFF; + + /* Get column for 16-bit access */ + col = host->col_addr >> 1; + + /* If we are accessing the spare region */ + if (host->spare_only) + rd_word = readw(&spare_buf[col]); + else + rd_word = readw(&main_buf[col]); + + /* Pick upper/lower byte of word from RAM buffer */ + if (host->col_addr & 0x1) + ret = (rd_word >> 8) & 0xFF; + else + ret = rd_word & 0xFF; + + /* Update saved column address */ + host->col_addr++; + + return ret; +} + +static uint16_t mxc_nand_read_word(struct mtd_info *mtd) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + uint16_t col, rd_word, ret; + uint16_t __iomem *p; + + DEBUG(MTD_DEBUG_LEVEL3, + "mxc_nand_read_word(col = %d)\n", host->col_addr); + + col = host->col_addr; + /* Adjust saved column address */ + if (col < mtd->writesize && host->spare_only) + col += mtd->writesize; + + if (col < mtd->writesize) + p = (host->regs + MAIN_AREA0) + (col >> 1); + else + p = (host->regs + SPARE_AREA0) + ((col - mtd->writesize) >> 1); + + if (col & 1) { + rd_word = readw(p); + ret = (rd_word >> 8) & 0xff; + rd_word = readw(&p[1]); + ret |= (rd_word << 8) & 0xff00; + + } else + ret = readw(p); + + /* Update saved column address */ + host->col_addr = col + 2; + + return ret; +} + +/* Write data of length len to buffer buf. The data to be + * written on NAND Flash is first copied to RAMbuffer. After the Data Input + * Operation by the NFC, the data is written to NAND Flash */ +static void mxc_nand_write_buf(struct mtd_info *mtd, + const u_char *buf, int len) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + int n, col, i = 0; + + DEBUG(MTD_DEBUG_LEVEL3, + "mxc_nand_write_buf(col = %d, len = %d)\n", host->col_addr, + len); + + col = host->col_addr; + + /* Adjust saved column address */ + if (col < mtd->writesize && host->spare_only) + col += mtd->writesize; + + n = mtd->writesize + mtd->oobsize - col; + n = min(len, n); + + DEBUG(MTD_DEBUG_LEVEL3, + "%s:%d: col = %d, n = %d\n", __func__, __LINE__, col, n); + + while (n) { + void __iomem *p; + + if (col < mtd->writesize) + p = host->regs + MAIN_AREA0 + (col & ~3); + else + p = host->regs + SPARE_AREA0 - + mtd->writesize + (col & ~3); + + DEBUG(MTD_DEBUG_LEVEL3, "%s:%d: p = %p\n", __func__, + __LINE__, p); + + if (((col | (int)&buf[i]) & 3) || n < 16) { + uint32_t data = 0; + + if (col & 3 || n < 4) + data = readl(p); + + switch (col & 3) { + case 0: + if (n) { + data = (data & 0xffffff00) | + (buf[i++] << 0); + n--; + col++; + } + case 1: + if (n) { + data = (data & 0xffff00ff) | + (buf[i++] << 8); + n--; + col++; + } + case 2: + if (n) { + data = (data & 0xff00ffff) | + (buf[i++] << 16); + n--; + col++; + } + case 3: + if (n) { + data = (data & 0x00ffffff) | + (buf[i++] << 24); + n--; + col++; + } + } + + writel(data, p); + } else { + int m = mtd->writesize - col; + + if (col >= mtd->writesize) + m += mtd->oobsize; + + m = min(n, m) & ~3; + + DEBUG(MTD_DEBUG_LEVEL3, + "%s:%d: n = %d, m = %d, i = %d, col = %d\n", + __func__, __LINE__, n, m, i, col); + + memcpy(p, &buf[i], m); + col += m; + i += m; + n -= m; + } + } + /* Update saved column address */ + host->col_addr = col; +} + +/* Read the data buffer from the NAND Flash. To read the data from NAND + * Flash first the data output cycle is initiated by the NFC, which copies + * the data to RAMbuffer. This data of length len is then copied to buffer buf. + */ +static void mxc_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + int n, col, i = 0; + + DEBUG(MTD_DEBUG_LEVEL3, + "mxc_nand_read_buf(col = %d, len = %d)\n", host->col_addr, len); + + col = host->col_addr; + + /* Adjust saved column address */ + if (col < mtd->writesize && host->spare_only) + col += mtd->writesize; + + n = mtd->writesize + mtd->oobsize - col; + n = min(len, n); + + while (n) { + void __iomem *p; + + if (col < mtd->writesize) + p = host->regs + MAIN_AREA0 + (col & ~3); + else + p = host->regs + SPARE_AREA0 - + mtd->writesize + (col & ~3); + + if (((col | (int)&buf[i]) & 3) || n < 16) { + uint32_t data; + + data = readl(p); + switch (col & 3) { + case 0: + if (n) { + buf[i++] = (uint8_t) (data); + n--; + col++; + } + case 1: + if (n) { + buf[i++] = (uint8_t) (data >> 8); + n--; + col++; + } + case 2: + if (n) { + buf[i++] = (uint8_t) (data >> 16); + n--; + col++; + } + case 3: + if (n) { + buf[i++] = (uint8_t) (data >> 24); + n--; + col++; + } + } + } else { + int m = mtd->writesize - col; + + if (col >= mtd->writesize) + m += mtd->oobsize; + + m = min(n, m) & ~3; + memcpy(&buf[i], p, m); + col += m; + i += m; + n -= m; + } + } + /* Update saved column address */ + host->col_addr = col; + +} + +/* Used by the upper layer to verify the data in NAND Flash + * with the data in the buf. */ +static int mxc_nand_verify_buf(struct mtd_info *mtd, + const u_char *buf, int len) +{ + return -EFAULT; +} + +/* This function is used by upper layer for select and + * deselect of the NAND chip */ +static void mxc_nand_select_chip(struct mtd_info *mtd, int chip) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + +#ifdef CONFIG_MTD_NAND_MXC_FORCE_CE + if (chip > 0) { + DEBUG(MTD_DEBUG_LEVEL0, + "ERROR: Illegal chip select (chip = %d)\n", chip); + return; + } + + if (chip == -1) { + writew(readw(host->regs + NFC_CONFIG1) & ~NFC_CE, + host->regs + NFC_CONFIG1); + return; + } + + writew(readw(host->regs + NFC_CONFIG1) | NFC_CE, + host->regs + NFC_CONFIG1); +#endif + + switch (chip) { + case -1: + /* Disable the NFC clock */ + if (host->clk_act) { + clk_disable(host->clk); + host->clk_act = 0; + } + break; + case 0: + /* Enable the NFC clock */ + if (!host->clk_act) { + clk_enable(host->clk); + host->clk_act = 1; + } + break; + + default: + break; + } +} + +/* Used by the upper layer to write command to NAND Flash for + * different operations to be carried out on NAND Flash */ +static void mxc_nand_command(struct mtd_info *mtd, unsigned command, + int column, int page_addr) +{ + struct nand_chip *nand_chip = mtd->priv; + struct mxc_nand_host *host = nand_chip->priv; + int useirq = true; + + DEBUG(MTD_DEBUG_LEVEL3, + "mxc_nand_command (cmd = 0x%x, col = 0x%x, page = 0x%x)\n", + command, column, page_addr); + + /* Reset command state information */ + host->status_request = false; + + /* Command pre-processing step */ + switch (command) { + + case NAND_CMD_STATUS: + host->col_addr = 0; + host->status_request = true; + break; + + case NAND_CMD_READ0: + host->col_addr = column; + host->spare_only = false; + useirq = false; + break; + + case NAND_CMD_READOOB: + host->col_addr = column; + host->spare_only = true; + useirq = false; + if (host->pagesize_2k) + command = NAND_CMD_READ0; /* only READ0 is valid */ + break; + + case NAND_CMD_SEQIN: + if (column >= mtd->writesize) { + /* + * FIXME: before send SEQIN command for write OOB, + * We must read one page out. + * For K9F1GXX has no READ1 command to set current HW + * pointer to spare area, we must write the whole page + * including OOB together. + */ + if (host->pagesize_2k) + /* call ourself to read a page */ + mxc_nand_command(mtd, NAND_CMD_READ0, 0, + page_addr); + + host->col_addr = column - mtd->writesize; + host->spare_only = true; + + /* Set program pointer to spare region */ + if (!host->pagesize_2k) + send_cmd(host, NAND_CMD_READOOB, false); + } else { + host->spare_only = false; + host->col_addr = column; + + /* Set program pointer to page start */ + if (!host->pagesize_2k) + send_cmd(host, NAND_CMD_READ0, false); + } + useirq = false; + break; + + case NAND_CMD_PAGEPROG: + send_prog_page(host, 0, host->spare_only); + + if (host->pagesize_2k) { + /* data in 4 areas datas */ + send_prog_page(host, 1, host->spare_only); + send_prog_page(host, 2, host->spare_only); + send_prog_page(host, 3, host->spare_only); + } + + break; + + case NAND_CMD_ERASE1: + useirq = false; + break; + } + + /* Write out the command to the device. */ + send_cmd(host, command, useirq); + + /* Write out column address, if necessary */ + if (column != -1) { + /* + * MXC NANDFC can only perform full page+spare or + * spare-only read/write. When the upper layers + * layers perform a read/write buf operation, + * we will used the saved column adress to index into + * the full page. + */ + send_addr(host, 0, page_addr == -1); + if (host->pagesize_2k) + /* another col addr cycle for 2k page */ + send_addr(host, 0, false); + } + + /* Write out page address, if necessary */ + if (page_addr != -1) { + /* paddr_0 - p_addr_7 */ + send_addr(host, (page_addr & 0xff), false); + + if (host->pagesize_2k) { + send_addr(host, (page_addr >> 8) & 0xFF, false); + if (mtd->size >= 0x40000000) + send_addr(host, (page_addr >> 16) & 0xff, true); + } else { + /* One more address cycle for higher density devices */ + if (mtd->size >= 0x4000000) { + /* paddr_8 - paddr_15 */ + send_addr(host, (page_addr >> 8) & 0xff, false); + send_addr(host, (page_addr >> 16) & 0xff, true); + } else + /* paddr_8 - paddr_15 */ + send_addr(host, (page_addr >> 8) & 0xff, true); + } + } + + /* Command post-processing step */ + switch (command) { + + case NAND_CMD_RESET: + break; + + case NAND_CMD_READOOB: + case NAND_CMD_READ0: + if (host->pagesize_2k) { + /* send read confirm command */ + send_cmd(host, NAND_CMD_READSTART, true); + /* read for each AREA */ + send_read_page(host, 0, host->spare_only); + send_read_page(host, 1, host->spare_only); + send_read_page(host, 2, host->spare_only); + send_read_page(host, 3, host->spare_only); + } else + send_read_page(host, 0, host->spare_only); + break; + + case NAND_CMD_READID: + send_read_id(host); + break; + + case NAND_CMD_PAGEPROG: + break; + + case NAND_CMD_STATUS: + break; + + case NAND_CMD_ERASE2: + break; + } +} + +static int __init mxcnd_probe(struct platform_device *pdev) +{ + struct nand_chip *this; + struct mtd_info *mtd; + struct mxc_nand_platform_data *pdata = pdev->dev.platform_data; + struct mxc_nand_host *host; + struct resource *res; + uint16_t tmp; + int err = 0, nr_parts = 0; + + /* Allocate memory for MTD device structure and private data */ + host = kzalloc(sizeof(struct mxc_nand_host), GFP_KERNEL); + if (!host) + return -ENOMEM; + + host->dev = &pdev->dev; + /* structures must be linked */ + this = &host->nand; + mtd = &host->mtd; + mtd->priv = this; + mtd->owner = THIS_MODULE; + + /* 50 us command delay time */ + this->chip_delay = 5; + + this->priv = host; + this->dev_ready = mxc_nand_dev_ready; + this->cmdfunc = mxc_nand_command; + this->select_chip = mxc_nand_select_chip; + this->read_byte = mxc_nand_read_byte; + this->read_word = mxc_nand_read_word; + this->write_buf = mxc_nand_write_buf; + this->read_buf = mxc_nand_read_buf; + this->verify_buf = mxc_nand_verify_buf; + + host->clk = clk_get(&pdev->dev, "nfc_clk"); + if (IS_ERR(host->clk)) + goto eclk; + + clk_enable(host->clk); + host->clk_act = 1; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + err = -ENODEV; + goto eres; + } + + host->regs = ioremap(res->start, res->end - res->start + 1); + if (!host->regs) { + err = -EIO; + goto eres; + } + + tmp = readw(host->regs + NFC_CONFIG1); + tmp |= NFC_INT_MSK; + writew(tmp, host->regs + NFC_CONFIG1); + + init_waitqueue_head(&host->irq_waitq); + + host->irq = platform_get_irq(pdev, 0); + + err = request_irq(host->irq, mxc_nfc_irq, 0, "mxc_nd", host); + if (err) + goto eirq; + + if (pdata->hw_ecc) { + this->ecc.calculate = mxc_nand_calculate_ecc; + this->ecc.hwctl = mxc_nand_enable_hwecc; + this->ecc.correct = mxc_nand_correct_data; + this->ecc.mode = NAND_ECC_HW; + this->ecc.size = 512; + this->ecc.bytes = 3; + this->ecc.layout = &nand_hw_eccoob_8; + tmp = readw(host->regs + NFC_CONFIG1); + tmp |= NFC_ECC_EN; + writew(tmp, host->regs + NFC_CONFIG1); + } else { + this->ecc.size = 512; + this->ecc.bytes = 3; + this->ecc.layout = &nand_hw_eccoob_8; + this->ecc.mode = NAND_ECC_SOFT; + tmp = readw(host->regs + NFC_CONFIG1); + tmp &= ~NFC_ECC_EN; + writew(tmp, host->regs + NFC_CONFIG1); + } + + /* Reset NAND */ + this->cmdfunc(mtd, NAND_CMD_RESET, -1, -1); + + /* preset operation */ + /* Unlock the internal RAM Buffer */ + writew(0x2, host->regs + NFC_CONFIG); + + /* Blocks to be unlocked */ + writew(0x0, host->regs + NFC_UNLOCKSTART_BLKADDR); + writew(0x4000, host->regs + NFC_UNLOCKEND_BLKADDR); + + /* Unlock Block Command for given address range */ + writew(0x4, host->regs + NFC_WRPROT); + + /* NAND bus width determines access funtions used by upper layer */ + if (pdata->width == 2) { + this->options |= NAND_BUSWIDTH_16; + this->ecc.layout = &nand_hw_eccoob_16; + } + + host->pagesize_2k = 0; + + /* Scan to find existence of the device */ + if (nand_scan(mtd, 1)) { + DEBUG(MTD_DEBUG_LEVEL0, + "MXC_ND: Unable to find any NAND device.\n"); + err = -ENXIO; + goto escan; + } + + /* Register the partitions */ +#ifdef CONFIG_MTD_PARTITIONS + nr_parts = + parse_mtd_partitions(mtd, part_probes, &host->parts, 0); + if (nr_parts > 0) + add_mtd_partitions(mtd, host->parts, nr_parts); + else +#endif + { + pr_info("Registering %s as whole device\n", mtd->name); + add_mtd_device(mtd); + } + + platform_set_drvdata(pdev, host); + + return 0; + +escan: + free_irq(host->irq, NULL); +eirq: + iounmap(host->regs); +eres: + clk_put(host->clk); +eclk: + kfree(host); + + return err; +} + +static int __devexit mxcnd_remove(struct platform_device *pdev) +{ + struct mxc_nand_host *host = platform_get_drvdata(pdev); + + clk_put(host->clk); + + platform_set_drvdata(pdev, NULL); + + nand_release(&host->mtd); + free_irq(host->irq, NULL); + iounmap(host->regs); + kfree(host); + + return 0; +} + +#ifdef CONFIG_PM +static int mxcnd_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct mtd_info *info = platform_get_drvdata(pdev); + int ret = 0; + + DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND suspend\n"); + if (info) + ret = info->suspend(info); + + /* Disable the NFC clock */ + clk_disable(nfc_clk); /* FIXME */ + + return ret; +} + +static int mxcnd_resume(struct platform_device *pdev) +{ + struct mtd_info *info = platform_get_drvdata(pdev); + int ret = 0; + + DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND resume\n"); + /* Enable the NFC clock */ + clk_enable(nfc_clk); /* FIXME */ + + if (info) + info->resume(info); + + return ret; +} + +#else +# define mxcnd_suspend NULL +# define mxcnd_resume NULL +#endif /* CONFIG_PM */ + +static struct platform_driver mxcnd_driver = { + .driver = { + .name = DRIVER_NAME, + }, + .remove = __exit_p(mxcnd_remove), + .suspend = mxcnd_suspend, + .resume = mxcnd_resume, +}; + +static int __init mxc_nd_init(void) +{ + /* Register the device driver structure. */ + pr_info("MXC MTD nand Driver\n"); + if (platform_driver_probe(&mxcnd_driver, mxcnd_probe) != 0) { + printk(KERN_ERR "Driver register failed for mxcnd_driver\n"); + return -ENODEV; + } + return 0; +} + +static void __exit mxc_nd_cleanup(void) +{ + /* Unregister the device structure */ + platform_driver_unregister(&mxcnd_driver); +} + +module_init(mxc_nd_init); +module_exit(mxc_nd_cleanup); + +MODULE_AUTHOR("Freescale Semiconductor, Inc."); +MODULE_DESCRIPTION("MXC NAND MTD driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index d1129bae6c27..0a9c9cd33f96 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -801,9 +801,9 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip, * nand_read_subpage - [REPLACABLE] software ecc based sub-page read function * @mtd: mtd info structure * @chip: nand chip info structure - * @dataofs offset of requested data within the page - * @readlen data length - * @buf: buffer to store read data + * @data_offs: offset of requested data within the page + * @readlen: data length + * @bufpoi: buffer to store read data */ static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, uint32_t data_offs, uint32_t readlen, uint8_t *bufpoi) { @@ -2042,7 +2042,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr, return -EINVAL; } - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; /* Grab the lock and see if the device is available */ nand_get_device(chip, mtd, FL_ERASING); @@ -2318,6 +2318,12 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd, /* Select the device */ chip->select_chip(mtd, 0); + /* + * Reset the chip, required by some chips (e.g. Micron MT29FxGxxxxx) + * after power-up + */ + chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1); + /* Send the command for reading device ID */ chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1); @@ -2488,6 +2494,8 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips) /* Check for a chip array */ for (i = 1; i < maxchips; i++) { chip->select_chip(mtd, i); + /* See comment in nand_get_flash_type for reset */ + chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1); /* Send the command for reading device ID */ chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1); /* Read manufacturer and device IDs */ diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c index 918a806a8471..868147acce2c 100644 --- a/drivers/mtd/nand/nand_ecc.c +++ b/drivers/mtd/nand/nand_ecc.c @@ -1,13 +1,18 @@ /* - * This file contains an ECC algorithm from Toshiba that detects and - * corrects 1 bit errors in a 256 byte block of data. + * This file contains an ECC algorithm that detects and corrects 1 bit + * errors in a 256 byte block of data. * * drivers/mtd/nand/nand_ecc.c * - * Copyright (C) 2000-2004 Steven J. Hill (sjhill@realitydiluted.com) - * Toshiba America Electronics Components, Inc. + * Copyright © 2008 Koninklijke Philips Electronics NV. + * Author: Frans Meulenbroeks * - * Copyright (C) 2006 Thomas Gleixner <tglx@linutronix.de> + * Completely replaces the previous ECC implementation which was written by: + * Steven J. Hill (sjhill@realitydiluted.com) + * Thomas Gleixner (tglx@linutronix.de) + * + * Information on how this algorithm works and how it was developed + * can be found in Documentation/mtd/nand_ecc.txt * * This file is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -23,174 +28,475 @@ * with this file; if not, write to the Free Software Foundation, Inc., * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. * - * As a special exception, if other files instantiate templates or use - * macros or inline functions from these files, or you compile these - * files and link them with other works to produce a work based on these - * files, these files do not by themselves cause the resulting work to be - * covered by the GNU General Public License. However the source code for - * these files must still be made available in accordance with section (3) - * of the GNU General Public License. - * - * This exception does not invalidate any other reasons why a work based on - * this file might be covered by the GNU General Public License. */ +/* + * The STANDALONE macro is useful when running the code outside the kernel + * e.g. when running the code in a testbed or a benchmark program. + * When STANDALONE is used, the module related macros are commented out + * as well as the linux include files. + * Instead a private definition of mtd_info is given to satisfy the compiler + * (the code does not use mtd_info, so the code does not care) + */ +#ifndef STANDALONE #include <linux/types.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/nand.h> #include <linux/mtd/nand_ecc.h> +#include <asm/byteorder.h> +#else +#include <stdint.h> +struct mtd_info; +#define EXPORT_SYMBOL(x) /* x */ + +#define MODULE_LICENSE(x) /* x */ +#define MODULE_AUTHOR(x) /* x */ +#define MODULE_DESCRIPTION(x) /* x */ + +#define printk printf +#define KERN_ERR "" +#endif + +/* + * invparity is a 256 byte table that contains the odd parity + * for each byte. So if the number of bits in a byte is even, + * the array element is 1, and when the number of bits is odd + * the array eleemnt is 0. + */ +static const char invparity[256] = { + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, + 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1 +}; + +/* + * bitsperbyte contains the number of bits per byte + * this is only used for testing and repairing parity + * (a precalculated value slightly improves performance) + */ +static const char bitsperbyte[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; /* - * Pre-calculated 256-way 1 byte column parity + * addressbits is a lookup table to filter out the bits from the xor-ed + * ecc data that identify the faulty location. + * this is only used for repairing parity + * see the comments in nand_correct_data for more details */ -static const u_char nand_ecc_precalc_table[] = { - 0x00, 0x55, 0x56, 0x03, 0x59, 0x0c, 0x0f, 0x5a, 0x5a, 0x0f, 0x0c, 0x59, 0x03, 0x56, 0x55, 0x00, - 0x65, 0x30, 0x33, 0x66, 0x3c, 0x69, 0x6a, 0x3f, 0x3f, 0x6a, 0x69, 0x3c, 0x66, 0x33, 0x30, 0x65, - 0x66, 0x33, 0x30, 0x65, 0x3f, 0x6a, 0x69, 0x3c, 0x3c, 0x69, 0x6a, 0x3f, 0x65, 0x30, 0x33, 0x66, - 0x03, 0x56, 0x55, 0x00, 0x5a, 0x0f, 0x0c, 0x59, 0x59, 0x0c, 0x0f, 0x5a, 0x00, 0x55, 0x56, 0x03, - 0x69, 0x3c, 0x3f, 0x6a, 0x30, 0x65, 0x66, 0x33, 0x33, 0x66, 0x65, 0x30, 0x6a, 0x3f, 0x3c, 0x69, - 0x0c, 0x59, 0x5a, 0x0f, 0x55, 0x00, 0x03, 0x56, 0x56, 0x03, 0x00, 0x55, 0x0f, 0x5a, 0x59, 0x0c, - 0x0f, 0x5a, 0x59, 0x0c, 0x56, 0x03, 0x00, 0x55, 0x55, 0x00, 0x03, 0x56, 0x0c, 0x59, 0x5a, 0x0f, - 0x6a, 0x3f, 0x3c, 0x69, 0x33, 0x66, 0x65, 0x30, 0x30, 0x65, 0x66, 0x33, 0x69, 0x3c, 0x3f, 0x6a, - 0x6a, 0x3f, 0x3c, 0x69, 0x33, 0x66, 0x65, 0x30, 0x30, 0x65, 0x66, 0x33, 0x69, 0x3c, 0x3f, 0x6a, - 0x0f, 0x5a, 0x59, 0x0c, 0x56, 0x03, 0x00, 0x55, 0x55, 0x00, 0x03, 0x56, 0x0c, 0x59, 0x5a, 0x0f, - 0x0c, 0x59, 0x5a, 0x0f, 0x55, 0x00, 0x03, 0x56, 0x56, 0x03, 0x00, 0x55, 0x0f, 0x5a, 0x59, 0x0c, - 0x69, 0x3c, 0x3f, 0x6a, 0x30, 0x65, 0x66, 0x33, 0x33, 0x66, 0x65, 0x30, 0x6a, 0x3f, 0x3c, 0x69, - 0x03, 0x56, 0x55, 0x00, 0x5a, 0x0f, 0x0c, 0x59, 0x59, 0x0c, 0x0f, 0x5a, 0x00, 0x55, 0x56, 0x03, - 0x66, 0x33, 0x30, 0x65, 0x3f, 0x6a, 0x69, 0x3c, 0x3c, 0x69, 0x6a, 0x3f, 0x65, 0x30, 0x33, 0x66, - 0x65, 0x30, 0x33, 0x66, 0x3c, 0x69, 0x6a, 0x3f, 0x3f, 0x6a, 0x69, 0x3c, 0x66, 0x33, 0x30, 0x65, - 0x00, 0x55, 0x56, 0x03, 0x59, 0x0c, 0x0f, 0x5a, 0x5a, 0x0f, 0x0c, 0x59, 0x03, 0x56, 0x55, 0x00 +static const char addressbits[256] = { + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, + 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, + 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, + 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, + 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, + 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, + 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, + 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, + 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, + 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, + 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, + 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, + 0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05, + 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07, + 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, + 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, + 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, + 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, + 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, + 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, + 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, + 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, + 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, + 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, + 0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09, + 0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, + 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, + 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, + 0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, + 0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f }; /** - * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block + * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte + * block * @mtd: MTD block structure - * @dat: raw data - * @ecc_code: buffer for ECC + * @buf: input buffer with raw data + * @code: output buffer with ECC */ -int nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat, - u_char *ecc_code) +int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf, + unsigned char *code) { - uint8_t idx, reg1, reg2, reg3, tmp1, tmp2; int i; + const uint32_t *bp = (uint32_t *)buf; + /* 256 or 512 bytes/ecc */ + const uint32_t eccsize_mult = + (((struct nand_chip *)mtd->priv)->ecc.size) >> 8; + uint32_t cur; /* current value in buffer */ + /* rp0..rp15..rp17 are the various accumulated parities (per byte) */ + uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7; + uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16; + uint32_t uninitialized_var(rp17); /* to make compiler happy */ + uint32_t par; /* the cumulative parity for all data */ + uint32_t tmppar; /* the cumulative parity for this iteration; + for rp12, rp14 and rp16 at the end of the + loop */ + + par = 0; + rp4 = 0; + rp6 = 0; + rp8 = 0; + rp10 = 0; + rp12 = 0; + rp14 = 0; + rp16 = 0; + + /* + * The loop is unrolled a number of times; + * This avoids if statements to decide on which rp value to update + * Also we process the data by longwords. + * Note: passing unaligned data might give a performance penalty. + * It is assumed that the buffers are aligned. + * tmppar is the cumulative sum of this iteration. + * needed for calculating rp12, rp14, rp16 and par + * also used as a performance improvement for rp6, rp8 and rp10 + */ + for (i = 0; i < eccsize_mult << 2; i++) { + cur = *bp++; + tmppar = cur; + rp4 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp6 ^= tmppar; + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp8 ^= tmppar; - /* Initialize variables */ - reg1 = reg2 = reg3 = 0; + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + rp6 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp6 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp10 ^= tmppar; - /* Build up column parity */ - for(i = 0; i < 256; i++) { - /* Get CP0 - CP5 from table */ - idx = nand_ecc_precalc_table[*dat++]; - reg1 ^= (idx & 0x3f); + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + rp6 ^= cur; + rp8 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp6 ^= cur; + rp8 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + rp8 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp8 ^= cur; - /* All bit XOR = 1 ? */ - if (idx & 0x40) { - reg3 ^= (uint8_t) i; - reg2 ^= ~((uint8_t) i); - } + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + rp6 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp6 ^= cur; + cur = *bp++; + tmppar ^= cur; + rp4 ^= cur; + cur = *bp++; + tmppar ^= cur; + + par ^= tmppar; + if ((i & 0x1) == 0) + rp12 ^= tmppar; + if ((i & 0x2) == 0) + rp14 ^= tmppar; + if (eccsize_mult == 2 && (i & 0x4) == 0) + rp16 ^= tmppar; } - /* Create non-inverted ECC code from line parity */ - tmp1 = (reg3 & 0x80) >> 0; /* B7 -> B7 */ - tmp1 |= (reg2 & 0x80) >> 1; /* B7 -> B6 */ - tmp1 |= (reg3 & 0x40) >> 1; /* B6 -> B5 */ - tmp1 |= (reg2 & 0x40) >> 2; /* B6 -> B4 */ - tmp1 |= (reg3 & 0x20) >> 2; /* B5 -> B3 */ - tmp1 |= (reg2 & 0x20) >> 3; /* B5 -> B2 */ - tmp1 |= (reg3 & 0x10) >> 3; /* B4 -> B1 */ - tmp1 |= (reg2 & 0x10) >> 4; /* B4 -> B0 */ - - tmp2 = (reg3 & 0x08) << 4; /* B3 -> B7 */ - tmp2 |= (reg2 & 0x08) << 3; /* B3 -> B6 */ - tmp2 |= (reg3 & 0x04) << 3; /* B2 -> B5 */ - tmp2 |= (reg2 & 0x04) << 2; /* B2 -> B4 */ - tmp2 |= (reg3 & 0x02) << 2; /* B1 -> B3 */ - tmp2 |= (reg2 & 0x02) << 1; /* B1 -> B2 */ - tmp2 |= (reg3 & 0x01) << 1; /* B0 -> B1 */ - tmp2 |= (reg2 & 0x01) << 0; /* B7 -> B0 */ - - /* Calculate final ECC code */ -#ifdef CONFIG_MTD_NAND_ECC_SMC - ecc_code[0] = ~tmp2; - ecc_code[1] = ~tmp1; + /* + * handle the fact that we use longword operations + * we'll bring rp4..rp14..rp16 back to single byte entities by + * shifting and xoring first fold the upper and lower 16 bits, + * then the upper and lower 8 bits. + */ + rp4 ^= (rp4 >> 16); + rp4 ^= (rp4 >> 8); + rp4 &= 0xff; + rp6 ^= (rp6 >> 16); + rp6 ^= (rp6 >> 8); + rp6 &= 0xff; + rp8 ^= (rp8 >> 16); + rp8 ^= (rp8 >> 8); + rp8 &= 0xff; + rp10 ^= (rp10 >> 16); + rp10 ^= (rp10 >> 8); + rp10 &= 0xff; + rp12 ^= (rp12 >> 16); + rp12 ^= (rp12 >> 8); + rp12 &= 0xff; + rp14 ^= (rp14 >> 16); + rp14 ^= (rp14 >> 8); + rp14 &= 0xff; + if (eccsize_mult == 2) { + rp16 ^= (rp16 >> 16); + rp16 ^= (rp16 >> 8); + rp16 &= 0xff; + } + + /* + * we also need to calculate the row parity for rp0..rp3 + * This is present in par, because par is now + * rp3 rp3 rp2 rp2 in little endian and + * rp2 rp2 rp3 rp3 in big endian + * as well as + * rp1 rp0 rp1 rp0 in little endian and + * rp0 rp1 rp0 rp1 in big endian + * First calculate rp2 and rp3 + */ +#ifdef __BIG_ENDIAN + rp2 = (par >> 16); + rp2 ^= (rp2 >> 8); + rp2 &= 0xff; + rp3 = par & 0xffff; + rp3 ^= (rp3 >> 8); + rp3 &= 0xff; #else - ecc_code[0] = ~tmp1; - ecc_code[1] = ~tmp2; + rp3 = (par >> 16); + rp3 ^= (rp3 >> 8); + rp3 &= 0xff; + rp2 = par & 0xffff; + rp2 ^= (rp2 >> 8); + rp2 &= 0xff; #endif - ecc_code[2] = ((~reg1) << 2) | 0x03; - return 0; -} -EXPORT_SYMBOL(nand_calculate_ecc); + /* reduce par to 16 bits then calculate rp1 and rp0 */ + par ^= (par >> 16); +#ifdef __BIG_ENDIAN + rp0 = (par >> 8) & 0xff; + rp1 = (par & 0xff); +#else + rp1 = (par >> 8) & 0xff; + rp0 = (par & 0xff); +#endif -static inline int countbits(uint32_t byte) -{ - int res = 0; + /* finally reduce par to 8 bits */ + par ^= (par >> 8); + par &= 0xff; - for (;byte; byte >>= 1) - res += byte & 0x01; - return res; + /* + * and calculate rp5..rp15..rp17 + * note that par = rp4 ^ rp5 and due to the commutative property + * of the ^ operator we can say: + * rp5 = (par ^ rp4); + * The & 0xff seems superfluous, but benchmarking learned that + * leaving it out gives slightly worse results. No idea why, probably + * it has to do with the way the pipeline in pentium is organized. + */ + rp5 = (par ^ rp4) & 0xff; + rp7 = (par ^ rp6) & 0xff; + rp9 = (par ^ rp8) & 0xff; + rp11 = (par ^ rp10) & 0xff; + rp13 = (par ^ rp12) & 0xff; + rp15 = (par ^ rp14) & 0xff; + if (eccsize_mult == 2) + rp17 = (par ^ rp16) & 0xff; + + /* + * Finally calculate the ecc bits. + * Again here it might seem that there are performance optimisations + * possible, but benchmarks showed that on the system this is developed + * the code below is the fastest + */ +#ifdef CONFIG_MTD_NAND_ECC_SMC + code[0] = + (invparity[rp7] << 7) | + (invparity[rp6] << 6) | + (invparity[rp5] << 5) | + (invparity[rp4] << 4) | + (invparity[rp3] << 3) | + (invparity[rp2] << 2) | + (invparity[rp1] << 1) | + (invparity[rp0]); + code[1] = + (invparity[rp15] << 7) | + (invparity[rp14] << 6) | + (invparity[rp13] << 5) | + (invparity[rp12] << 4) | + (invparity[rp11] << 3) | + (invparity[rp10] << 2) | + (invparity[rp9] << 1) | + (invparity[rp8]); +#else + code[1] = + (invparity[rp7] << 7) | + (invparity[rp6] << 6) | + (invparity[rp5] << 5) | + (invparity[rp4] << 4) | + (invparity[rp3] << 3) | + (invparity[rp2] << 2) | + (invparity[rp1] << 1) | + (invparity[rp0]); + code[0] = + (invparity[rp15] << 7) | + (invparity[rp14] << 6) | + (invparity[rp13] << 5) | + (invparity[rp12] << 4) | + (invparity[rp11] << 3) | + (invparity[rp10] << 2) | + (invparity[rp9] << 1) | + (invparity[rp8]); +#endif + if (eccsize_mult == 1) + code[2] = + (invparity[par & 0xf0] << 7) | + (invparity[par & 0x0f] << 6) | + (invparity[par & 0xcc] << 5) | + (invparity[par & 0x33] << 4) | + (invparity[par & 0xaa] << 3) | + (invparity[par & 0x55] << 2) | + 3; + else + code[2] = + (invparity[par & 0xf0] << 7) | + (invparity[par & 0x0f] << 6) | + (invparity[par & 0xcc] << 5) | + (invparity[par & 0x33] << 4) | + (invparity[par & 0xaa] << 3) | + (invparity[par & 0x55] << 2) | + (invparity[rp17] << 1) | + (invparity[rp16] << 0); + return 0; } +EXPORT_SYMBOL(nand_calculate_ecc); /** * nand_correct_data - [NAND Interface] Detect and correct bit error(s) * @mtd: MTD block structure - * @dat: raw data read from the chip + * @buf: raw data read from the chip * @read_ecc: ECC from the chip * @calc_ecc: the ECC calculated from raw data * - * Detect and correct a 1 bit error for 256 byte block + * Detect and correct a 1 bit error for 256/512 byte block */ -int nand_correct_data(struct mtd_info *mtd, u_char *dat, - u_char *read_ecc, u_char *calc_ecc) +int nand_correct_data(struct mtd_info *mtd, unsigned char *buf, + unsigned char *read_ecc, unsigned char *calc_ecc) { - uint8_t s0, s1, s2; + unsigned char b0, b1, b2; + unsigned char byte_addr, bit_addr; + /* 256 or 512 bytes/ecc */ + const uint32_t eccsize_mult = + (((struct nand_chip *)mtd->priv)->ecc.size) >> 8; + /* + * b0 to b2 indicate which bit is faulty (if any) + * we might need the xor result more than once, + * so keep them in a local var + */ #ifdef CONFIG_MTD_NAND_ECC_SMC - s0 = calc_ecc[0] ^ read_ecc[0]; - s1 = calc_ecc[1] ^ read_ecc[1]; - s2 = calc_ecc[2] ^ read_ecc[2]; + b0 = read_ecc[0] ^ calc_ecc[0]; + b1 = read_ecc[1] ^ calc_ecc[1]; #else - s1 = calc_ecc[0] ^ read_ecc[0]; - s0 = calc_ecc[1] ^ read_ecc[1]; - s2 = calc_ecc[2] ^ read_ecc[2]; + b0 = read_ecc[1] ^ calc_ecc[1]; + b1 = read_ecc[0] ^ calc_ecc[0]; #endif - if ((s0 | s1 | s2) == 0) - return 0; - - /* Check for a single bit error */ - if( ((s0 ^ (s0 >> 1)) & 0x55) == 0x55 && - ((s1 ^ (s1 >> 1)) & 0x55) == 0x55 && - ((s2 ^ (s2 >> 1)) & 0x54) == 0x54) { + b2 = read_ecc[2] ^ calc_ecc[2]; - uint32_t byteoffs, bitnum; + /* check if there are any bitfaults */ - byteoffs = (s1 << 0) & 0x80; - byteoffs |= (s1 << 1) & 0x40; - byteoffs |= (s1 << 2) & 0x20; - byteoffs |= (s1 << 3) & 0x10; + /* repeated if statements are slightly more efficient than switch ... */ + /* ordered in order of likelihood */ - byteoffs |= (s0 >> 4) & 0x08; - byteoffs |= (s0 >> 3) & 0x04; - byteoffs |= (s0 >> 2) & 0x02; - byteoffs |= (s0 >> 1) & 0x01; - - bitnum = (s2 >> 5) & 0x04; - bitnum |= (s2 >> 4) & 0x02; - bitnum |= (s2 >> 3) & 0x01; - - dat[byteoffs] ^= (1 << bitnum); + if ((b0 | b1 | b2) == 0) + return 0; /* no error */ + if ((((b0 ^ (b0 >> 1)) & 0x55) == 0x55) && + (((b1 ^ (b1 >> 1)) & 0x55) == 0x55) && + ((eccsize_mult == 1 && ((b2 ^ (b2 >> 1)) & 0x54) == 0x54) || + (eccsize_mult == 2 && ((b2 ^ (b2 >> 1)) & 0x55) == 0x55))) { + /* single bit error */ + /* + * rp17/rp15/13/11/9/7/5/3/1 indicate which byte is the faulty + * byte, cp 5/3/1 indicate the faulty bit. + * A lookup table (called addressbits) is used to filter + * the bits from the byte they are in. + * A marginal optimisation is possible by having three + * different lookup tables. + * One as we have now (for b0), one for b2 + * (that would avoid the >> 1), and one for b1 (with all values + * << 4). However it was felt that introducing two more tables + * hardly justify the gain. + * + * The b2 shift is there to get rid of the lowest two bits. + * We could also do addressbits[b2] >> 1 but for the + * performace it does not make any difference + */ + if (eccsize_mult == 1) + byte_addr = (addressbits[b1] << 4) + addressbits[b0]; + else + byte_addr = (addressbits[b2 & 0x3] << 8) + + (addressbits[b1] << 4) + addressbits[b0]; + bit_addr = addressbits[b2 >> 2]; + /* flip the bit */ + buf[byte_addr] ^= (1 << bit_addr); return 1; - } - if(countbits(s0 | ((uint32_t)s1 << 8) | ((uint32_t)s2 <<16)) == 1) - return 1; + } + /* count nr of bits; use table lookup, faster than calculating it */ + if ((bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]) == 1) + return 1; /* error in ecc data; no action needed */ - return -EBADMSG; + printk(KERN_ERR "uncorrectable error : "); + return -1; } EXPORT_SYMBOL(nand_correct_data); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Steven J. Hill <sjhill@realitydiluted.com>"); +MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>"); MODULE_DESCRIPTION("Generic NAND ECC support"); diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c index 556e8131ecdc..ae7c57781a68 100644 --- a/drivers/mtd/nand/nandsim.c +++ b/drivers/mtd/nand/nandsim.c @@ -38,7 +38,6 @@ #include <linux/delay.h> #include <linux/list.h> #include <linux/random.h> -#include <asm/div64.h> /* Default simulator parameters values */ #if !defined(CONFIG_NANDSIM_FIRST_ID_BYTE) || \ diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index a64ad15b8fdd..c0fa9c9edf08 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -115,55 +115,11 @@ enum { STATE_PIO_WRITING, }; -struct pxa3xx_nand_timing { - unsigned int tCH; /* Enable signal hold time */ - unsigned int tCS; /* Enable signal setup time */ - unsigned int tWH; /* ND_nWE high duration */ - unsigned int tWP; /* ND_nWE pulse time */ - unsigned int tRH; /* ND_nRE high duration */ - unsigned int tRP; /* ND_nRE pulse width */ - unsigned int tR; /* ND_nWE high to ND_nRE low for read */ - unsigned int tWHR; /* ND_nWE high to ND_nRE low for status read */ - unsigned int tAR; /* ND_ALE low to ND_nRE low delay */ -}; - -struct pxa3xx_nand_cmdset { - uint16_t read1; - uint16_t read2; - uint16_t program; - uint16_t read_status; - uint16_t read_id; - uint16_t erase; - uint16_t reset; - uint16_t lock; - uint16_t unlock; - uint16_t lock_status; -}; - -struct pxa3xx_nand_flash { - struct pxa3xx_nand_timing *timing; /* NAND Flash timing */ - struct pxa3xx_nand_cmdset *cmdset; - - uint32_t page_per_block;/* Pages per block (PG_PER_BLK) */ - uint32_t page_size; /* Page size in bytes (PAGE_SZ) */ - uint32_t flash_width; /* Width of Flash memory (DWIDTH_M) */ - uint32_t dfc_width; /* Width of flash controller(DWIDTH_C) */ - uint32_t num_blocks; /* Number of physical blocks in Flash */ - uint32_t chip_id; - - /* NOTE: these are automatically calculated, do not define */ - size_t oob_size; - size_t read_id_bytes; - - unsigned int col_addr_cycles; - unsigned int row_addr_cycles; -}; - struct pxa3xx_nand_info { struct nand_chip nand_chip; struct platform_device *pdev; - struct pxa3xx_nand_flash *flash_info; + const struct pxa3xx_nand_flash *flash_info; struct clk *clk; void __iomem *mmio_base; @@ -202,12 +158,20 @@ struct pxa3xx_nand_info { uint32_t ndcb0; uint32_t ndcb1; uint32_t ndcb2; + + /* calculated from pxa3xx_nand_flash data */ + size_t oob_size; + size_t read_id_bytes; + + unsigned int col_addr_cycles; + unsigned int row_addr_cycles; }; static int use_dma = 1; module_param(use_dma, bool, 0444); MODULE_PARM_DESC(use_dma, "enable DMA for data transfering to/from NAND HW"); +#ifdef CONFIG_MTD_NAND_PXA3xx_BUILTIN static struct pxa3xx_nand_cmdset smallpage_cmdset = { .read1 = 0x0000, .read2 = 0x0050, @@ -291,11 +255,35 @@ static struct pxa3xx_nand_flash micron1GbX16 = { .chip_id = 0xb12c, }; +static struct pxa3xx_nand_timing stm2GbX16_timing = { + .tCH = 10, + .tCS = 35, + .tWH = 15, + .tWP = 25, + .tRH = 15, + .tRP = 25, + .tR = 25000, + .tWHR = 60, + .tAR = 10, +}; + +static struct pxa3xx_nand_flash stm2GbX16 = { + .timing = &stm2GbX16_timing, + .page_per_block = 64, + .page_size = 2048, + .flash_width = 16, + .dfc_width = 16, + .num_blocks = 2048, + .chip_id = 0xba20, +}; + static struct pxa3xx_nand_flash *builtin_flash_types[] = { &samsung512MbX16, µn1GbX8, µn1GbX16, + &stm2GbX16, }; +#endif /* CONFIG_MTD_NAND_PXA3xx_BUILTIN */ #define NDTR0_tCH(c) (min((c), 7) << 19) #define NDTR0_tCS(c) (min((c), 7) << 16) @@ -312,7 +300,7 @@ static struct pxa3xx_nand_flash *builtin_flash_types[] = { #define ns2cycle(ns, clk) (int)(((ns) * (clk / 1000000) / 1000) + 1) static void pxa3xx_nand_set_timing(struct pxa3xx_nand_info *info, - struct pxa3xx_nand_timing *t) + const struct pxa3xx_nand_timing *t) { unsigned long nand_clk = clk_get_rate(info->clk); uint32_t ndtr0, ndtr1; @@ -354,8 +342,8 @@ static int wait_for_event(struct pxa3xx_nand_info *info, uint32_t event) static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info, uint16_t cmd, int column, int page_addr) { - struct pxa3xx_nand_flash *f = info->flash_info; - struct pxa3xx_nand_cmdset *cmdset = f->cmdset; + const struct pxa3xx_nand_flash *f = info->flash_info; + const struct pxa3xx_nand_cmdset *cmdset = f->cmdset; /* calculate data size */ switch (f->page_size) { @@ -373,14 +361,14 @@ static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info, info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0); info->ndcb1 = 0; info->ndcb2 = 0; - info->ndcb0 |= NDCB0_ADDR_CYC(f->row_addr_cycles + f->col_addr_cycles); + info->ndcb0 |= NDCB0_ADDR_CYC(info->row_addr_cycles + info->col_addr_cycles); - if (f->col_addr_cycles == 2) { + if (info->col_addr_cycles == 2) { /* large block, 2 cycles for column address * row address starts from 3rd cycle */ info->ndcb1 |= (page_addr << 16) | (column & 0xffff); - if (f->row_addr_cycles == 3) + if (info->row_addr_cycles == 3) info->ndcb2 = (page_addr >> 16) & 0xff; } else /* small block, 1 cycles for column address @@ -406,7 +394,7 @@ static int prepare_erase_cmd(struct pxa3xx_nand_info *info, static int prepare_other_cmd(struct pxa3xx_nand_info *info, uint16_t cmd) { - struct pxa3xx_nand_cmdset *cmdset = info->flash_info->cmdset; + const struct pxa3xx_nand_cmdset *cmdset = info->flash_info->cmdset; info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0); info->ndcb1 = 0; @@ -641,8 +629,8 @@ static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command, int column, int page_addr) { struct pxa3xx_nand_info *info = mtd->priv; - struct pxa3xx_nand_flash *flash_info = info->flash_info; - struct pxa3xx_nand_cmdset *cmdset = flash_info->cmdset; + const struct pxa3xx_nand_flash *flash_info = info->flash_info; + const struct pxa3xx_nand_cmdset *cmdset = flash_info->cmdset; int ret; info->use_dma = (use_dma) ? 1 : 0; @@ -720,7 +708,7 @@ static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command, info->use_dma = 0; /* force PIO read */ info->buf_start = 0; info->buf_count = (command == NAND_CMD_READID) ? - flash_info->read_id_bytes : 1; + info->read_id_bytes : 1; if (prepare_other_cmd(info, (command == NAND_CMD_READID) ? cmdset->read_id : cmdset->read_status)) @@ -861,8 +849,8 @@ static int pxa3xx_nand_ecc_correct(struct mtd_info *mtd, static int __readid(struct pxa3xx_nand_info *info, uint32_t *id) { - struct pxa3xx_nand_flash *f = info->flash_info; - struct pxa3xx_nand_cmdset *cmdset = f->cmdset; + const struct pxa3xx_nand_flash *f = info->flash_info; + const struct pxa3xx_nand_cmdset *cmdset = f->cmdset; uint32_t ndcr; uint8_t id_buff[8]; @@ -891,7 +879,7 @@ fail_timeout: } static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info, - struct pxa3xx_nand_flash *f) + const struct pxa3xx_nand_flash *f) { struct platform_device *pdev = info->pdev; struct pxa3xx_nand_platform_data *pdata = pdev->dev.platform_data; @@ -904,25 +892,25 @@ static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info, return -EINVAL; /* calculate flash information */ - f->oob_size = (f->page_size == 2048) ? 64 : 16; - f->read_id_bytes = (f->page_size == 2048) ? 4 : 2; + info->oob_size = (f->page_size == 2048) ? 64 : 16; + info->read_id_bytes = (f->page_size == 2048) ? 4 : 2; /* calculate addressing information */ - f->col_addr_cycles = (f->page_size == 2048) ? 2 : 1; + info->col_addr_cycles = (f->page_size == 2048) ? 2 : 1; if (f->num_blocks * f->page_per_block > 65536) - f->row_addr_cycles = 3; + info->row_addr_cycles = 3; else - f->row_addr_cycles = 2; + info->row_addr_cycles = 2; ndcr |= (pdata->enable_arbiter) ? NDCR_ND_ARB_EN : 0; - ndcr |= (f->col_addr_cycles == 2) ? NDCR_RA_START : 0; + ndcr |= (info->col_addr_cycles == 2) ? NDCR_RA_START : 0; ndcr |= (f->page_per_block == 64) ? NDCR_PG_PER_BLK : 0; ndcr |= (f->page_size == 2048) ? NDCR_PAGE_SZ : 0; ndcr |= (f->flash_width == 16) ? NDCR_DWIDTH_M : 0; ndcr |= (f->dfc_width == 16) ? NDCR_DWIDTH_C : 0; - ndcr |= NDCR_RD_ID_CNT(f->read_id_bytes); + ndcr |= NDCR_RD_ID_CNT(info->read_id_bytes); ndcr |= NDCR_SPARE_EN; /* enable spare by default */ info->reg_ndcr = ndcr; @@ -932,12 +920,27 @@ static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info, return 0; } -static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info) +static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info, + const struct pxa3xx_nand_platform_data *pdata) { - struct pxa3xx_nand_flash *f; - uint32_t id; + const struct pxa3xx_nand_flash *f; + uint32_t id = -1; int i; + for (i = 0; i<pdata->num_flash; ++i) { + f = pdata->flash + i; + + if (pxa3xx_nand_config_flash(info, f)) + continue; + + if (__readid(info, &id)) + continue; + + if (id == f->chip_id) + return 0; + } + +#ifdef CONFIG_MTD_NAND_PXA3xx_BUILTIN for (i = 0; i < ARRAY_SIZE(builtin_flash_types); i++) { f = builtin_flash_types[i]; @@ -951,7 +954,11 @@ static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info) if (id == f->chip_id) return 0; } +#endif + dev_warn(&info->pdev->dev, + "failed to detect configured nand flash; found %04x instead of\n", + id); return -ENODEV; } @@ -1014,7 +1021,7 @@ static struct nand_ecclayout hw_largepage_ecclayout = { static void pxa3xx_nand_init_mtd(struct mtd_info *mtd, struct pxa3xx_nand_info *info) { - struct pxa3xx_nand_flash *f = info->flash_info; + const struct pxa3xx_nand_flash *f = info->flash_info; struct nand_chip *this = &info->nand_chip; this->options = (f->flash_width == 16) ? NAND_BUSWIDTH_16: 0; @@ -1135,7 +1142,7 @@ static int pxa3xx_nand_probe(struct platform_device *pdev) goto fail_free_buf; } - ret = pxa3xx_nand_detect_flash(info); + ret = pxa3xx_nand_detect_flash(info, pdata); if (ret) { dev_err(&pdev->dev, "failed to detect flash\n"); ret = -ENODEV; diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c new file mode 100644 index 000000000000..821acb08ff1c --- /dev/null +++ b/drivers/mtd/nand/sh_flctl.c @@ -0,0 +1,878 @@ +/* + * SuperH FLCTL nand controller + * + * Copyright © 2008 Renesas Solutions Corp. + * Copyright © 2008 Atom Create Engineering Co., Ltd. + * + * Based on fsl_elbc_nand.c, Copyright © 2006-2007 Freescale Semiconductor + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/platform_device.h> + +#include <linux/mtd/mtd.h> +#include <linux/mtd/nand.h> +#include <linux/mtd/partitions.h> +#include <linux/mtd/sh_flctl.h> + +static struct nand_ecclayout flctl_4secc_oob_16 = { + .eccbytes = 10, + .eccpos = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, + .oobfree = { + {.offset = 12, + . length = 4} }, +}; + +static struct nand_ecclayout flctl_4secc_oob_64 = { + .eccbytes = 10, + .eccpos = {48, 49, 50, 51, 52, 53, 54, 55, 56, 57}, + .oobfree = { + {.offset = 60, + . length = 4} }, +}; + +static uint8_t scan_ff_pattern[] = { 0xff, 0xff }; + +static struct nand_bbt_descr flctl_4secc_smallpage = { + .options = NAND_BBT_SCAN2NDPAGE, + .offs = 11, + .len = 1, + .pattern = scan_ff_pattern, +}; + +static struct nand_bbt_descr flctl_4secc_largepage = { + .options = 0, + .offs = 58, + .len = 2, + .pattern = scan_ff_pattern, +}; + +static void empty_fifo(struct sh_flctl *flctl) +{ + writel(0x000c0000, FLINTDMACR(flctl)); /* FIFO Clear */ + writel(0x00000000, FLINTDMACR(flctl)); /* Clear Error flags */ +} + +static void start_translation(struct sh_flctl *flctl) +{ + writeb(TRSTRT, FLTRCR(flctl)); +} + +static void wait_completion(struct sh_flctl *flctl) +{ + uint32_t timeout = LOOP_TIMEOUT_MAX; + + while (timeout--) { + if (readb(FLTRCR(flctl)) & TREND) { + writeb(0x0, FLTRCR(flctl)); + return; + } + udelay(1); + } + + printk(KERN_ERR "wait_completion(): Timeout occured \n"); + writeb(0x0, FLTRCR(flctl)); +} + +static void set_addr(struct mtd_info *mtd, int column, int page_addr) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + uint32_t addr = 0; + + if (column == -1) { + addr = page_addr; /* ERASE1 */ + } else if (page_addr != -1) { + /* SEQIN, READ0, etc.. */ + if (flctl->page_size) { + addr = column & 0x0FFF; + addr |= (page_addr & 0xff) << 16; + addr |= ((page_addr >> 8) & 0xff) << 24; + /* big than 128MB */ + if (flctl->rw_ADRCNT == ADRCNT2_E) { + uint32_t addr2; + addr2 = (page_addr >> 16) & 0xff; + writel(addr2, FLADR2(flctl)); + } + } else { + addr = column; + addr |= (page_addr & 0xff) << 8; + addr |= ((page_addr >> 8) & 0xff) << 16; + addr |= ((page_addr >> 16) & 0xff) << 24; + } + } + writel(addr, FLADR(flctl)); +} + +static void wait_rfifo_ready(struct sh_flctl *flctl) +{ + uint32_t timeout = LOOP_TIMEOUT_MAX; + + while (timeout--) { + uint32_t val; + /* check FIFO */ + val = readl(FLDTCNTR(flctl)) >> 16; + if (val & 0xFF) + return; + udelay(1); + } + printk(KERN_ERR "wait_rfifo_ready(): Timeout occured \n"); +} + +static void wait_wfifo_ready(struct sh_flctl *flctl) +{ + uint32_t len, timeout = LOOP_TIMEOUT_MAX; + + while (timeout--) { + /* check FIFO */ + len = (readl(FLDTCNTR(flctl)) >> 16) & 0xFF; + if (len >= 4) + return; + udelay(1); + } + printk(KERN_ERR "wait_wfifo_ready(): Timeout occured \n"); +} + +static int wait_recfifo_ready(struct sh_flctl *flctl) +{ + uint32_t timeout = LOOP_TIMEOUT_MAX; + int checked[4]; + void __iomem *ecc_reg[4]; + int i; + uint32_t data, size; + + memset(checked, 0, sizeof(checked)); + + while (timeout--) { + size = readl(FLDTCNTR(flctl)) >> 24; + if (size & 0xFF) + return 0; /* success */ + + if (readl(FL4ECCCR(flctl)) & _4ECCFA) + return 1; /* can't correct */ + + udelay(1); + if (!(readl(FL4ECCCR(flctl)) & _4ECCEND)) + continue; + + /* start error correction */ + ecc_reg[0] = FL4ECCRESULT0(flctl); + ecc_reg[1] = FL4ECCRESULT1(flctl); + ecc_reg[2] = FL4ECCRESULT2(flctl); + ecc_reg[3] = FL4ECCRESULT3(flctl); + + for (i = 0; i < 3; i++) { + data = readl(ecc_reg[i]); + if (data != INIT_FL4ECCRESULT_VAL && !checked[i]) { + uint8_t org; + int index; + + index = data >> 16; + org = flctl->done_buff[index]; + flctl->done_buff[index] = org ^ (data & 0xFF); + checked[i] = 1; + } + } + + writel(0, FL4ECCCR(flctl)); + } + + printk(KERN_ERR "wait_recfifo_ready(): Timeout occured \n"); + return 1; /* timeout */ +} + +static void wait_wecfifo_ready(struct sh_flctl *flctl) +{ + uint32_t timeout = LOOP_TIMEOUT_MAX; + uint32_t len; + + while (timeout--) { + /* check FLECFIFO */ + len = (readl(FLDTCNTR(flctl)) >> 24) & 0xFF; + if (len >= 4) + return; + udelay(1); + } + printk(KERN_ERR "wait_wecfifo_ready(): Timeout occured \n"); +} + +static void read_datareg(struct sh_flctl *flctl, int offset) +{ + unsigned long data; + unsigned long *buf = (unsigned long *)&flctl->done_buff[offset]; + + wait_completion(flctl); + + data = readl(FLDATAR(flctl)); + *buf = le32_to_cpu(data); +} + +static void read_fiforeg(struct sh_flctl *flctl, int rlen, int offset) +{ + int i, len_4align; + unsigned long *buf = (unsigned long *)&flctl->done_buff[offset]; + void *fifo_addr = (void *)FLDTFIFO(flctl); + + len_4align = (rlen + 3) / 4; + + for (i = 0; i < len_4align; i++) { + wait_rfifo_ready(flctl); + buf[i] = readl(fifo_addr); + buf[i] = be32_to_cpu(buf[i]); + } +} + +static int read_ecfiforeg(struct sh_flctl *flctl, uint8_t *buff) +{ + int i; + unsigned long *ecc_buf = (unsigned long *)buff; + void *fifo_addr = (void *)FLECFIFO(flctl); + + for (i = 0; i < 4; i++) { + if (wait_recfifo_ready(flctl)) + return 1; + ecc_buf[i] = readl(fifo_addr); + ecc_buf[i] = be32_to_cpu(ecc_buf[i]); + } + + return 0; +} + +static void write_fiforeg(struct sh_flctl *flctl, int rlen, int offset) +{ + int i, len_4align; + unsigned long *data = (unsigned long *)&flctl->done_buff[offset]; + void *fifo_addr = (void *)FLDTFIFO(flctl); + + len_4align = (rlen + 3) / 4; + for (i = 0; i < len_4align; i++) { + wait_wfifo_ready(flctl); + writel(cpu_to_be32(data[i]), fifo_addr); + } +} + +static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_val) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + uint32_t flcmncr_val = readl(FLCMNCR(flctl)); + uint32_t flcmdcr_val, addr_len_bytes = 0; + + /* Set SNAND bit if page size is 2048byte */ + if (flctl->page_size) + flcmncr_val |= SNAND_E; + else + flcmncr_val &= ~SNAND_E; + + /* default FLCMDCR val */ + flcmdcr_val = DOCMD1_E | DOADR_E; + + /* Set for FLCMDCR */ + switch (cmd) { + case NAND_CMD_ERASE1: + addr_len_bytes = flctl->erase_ADRCNT; + flcmdcr_val |= DOCMD2_E; + break; + case NAND_CMD_READ0: + case NAND_CMD_READOOB: + addr_len_bytes = flctl->rw_ADRCNT; + flcmdcr_val |= CDSRC_E; + break; + case NAND_CMD_SEQIN: + /* This case is that cmd is READ0 or READ1 or READ00 */ + flcmdcr_val &= ~DOADR_E; /* ONLY execute 1st cmd */ + break; + case NAND_CMD_PAGEPROG: + addr_len_bytes = flctl->rw_ADRCNT; + flcmdcr_val |= DOCMD2_E | CDSRC_E | SELRW; + break; + case NAND_CMD_READID: + flcmncr_val &= ~SNAND_E; + addr_len_bytes = ADRCNT_1; + break; + case NAND_CMD_STATUS: + case NAND_CMD_RESET: + flcmncr_val &= ~SNAND_E; + flcmdcr_val &= ~(DOADR_E | DOSR_E); + break; + default: + break; + } + + /* Set address bytes parameter */ + flcmdcr_val |= addr_len_bytes; + + /* Now actually write */ + writel(flcmncr_val, FLCMNCR(flctl)); + writel(flcmdcr_val, FLCMDCR(flctl)); + writel(flcmcdr_val, FLCMCDR(flctl)); +} + +static int flctl_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip, + uint8_t *buf) +{ + int i, eccsize = chip->ecc.size; + int eccbytes = chip->ecc.bytes; + int eccsteps = chip->ecc.steps; + uint8_t *p = buf; + struct sh_flctl *flctl = mtd_to_flctl(mtd); + + for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) + chip->read_buf(mtd, p, eccsize); + + for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) { + if (flctl->hwecc_cant_correct[i]) + mtd->ecc_stats.failed++; + else + mtd->ecc_stats.corrected += 0; + } + + return 0; +} + +static void flctl_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip, + const uint8_t *buf) +{ + int i, eccsize = chip->ecc.size; + int eccbytes = chip->ecc.bytes; + int eccsteps = chip->ecc.steps; + const uint8_t *p = buf; + + for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) + chip->write_buf(mtd, p, eccsize); +} + +static void execmd_read_page_sector(struct mtd_info *mtd, int page_addr) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int sector, page_sectors; + + if (flctl->page_size) + page_sectors = 4; + else + page_sectors = 1; + + writel(readl(FLCMNCR(flctl)) | ACM_SACCES_MODE | _4ECCCORRECT, + FLCMNCR(flctl)); + + set_cmd_regs(mtd, NAND_CMD_READ0, + (NAND_CMD_READSTART << 8) | NAND_CMD_READ0); + + for (sector = 0; sector < page_sectors; sector++) { + int ret; + + empty_fifo(flctl); + writel(readl(FLCMDCR(flctl)) | 1, FLCMDCR(flctl)); + writel(page_addr << 2 | sector, FLADR(flctl)); + + start_translation(flctl); + read_fiforeg(flctl, 512, 512 * sector); + + ret = read_ecfiforeg(flctl, + &flctl->done_buff[mtd->writesize + 16 * sector]); + + if (ret) + flctl->hwecc_cant_correct[sector] = 1; + + writel(0x0, FL4ECCCR(flctl)); + wait_completion(flctl); + } + writel(readl(FLCMNCR(flctl)) & ~(ACM_SACCES_MODE | _4ECCCORRECT), + FLCMNCR(flctl)); +} + +static void execmd_read_oob(struct mtd_info *mtd, int page_addr) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + + set_cmd_regs(mtd, NAND_CMD_READ0, + (NAND_CMD_READSTART << 8) | NAND_CMD_READ0); + + empty_fifo(flctl); + if (flctl->page_size) { + int i; + /* In case that the page size is 2k */ + for (i = 0; i < 16 * 3; i++) + flctl->done_buff[i] = 0xFF; + + set_addr(mtd, 3 * 528 + 512, page_addr); + writel(16, FLDTCNTR(flctl)); + + start_translation(flctl); + read_fiforeg(flctl, 16, 16 * 3); + wait_completion(flctl); + } else { + /* In case that the page size is 512b */ + set_addr(mtd, 512, page_addr); + writel(16, FLDTCNTR(flctl)); + + start_translation(flctl); + read_fiforeg(flctl, 16, 0); + wait_completion(flctl); + } +} + +static void execmd_write_page_sector(struct mtd_info *mtd) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int i, page_addr = flctl->seqin_page_addr; + int sector, page_sectors; + + if (flctl->page_size) + page_sectors = 4; + else + page_sectors = 1; + + writel(readl(FLCMNCR(flctl)) | ACM_SACCES_MODE, FLCMNCR(flctl)); + + set_cmd_regs(mtd, NAND_CMD_PAGEPROG, + (NAND_CMD_PAGEPROG << 8) | NAND_CMD_SEQIN); + + for (sector = 0; sector < page_sectors; sector++) { + empty_fifo(flctl); + writel(readl(FLCMDCR(flctl)) | 1, FLCMDCR(flctl)); + writel(page_addr << 2 | sector, FLADR(flctl)); + + start_translation(flctl); + write_fiforeg(flctl, 512, 512 * sector); + + for (i = 0; i < 4; i++) { + wait_wecfifo_ready(flctl); /* wait for write ready */ + writel(0xFFFFFFFF, FLECFIFO(flctl)); + } + wait_completion(flctl); + } + + writel(readl(FLCMNCR(flctl)) & ~ACM_SACCES_MODE, FLCMNCR(flctl)); +} + +static void execmd_write_oob(struct mtd_info *mtd) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int page_addr = flctl->seqin_page_addr; + int sector, page_sectors; + + if (flctl->page_size) { + sector = 3; + page_sectors = 4; + } else { + sector = 0; + page_sectors = 1; + } + + set_cmd_regs(mtd, NAND_CMD_PAGEPROG, + (NAND_CMD_PAGEPROG << 8) | NAND_CMD_SEQIN); + + for (; sector < page_sectors; sector++) { + empty_fifo(flctl); + set_addr(mtd, sector * 528 + 512, page_addr); + writel(16, FLDTCNTR(flctl)); /* set read size */ + + start_translation(flctl); + write_fiforeg(flctl, 16, 16 * sector); + wait_completion(flctl); + } +} + +static void flctl_cmdfunc(struct mtd_info *mtd, unsigned int command, + int column, int page_addr) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + uint32_t read_cmd = 0; + + flctl->read_bytes = 0; + if (command != NAND_CMD_PAGEPROG) + flctl->index = 0; + + switch (command) { + case NAND_CMD_READ1: + case NAND_CMD_READ0: + if (flctl->hwecc) { + /* read page with hwecc */ + execmd_read_page_sector(mtd, page_addr); + break; + } + empty_fifo(flctl); + if (flctl->page_size) + set_cmd_regs(mtd, command, (NAND_CMD_READSTART << 8) + | command); + else + set_cmd_regs(mtd, command, command); + + set_addr(mtd, 0, page_addr); + + flctl->read_bytes = mtd->writesize + mtd->oobsize; + flctl->index += column; + goto read_normal_exit; + + case NAND_CMD_READOOB: + if (flctl->hwecc) { + /* read page with hwecc */ + execmd_read_oob(mtd, page_addr); + break; + } + + empty_fifo(flctl); + if (flctl->page_size) { + set_cmd_regs(mtd, command, (NAND_CMD_READSTART << 8) + | NAND_CMD_READ0); + set_addr(mtd, mtd->writesize, page_addr); + } else { + set_cmd_regs(mtd, command, command); + set_addr(mtd, 0, page_addr); + } + flctl->read_bytes = mtd->oobsize; + goto read_normal_exit; + + case NAND_CMD_READID: + empty_fifo(flctl); + set_cmd_regs(mtd, command, command); + set_addr(mtd, 0, 0); + + flctl->read_bytes = 4; + writel(flctl->read_bytes, FLDTCNTR(flctl)); /* set read size */ + start_translation(flctl); + read_datareg(flctl, 0); /* read and end */ + break; + + case NAND_CMD_ERASE1: + flctl->erase1_page_addr = page_addr; + break; + + case NAND_CMD_ERASE2: + set_cmd_regs(mtd, NAND_CMD_ERASE1, + (command << 8) | NAND_CMD_ERASE1); + set_addr(mtd, -1, flctl->erase1_page_addr); + start_translation(flctl); + wait_completion(flctl); + break; + + case NAND_CMD_SEQIN: + if (!flctl->page_size) { + /* output read command */ + if (column >= mtd->writesize) { + column -= mtd->writesize; + read_cmd = NAND_CMD_READOOB; + } else if (column < 256) { + read_cmd = NAND_CMD_READ0; + } else { + column -= 256; + read_cmd = NAND_CMD_READ1; + } + } + flctl->seqin_column = column; + flctl->seqin_page_addr = page_addr; + flctl->seqin_read_cmd = read_cmd; + break; + + case NAND_CMD_PAGEPROG: + empty_fifo(flctl); + if (!flctl->page_size) { + set_cmd_regs(mtd, NAND_CMD_SEQIN, + flctl->seqin_read_cmd); + set_addr(mtd, -1, -1); + writel(0, FLDTCNTR(flctl)); /* set 0 size */ + start_translation(flctl); + wait_completion(flctl); + } + if (flctl->hwecc) { + /* write page with hwecc */ + if (flctl->seqin_column == mtd->writesize) + execmd_write_oob(mtd); + else if (!flctl->seqin_column) + execmd_write_page_sector(mtd); + else + printk(KERN_ERR "Invalid address !?\n"); + break; + } + set_cmd_regs(mtd, command, (command << 8) | NAND_CMD_SEQIN); + set_addr(mtd, flctl->seqin_column, flctl->seqin_page_addr); + writel(flctl->index, FLDTCNTR(flctl)); /* set write size */ + start_translation(flctl); + write_fiforeg(flctl, flctl->index, 0); + wait_completion(flctl); + break; + + case NAND_CMD_STATUS: + set_cmd_regs(mtd, command, command); + set_addr(mtd, -1, -1); + + flctl->read_bytes = 1; + writel(flctl->read_bytes, FLDTCNTR(flctl)); /* set read size */ + start_translation(flctl); + read_datareg(flctl, 0); /* read and end */ + break; + + case NAND_CMD_RESET: + set_cmd_regs(mtd, command, command); + set_addr(mtd, -1, -1); + + writel(0, FLDTCNTR(flctl)); /* set 0 size */ + start_translation(flctl); + wait_completion(flctl); + break; + + default: + break; + } + return; + +read_normal_exit: + writel(flctl->read_bytes, FLDTCNTR(flctl)); /* set read size */ + start_translation(flctl); + read_fiforeg(flctl, flctl->read_bytes, 0); + wait_completion(flctl); + return; +} + +static void flctl_select_chip(struct mtd_info *mtd, int chipnr) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + uint32_t flcmncr_val = readl(FLCMNCR(flctl)); + + switch (chipnr) { + case -1: + flcmncr_val &= ~CE0_ENABLE; + writel(flcmncr_val, FLCMNCR(flctl)); + break; + case 0: + flcmncr_val |= CE0_ENABLE; + writel(flcmncr_val, FLCMNCR(flctl)); + break; + default: + BUG(); + } +} + +static void flctl_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int i, index = flctl->index; + + for (i = 0; i < len; i++) + flctl->done_buff[index + i] = buf[i]; + flctl->index += len; +} + +static uint8_t flctl_read_byte(struct mtd_info *mtd) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + int index = flctl->index; + uint8_t data; + + data = flctl->done_buff[index]; + flctl->index++; + return data; +} + +static void flctl_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + buf[i] = flctl_read_byte(mtd); +} + +static int flctl_verify_buf(struct mtd_info *mtd, const u_char *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (buf[i] != flctl_read_byte(mtd)) + return -EFAULT; + return 0; +} + +static void flctl_register_init(struct sh_flctl *flctl, unsigned long val) +{ + writel(val, FLCMNCR(flctl)); +} + +static int flctl_chip_init_tail(struct mtd_info *mtd) +{ + struct sh_flctl *flctl = mtd_to_flctl(mtd); + struct nand_chip *chip = &flctl->chip; + + if (mtd->writesize == 512) { + flctl->page_size = 0; + if (chip->chipsize > (32 << 20)) { + /* big than 32MB */ + flctl->rw_ADRCNT = ADRCNT_4; + flctl->erase_ADRCNT = ADRCNT_3; + } else if (chip->chipsize > (2 << 16)) { + /* big than 128KB */ + flctl->rw_ADRCNT = ADRCNT_3; + flctl->erase_ADRCNT = ADRCNT_2; + } else { + flctl->rw_ADRCNT = ADRCNT_2; + flctl->erase_ADRCNT = ADRCNT_1; + } + } else { + flctl->page_size = 1; + if (chip->chipsize > (128 << 20)) { + /* big than 128MB */ + flctl->rw_ADRCNT = ADRCNT2_E; + flctl->erase_ADRCNT = ADRCNT_3; + } else if (chip->chipsize > (8 << 16)) { + /* big than 512KB */ + flctl->rw_ADRCNT = ADRCNT_4; + flctl->erase_ADRCNT = ADRCNT_2; + } else { + flctl->rw_ADRCNT = ADRCNT_3; + flctl->erase_ADRCNT = ADRCNT_1; + } + } + + if (flctl->hwecc) { + if (mtd->writesize == 512) { + chip->ecc.layout = &flctl_4secc_oob_16; + chip->badblock_pattern = &flctl_4secc_smallpage; + } else { + chip->ecc.layout = &flctl_4secc_oob_64; + chip->badblock_pattern = &flctl_4secc_largepage; + } + + chip->ecc.size = 512; + chip->ecc.bytes = 10; + chip->ecc.read_page = flctl_read_page_hwecc; + chip->ecc.write_page = flctl_write_page_hwecc; + chip->ecc.mode = NAND_ECC_HW; + + /* 4 symbols ECC enabled */ + writel(readl(FLCMNCR(flctl)) | _4ECCEN | ECCPOS2 | ECCPOS_02, + FLCMNCR(flctl)); + } else { + chip->ecc.mode = NAND_ECC_SOFT; + } + + return 0; +} + +static int __init flctl_probe(struct platform_device *pdev) +{ + struct resource *res; + struct sh_flctl *flctl; + struct mtd_info *flctl_mtd; + struct nand_chip *nand; + struct sh_flctl_platform_data *pdata; + int ret; + + pdata = pdev->dev.platform_data; + if (pdata == NULL) { + printk(KERN_ERR "sh_flctl platform_data not found.\n"); + return -ENODEV; + } + + flctl = kzalloc(sizeof(struct sh_flctl), GFP_KERNEL); + if (!flctl) { + printk(KERN_ERR "Unable to allocate NAND MTD dev structure.\n"); + return -ENOMEM; + } + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + printk(KERN_ERR "%s: resource not found.\n", __func__); + ret = -ENODEV; + goto err; + } + + flctl->reg = ioremap(res->start, res->end - res->start + 1); + if (flctl->reg == NULL) { + printk(KERN_ERR "%s: ioremap error.\n", __func__); + ret = -ENOMEM; + goto err; + } + + platform_set_drvdata(pdev, flctl); + flctl_mtd = &flctl->mtd; + nand = &flctl->chip; + flctl_mtd->priv = nand; + flctl->hwecc = pdata->has_hwecc; + + flctl_register_init(flctl, pdata->flcmncr_val); + + nand->options = NAND_NO_AUTOINCR; + + /* Set address of hardware control function */ + /* 20 us command delay time */ + nand->chip_delay = 20; + + nand->read_byte = flctl_read_byte; + nand->write_buf = flctl_write_buf; + nand->read_buf = flctl_read_buf; + nand->verify_buf = flctl_verify_buf; + nand->select_chip = flctl_select_chip; + nand->cmdfunc = flctl_cmdfunc; + + ret = nand_scan_ident(flctl_mtd, 1); + if (ret) + goto err; + + ret = flctl_chip_init_tail(flctl_mtd); + if (ret) + goto err; + + ret = nand_scan_tail(flctl_mtd); + if (ret) + goto err; + + add_mtd_partitions(flctl_mtd, pdata->parts, pdata->nr_parts); + + return 0; + +err: + kfree(flctl); + return ret; +} + +static int __exit flctl_remove(struct platform_device *pdev) +{ + struct sh_flctl *flctl = platform_get_drvdata(pdev); + + nand_release(&flctl->mtd); + kfree(flctl); + + return 0; +} + +static struct platform_driver flctl_driver = { + .probe = flctl_probe, + .remove = flctl_remove, + .driver = { + .name = "sh_flctl", + .owner = THIS_MODULE, + }, +}; + +static int __init flctl_nand_init(void) +{ + return platform_driver_register(&flctl_driver); +} + +static void __exit flctl_nand_cleanup(void) +{ + platform_driver_unregister(&flctl_driver); +} + +module_init(flctl_nand_init); +module_exit(flctl_nand_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yoshihiro Shimoda"); +MODULE_DESCRIPTION("SuperH FLCTL driver"); +MODULE_ALIAS("platform:sh_flctl"); diff --git a/drivers/mtd/nand/toto.c b/drivers/mtd/nand/toto.c deleted file mode 100644 index bbf492e6830d..000000000000 --- a/drivers/mtd/nand/toto.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * drivers/mtd/nand/toto.c - * - * Copyright (c) 2003 Texas Instruments - * - * Derived from drivers/mtd/autcpu12.c - * - * Copyright (c) 2002 Thomas Gleixner <tgxl@linutronix.de> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Overview: - * This is a device driver for the NAND flash device found on the - * TI fido board. It supports 32MiB and 64MiB cards - */ - -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/delay.h> -#include <linux/mtd/mtd.h> -#include <linux/mtd/nand.h> -#include <linux/mtd/partitions.h> -#include <asm/io.h> -#include <asm/arch/hardware.h> -#include <asm/sizes.h> -#include <asm/arch/toto.h> -#include <asm/arch-omap1510/hardware.h> -#include <asm/arch/gpio.h> - -#define CONFIG_NAND_WORKAROUND 1 - -/* - * MTD structure for TOTO board - */ -static struct mtd_info *toto_mtd = NULL; - -static unsigned long toto_io_base = OMAP_FLASH_1_BASE; - -/* - * Define partitions for flash devices - */ - -static struct mtd_partition partition_info64M[] = { - { .name = "toto kernel partition 1", - .offset = 0, - .size = 2 * SZ_1M }, - { .name = "toto file sys partition 2", - .offset = 2 * SZ_1M, - .size = 14 * SZ_1M }, - { .name = "toto user partition 3", - .offset = 16 * SZ_1M, - .size = 16 * SZ_1M }, - { .name = "toto devboard extra partition 4", - .offset = 32 * SZ_1M, - .size = 32 * SZ_1M }, -}; - -static struct mtd_partition partition_info32M[] = { - { .name = "toto kernel partition 1", - .offset = 0, - .size = 2 * SZ_1M }, - { .name = "toto file sys partition 2", - .offset = 2 * SZ_1M, - .size = 14 * SZ_1M }, - { .name = "toto user partition 3", - .offset = 16 * SZ_1M, - .size = 16 * SZ_1M }, -}; - -#define NUM_PARTITIONS32M 3 -#define NUM_PARTITIONS64M 4 - -/* - * hardware specific access to control-lines - * - * ctrl: - * NAND_NCE: bit 0 -> bit 14 (0x4000) - * NAND_CLE: bit 1 -> bit 12 (0x1000) - * NAND_ALE: bit 2 -> bit 1 (0x0002) - */ -static void toto_hwcontrol(struct mtd_info *mtd, int cmd, - unsigned int ctrl) -{ - struct nand_chip *chip = mtd->priv; - - if (ctrl & NAND_CTRL_CHANGE) { - unsigned long bits; - - /* hopefully enough time for tc make proceding write to clear */ - udelay(1); - - bits = (~ctrl & NAND_NCE) << 14; - bits |= (ctrl & NAND_CLE) << 12; - bits |= (ctrl & NAND_ALE) >> 1; - -#warning Wild guess as gpiosetout() is nowhere defined in the kernel source - tglx - gpiosetout(0x5002, bits); - -#ifdef CONFIG_NAND_WORKAROUND - /* "some" dev boards busted, blue wired to rts2 :( */ - rts2setout(2, (ctrl & NAND_CLE) << 1); -#endif - /* allow time to ensure gpio state to over take memory write */ - udelay(1); - } - - if (cmd != NAND_CMD_NONE) - writeb(cmd, chip->IO_ADDR_W); -} - -/* - * Main initialization routine - */ -static int __init toto_init(void) -{ - struct nand_chip *this; - int err = 0; - - /* Allocate memory for MTD device structure and private data */ - toto_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL); - if (!toto_mtd) { - printk(KERN_WARNING "Unable to allocate toto NAND MTD device structure.\n"); - err = -ENOMEM; - goto out; - } - - /* Get pointer to private data */ - this = (struct nand_chip *)(&toto_mtd[1]); - - /* Initialize structures */ - memset(toto_mtd, 0, sizeof(struct mtd_info)); - memset(this, 0, sizeof(struct nand_chip)); - - /* Link the private data with the MTD structure */ - toto_mtd->priv = this; - toto_mtd->owner = THIS_MODULE; - - /* Set address of NAND IO lines */ - this->IO_ADDR_R = toto_io_base; - this->IO_ADDR_W = toto_io_base; - this->cmd_ctrl = toto_hwcontrol; - this->dev_ready = NULL; - /* 25 us command delay time */ - this->chip_delay = 30; - this->ecc.mode = NAND_ECC_SOFT; - - /* Scan to find existance of the device */ - if (nand_scan(toto_mtd, 1)) { - err = -ENXIO; - goto out_mtd; - } - - /* Register the partitions */ - switch (toto_mtd->size) { - case SZ_64M: - add_mtd_partitions(toto_mtd, partition_info64M, NUM_PARTITIONS64M); - break; - case SZ_32M: - add_mtd_partitions(toto_mtd, partition_info32M, NUM_PARTITIONS32M); - break; - default:{ - printk(KERN_WARNING "Unsupported Nand device\n"); - err = -ENXIO; - goto out_buf; - } - } - - gpioreserve(NAND_MASK); /* claim our gpios */ - archflashwp(0, 0); /* open up flash for writing */ - - goto out; - - out_mtd: - kfree(toto_mtd); - out: - return err; -} - -module_init(toto_init); - -/* - * Clean up routine - */ -static void __exit toto_cleanup(void) -{ - /* Release resources, unregister device */ - nand_release(toto_mtd); - - /* Free the MTD device structure */ - kfree(toto_mtd); - - /* stop flash writes */ - archflashwp(0, 1); - - /* release gpios to system */ - gpiorelease(NAND_MASK); -} - -module_exit(toto_cleanup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Richard Woodruff <r-woodruff2@ti.com>"); -MODULE_DESCRIPTION("Glue layer for NAND flash on toto board"); diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c index 4f80c2fd89af..9e45b3f39c0e 100644 --- a/drivers/mtd/ofpart.c +++ b/drivers/mtd/ofpart.c @@ -20,7 +20,6 @@ #include <linux/mtd/partitions.h> int __devinit of_mtd_parse_partitions(struct device *dev, - struct mtd_info *mtd, struct device_node *node, struct mtd_partition **pparts) { diff --git a/drivers/mtd/onenand/Kconfig b/drivers/mtd/onenand/Kconfig index cb41cbca64f7..79fa79e8f8de 100644 --- a/drivers/mtd/onenand/Kconfig +++ b/drivers/mtd/onenand/Kconfig @@ -27,8 +27,16 @@ config MTD_ONENAND_GENERIC help Support for OneNAND flash via platform device driver. +config MTD_ONENAND_OMAP2 + tristate "OneNAND on OMAP2/OMAP3 support" + depends on MTD_ONENAND && (ARCH_OMAP2 || ARCH_OMAP3) + help + Support for a OneNAND flash device connected to an OMAP2/OMAP3 CPU + via the GPMC memory controller. + config MTD_ONENAND_OTP bool "OneNAND OTP Support" + select HAVE_MTD_OTP help One Block of the NAND Flash Array memory is reserved as a One-Time Programmable Block memory area. diff --git a/drivers/mtd/onenand/Makefile b/drivers/mtd/onenand/Makefile index 4d2eacfd7e11..64b6cc61a520 100644 --- a/drivers/mtd/onenand/Makefile +++ b/drivers/mtd/onenand/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_MTD_ONENAND) += onenand.o # Board specific. obj-$(CONFIG_MTD_ONENAND_GENERIC) += generic.o +obj-$(CONFIG_MTD_ONENAND_OMAP2) += omap2.o # Simulator obj-$(CONFIG_MTD_ONENAND_SIM) += onenand_sim.o diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c new file mode 100644 index 000000000000..8387e05daae2 --- /dev/null +++ b/drivers/mtd/onenand/omap2.c @@ -0,0 +1,802 @@ +/* + * linux/drivers/mtd/onenand/omap2.c + * + * OneNAND driver for OMAP2 / OMAP3 + * + * Copyright © 2005-2006 Nokia Corporation + * + * Author: Jarkko Lavinen <jarkko.lavinen@nokia.com> and Juha Yrjölä + * IRQ and DMA support written by Timo Teras + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +#include <linux/device.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/onenand.h> +#include <linux/mtd/partitions.h> +#include <linux/platform_device.h> +#include <linux/interrupt.h> +#include <linux/delay.h> + +#include <asm/io.h> +#include <asm/mach/flash.h> +#include <asm/arch/gpmc.h> +#include <asm/arch/onenand.h> +#include <asm/arch/gpio.h> +#include <asm/arch/gpmc.h> +#include <asm/arch/pm.h> + +#include <linux/dma-mapping.h> +#include <asm/dma-mapping.h> +#include <asm/arch/dma.h> + +#include <asm/arch/board.h> + +#define DRIVER_NAME "omap2-onenand" + +#define ONENAND_IO_SIZE SZ_128K +#define ONENAND_BUFRAM_SIZE (1024 * 5) + +struct omap2_onenand { + struct platform_device *pdev; + int gpmc_cs; + unsigned long phys_base; + int gpio_irq; + struct mtd_info mtd; + struct mtd_partition *parts; + struct onenand_chip onenand; + struct completion irq_done; + struct completion dma_done; + int dma_channel; + int freq; + int (*setup)(void __iomem *base, int freq); +}; + +static void omap2_onenand_dma_cb(int lch, u16 ch_status, void *data) +{ + struct omap2_onenand *c = data; + + complete(&c->dma_done); +} + +static irqreturn_t omap2_onenand_interrupt(int irq, void *dev_id) +{ + struct omap2_onenand *c = dev_id; + + complete(&c->irq_done); + + return IRQ_HANDLED; +} + +static inline unsigned short read_reg(struct omap2_onenand *c, int reg) +{ + return readw(c->onenand.base + reg); +} + +static inline void write_reg(struct omap2_onenand *c, unsigned short value, + int reg) +{ + writew(value, c->onenand.base + reg); +} + +static void wait_err(char *msg, int state, unsigned int ctrl, unsigned int intr) +{ + printk(KERN_ERR "onenand_wait: %s! state %d ctrl 0x%04x intr 0x%04x\n", + msg, state, ctrl, intr); +} + +static void wait_warn(char *msg, int state, unsigned int ctrl, + unsigned int intr) +{ + printk(KERN_WARNING "onenand_wait: %s! state %d ctrl 0x%04x " + "intr 0x%04x\n", msg, state, ctrl, intr); +} + +static int omap2_onenand_wait(struct mtd_info *mtd, int state) +{ + struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd); + unsigned int intr = 0; + unsigned int ctrl; + unsigned long timeout; + u32 syscfg; + + if (state == FL_RESETING) { + int i; + + for (i = 0; i < 20; i++) { + udelay(1); + intr = read_reg(c, ONENAND_REG_INTERRUPT); + if (intr & ONENAND_INT_MASTER) + break; + } + ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS); + if (ctrl & ONENAND_CTRL_ERROR) { + wait_err("controller error", state, ctrl, intr); + return -EIO; + } + if (!(intr & ONENAND_INT_RESET)) { + wait_err("timeout", state, ctrl, intr); + return -EIO; + } + return 0; + } + + if (state != FL_READING) { + int result; + + /* Turn interrupts on */ + syscfg = read_reg(c, ONENAND_REG_SYS_CFG1); + if (!(syscfg & ONENAND_SYS_CFG1_IOBE)) { + syscfg |= ONENAND_SYS_CFG1_IOBE; + write_reg(c, syscfg, ONENAND_REG_SYS_CFG1); + if (cpu_is_omap34xx()) + /* Add a delay to let GPIO settle */ + syscfg = read_reg(c, ONENAND_REG_SYS_CFG1); + } + + INIT_COMPLETION(c->irq_done); + if (c->gpio_irq) { + result = omap_get_gpio_datain(c->gpio_irq); + if (result == -1) { + ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS); + intr = read_reg(c, ONENAND_REG_INTERRUPT); + wait_err("gpio error", state, ctrl, intr); + return -EIO; + } + } else + result = 0; + if (result == 0) { + int retry_cnt = 0; +retry: + result = wait_for_completion_timeout(&c->irq_done, + msecs_to_jiffies(20)); + if (result == 0) { + /* Timeout after 20ms */ + ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS); + if (ctrl & ONENAND_CTRL_ONGO) { + /* + * The operation seems to be still going + * so give it some more time. + */ + retry_cnt += 1; + if (retry_cnt < 3) + goto retry; + intr = read_reg(c, + ONENAND_REG_INTERRUPT); + wait_err("timeout", state, ctrl, intr); + return -EIO; + } + intr = read_reg(c, ONENAND_REG_INTERRUPT); + if ((intr & ONENAND_INT_MASTER) == 0) + wait_warn("timeout", state, ctrl, intr); + } + } + } else { + int retry_cnt = 0; + + /* Turn interrupts off */ + syscfg = read_reg(c, ONENAND_REG_SYS_CFG1); + syscfg &= ~ONENAND_SYS_CFG1_IOBE; + write_reg(c, syscfg, ONENAND_REG_SYS_CFG1); + + timeout = jiffies + msecs_to_jiffies(20); + while (1) { + if (time_before(jiffies, timeout)) { + intr = read_reg(c, ONENAND_REG_INTERRUPT); + if (intr & ONENAND_INT_MASTER) + break; + } else { + /* Timeout after 20ms */ + ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS); + if (ctrl & ONENAND_CTRL_ONGO) { + /* + * The operation seems to be still going + * so give it some more time. + */ + retry_cnt += 1; + if (retry_cnt < 3) { + timeout = jiffies + + msecs_to_jiffies(20); + continue; + } + } + break; + } + } + } + + intr = read_reg(c, ONENAND_REG_INTERRUPT); + ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS); + + if (intr & ONENAND_INT_READ) { + int ecc = read_reg(c, ONENAND_REG_ECC_STATUS); + + if (ecc) { + unsigned int addr1, addr8; + + addr1 = read_reg(c, ONENAND_REG_START_ADDRESS1); + addr8 = read_reg(c, ONENAND_REG_START_ADDRESS8); + if (ecc & ONENAND_ECC_2BIT_ALL) { + printk(KERN_ERR "onenand_wait: ECC error = " + "0x%04x, addr1 %#x, addr8 %#x\n", + ecc, addr1, addr8); + mtd->ecc_stats.failed++; + return -EBADMSG; + } else if (ecc & ONENAND_ECC_1BIT_ALL) { + printk(KERN_NOTICE "onenand_wait: correctable " + "ECC error = 0x%04x, addr1 %#x, " + "addr8 %#x\n", ecc, addr1, addr8); + mtd->ecc_stats.corrected++; + } + } + } else if (state == FL_READING) { + wait_err("timeout", state, ctrl, intr); + return -EIO; + } + + if (ctrl & ONENAND_CTRL_ERROR) { + wait_err("controller error", state, ctrl, intr); + if (ctrl & ONENAND_CTRL_LOCK) + printk(KERN_ERR "onenand_wait: " + "Device is write protected!!!\n"); + return -EIO; + } + + if (ctrl & 0xFE9F) + wait_warn("unexpected controller status", state, ctrl, intr); + + return 0; +} + +static inline int omap2_onenand_bufferram_offset(struct mtd_info *mtd, int area) +{ + struct onenand_chip *this = mtd->priv; + + if (ONENAND_CURRENT_BUFFERRAM(this)) { + if (area == ONENAND_DATARAM) + return mtd->writesize; + if (area == ONENAND_SPARERAM) + return mtd->oobsize; + } + + return 0; +} + +#if defined(CONFIG_ARCH_OMAP3) || defined(MULTI_OMAP2) + +static int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area, + unsigned char *buffer, int offset, + size_t count) +{ + struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd); + struct onenand_chip *this = mtd->priv; + dma_addr_t dma_src, dma_dst; + int bram_offset; + unsigned long timeout; + void *buf = (void *)buffer; + size_t xtra; + volatile unsigned *done; + + bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset; + if (bram_offset & 3 || (size_t)buf & 3 || count < 384) + goto out_copy; + + if (buf >= high_memory) { + struct page *p1; + + if (((size_t)buf & PAGE_MASK) != + ((size_t)(buf + count - 1) & PAGE_MASK)) + goto out_copy; + p1 = vmalloc_to_page(buf); + if (!p1) + goto out_copy; + buf = page_address(p1) + ((size_t)buf & ~PAGE_MASK); + } + + xtra = count & 3; + if (xtra) { + count -= xtra; + memcpy(buf + count, this->base + bram_offset + count, xtra); + } + + dma_src = c->phys_base + bram_offset; + dma_dst = dma_map_single(&c->pdev->dev, buf, count, DMA_FROM_DEVICE); + if (dma_mapping_error(&c->pdev->dev, dma_dst)) { + dev_err(&c->pdev->dev, + "Couldn't DMA map a %d byte buffer\n", + count); + goto out_copy; + } + + omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S32, + count >> 2, 1, 0, 0, 0); + omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_src, 0, 0); + omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_dst, 0, 0); + + INIT_COMPLETION(c->dma_done); + omap_start_dma(c->dma_channel); + + timeout = jiffies + msecs_to_jiffies(20); + done = &c->dma_done.done; + while (time_before(jiffies, timeout)) + if (*done) + break; + + dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_FROM_DEVICE); + + if (!*done) { + dev_err(&c->pdev->dev, "timeout waiting for DMA\n"); + goto out_copy; + } + + return 0; + +out_copy: + memcpy(buf, this->base + bram_offset, count); + return 0; +} + +static int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area, + const unsigned char *buffer, + int offset, size_t count) +{ + struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd); + struct onenand_chip *this = mtd->priv; + dma_addr_t dma_src, dma_dst; + int bram_offset; + unsigned long timeout; + void *buf = (void *)buffer; + volatile unsigned *done; + + bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset; + if (bram_offset & 3 || (size_t)buf & 3 || count < 384) + goto out_copy; + + /* panic_write() may be in an interrupt context */ + if (in_interrupt()) + goto out_copy; + + if (buf >= high_memory) { + struct page *p1; + + if (((size_t)buf & PAGE_MASK) != + ((size_t)(buf + count - 1) & PAGE_MASK)) + goto out_copy; + p1 = vmalloc_to_page(buf); + if (!p1) + goto out_copy; + buf = page_address(p1) + ((size_t)buf & ~PAGE_MASK); + } + + dma_src = dma_map_single(&c->pdev->dev, buf, count, DMA_TO_DEVICE); + dma_dst = c->phys_base + bram_offset; + if (dma_mapping_error(&c->pdev->dev, dma_dst)) { + dev_err(&c->pdev->dev, + "Couldn't DMA map a %d byte buffer\n", + count); + return -1; + } + + omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S32, + count >> 2, 1, 0, 0, 0); + omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_src, 0, 0); + omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_dst, 0, 0); + + INIT_COMPLETION(c->dma_done); + omap_start_dma(c->dma_channel); + + timeout = jiffies + msecs_to_jiffies(20); + done = &c->dma_done.done; + while (time_before(jiffies, timeout)) + if (*done) + break; + + dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_TO_DEVICE); + + if (!*done) { + dev_err(&c->pdev->dev, "timeout waiting for DMA\n"); + goto out_copy; + } + + return 0; + +out_copy: + memcpy(this->base + bram_offset, buf, count); + return 0; +} + +#else + +int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area, + unsigned char *buffer, int offset, + size_t count); + +int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area, + const unsigned char *buffer, + int offset, size_t count); + +#endif + +#if defined(CONFIG_ARCH_OMAP2) || defined(MULTI_OMAP2) + +static int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area, + unsigned char *buffer, int offset, + size_t count) +{ + struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd); + struct onenand_chip *this = mtd->priv; + dma_addr_t dma_src, dma_dst; + int bram_offset; + + bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset; + /* DMA is not used. Revisit PM requirements before enabling it. */ + if (1 || (c->dma_channel < 0) || + ((void *) buffer >= (void *) high_memory) || (bram_offset & 3) || + (((unsigned int) buffer) & 3) || (count < 1024) || (count & 3)) { + memcpy(buffer, (__force void *)(this->base + bram_offset), + count); + return 0; + } + + dma_src = c->phys_base + bram_offset; + dma_dst = dma_map_single(&c->pdev->dev, buffer, count, + DMA_FROM_DEVICE); + if (dma_mapping_error(&c->pdev->dev, dma_dst)) { + dev_err(&c->pdev->dev, + "Couldn't DMA map a %d byte buffer\n", + count); + return -1; + } + + omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S32, + count / 4, 1, 0, 0, 0); + omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_src, 0, 0); + omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_dst, 0, 0); + + INIT_COMPLETION(c->dma_done); + omap_start_dma(c->dma_channel); + wait_for_completion(&c->dma_done); + + dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_FROM_DEVICE); + + return 0; +} + +static int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area, + const unsigned char *buffer, + int offset, size_t count) +{ + struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd); + struct onenand_chip *this = mtd->priv; + dma_addr_t dma_src, dma_dst; + int bram_offset; + + bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset; + /* DMA is not used. Revisit PM requirements before enabling it. */ + if (1 || (c->dma_channel < 0) || + ((void *) buffer >= (void *) high_memory) || (bram_offset & 3) || + (((unsigned int) buffer) & 3) || (count < 1024) || (count & 3)) { + memcpy((__force void *)(this->base + bram_offset), buffer, + count); + return 0; + } + + dma_src = dma_map_single(&c->pdev->dev, (void *) buffer, count, + DMA_TO_DEVICE); + dma_dst = c->phys_base + bram_offset; + if (dma_mapping_error(&c->pdev->dev, dma_dst)) { + dev_err(&c->pdev->dev, + "Couldn't DMA map a %d byte buffer\n", + count); + return -1; + } + + omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S16, + count / 2, 1, 0, 0, 0); + omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_src, 0, 0); + omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC, + dma_dst, 0, 0); + + INIT_COMPLETION(c->dma_done); + omap_start_dma(c->dma_channel); + wait_for_completion(&c->dma_done); + + dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_TO_DEVICE); + + return 0; +} + +#else + +int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area, + unsigned char *buffer, int offset, + size_t count); + +int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area, + const unsigned char *buffer, + int offset, size_t count); + +#endif + +static struct platform_driver omap2_onenand_driver; + +static int __adjust_timing(struct device *dev, void *data) +{ + int ret = 0; + struct omap2_onenand *c; + + c = dev_get_drvdata(dev); + + BUG_ON(c->setup == NULL); + + /* DMA is not in use so this is all that is needed */ + /* Revisit for OMAP3! */ + ret = c->setup(c->onenand.base, c->freq); + + return ret; +} + +int omap2_onenand_rephase(void) +{ + return driver_for_each_device(&omap2_onenand_driver.driver, NULL, + NULL, __adjust_timing); +} + +static void __devexit omap2_onenand_shutdown(struct platform_device *pdev) +{ + struct omap2_onenand *c = dev_get_drvdata(&pdev->dev); + + /* With certain content in the buffer RAM, the OMAP boot ROM code + * can recognize the flash chip incorrectly. Zero it out before + * soft reset. + */ + memset((__force void *)c->onenand.base, 0, ONENAND_BUFRAM_SIZE); +} + +static int __devinit omap2_onenand_probe(struct platform_device *pdev) +{ + struct omap_onenand_platform_data *pdata; + struct omap2_onenand *c; + int r; + + pdata = pdev->dev.platform_data; + if (pdata == NULL) { + dev_err(&pdev->dev, "platform data missing\n"); + return -ENODEV; + } + + c = kzalloc(sizeof(struct omap2_onenand), GFP_KERNEL); + if (!c) + return -ENOMEM; + + init_completion(&c->irq_done); + init_completion(&c->dma_done); + c->gpmc_cs = pdata->cs; + c->gpio_irq = pdata->gpio_irq; + c->dma_channel = pdata->dma_channel; + if (c->dma_channel < 0) { + /* if -1, don't use DMA */ + c->gpio_irq = 0; + } + + r = gpmc_cs_request(c->gpmc_cs, ONENAND_IO_SIZE, &c->phys_base); + if (r < 0) { + dev_err(&pdev->dev, "Cannot request GPMC CS\n"); + goto err_kfree; + } + + if (request_mem_region(c->phys_base, ONENAND_IO_SIZE, + pdev->dev.driver->name) == NULL) { + dev_err(&pdev->dev, "Cannot reserve memory region at 0x%08lx, " + "size: 0x%x\n", c->phys_base, ONENAND_IO_SIZE); + r = -EBUSY; + goto err_free_cs; + } + c->onenand.base = ioremap(c->phys_base, ONENAND_IO_SIZE); + if (c->onenand.base == NULL) { + r = -ENOMEM; + goto err_release_mem_region; + } + + if (pdata->onenand_setup != NULL) { + r = pdata->onenand_setup(c->onenand.base, c->freq); + if (r < 0) { + dev_err(&pdev->dev, "Onenand platform setup failed: " + "%d\n", r); + goto err_iounmap; + } + c->setup = pdata->onenand_setup; + } + + if (c->gpio_irq) { + if ((r = omap_request_gpio(c->gpio_irq)) < 0) { + dev_err(&pdev->dev, "Failed to request GPIO%d for " + "OneNAND\n", c->gpio_irq); + goto err_iounmap; + } + omap_set_gpio_direction(c->gpio_irq, 1); + + if ((r = request_irq(OMAP_GPIO_IRQ(c->gpio_irq), + omap2_onenand_interrupt, IRQF_TRIGGER_RISING, + pdev->dev.driver->name, c)) < 0) + goto err_release_gpio; + } + + if (c->dma_channel >= 0) { + r = omap_request_dma(0, pdev->dev.driver->name, + omap2_onenand_dma_cb, (void *) c, + &c->dma_channel); + if (r == 0) { + omap_set_dma_write_mode(c->dma_channel, + OMAP_DMA_WRITE_NON_POSTED); + omap_set_dma_src_data_pack(c->dma_channel, 1); + omap_set_dma_src_burst_mode(c->dma_channel, + OMAP_DMA_DATA_BURST_8); + omap_set_dma_dest_data_pack(c->dma_channel, 1); + omap_set_dma_dest_burst_mode(c->dma_channel, + OMAP_DMA_DATA_BURST_8); + } else { + dev_info(&pdev->dev, + "failed to allocate DMA for OneNAND, " + "using PIO instead\n"); + c->dma_channel = -1; + } + } + + dev_info(&pdev->dev, "initializing on CS%d, phys base 0x%08lx, virtual " + "base %p\n", c->gpmc_cs, c->phys_base, + c->onenand.base); + + c->pdev = pdev; + c->mtd.name = pdev->dev.bus_id; + c->mtd.priv = &c->onenand; + c->mtd.owner = THIS_MODULE; + + if (c->dma_channel >= 0) { + struct onenand_chip *this = &c->onenand; + + this->wait = omap2_onenand_wait; + if (cpu_is_omap34xx()) { + this->read_bufferram = omap3_onenand_read_bufferram; + this->write_bufferram = omap3_onenand_write_bufferram; + } else { + this->read_bufferram = omap2_onenand_read_bufferram; + this->write_bufferram = omap2_onenand_write_bufferram; + } + } + + if ((r = onenand_scan(&c->mtd, 1)) < 0) + goto err_release_dma; + + switch ((c->onenand.version_id >> 4) & 0xf) { + case 0: + c->freq = 40; + break; + case 1: + c->freq = 54; + break; + case 2: + c->freq = 66; + break; + case 3: + c->freq = 83; + break; + } + +#ifdef CONFIG_MTD_PARTITIONS + if (pdata->parts != NULL) + r = add_mtd_partitions(&c->mtd, pdata->parts, + pdata->nr_parts); + else +#endif + r = add_mtd_device(&c->mtd); + if (r < 0) + goto err_release_onenand; + + platform_set_drvdata(pdev, c); + + return 0; + +err_release_onenand: + onenand_release(&c->mtd); +err_release_dma: + if (c->dma_channel != -1) + omap_free_dma(c->dma_channel); + if (c->gpio_irq) + free_irq(OMAP_GPIO_IRQ(c->gpio_irq), c); +err_release_gpio: + if (c->gpio_irq) + omap_free_gpio(c->gpio_irq); +err_iounmap: + iounmap(c->onenand.base); +err_release_mem_region: + release_mem_region(c->phys_base, ONENAND_IO_SIZE); +err_free_cs: + gpmc_cs_free(c->gpmc_cs); +err_kfree: + kfree(c); + + return r; +} + +static int __devexit omap2_onenand_remove(struct platform_device *pdev) +{ + struct omap2_onenand *c = dev_get_drvdata(&pdev->dev); + + BUG_ON(c == NULL); + +#ifdef CONFIG_MTD_PARTITIONS + if (c->parts) + del_mtd_partitions(&c->mtd); + else + del_mtd_device(&c->mtd); +#else + del_mtd_device(&c->mtd); +#endif + + onenand_release(&c->mtd); + if (c->dma_channel != -1) + omap_free_dma(c->dma_channel); + omap2_onenand_shutdown(pdev); + platform_set_drvdata(pdev, NULL); + if (c->gpio_irq) { + free_irq(OMAP_GPIO_IRQ(c->gpio_irq), c); + omap_free_gpio(c->gpio_irq); + } + iounmap(c->onenand.base); + release_mem_region(c->phys_base, ONENAND_IO_SIZE); + kfree(c); + + return 0; +} + +static struct platform_driver omap2_onenand_driver = { + .probe = omap2_onenand_probe, + .remove = omap2_onenand_remove, + .shutdown = omap2_onenand_shutdown, + .driver = { + .name = DRIVER_NAME, + .owner = THIS_MODULE, + }, +}; + +static int __init omap2_onenand_init(void) +{ + printk(KERN_INFO "OneNAND driver initializing\n"); + return platform_driver_register(&omap2_onenand_driver); +} + +static void __exit omap2_onenand_exit(void) +{ + platform_driver_unregister(&omap2_onenand_driver); +} + +module_init(omap2_onenand_init); +module_exit(omap2_onenand_exit); + +MODULE_ALIAS(DRIVER_NAME); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>"); +MODULE_DESCRIPTION("Glue layer for OneNAND flash on OMAP2 / OMAP3"); diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 926cf3a4135d..90ed319f26e6 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -1794,7 +1794,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr) return -EINVAL; } - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; /* Grab the lock and see if the device is available */ onenand_get_device(mtd, FL_ERASING); diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c index a5f3d60047d4..33a5d6ed6f18 100644 --- a/drivers/mtd/ssfdc.c +++ b/drivers/mtd/ssfdc.c @@ -321,8 +321,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) DEBUG(MTD_DEBUG_LEVEL1, "SSFDC_RO: cis_block=%d,erase_size=%d,map_len=%d,n_zones=%d\n", ssfdc->cis_block, ssfdc->erase_size, ssfdc->map_len, - (ssfdc->map_len + MAX_PHYS_BLK_PER_ZONE - 1) / - MAX_PHYS_BLK_PER_ZONE); + DIV_ROUND_UP(ssfdc->map_len, MAX_PHYS_BLK_PER_ZONE)); /* Set geometry */ ssfdc->heads = 16; diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index 03c759b4eeb5..b30a0b83d7f1 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -104,12 +104,9 @@ static int vol_cdev_open(struct inode *inode, struct file *file) struct ubi_volume_desc *desc; int vol_id = iminor(inode) - 1, mode, ubi_num; - lock_kernel(); ubi_num = ubi_major2num(imajor(inode)); - if (ubi_num < 0) { - unlock_kernel(); + if (ubi_num < 0) return ubi_num; - } if (file->f_mode & FMODE_WRITE) mode = UBI_READWRITE; @@ -119,7 +116,6 @@ static int vol_cdev_open(struct inode *inode, struct file *file) dbg_gen("open volume %d, mode %d", vol_id, mode); desc = ubi_open_volume(ubi_num, vol_id, mode); - unlock_kernel(); if (IS_ERR(desc)) return PTR_ERR(desc); diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c index 967bb4406df9..4f2daa5bbecf 100644 --- a/drivers/mtd/ubi/scan.c +++ b/drivers/mtd/ubi/scan.c @@ -387,7 +387,7 @@ int ubi_scan_add_used(struct ubi_device *ubi, struct ubi_scan_info *si, pnum, vol_id, lnum, ec, sqnum, bitflips); sv = add_volume(si, vol_id, pnum, vid_hdr); - if (IS_ERR(sv) < 0) + if (IS_ERR(sv)) return PTR_ERR(sv); if (si->max_sqnum < sqnum) diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index 217d0e111b2a..333c8941552f 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -244,8 +244,8 @@ static int vtbl_check(const struct ubi_device *ubi, } if (reserved_pebs > ubi->good_peb_count) { - dbg_err("too large reserved_pebs, good PEBs %d", - ubi->good_peb_count); + dbg_err("too large reserved_pebs %d, good PEBs %d", + reserved_pebs, ubi->good_peb_count); err = 9; goto bad; } diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index fc5f2dbf5323..8b51e10b7783 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -563,7 +563,7 @@ static int __iommu_flush_context(struct intel_iommu *iommu, spin_unlock_irqrestore(&iommu->register_lock, flag); - /* flush context entry will implictly flush write buffer */ + /* flush context entry will implicitly flush write buffer */ return 0; } @@ -656,7 +656,7 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type)) pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", DMA_TLB_IIRG(type), DMA_TLB_IAIG(val)); - /* flush context entry will implictly flush write buffer */ + /* flush iotlb entry will implicitly flush write buffer */ return 0; } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c9884bba22de..dbe9f39f4436 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1358,11 +1358,10 @@ int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name) return 0; err_out: - dev_warn(&pdev->dev, "BAR %d: can't reserve %s region [%#llx-%#llx]\n", + dev_warn(&pdev->dev, "BAR %d: can't reserve %s region %pR\n", bar, pci_resource_flags(pdev, bar) & IORESOURCE_IO ? "I/O" : "mem", - (unsigned long long)pci_resource_start(pdev, bar), - (unsigned long long)pci_resource_end(pdev, bar)); + &pdev->resource[bar]); return -EBUSY; } diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index dd9161a054e1..d3db8b249729 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -304,9 +304,8 @@ static int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, } else { res->start = l64; res->end = l64 + sz64; - printk(KERN_DEBUG "PCI: %s reg %x 64bit mmio: [%llx, %llx]\n", - pci_name(dev), pos, (unsigned long long)res->start, - (unsigned long long)res->end); + printk(KERN_DEBUG "PCI: %s reg %x 64bit mmio: %pR\n", + pci_name(dev), pos, res); } } else { sz = pci_size(l, sz, mask); @@ -316,9 +315,10 @@ static int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, res->start = l; res->end = l + sz; - printk(KERN_DEBUG "PCI: %s reg %x %s: [%llx, %llx]\n", pci_name(dev), - pos, (res->flags & IORESOURCE_IO) ? "io port":"32bit mmio", - (unsigned long long)res->start, (unsigned long long)res->end); + printk(KERN_DEBUG "PCI: %s reg %x %s: %pR\n", + pci_name(dev), pos, + (res->flags & IORESOURCE_IO) ? "io port":"32bit mmio", + res); } out: @@ -389,9 +389,8 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) res->start = base; if (!res->end) res->end = limit + 0xfff; - printk(KERN_DEBUG "PCI: bridge %s io port: [%llx, %llx]\n", - pci_name(dev), (unsigned long long) res->start, - (unsigned long long) res->end); + printk(KERN_DEBUG "PCI: bridge %s io port: %pR\n", + pci_name(dev), res); } res = child->resource[1]; @@ -403,9 +402,8 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM; res->start = base; res->end = limit + 0xfffff; - printk(KERN_DEBUG "PCI: bridge %s 32bit mmio: [%llx, %llx]\n", - pci_name(dev), (unsigned long long) res->start, - (unsigned long long) res->end); + printk(KERN_DEBUG "PCI: bridge %s 32bit mmio: %pR\n", + pci_name(dev), res); } res = child->resource[2]; @@ -441,9 +439,9 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child) res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH; res->start = base; res->end = limit + 0xfffff; - printk(KERN_DEBUG "PCI: bridge %s %sbit mmio pref: [%llx, %llx]\n", - pci_name(dev), (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64" : "32", - (unsigned long long) res->start, (unsigned long long) res->end); + printk(KERN_DEBUG "PCI: bridge %s %sbit mmio pref: %pR\n", + pci_name(dev), + (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64":"32", res); } } diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c index bd5c0e031398..1f5f6143f35c 100644 --- a/drivers/pci/rom.c +++ b/drivers/pci/rom.c @@ -21,7 +21,7 @@ * between the ROM and other resources, so enabling it may disable access * to MMIO registers or other card memory. */ -static int pci_enable_rom(struct pci_dev *pdev) +int pci_enable_rom(struct pci_dev *pdev) { struct resource *res = pdev->resource + PCI_ROM_RESOURCE; struct pci_bus_region region; @@ -45,7 +45,7 @@ static int pci_enable_rom(struct pci_dev *pdev) * Disable ROM decoding on a PCI device by turning off the last bit in the * ROM BAR. */ -static void pci_disable_rom(struct pci_dev *pdev) +void pci_disable_rom(struct pci_dev *pdev) { u32 rom_addr; pci_read_config_dword(pdev, pdev->rom_base_reg, &rom_addr); @@ -260,3 +260,5 @@ void pci_cleanup_rom(struct pci_dev *pdev) EXPORT_SYMBOL(pci_map_rom); EXPORT_SYMBOL(pci_unmap_rom); +EXPORT_SYMBOL_GPL(pci_enable_rom); +EXPORT_SYMBOL_GPL(pci_disable_rom); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index d5e2106760f8..471a429d7a20 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -356,10 +356,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long order = __ffs(align) - 20; if (order > 11) { dev_warn(&dev->dev, "BAR %d bad alignment %llx: " - "%#016llx-%#016llx\n", i, - (unsigned long long)align, - (unsigned long long)r->start, - (unsigned long long)r->end); + "%pR\n", i, (unsigned long long)align, r); r->flags = 0; continue; } @@ -539,11 +536,9 @@ static void pci_bus_dump_res(struct pci_bus *bus) if (!res) continue; - printk(KERN_INFO "bus: %02x index %x %s: [%llx, %llx]\n", - bus->number, i, - (res->flags & IORESOURCE_IO) ? "io port" : "mmio", - (unsigned long long) res->start, - (unsigned long long) res->end); + printk(KERN_INFO "bus: %02x index %x %s: %pR\n", + bus->number, i, + (res->flags & IORESOURCE_IO) ? "io port" : "mmio", res); } } diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 1a5fc83c71b3..d4b5c690eaa7 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -49,10 +49,8 @@ void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno) pcibios_resource_to_bus(dev, ®ion, res); - dev_dbg(&dev->dev, "BAR %d: got res [%#llx-%#llx] bus [%#llx-%#llx] " - "flags %#lx\n", resno, - (unsigned long long)res->start, - (unsigned long long)res->end, + dev_dbg(&dev->dev, "BAR %d: got res %pR bus [%#llx-%#llx] " + "flags %#lx\n", resno, res, (unsigned long long)region.start, (unsigned long long)region.end, (unsigned long)res->flags); @@ -114,13 +112,11 @@ int pci_claim_resource(struct pci_dev *dev, int resource) err = insert_resource(root, res); if (err) { - dev_err(&dev->dev, "BAR %d: %s of %s [%#llx-%#llx]\n", + dev_err(&dev->dev, "BAR %d: %s of %s %pR\n", resource, root ? "address space collision on" : "no parent found for", - dtype, - (unsigned long long)res->start, - (unsigned long long)res->end); + dtype, res); } return err; @@ -139,9 +135,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno) align = resource_alignment(res); if (!align) { dev_err(&dev->dev, "BAR %d: can't allocate resource (bogus " - "alignment) [%#llx-%#llx] flags %#lx\n", - resno, (unsigned long long)res->start, - (unsigned long long)res->end, res->flags); + "alignment) %pR flags %#lx\n", + resno, res, res->flags); return -EINVAL; } @@ -162,11 +157,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno) } if (ret) { - dev_err(&dev->dev, "BAR %d: can't allocate %s resource " - "[%#llx-%#llx]\n", resno, - res->flags & IORESOURCE_IO ? "I/O" : "mem", - (unsigned long long)res->start, - (unsigned long long)res->end); + dev_err(&dev->dev, "BAR %d: can't allocate %s resource %pR\n", + resno, res->flags & IORESOURCE_IO ? "I/O" : "mem", res); } else { res->flags &= ~IORESOURCE_STARTALIGN; if (resno < PCI_BRIDGE_RESOURCES) @@ -202,11 +194,8 @@ int pci_assign_resource_fixed(struct pci_dev *dev, int resno) } if (ret) { - dev_err(&dev->dev, "BAR %d: can't allocate %s resource " - "[%#llx-%#llx\n]", resno, - res->flags & IORESOURCE_IO ? "I/O" : "mem", - (unsigned long long)res->start, - (unsigned long long)res->end); + dev_err(&dev->dev, "BAR %d: can't allocate %s resource %pR\n", + resno, res->flags & IORESOURCE_IO ? "I/O" : "mem", res); } else if (resno < PCI_BRIDGE_RESOURCES) { pci_update_resource(dev, res, resno); } @@ -237,9 +226,8 @@ void pdev_sort_resources(struct pci_dev *dev, struct resource_list *head) r_align = resource_alignment(r); if (!r_align) { dev_warn(&dev->dev, "BAR %d: bogus alignment " - "[%#llx-%#llx] flags %#lx\n", - i, (unsigned long long)r->start, - (unsigned long long)r->end, r->flags); + "%pR flags %#lx\n", + i, r, r->flags); continue; } for (list = head; ; list = list->next) { @@ -287,9 +275,7 @@ int pci_enable_resources(struct pci_dev *dev, int mask) if (!r->parent) { dev_err(&dev->dev, "device not available because of " - "BAR %d [%#llx-%#llx] collisions\n", i, - (unsigned long long) r->start, - (unsigned long long) r->end); + "BAR %d %pR collisions\n", i, r); return -EINVAL; } diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c index fe2aeb11939b..23ae8460f5c1 100644 --- a/drivers/power/power_supply_sysfs.c +++ b/drivers/power/power_supply_sysfs.c @@ -30,7 +30,7 @@ #define POWER_SUPPLY_ATTR(_name) \ { \ - .attr = { .name = #_name, .mode = 0444, .owner = THIS_MODULE }, \ + .attr = { .name = #_name, .mode = 0444 }, \ .show = power_supply_show_property, \ .store = NULL, \ } diff --git a/drivers/ps3/ps3av.c b/drivers/ps3/ps3av.c index 6f2f90ebb020..06848b254d57 100644 --- a/drivers/ps3/ps3av.c +++ b/drivers/ps3/ps3av.c @@ -915,6 +915,22 @@ int ps3av_video_mute(int mute) EXPORT_SYMBOL_GPL(ps3av_video_mute); +/* mute analog output only */ +int ps3av_audio_mute_analog(int mute) +{ + int i, res; + + for (i = 0; i < ps3av->av_hw_conf.num_of_avmulti; i++) { + res = ps3av_cmd_av_audio_mute(1, + &ps3av->av_port[i + ps3av->av_hw_conf.num_of_hdmi], + mute); + if (res < 0) + return -1; + } + return 0; +} +EXPORT_SYMBOL_GPL(ps3av_audio_mute_analog); + int ps3av_audio_mute(int mute) { return ps3av_set_audio_mute(mute ? PS3AV_CMD_MUTE_ON diff --git a/drivers/ps3/ps3av_cmd.c b/drivers/ps3/ps3av_cmd.c index 7f880c26122f..11eb50318fec 100644 --- a/drivers/ps3/ps3av_cmd.c +++ b/drivers/ps3/ps3av_cmd.c @@ -660,9 +660,10 @@ u32 ps3av_cmd_set_av_audio_param(void *p, u32 port, } /* default cs val */ -static const u8 ps3av_mode_cs_info[] = { +u8 ps3av_mode_cs_info[] = { 0x00, 0x09, 0x00, 0x02, 0x01, 0x00, 0x00, 0x00 }; +EXPORT_SYMBOL_GPL(ps3av_mode_cs_info); #define CS_44 0x00 #define CS_48 0x02 @@ -677,7 +678,7 @@ void ps3av_cmd_set_audio_mode(struct ps3av_pkt_audio_mode *audio, u32 avport, u32 ch, u32 fs, u32 word_bits, u32 format, u32 source) { - int spdif_through, spdif_bitstream; + int spdif_through; int i; if (!(ch | fs | format | word_bits | source)) { @@ -687,7 +688,6 @@ void ps3av_cmd_set_audio_mode(struct ps3av_pkt_audio_mode *audio, u32 avport, format = PS3AV_CMD_AUDIO_FORMAT_PCM; source = PS3AV_CMD_AUDIO_SOURCE_SERIAL; } - spdif_through = spdif_bitstream = 0; /* XXX not supported */ /* audio mode */ memset(audio, 0, sizeof(*audio)); @@ -777,16 +777,17 @@ void ps3av_cmd_set_audio_mode(struct ps3av_pkt_audio_mode *audio, u32 avport, break; } + /* non-audio bit */ + spdif_through = audio->audio_cs_info[0] & 0x02; + /* pass through setting */ if (spdif_through && (avport == PS3AV_CMD_AVPORT_SPDIF_0 || - avport == PS3AV_CMD_AVPORT_SPDIF_1)) { + avport == PS3AV_CMD_AVPORT_SPDIF_1 || + avport == PS3AV_CMD_AVPORT_HDMI_0 || + avport == PS3AV_CMD_AVPORT_HDMI_1)) { audio->audio_word_bits = PS3AV_CMD_AUDIO_WORD_BITS_16; - audio->audio_source = PS3AV_CMD_AUDIO_SOURCE_SPDIF; - if (spdif_bitstream) { - audio->audio_format = PS3AV_CMD_AUDIO_FORMAT_BITSTREAM; - audio->audio_cs_info[0] |= CS_BIT; - } + audio->audio_format = PS3AV_CMD_AUDIO_FORMAT_BITSTREAM; } } diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 37082616482b..b5bf93706913 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -53,21 +53,21 @@ static void at91_rtc_decodetime(unsigned int timereg, unsigned int calreg, } while ((time != at91_sys_read(timereg)) || (date != at91_sys_read(calreg))); - tm->tm_sec = BCD2BIN((time & AT91_RTC_SEC) >> 0); - tm->tm_min = BCD2BIN((time & AT91_RTC_MIN) >> 8); - tm->tm_hour = BCD2BIN((time & AT91_RTC_HOUR) >> 16); + tm->tm_sec = bcd2bin((time & AT91_RTC_SEC) >> 0); + tm->tm_min = bcd2bin((time & AT91_RTC_MIN) >> 8); + tm->tm_hour = bcd2bin((time & AT91_RTC_HOUR) >> 16); /* * The Calendar Alarm register does not have a field for * the year - so these will return an invalid value. When an * alarm is set, at91_alarm_year wille store the current year. */ - tm->tm_year = BCD2BIN(date & AT91_RTC_CENT) * 100; /* century */ - tm->tm_year += BCD2BIN((date & AT91_RTC_YEAR) >> 8); /* year */ + tm->tm_year = bcd2bin(date & AT91_RTC_CENT) * 100; /* century */ + tm->tm_year += bcd2bin((date & AT91_RTC_YEAR) >> 8); /* year */ - tm->tm_wday = BCD2BIN((date & AT91_RTC_DAY) >> 21) - 1; /* day of the week [0-6], Sunday=0 */ - tm->tm_mon = BCD2BIN((date & AT91_RTC_MONTH) >> 16) - 1; - tm->tm_mday = BCD2BIN((date & AT91_RTC_DATE) >> 24); + tm->tm_wday = bcd2bin((date & AT91_RTC_DAY) >> 21) - 1; /* day of the week [0-6], Sunday=0 */ + tm->tm_mon = bcd2bin((date & AT91_RTC_MONTH) >> 16) - 1; + tm->tm_mday = bcd2bin((date & AT91_RTC_DATE) >> 24); } /* @@ -106,16 +106,16 @@ static int at91_rtc_settime(struct device *dev, struct rtc_time *tm) at91_sys_write(AT91_RTC_IDR, AT91_RTC_ACKUPD); at91_sys_write(AT91_RTC_TIMR, - BIN2BCD(tm->tm_sec) << 0 - | BIN2BCD(tm->tm_min) << 8 - | BIN2BCD(tm->tm_hour) << 16); + bin2bcd(tm->tm_sec) << 0 + | bin2bcd(tm->tm_min) << 8 + | bin2bcd(tm->tm_hour) << 16); at91_sys_write(AT91_RTC_CALR, - BIN2BCD((tm->tm_year + 1900) / 100) /* century */ - | BIN2BCD(tm->tm_year % 100) << 8 /* year */ - | BIN2BCD(tm->tm_mon + 1) << 16 /* tm_mon starts at zero */ - | BIN2BCD(tm->tm_wday + 1) << 21 /* day of the week [0-6], Sunday=0 */ - | BIN2BCD(tm->tm_mday) << 24); + bin2bcd((tm->tm_year + 1900) / 100) /* century */ + | bin2bcd(tm->tm_year % 100) << 8 /* year */ + | bin2bcd(tm->tm_mon + 1) << 16 /* tm_mon starts at zero */ + | bin2bcd(tm->tm_wday + 1) << 21 /* day of the week [0-6], Sunday=0 */ + | bin2bcd(tm->tm_mday) << 24); /* Restart Time/Calendar */ cr = at91_sys_read(AT91_RTC_CR); @@ -162,13 +162,13 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) at91_sys_write(AT91_RTC_IDR, AT91_RTC_ALARM); at91_sys_write(AT91_RTC_TIMALR, - BIN2BCD(tm.tm_sec) << 0 - | BIN2BCD(tm.tm_min) << 8 - | BIN2BCD(tm.tm_hour) << 16 + bin2bcd(tm.tm_sec) << 0 + | bin2bcd(tm.tm_min) << 8 + | bin2bcd(tm.tm_hour) << 16 | AT91_RTC_HOUREN | AT91_RTC_MINEN | AT91_RTC_SECEN); at91_sys_write(AT91_RTC_CALALR, - BIN2BCD(tm.tm_mon + 1) << 16 /* tm_mon starts at zero */ - | BIN2BCD(tm.tm_mday) << 24 + bin2bcd(tm.tm_mon + 1) << 16 /* tm_mon starts at zero */ + | bin2bcd(tm.tm_mday) << 24 | AT91_RTC_DATEEN | AT91_RTC_MTHEN); if (alrm->enabled) { diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index 189a018bdf34..d00a274df8fc 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -71,14 +71,14 @@ static int bq4802_read_time(struct device *dev, struct rtc_time *tm) spin_unlock_irqrestore(&p->lock, flags); - BCD_TO_BIN(tm->tm_sec); - BCD_TO_BIN(tm->tm_min); - BCD_TO_BIN(tm->tm_hour); - BCD_TO_BIN(tm->tm_mday); - BCD_TO_BIN(tm->tm_mon); - BCD_TO_BIN(tm->tm_year); - BCD_TO_BIN(tm->tm_wday); - BCD_TO_BIN(century); + tm->tm_sec = bcd2bin(tm->tm_sec); + tm->tm_min = bcd2bin(tm->tm_min); + tm->tm_hour = bcd2bin(tm->tm_hour); + tm->tm_mday = bcd2bin(tm->tm_mday); + tm->tm_mon = bcd2bin(tm->tm_mon); + tm->tm_year = bcd2bin(tm->tm_year); + tm->tm_wday = bcd2bin(tm->tm_wday); + century = bcd2bin(century); tm->tm_year += (century * 100); tm->tm_year -= 1900; @@ -106,13 +106,13 @@ static int bq4802_set_time(struct device *dev, struct rtc_time *tm) min = tm->tm_min; sec = tm->tm_sec; - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); - BIN_TO_BCD(century); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); + century = bin2bcd(century); spin_lock_irqsave(&p->lock, flags); diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 963ad0b6a4e9..5549231179a2 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -143,6 +143,43 @@ static inline int hpet_unregister_irq_handler(irq_handler_t handler) /*----------------------------------------------------------------*/ +#ifdef RTC_PORT + +/* Most newer x86 systems have two register banks, the first used + * for RTC and NVRAM and the second only for NVRAM. Caller must + * own rtc_lock ... and we won't worry about access during NMI. + */ +#define can_bank2 true + +static inline unsigned char cmos_read_bank2(unsigned char addr) +{ + outb(addr, RTC_PORT(2)); + return inb(RTC_PORT(3)); +} + +static inline void cmos_write_bank2(unsigned char val, unsigned char addr) +{ + outb(addr, RTC_PORT(2)); + outb(val, RTC_PORT(2)); +} + +#else + +#define can_bank2 false + +static inline unsigned char cmos_read_bank2(unsigned char addr) +{ + return 0; +} + +static inline void cmos_write_bank2(unsigned char val, unsigned char addr) +{ +} + +#endif + +/*----------------------------------------------------------------*/ + static int cmos_read_time(struct device *dev, struct rtc_time *t) { /* REVISIT: if the clock has a "century" register, use @@ -203,26 +240,26 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) /* REVISIT this assumes PC style usage: always BCD */ if (((unsigned)t->time.tm_sec) < 0x60) - t->time.tm_sec = BCD2BIN(t->time.tm_sec); + t->time.tm_sec = bcd2bin(t->time.tm_sec); else t->time.tm_sec = -1; if (((unsigned)t->time.tm_min) < 0x60) - t->time.tm_min = BCD2BIN(t->time.tm_min); + t->time.tm_min = bcd2bin(t->time.tm_min); else t->time.tm_min = -1; if (((unsigned)t->time.tm_hour) < 0x24) - t->time.tm_hour = BCD2BIN(t->time.tm_hour); + t->time.tm_hour = bcd2bin(t->time.tm_hour); else t->time.tm_hour = -1; if (cmos->day_alrm) { if (((unsigned)t->time.tm_mday) <= 0x31) - t->time.tm_mday = BCD2BIN(t->time.tm_mday); + t->time.tm_mday = bcd2bin(t->time.tm_mday); else t->time.tm_mday = -1; if (cmos->mon_alrm) { if (((unsigned)t->time.tm_mon) <= 0x12) - t->time.tm_mon = BCD2BIN(t->time.tm_mon) - 1; + t->time.tm_mon = bcd2bin(t->time.tm_mon) - 1; else t->time.tm_mon = -1; } @@ -294,19 +331,19 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) /* Writing 0xff means "don't care" or "match all". */ mon = t->time.tm_mon + 1; - mon = (mon <= 12) ? BIN2BCD(mon) : 0xff; + mon = (mon <= 12) ? bin2bcd(mon) : 0xff; mday = t->time.tm_mday; - mday = (mday >= 1 && mday <= 31) ? BIN2BCD(mday) : 0xff; + mday = (mday >= 1 && mday <= 31) ? bin2bcd(mday) : 0xff; hrs = t->time.tm_hour; - hrs = (hrs < 24) ? BIN2BCD(hrs) : 0xff; + hrs = (hrs < 24) ? bin2bcd(hrs) : 0xff; min = t->time.tm_min; - min = (min < 60) ? BIN2BCD(min) : 0xff; + min = (min < 60) ? bin2bcd(min) : 0xff; sec = t->time.tm_sec; - sec = (sec < 60) ? BIN2BCD(sec) : 0xff; + sec = (sec < 60) ? bin2bcd(sec) : 0xff; spin_lock_irq(&rtc_lock); @@ -491,12 +528,21 @@ cmos_nvram_read(struct kobject *kobj, struct bin_attribute *attr, if (unlikely(off >= attr->size)) return 0; + if (unlikely(off < 0)) + return -EINVAL; if ((off + count) > attr->size) count = attr->size - off; + off += NVRAM_OFFSET; spin_lock_irq(&rtc_lock); - for (retval = 0, off += NVRAM_OFFSET; count--; retval++, off++) - *buf++ = CMOS_READ(off); + for (retval = 0; count; count--, off++, retval++) { + if (off < 128) + *buf++ = CMOS_READ(off); + else if (can_bank2) + *buf++ = cmos_read_bank2(off); + else + break; + } spin_unlock_irq(&rtc_lock); return retval; @@ -512,6 +558,8 @@ cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr, cmos = dev_get_drvdata(container_of(kobj, struct device, kobj)); if (unlikely(off >= attr->size)) return -EFBIG; + if (unlikely(off < 0)) + return -EINVAL; if ((off + count) > attr->size) count = attr->size - off; @@ -520,15 +568,20 @@ cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr, * here. If userspace is smart enough to know what fields of * NVRAM to update, updating checksums is also part of its job. */ + off += NVRAM_OFFSET; spin_lock_irq(&rtc_lock); - for (retval = 0, off += NVRAM_OFFSET; count--; retval++, off++) { + for (retval = 0; count; count--, off++, retval++) { /* don't trash RTC registers */ if (off == cmos->day_alrm || off == cmos->mon_alrm || off == cmos->century) buf++; - else + else if (off < 128) CMOS_WRITE(*buf++, off); + else if (can_bank2) + cmos_write_bank2(*buf++, off); + else + break; } spin_unlock_irq(&rtc_lock); @@ -539,7 +592,6 @@ static struct bin_attribute nvram = { .attr = { .name = "nvram", .mode = S_IRUGO | S_IWUSR, - .owner = THIS_MODULE, }, .read = cmos_nvram_read, @@ -631,8 +683,8 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) /* Heuristic to deduce NVRAM size ... do what the legacy NVRAM * driver did, but don't reject unknown configs. Old hardware - * won't address 128 bytes, and for now we ignore the way newer - * chips can address 256 bytes (using two more i/o ports). + * won't address 128 bytes. Newer chips have multiple banks, + * though they may not be listed in one I/O resource. */ #if defined(CONFIG_ATARI) address_space = 64; @@ -642,6 +694,8 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) #warning Assuming 128 bytes of RTC+NVRAM address space, not 64 bytes. address_space = 128; #endif + if (can_bank2 && ports->end > (ports->start + 1)) + address_space = 256; /* For ACPI systems extension info comes from the FADT. On others, * board specific setup provides it as appropriate. Systems where @@ -740,7 +794,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) goto cleanup2; } - pr_info("%s: alarms up to one %s%s%s\n", + pr_info("%s: alarms up to one %s%s, %zd bytes nvram, %s irqs\n", cmos_rtc.rtc->dev.bus_id, is_valid_irq(rtc_irq) ? (cmos_rtc.mon_alrm @@ -749,6 +803,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) ? "month" : "day")) : "no", cmos_rtc.century ? ", y3k" : "", + nvram.size, is_hpet_enabled() ? ", hpet irqs" : ""); return 0; diff --git a/drivers/rtc/rtc-ds1216.c b/drivers/rtc/rtc-ds1216.c index 0b17770b032b..9a234a4ec06d 100644 --- a/drivers/rtc/rtc-ds1216.c +++ b/drivers/rtc/rtc-ds1216.c @@ -86,19 +86,19 @@ static int ds1216_rtc_read_time(struct device *dev, struct rtc_time *tm) ds1216_switch_ds_to_clock(priv->ioaddr); ds1216_read(priv->ioaddr, (u8 *)®s); - tm->tm_sec = BCD2BIN(regs.sec); - tm->tm_min = BCD2BIN(regs.min); + tm->tm_sec = bcd2bin(regs.sec); + tm->tm_min = bcd2bin(regs.min); if (regs.hour & DS1216_HOUR_1224) { /* AM/PM mode */ - tm->tm_hour = BCD2BIN(regs.hour & 0x1f); + tm->tm_hour = bcd2bin(regs.hour & 0x1f); if (regs.hour & DS1216_HOUR_AMPM) tm->tm_hour += 12; } else - tm->tm_hour = BCD2BIN(regs.hour & 0x3f); + tm->tm_hour = bcd2bin(regs.hour & 0x3f); tm->tm_wday = (regs.wday & 7) - 1; - tm->tm_mday = BCD2BIN(regs.mday & 0x3f); - tm->tm_mon = BCD2BIN(regs.month & 0x1f); - tm->tm_year = BCD2BIN(regs.year); + tm->tm_mday = bcd2bin(regs.mday & 0x3f); + tm->tm_mon = bcd2bin(regs.month & 0x1f); + tm->tm_year = bcd2bin(regs.year); if (tm->tm_year < 70) tm->tm_year += 100; return 0; @@ -114,19 +114,19 @@ static int ds1216_rtc_set_time(struct device *dev, struct rtc_time *tm) ds1216_read(priv->ioaddr, (u8 *)®s); regs.tsec = 0; /* clear 0.1 and 0.01 seconds */ - regs.sec = BIN2BCD(tm->tm_sec); - regs.min = BIN2BCD(tm->tm_min); + regs.sec = bin2bcd(tm->tm_sec); + regs.min = bin2bcd(tm->tm_min); regs.hour &= DS1216_HOUR_1224; if (regs.hour && tm->tm_hour > 12) { regs.hour |= DS1216_HOUR_AMPM; tm->tm_hour -= 12; } - regs.hour |= BIN2BCD(tm->tm_hour); + regs.hour |= bin2bcd(tm->tm_hour); regs.wday &= ~7; regs.wday |= tm->tm_wday; - regs.mday = BIN2BCD(tm->tm_mday); - regs.month = BIN2BCD(tm->tm_mon); - regs.year = BIN2BCD(tm->tm_year % 100); + regs.mday = bin2bcd(tm->tm_mday); + regs.month = bin2bcd(tm->tm_mon); + regs.year = bin2bcd(tm->tm_year % 100); ds1216_switch_ds_to_clock(priv->ioaddr); ds1216_write(priv->ioaddr, (u8 *)®s); diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c index 8f4e96bb229a..184556620778 100644 --- a/drivers/rtc/rtc-ds1302.c +++ b/drivers/rtc/rtc-ds1302.c @@ -107,13 +107,13 @@ static int ds1302_rtc_read_time(struct device *dev, struct rtc_time *tm) spin_lock_irq(&rtc->lock); - tm->tm_sec = BCD2BIN(ds1302_readbyte(RTC_ADDR_SEC)); - tm->tm_min = BCD2BIN(ds1302_readbyte(RTC_ADDR_MIN)); - tm->tm_hour = BCD2BIN(ds1302_readbyte(RTC_ADDR_HOUR)); - tm->tm_wday = BCD2BIN(ds1302_readbyte(RTC_ADDR_DAY)); - tm->tm_mday = BCD2BIN(ds1302_readbyte(RTC_ADDR_DATE)); - tm->tm_mon = BCD2BIN(ds1302_readbyte(RTC_ADDR_MON)) - 1; - tm->tm_year = BCD2BIN(ds1302_readbyte(RTC_ADDR_YEAR)); + tm->tm_sec = bcd2bin(ds1302_readbyte(RTC_ADDR_SEC)); + tm->tm_min = bcd2bin(ds1302_readbyte(RTC_ADDR_MIN)); + tm->tm_hour = bcd2bin(ds1302_readbyte(RTC_ADDR_HOUR)); + tm->tm_wday = bcd2bin(ds1302_readbyte(RTC_ADDR_DAY)); + tm->tm_mday = bcd2bin(ds1302_readbyte(RTC_ADDR_DATE)); + tm->tm_mon = bcd2bin(ds1302_readbyte(RTC_ADDR_MON)) - 1; + tm->tm_year = bcd2bin(ds1302_readbyte(RTC_ADDR_YEAR)); if (tm->tm_year < 70) tm->tm_year += 100; @@ -141,13 +141,13 @@ static int ds1302_rtc_set_time(struct device *dev, struct rtc_time *tm) /* Stop RTC */ ds1302_writebyte(RTC_ADDR_SEC, ds1302_readbyte(RTC_ADDR_SEC) | 0x80); - ds1302_writebyte(RTC_ADDR_SEC, BIN2BCD(tm->tm_sec)); - ds1302_writebyte(RTC_ADDR_MIN, BIN2BCD(tm->tm_min)); - ds1302_writebyte(RTC_ADDR_HOUR, BIN2BCD(tm->tm_hour)); - ds1302_writebyte(RTC_ADDR_DAY, BIN2BCD(tm->tm_wday)); - ds1302_writebyte(RTC_ADDR_DATE, BIN2BCD(tm->tm_mday)); - ds1302_writebyte(RTC_ADDR_MON, BIN2BCD(tm->tm_mon + 1)); - ds1302_writebyte(RTC_ADDR_YEAR, BIN2BCD(tm->tm_year % 100)); + ds1302_writebyte(RTC_ADDR_SEC, bin2bcd(tm->tm_sec)); + ds1302_writebyte(RTC_ADDR_MIN, bin2bcd(tm->tm_min)); + ds1302_writebyte(RTC_ADDR_HOUR, bin2bcd(tm->tm_hour)); + ds1302_writebyte(RTC_ADDR_DAY, bin2bcd(tm->tm_wday)); + ds1302_writebyte(RTC_ADDR_DATE, bin2bcd(tm->tm_mday)); + ds1302_writebyte(RTC_ADDR_MON, bin2bcd(tm->tm_mon + 1)); + ds1302_writebyte(RTC_ADDR_YEAR, bin2bcd(tm->tm_year % 100)); /* Start RTC */ ds1302_writebyte(RTC_ADDR_SEC, ds1302_readbyte(RTC_ADDR_SEC) & ~0x80); diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c index b91d02a3ace9..fc372df6534b 100644 --- a/drivers/rtc/rtc-ds1305.c +++ b/drivers/rtc/rtc-ds1305.c @@ -114,10 +114,10 @@ static unsigned bcd2hour(u8 bcd) hour = 12; bcd &= ~DS1305_HR_PM; } - hour += BCD2BIN(bcd); + hour += bcd2bin(bcd); return hour - 1; } - return BCD2BIN(bcd); + return bcd2bin(bcd); } static u8 hour2bcd(bool hr12, int hour) @@ -125,11 +125,11 @@ static u8 hour2bcd(bool hr12, int hour) if (hr12) { hour++; if (hour <= 12) - return DS1305_HR_12 | BIN2BCD(hour); + return DS1305_HR_12 | bin2bcd(hour); hour -= 12; - return DS1305_HR_12 | DS1305_HR_PM | BIN2BCD(hour); + return DS1305_HR_12 | DS1305_HR_PM | bin2bcd(hour); } - return BIN2BCD(hour); + return bin2bcd(hour); } /*----------------------------------------------------------------------*/ @@ -206,13 +206,13 @@ static int ds1305_get_time(struct device *dev, struct rtc_time *time) buf[4], buf[5], buf[6]); /* Decode the registers */ - time->tm_sec = BCD2BIN(buf[DS1305_SEC]); - time->tm_min = BCD2BIN(buf[DS1305_MIN]); + time->tm_sec = bcd2bin(buf[DS1305_SEC]); + time->tm_min = bcd2bin(buf[DS1305_MIN]); time->tm_hour = bcd2hour(buf[DS1305_HOUR]); time->tm_wday = buf[DS1305_WDAY] - 1; - time->tm_mday = BCD2BIN(buf[DS1305_MDAY]); - time->tm_mon = BCD2BIN(buf[DS1305_MON]) - 1; - time->tm_year = BCD2BIN(buf[DS1305_YEAR]) + 100; + time->tm_mday = bcd2bin(buf[DS1305_MDAY]); + time->tm_mon = bcd2bin(buf[DS1305_MON]) - 1; + time->tm_year = bcd2bin(buf[DS1305_YEAR]) + 100; dev_vdbg(dev, "%s secs=%d, mins=%d, " "hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n", @@ -239,13 +239,13 @@ static int ds1305_set_time(struct device *dev, struct rtc_time *time) /* Write registers starting at the first time/date address. */ *bp++ = DS1305_WRITE | DS1305_SEC; - *bp++ = BIN2BCD(time->tm_sec); - *bp++ = BIN2BCD(time->tm_min); + *bp++ = bin2bcd(time->tm_sec); + *bp++ = bin2bcd(time->tm_min); *bp++ = hour2bcd(ds1305->hr12, time->tm_hour); *bp++ = (time->tm_wday < 7) ? (time->tm_wday + 1) : 1; - *bp++ = BIN2BCD(time->tm_mday); - *bp++ = BIN2BCD(time->tm_mon + 1); - *bp++ = BIN2BCD(time->tm_year - 100); + *bp++ = bin2bcd(time->tm_mday); + *bp++ = bin2bcd(time->tm_mon + 1); + *bp++ = bin2bcd(time->tm_year - 100); dev_dbg(dev, "%s: %02x %02x %02x, %02x %02x %02x %02x\n", "write", buf[1], buf[2], buf[3], @@ -329,8 +329,8 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm) * fill in the rest ... and also handle rollover to tomorrow when * that's needed. */ - alm->time.tm_sec = BCD2BIN(buf[DS1305_SEC]); - alm->time.tm_min = BCD2BIN(buf[DS1305_MIN]); + alm->time.tm_sec = bcd2bin(buf[DS1305_SEC]); + alm->time.tm_min = bcd2bin(buf[DS1305_MIN]); alm->time.tm_hour = bcd2hour(buf[DS1305_HOUR]); alm->time.tm_mday = -1; alm->time.tm_mon = -1; @@ -387,8 +387,8 @@ static int ds1305_set_alarm(struct device *dev, struct rtc_wkalrm *alm) /* write alarm */ buf[0] = DS1305_WRITE | DS1305_ALM0(DS1305_SEC); - buf[1 + DS1305_SEC] = BIN2BCD(alm->time.tm_sec); - buf[1 + DS1305_MIN] = BIN2BCD(alm->time.tm_min); + buf[1 + DS1305_SEC] = bin2bcd(alm->time.tm_sec); + buf[1 + DS1305_MIN] = bin2bcd(alm->time.tm_min); buf[1 + DS1305_HOUR] = hour2bcd(ds1305->hr12, alm->time.tm_hour); buf[1 + DS1305_WDAY] = DS1305_ALM_DISABLE; @@ -606,7 +606,6 @@ ds1305_nvram_write(struct kobject *kobj, struct bin_attribute *attr, static struct bin_attribute nvram = { .attr.name = "nvram", .attr.mode = S_IRUGO | S_IWUSR, - .attr.owner = THIS_MODULE, .read = ds1305_nvram_read, .write = ds1305_nvram_write, .size = DS1305_NVRAM_LEN, diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 4fcf0734a6ef..162330b9d1dc 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -222,17 +222,17 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t) ds1307->regs[4], ds1307->regs[5], ds1307->regs[6]); - t->tm_sec = BCD2BIN(ds1307->regs[DS1307_REG_SECS] & 0x7f); - t->tm_min = BCD2BIN(ds1307->regs[DS1307_REG_MIN] & 0x7f); + t->tm_sec = bcd2bin(ds1307->regs[DS1307_REG_SECS] & 0x7f); + t->tm_min = bcd2bin(ds1307->regs[DS1307_REG_MIN] & 0x7f); tmp = ds1307->regs[DS1307_REG_HOUR] & 0x3f; - t->tm_hour = BCD2BIN(tmp); - t->tm_wday = BCD2BIN(ds1307->regs[DS1307_REG_WDAY] & 0x07) - 1; - t->tm_mday = BCD2BIN(ds1307->regs[DS1307_REG_MDAY] & 0x3f); + t->tm_hour = bcd2bin(tmp); + t->tm_wday = bcd2bin(ds1307->regs[DS1307_REG_WDAY] & 0x07) - 1; + t->tm_mday = bcd2bin(ds1307->regs[DS1307_REG_MDAY] & 0x3f); tmp = ds1307->regs[DS1307_REG_MONTH] & 0x1f; - t->tm_mon = BCD2BIN(tmp) - 1; + t->tm_mon = bcd2bin(tmp) - 1; /* assume 20YY not 19YY, and ignore DS1337_BIT_CENTURY */ - t->tm_year = BCD2BIN(ds1307->regs[DS1307_REG_YEAR]) + 100; + t->tm_year = bcd2bin(ds1307->regs[DS1307_REG_YEAR]) + 100; dev_dbg(dev, "%s secs=%d, mins=%d, " "hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n", @@ -258,16 +258,16 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) t->tm_mon, t->tm_year, t->tm_wday); *buf++ = 0; /* first register addr */ - buf[DS1307_REG_SECS] = BIN2BCD(t->tm_sec); - buf[DS1307_REG_MIN] = BIN2BCD(t->tm_min); - buf[DS1307_REG_HOUR] = BIN2BCD(t->tm_hour); - buf[DS1307_REG_WDAY] = BIN2BCD(t->tm_wday + 1); - buf[DS1307_REG_MDAY] = BIN2BCD(t->tm_mday); - buf[DS1307_REG_MONTH] = BIN2BCD(t->tm_mon + 1); + buf[DS1307_REG_SECS] = bin2bcd(t->tm_sec); + buf[DS1307_REG_MIN] = bin2bcd(t->tm_min); + buf[DS1307_REG_HOUR] = bin2bcd(t->tm_hour); + buf[DS1307_REG_WDAY] = bin2bcd(t->tm_wday + 1); + buf[DS1307_REG_MDAY] = bin2bcd(t->tm_mday); + buf[DS1307_REG_MONTH] = bin2bcd(t->tm_mon + 1); /* assume 20YY not 19YY */ tmp = t->tm_year - 100; - buf[DS1307_REG_YEAR] = BIN2BCD(tmp); + buf[DS1307_REG_YEAR] = bin2bcd(tmp); switch (ds1307->type) { case ds_1337: @@ -551,7 +551,6 @@ static struct bin_attribute nvram = { .attr = { .name = "nvram", .mode = S_IRUGO | S_IWUSR, - .owner = THIS_MODULE, }, .read = ds1307_nvram_read, @@ -709,18 +708,18 @@ read_rtc: } tmp = ds1307->regs[DS1307_REG_SECS]; - tmp = BCD2BIN(tmp & 0x7f); + tmp = bcd2bin(tmp & 0x7f); if (tmp > 60) goto exit_bad; - tmp = BCD2BIN(ds1307->regs[DS1307_REG_MIN] & 0x7f); + tmp = bcd2bin(ds1307->regs[DS1307_REG_MIN] & 0x7f); if (tmp > 60) goto exit_bad; - tmp = BCD2BIN(ds1307->regs[DS1307_REG_MDAY] & 0x3f); + tmp = bcd2bin(ds1307->regs[DS1307_REG_MDAY] & 0x3f); if (tmp == 0 || tmp > 31) goto exit_bad; - tmp = BCD2BIN(ds1307->regs[DS1307_REG_MONTH] & 0x1f); + tmp = bcd2bin(ds1307->regs[DS1307_REG_MONTH] & 0x1f); if (tmp == 0 || tmp > 12) goto exit_bad; @@ -739,14 +738,14 @@ read_rtc: /* Be sure we're in 24 hour mode. Multi-master systems * take note... */ - tmp = BCD2BIN(tmp & 0x1f); + tmp = bcd2bin(tmp & 0x1f); if (tmp == 12) tmp = 0; if (ds1307->regs[DS1307_REG_HOUR] & DS1307_BIT_PM) tmp += 12; i2c_smbus_write_byte_data(client, DS1307_REG_HOUR, - BIN2BCD(tmp)); + bin2bcd(tmp)); } ds1307->rtc = rtc_device_register(client->name, &client->dev, diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 86981d34fbb6..25caada78398 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -153,8 +153,8 @@ ds1511_wdog_set(unsigned long deciseconds) /* * set the wdog values in the wdog registers */ - rtc_write(BIN2BCD(deciseconds % 100), DS1511_WD_MSEC); - rtc_write(BIN2BCD(deciseconds / 100), DS1511_WD_SEC); + rtc_write(bin2bcd(deciseconds % 100), DS1511_WD_MSEC); + rtc_write(bin2bcd(deciseconds / 100), DS1511_WD_SEC); /* * set wdog enable and wdog 'steering' bit to issue a reset */ @@ -220,13 +220,13 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) /* * each register is a different number of valid bits */ - sec = BIN2BCD(sec) & 0x7f; - min = BIN2BCD(min) & 0x7f; - hrs = BIN2BCD(hrs) & 0x3f; - day = BIN2BCD(day) & 0x3f; - mon = BIN2BCD(mon) & 0x1f; - yrs = BIN2BCD(yrs) & 0xff; - cen = BIN2BCD(cen) & 0xff; + sec = bin2bcd(sec) & 0x7f; + min = bin2bcd(min) & 0x7f; + hrs = bin2bcd(hrs) & 0x3f; + day = bin2bcd(day) & 0x3f; + mon = bin2bcd(mon) & 0x1f; + yrs = bin2bcd(yrs) & 0xff; + cen = bin2bcd(cen) & 0xff; spin_lock_irqsave(&ds1511_lock, flags); rtc_disable_update(); @@ -264,14 +264,14 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm) rtc_enable_update(); spin_unlock_irqrestore(&ds1511_lock, flags); - rtc_tm->tm_sec = BCD2BIN(rtc_tm->tm_sec); - rtc_tm->tm_min = BCD2BIN(rtc_tm->tm_min); - rtc_tm->tm_hour = BCD2BIN(rtc_tm->tm_hour); - rtc_tm->tm_mday = BCD2BIN(rtc_tm->tm_mday); - rtc_tm->tm_wday = BCD2BIN(rtc_tm->tm_wday); - rtc_tm->tm_mon = BCD2BIN(rtc_tm->tm_mon); - rtc_tm->tm_year = BCD2BIN(rtc_tm->tm_year); - century = BCD2BIN(century) * 100; + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_wday = bcd2bin(rtc_tm->tm_wday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); + century = bcd2bin(century) * 100; /* * Account for differences between how the RTC uses the values @@ -304,16 +304,16 @@ ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) spin_lock_irqsave(&pdata->rtc->irq_lock, flags); rtc_write(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_mday) & 0x3f, + 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, RTC_ALARM_DATE); rtc_write(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_hour) & 0x3f, + 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, RTC_ALARM_HOUR); rtc_write(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_min) & 0x7f, + 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, RTC_ALARM_MIN); rtc_write(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_sec) & 0x7f, + 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, RTC_ALARM_SEC); rtc_write(rtc_read(RTC_CMD) | (pdata->irqen ? RTC_TIE : 0), RTC_CMD); rtc_read(RTC_CMD1); /* clear interrupts */ @@ -481,7 +481,6 @@ static struct bin_attribute ds1511_nvram_attr = { .attr = { .name = "nvram", .mode = S_IRUGO | S_IWUGO, - .owner = THIS_MODULE, }, .size = DS1511_RAM_MAX, .read = ds1511_nvram_read, diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 4ef59285b489..b9475cd20210 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -78,17 +78,17 @@ static int ds1553_rtc_set_time(struct device *dev, struct rtc_time *tm) void __iomem *ioaddr = pdata->ioaddr; u8 century; - century = BIN2BCD((tm->tm_year + 1900) / 100); + century = bin2bcd((tm->tm_year + 1900) / 100); writeb(RTC_WRITE, pdata->ioaddr + RTC_CONTROL); - writeb(BIN2BCD(tm->tm_year % 100), ioaddr + RTC_YEAR); - writeb(BIN2BCD(tm->tm_mon + 1), ioaddr + RTC_MONTH); - writeb(BIN2BCD(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY); - writeb(BIN2BCD(tm->tm_mday), ioaddr + RTC_DATE); - writeb(BIN2BCD(tm->tm_hour), ioaddr + RTC_HOURS); - writeb(BIN2BCD(tm->tm_min), ioaddr + RTC_MINUTES); - writeb(BIN2BCD(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS); + writeb(bin2bcd(tm->tm_year % 100), ioaddr + RTC_YEAR); + writeb(bin2bcd(tm->tm_mon + 1), ioaddr + RTC_MONTH); + writeb(bin2bcd(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY); + writeb(bin2bcd(tm->tm_mday), ioaddr + RTC_DATE); + writeb(bin2bcd(tm->tm_hour), ioaddr + RTC_HOURS); + writeb(bin2bcd(tm->tm_min), ioaddr + RTC_MINUTES); + writeb(bin2bcd(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS); /* RTC_CENTURY and RTC_CONTROL share same register */ writeb(RTC_WRITE | (century & RTC_CENTURY_MASK), ioaddr + RTC_CENTURY); @@ -118,14 +118,14 @@ static int ds1553_rtc_read_time(struct device *dev, struct rtc_time *tm) year = readb(ioaddr + RTC_YEAR); century = readb(ioaddr + RTC_CENTURY) & RTC_CENTURY_MASK; writeb(0, ioaddr + RTC_CONTROL); - tm->tm_sec = BCD2BIN(second); - tm->tm_min = BCD2BIN(minute); - tm->tm_hour = BCD2BIN(hour); - tm->tm_mday = BCD2BIN(day); - tm->tm_wday = BCD2BIN(week); - tm->tm_mon = BCD2BIN(month) - 1; + tm->tm_sec = bcd2bin(second); + tm->tm_min = bcd2bin(minute); + tm->tm_hour = bcd2bin(hour); + tm->tm_mday = bcd2bin(day); + tm->tm_wday = bcd2bin(week); + tm->tm_mon = bcd2bin(month) - 1; /* year is 1900 + tm->tm_year */ - tm->tm_year = BCD2BIN(year) + BCD2BIN(century) * 100 - 1900; + tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900; if (rtc_valid_tm(tm) < 0) { dev_err(dev, "retrieved date/time is not valid.\n"); @@ -141,16 +141,16 @@ static void ds1553_rtc_update_alarm(struct rtc_plat_data *pdata) spin_lock_irqsave(&pdata->rtc->irq_lock, flags); writeb(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_mday), + 0x80 : bin2bcd(pdata->alrm_mday), ioaddr + RTC_DATE_ALARM); writeb(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_hour), + 0x80 : bin2bcd(pdata->alrm_hour), ioaddr + RTC_HOURS_ALARM); writeb(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_min), + 0x80 : bin2bcd(pdata->alrm_min), ioaddr + RTC_MINUTES_ALARM); writeb(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_sec), + 0x80 : bin2bcd(pdata->alrm_sec), ioaddr + RTC_SECONDS_ALARM); writeb(pdata->irqen ? RTC_INTS_AE : 0, ioaddr + RTC_INTERRUPTS); readb(ioaddr + RTC_FLAGS); /* clear interrupts */ diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index 24d35ede2dbf..8bc8501bffc8 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -66,17 +66,17 @@ static int ds1742_rtc_set_time(struct device *dev, struct rtc_time *tm) void __iomem *ioaddr = pdata->ioaddr_rtc; u8 century; - century = BIN2BCD((tm->tm_year + 1900) / 100); + century = bin2bcd((tm->tm_year + 1900) / 100); writeb(RTC_WRITE, ioaddr + RTC_CONTROL); - writeb(BIN2BCD(tm->tm_year % 100), ioaddr + RTC_YEAR); - writeb(BIN2BCD(tm->tm_mon + 1), ioaddr + RTC_MONTH); - writeb(BIN2BCD(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY); - writeb(BIN2BCD(tm->tm_mday), ioaddr + RTC_DATE); - writeb(BIN2BCD(tm->tm_hour), ioaddr + RTC_HOURS); - writeb(BIN2BCD(tm->tm_min), ioaddr + RTC_MINUTES); - writeb(BIN2BCD(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS); + writeb(bin2bcd(tm->tm_year % 100), ioaddr + RTC_YEAR); + writeb(bin2bcd(tm->tm_mon + 1), ioaddr + RTC_MONTH); + writeb(bin2bcd(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY); + writeb(bin2bcd(tm->tm_mday), ioaddr + RTC_DATE); + writeb(bin2bcd(tm->tm_hour), ioaddr + RTC_HOURS); + writeb(bin2bcd(tm->tm_min), ioaddr + RTC_MINUTES); + writeb(bin2bcd(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS); /* RTC_CENTURY and RTC_CONTROL share same register */ writeb(RTC_WRITE | (century & RTC_CENTURY_MASK), ioaddr + RTC_CENTURY); @@ -106,14 +106,14 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm) year = readb(ioaddr + RTC_YEAR); century = readb(ioaddr + RTC_CENTURY) & RTC_CENTURY_MASK; writeb(0, ioaddr + RTC_CONTROL); - tm->tm_sec = BCD2BIN(second); - tm->tm_min = BCD2BIN(minute); - tm->tm_hour = BCD2BIN(hour); - tm->tm_mday = BCD2BIN(day); - tm->tm_wday = BCD2BIN(week); - tm->tm_mon = BCD2BIN(month) - 1; + tm->tm_sec = bcd2bin(second); + tm->tm_min = bcd2bin(minute); + tm->tm_hour = bcd2bin(hour); + tm->tm_mday = bcd2bin(day); + tm->tm_wday = bcd2bin(week); + tm->tm_mon = bcd2bin(month) - 1; /* year is 1900 + tm->tm_year */ - tm->tm_year = BCD2BIN(year) + BCD2BIN(century) * 100 - 1900; + tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900; if (rtc_valid_tm(tm) < 0) { dev_err(dev, "retrieved date/time is not valid.\n"); diff --git a/drivers/rtc/rtc-fm3130.c b/drivers/rtc/rtc-fm3130.c index abfdfcbaa059..3a7be11cc6b9 100644 --- a/drivers/rtc/rtc-fm3130.c +++ b/drivers/rtc/rtc-fm3130.c @@ -131,17 +131,17 @@ static int fm3130_get_time(struct device *dev, struct rtc_time *t) fm3130->regs[0xc], fm3130->regs[0xd], fm3130->regs[0xe]); - t->tm_sec = BCD2BIN(fm3130->regs[FM3130_RTC_SECONDS] & 0x7f); - t->tm_min = BCD2BIN(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f); + t->tm_sec = bcd2bin(fm3130->regs[FM3130_RTC_SECONDS] & 0x7f); + t->tm_min = bcd2bin(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f); tmp = fm3130->regs[FM3130_RTC_HOURS] & 0x3f; - t->tm_hour = BCD2BIN(tmp); - t->tm_wday = BCD2BIN(fm3130->regs[FM3130_RTC_DAY] & 0x07) - 1; - t->tm_mday = BCD2BIN(fm3130->regs[FM3130_RTC_DATE] & 0x3f); + t->tm_hour = bcd2bin(tmp); + t->tm_wday = bcd2bin(fm3130->regs[FM3130_RTC_DAY] & 0x07) - 1; + t->tm_mday = bcd2bin(fm3130->regs[FM3130_RTC_DATE] & 0x3f); tmp = fm3130->regs[FM3130_RTC_MONTHS] & 0x1f; - t->tm_mon = BCD2BIN(tmp) - 1; + t->tm_mon = bcd2bin(tmp) - 1; /* assume 20YY not 19YY, and ignore CF bit */ - t->tm_year = BCD2BIN(fm3130->regs[FM3130_RTC_YEARS]) + 100; + t->tm_year = bcd2bin(fm3130->regs[FM3130_RTC_YEARS]) + 100; dev_dbg(dev, "%s secs=%d, mins=%d, " "hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n", @@ -167,16 +167,16 @@ static int fm3130_set_time(struct device *dev, struct rtc_time *t) t->tm_mon, t->tm_year, t->tm_wday); /* first register addr */ - buf[FM3130_RTC_SECONDS] = BIN2BCD(t->tm_sec); - buf[FM3130_RTC_MINUTES] = BIN2BCD(t->tm_min); - buf[FM3130_RTC_HOURS] = BIN2BCD(t->tm_hour); - buf[FM3130_RTC_DAY] = BIN2BCD(t->tm_wday + 1); - buf[FM3130_RTC_DATE] = BIN2BCD(t->tm_mday); - buf[FM3130_RTC_MONTHS] = BIN2BCD(t->tm_mon + 1); + buf[FM3130_RTC_SECONDS] = bin2bcd(t->tm_sec); + buf[FM3130_RTC_MINUTES] = bin2bcd(t->tm_min); + buf[FM3130_RTC_HOURS] = bin2bcd(t->tm_hour); + buf[FM3130_RTC_DAY] = bin2bcd(t->tm_wday + 1); + buf[FM3130_RTC_DATE] = bin2bcd(t->tm_mday); + buf[FM3130_RTC_MONTHS] = bin2bcd(t->tm_mon + 1); /* assume 20YY not 19YY */ tmp = t->tm_year - 100; - buf[FM3130_RTC_YEARS] = BIN2BCD(tmp); + buf[FM3130_RTC_YEARS] = bin2bcd(tmp); dev_dbg(dev, "%s: %02x %02x %02x %02x %02x %02x %02x" "%02x %02x %02x %02x %02x %02x %02x %02x\n", @@ -222,11 +222,11 @@ static int fm3130_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) fm3130->regs[FM3130_ALARM_MONTHS]); - tm->tm_sec = BCD2BIN(fm3130->regs[FM3130_ALARM_SECONDS] & 0x7F); - tm->tm_min = BCD2BIN(fm3130->regs[FM3130_ALARM_MINUTES] & 0x7F); - tm->tm_hour = BCD2BIN(fm3130->regs[FM3130_ALARM_HOURS] & 0x3F); - tm->tm_mday = BCD2BIN(fm3130->regs[FM3130_ALARM_DATE] & 0x3F); - tm->tm_mon = BCD2BIN(fm3130->regs[FM3130_ALARM_MONTHS] & 0x1F); + tm->tm_sec = bcd2bin(fm3130->regs[FM3130_ALARM_SECONDS] & 0x7F); + tm->tm_min = bcd2bin(fm3130->regs[FM3130_ALARM_MINUTES] & 0x7F); + tm->tm_hour = bcd2bin(fm3130->regs[FM3130_ALARM_HOURS] & 0x3F); + tm->tm_mday = bcd2bin(fm3130->regs[FM3130_ALARM_DATE] & 0x3F); + tm->tm_mon = bcd2bin(fm3130->regs[FM3130_ALARM_MONTHS] & 0x1F); if (tm->tm_mon > 0) tm->tm_mon -= 1; /* RTC is 1-12, tm_mon is 0-11 */ dev_dbg(dev, "%s secs=%d, mins=%d, " @@ -252,23 +252,23 @@ static int fm3130_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) if (tm->tm_sec != -1) fm3130->regs[FM3130_ALARM_SECONDS] = - BIN2BCD(tm->tm_sec) | 0x80; + bin2bcd(tm->tm_sec) | 0x80; if (tm->tm_min != -1) fm3130->regs[FM3130_ALARM_MINUTES] = - BIN2BCD(tm->tm_min) | 0x80; + bin2bcd(tm->tm_min) | 0x80; if (tm->tm_hour != -1) fm3130->regs[FM3130_ALARM_HOURS] = - BIN2BCD(tm->tm_hour) | 0x80; + bin2bcd(tm->tm_hour) | 0x80; if (tm->tm_mday != -1) fm3130->regs[FM3130_ALARM_DATE] = - BIN2BCD(tm->tm_mday) | 0x80; + bin2bcd(tm->tm_mday) | 0x80; if (tm->tm_mon != -1) fm3130->regs[FM3130_ALARM_MONTHS] = - BIN2BCD(tm->tm_mon + 1) | 0x80; + bin2bcd(tm->tm_mon + 1) | 0x80; dev_dbg(dev, "alarm write %02x %02x %02x %02x %02x\n", fm3130->regs[FM3130_ALARM_SECONDS], @@ -414,18 +414,18 @@ static int __devinit fm3130_probe(struct i2c_client *client, /* TODO */ /* TODO need to sanity check alarm */ tmp = fm3130->regs[FM3130_RTC_SECONDS]; - tmp = BCD2BIN(tmp & 0x7f); + tmp = bcd2bin(tmp & 0x7f); if (tmp > 60) goto exit_bad; - tmp = BCD2BIN(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f); + tmp = bcd2bin(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f); if (tmp > 60) goto exit_bad; - tmp = BCD2BIN(fm3130->regs[FM3130_RTC_DATE] & 0x3f); + tmp = bcd2bin(fm3130->regs[FM3130_RTC_DATE] & 0x3f); if (tmp == 0 || tmp > 31) goto exit_bad; - tmp = BCD2BIN(fm3130->regs[FM3130_RTC_MONTHS] & 0x1f); + tmp = bcd2bin(fm3130->regs[FM3130_RTC_MONTHS] & 0x1f); if (tmp == 0 || tmp > 12) goto exit_bad; diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c index a81adab6e515..2cd77ab8fc66 100644 --- a/drivers/rtc/rtc-isl1208.c +++ b/drivers/rtc/rtc-isl1208.c @@ -259,26 +259,26 @@ isl1208_i2c_read_time(struct i2c_client *client, struct rtc_time *tm) return sr; } - tm->tm_sec = BCD2BIN(regs[ISL1208_REG_SC]); - tm->tm_min = BCD2BIN(regs[ISL1208_REG_MN]); + tm->tm_sec = bcd2bin(regs[ISL1208_REG_SC]); + tm->tm_min = bcd2bin(regs[ISL1208_REG_MN]); /* HR field has a more complex interpretation */ { const u8 _hr = regs[ISL1208_REG_HR]; if (_hr & ISL1208_REG_HR_MIL) /* 24h format */ - tm->tm_hour = BCD2BIN(_hr & 0x3f); + tm->tm_hour = bcd2bin(_hr & 0x3f); else { /* 12h format */ - tm->tm_hour = BCD2BIN(_hr & 0x1f); + tm->tm_hour = bcd2bin(_hr & 0x1f); if (_hr & ISL1208_REG_HR_PM) /* PM flag set */ tm->tm_hour += 12; } } - tm->tm_mday = BCD2BIN(regs[ISL1208_REG_DT]); - tm->tm_mon = BCD2BIN(regs[ISL1208_REG_MO]) - 1; /* rtc starts at 1 */ - tm->tm_year = BCD2BIN(regs[ISL1208_REG_YR]) + 100; - tm->tm_wday = BCD2BIN(regs[ISL1208_REG_DW]); + tm->tm_mday = bcd2bin(regs[ISL1208_REG_DT]); + tm->tm_mon = bcd2bin(regs[ISL1208_REG_MO]) - 1; /* rtc starts at 1 */ + tm->tm_year = bcd2bin(regs[ISL1208_REG_YR]) + 100; + tm->tm_wday = bcd2bin(regs[ISL1208_REG_DW]); return 0; } @@ -305,13 +305,13 @@ isl1208_i2c_read_alarm(struct i2c_client *client, struct rtc_wkalrm *alarm) } /* MSB of each alarm register is an enable bit */ - tm->tm_sec = BCD2BIN(regs[ISL1208_REG_SCA - ISL1208_REG_SCA] & 0x7f); - tm->tm_min = BCD2BIN(regs[ISL1208_REG_MNA - ISL1208_REG_SCA] & 0x7f); - tm->tm_hour = BCD2BIN(regs[ISL1208_REG_HRA - ISL1208_REG_SCA] & 0x3f); - tm->tm_mday = BCD2BIN(regs[ISL1208_REG_DTA - ISL1208_REG_SCA] & 0x3f); + tm->tm_sec = bcd2bin(regs[ISL1208_REG_SCA - ISL1208_REG_SCA] & 0x7f); + tm->tm_min = bcd2bin(regs[ISL1208_REG_MNA - ISL1208_REG_SCA] & 0x7f); + tm->tm_hour = bcd2bin(regs[ISL1208_REG_HRA - ISL1208_REG_SCA] & 0x3f); + tm->tm_mday = bcd2bin(regs[ISL1208_REG_DTA - ISL1208_REG_SCA] & 0x3f); tm->tm_mon = - BCD2BIN(regs[ISL1208_REG_MOA - ISL1208_REG_SCA] & 0x1f) - 1; - tm->tm_wday = BCD2BIN(regs[ISL1208_REG_DWA - ISL1208_REG_SCA] & 0x03); + bcd2bin(regs[ISL1208_REG_MOA - ISL1208_REG_SCA] & 0x1f) - 1; + tm->tm_wday = bcd2bin(regs[ISL1208_REG_DWA - ISL1208_REG_SCA] & 0x03); return 0; } @@ -328,15 +328,15 @@ isl1208_i2c_set_time(struct i2c_client *client, struct rtc_time const *tm) int sr; u8 regs[ISL1208_RTC_SECTION_LEN] = { 0, }; - regs[ISL1208_REG_SC] = BIN2BCD(tm->tm_sec); - regs[ISL1208_REG_MN] = BIN2BCD(tm->tm_min); - regs[ISL1208_REG_HR] = BIN2BCD(tm->tm_hour) | ISL1208_REG_HR_MIL; + regs[ISL1208_REG_SC] = bin2bcd(tm->tm_sec); + regs[ISL1208_REG_MN] = bin2bcd(tm->tm_min); + regs[ISL1208_REG_HR] = bin2bcd(tm->tm_hour) | ISL1208_REG_HR_MIL; - regs[ISL1208_REG_DT] = BIN2BCD(tm->tm_mday); - regs[ISL1208_REG_MO] = BIN2BCD(tm->tm_mon + 1); - regs[ISL1208_REG_YR] = BIN2BCD(tm->tm_year - 100); + regs[ISL1208_REG_DT] = bin2bcd(tm->tm_mday); + regs[ISL1208_REG_MO] = bin2bcd(tm->tm_mon + 1); + regs[ISL1208_REG_YR] = bin2bcd(tm->tm_year - 100); - regs[ISL1208_REG_DW] = BIN2BCD(tm->tm_wday & 7); + regs[ISL1208_REG_DW] = bin2bcd(tm->tm_wday & 7); sr = isl1208_i2c_get_sr(client); if (sr < 0) { diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index 470fb2d29545..893f7dece239 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -110,15 +110,15 @@ static int m41t80_get_datetime(struct i2c_client *client, return -EIO; } - tm->tm_sec = BCD2BIN(buf[M41T80_REG_SEC] & 0x7f); - tm->tm_min = BCD2BIN(buf[M41T80_REG_MIN] & 0x7f); - tm->tm_hour = BCD2BIN(buf[M41T80_REG_HOUR] & 0x3f); - tm->tm_mday = BCD2BIN(buf[M41T80_REG_DAY] & 0x3f); + tm->tm_sec = bcd2bin(buf[M41T80_REG_SEC] & 0x7f); + tm->tm_min = bcd2bin(buf[M41T80_REG_MIN] & 0x7f); + tm->tm_hour = bcd2bin(buf[M41T80_REG_HOUR] & 0x3f); + tm->tm_mday = bcd2bin(buf[M41T80_REG_DAY] & 0x3f); tm->tm_wday = buf[M41T80_REG_WDAY] & 0x07; - tm->tm_mon = BCD2BIN(buf[M41T80_REG_MON] & 0x1f) - 1; + tm->tm_mon = bcd2bin(buf[M41T80_REG_MON] & 0x1f) - 1; /* assume 20YY not 19YY, and ignore the Century Bit */ - tm->tm_year = BCD2BIN(buf[M41T80_REG_YEAR]) + 100; + tm->tm_year = bcd2bin(buf[M41T80_REG_YEAR]) + 100; return 0; } @@ -161,19 +161,19 @@ static int m41t80_set_datetime(struct i2c_client *client, struct rtc_time *tm) /* Merge time-data and register flags into buf[0..7] */ buf[M41T80_REG_SSEC] = 0; buf[M41T80_REG_SEC] = - BIN2BCD(tm->tm_sec) | (buf[M41T80_REG_SEC] & ~0x7f); + bin2bcd(tm->tm_sec) | (buf[M41T80_REG_SEC] & ~0x7f); buf[M41T80_REG_MIN] = - BIN2BCD(tm->tm_min) | (buf[M41T80_REG_MIN] & ~0x7f); + bin2bcd(tm->tm_min) | (buf[M41T80_REG_MIN] & ~0x7f); buf[M41T80_REG_HOUR] = - BIN2BCD(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f) ; + bin2bcd(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f) ; buf[M41T80_REG_WDAY] = (tm->tm_wday & 0x07) | (buf[M41T80_REG_WDAY] & ~0x07); buf[M41T80_REG_DAY] = - BIN2BCD(tm->tm_mday) | (buf[M41T80_REG_DAY] & ~0x3f); + bin2bcd(tm->tm_mday) | (buf[M41T80_REG_DAY] & ~0x3f); buf[M41T80_REG_MON] = - BIN2BCD(tm->tm_mon + 1) | (buf[M41T80_REG_MON] & ~0x1f); + bin2bcd(tm->tm_mon + 1) | (buf[M41T80_REG_MON] & ~0x1f); /* assume 20YY not 19YY */ - buf[M41T80_REG_YEAR] = BIN2BCD(tm->tm_year % 100); + buf[M41T80_REG_YEAR] = bin2bcd(tm->tm_year % 100); if (i2c_transfer(client->adapter, msgs, 1) != 1) { dev_err(&client->dev, "write error\n"); @@ -288,15 +288,15 @@ static int m41t80_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *t) wbuf[0] = M41T80_REG_ALARM_MON; /* offset into rtc's regs */ reg[M41T80_REG_ALARM_SEC] |= t->time.tm_sec >= 0 ? - BIN2BCD(t->time.tm_sec) : 0x80; + bin2bcd(t->time.tm_sec) : 0x80; reg[M41T80_REG_ALARM_MIN] |= t->time.tm_min >= 0 ? - BIN2BCD(t->time.tm_min) : 0x80; + bin2bcd(t->time.tm_min) : 0x80; reg[M41T80_REG_ALARM_HOUR] |= t->time.tm_hour >= 0 ? - BIN2BCD(t->time.tm_hour) : 0x80; + bin2bcd(t->time.tm_hour) : 0x80; reg[M41T80_REG_ALARM_DAY] |= t->time.tm_mday >= 0 ? - BIN2BCD(t->time.tm_mday) : 0x80; + bin2bcd(t->time.tm_mday) : 0x80; if (t->time.tm_mon >= 0) - reg[M41T80_REG_ALARM_MON] |= BIN2BCD(t->time.tm_mon + 1); + reg[M41T80_REG_ALARM_MON] |= bin2bcd(t->time.tm_mon + 1); else reg[M41T80_REG_ALARM_DAY] |= 0x40; @@ -347,15 +347,15 @@ static int m41t80_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *t) t->time.tm_mday = -1; t->time.tm_mon = -1; if (!(reg[M41T80_REG_ALARM_SEC] & 0x80)) - t->time.tm_sec = BCD2BIN(reg[M41T80_REG_ALARM_SEC] & 0x7f); + t->time.tm_sec = bcd2bin(reg[M41T80_REG_ALARM_SEC] & 0x7f); if (!(reg[M41T80_REG_ALARM_MIN] & 0x80)) - t->time.tm_min = BCD2BIN(reg[M41T80_REG_ALARM_MIN] & 0x7f); + t->time.tm_min = bcd2bin(reg[M41T80_REG_ALARM_MIN] & 0x7f); if (!(reg[M41T80_REG_ALARM_HOUR] & 0x80)) - t->time.tm_hour = BCD2BIN(reg[M41T80_REG_ALARM_HOUR] & 0x3f); + t->time.tm_hour = bcd2bin(reg[M41T80_REG_ALARM_HOUR] & 0x3f); if (!(reg[M41T80_REG_ALARM_DAY] & 0x80)) - t->time.tm_mday = BCD2BIN(reg[M41T80_REG_ALARM_DAY] & 0x3f); + t->time.tm_mday = bcd2bin(reg[M41T80_REG_ALARM_DAY] & 0x3f); if (!(reg[M41T80_REG_ALARM_DAY] & 0x40)) - t->time.tm_mon = BCD2BIN(reg[M41T80_REG_ALARM_MON] & 0x1f) - 1; + t->time.tm_mon = bcd2bin(reg[M41T80_REG_ALARM_MON] & 0x1f) - 1; t->time.tm_year = -1; t->time.tm_wday = -1; t->time.tm_yday = -1; diff --git a/drivers/rtc/rtc-m41t94.c b/drivers/rtc/rtc-m41t94.c index 9b19499c829e..c3a18c58daf6 100644 --- a/drivers/rtc/rtc-m41t94.c +++ b/drivers/rtc/rtc-m41t94.c @@ -41,17 +41,17 @@ static int m41t94_set_time(struct device *dev, struct rtc_time *tm) tm->tm_mon, tm->tm_year, tm->tm_wday); buf[0] = 0x80 | M41T94_REG_SECONDS; /* write time + date */ - buf[M41T94_REG_SECONDS] = BIN2BCD(tm->tm_sec); - buf[M41T94_REG_MINUTES] = BIN2BCD(tm->tm_min); - buf[M41T94_REG_HOURS] = BIN2BCD(tm->tm_hour); - buf[M41T94_REG_WDAY] = BIN2BCD(tm->tm_wday + 1); - buf[M41T94_REG_DAY] = BIN2BCD(tm->tm_mday); - buf[M41T94_REG_MONTH] = BIN2BCD(tm->tm_mon + 1); + buf[M41T94_REG_SECONDS] = bin2bcd(tm->tm_sec); + buf[M41T94_REG_MINUTES] = bin2bcd(tm->tm_min); + buf[M41T94_REG_HOURS] = bin2bcd(tm->tm_hour); + buf[M41T94_REG_WDAY] = bin2bcd(tm->tm_wday + 1); + buf[M41T94_REG_DAY] = bin2bcd(tm->tm_mday); + buf[M41T94_REG_MONTH] = bin2bcd(tm->tm_mon + 1); buf[M41T94_REG_HOURS] |= M41T94_BIT_CEB; if (tm->tm_year >= 100) buf[M41T94_REG_HOURS] |= M41T94_BIT_CB; - buf[M41T94_REG_YEAR] = BIN2BCD(tm->tm_year % 100); + buf[M41T94_REG_YEAR] = bin2bcd(tm->tm_year % 100); return spi_write(spi, buf, 8); } @@ -82,14 +82,14 @@ static int m41t94_read_time(struct device *dev, struct rtc_time *tm) spi_write(spi, buf, 2); } - tm->tm_sec = BCD2BIN(spi_w8r8(spi, M41T94_REG_SECONDS)); - tm->tm_min = BCD2BIN(spi_w8r8(spi, M41T94_REG_MINUTES)); + tm->tm_sec = bcd2bin(spi_w8r8(spi, M41T94_REG_SECONDS)); + tm->tm_min = bcd2bin(spi_w8r8(spi, M41T94_REG_MINUTES)); hour = spi_w8r8(spi, M41T94_REG_HOURS); - tm->tm_hour = BCD2BIN(hour & 0x3f); - tm->tm_wday = BCD2BIN(spi_w8r8(spi, M41T94_REG_WDAY)) - 1; - tm->tm_mday = BCD2BIN(spi_w8r8(spi, M41T94_REG_DAY)); - tm->tm_mon = BCD2BIN(spi_w8r8(spi, M41T94_REG_MONTH)) - 1; - tm->tm_year = BCD2BIN(spi_w8r8(spi, M41T94_REG_YEAR)); + tm->tm_hour = bcd2bin(hour & 0x3f); + tm->tm_wday = bcd2bin(spi_w8r8(spi, M41T94_REG_WDAY)) - 1; + tm->tm_mday = bcd2bin(spi_w8r8(spi, M41T94_REG_DAY)); + tm->tm_mon = bcd2bin(spi_w8r8(spi, M41T94_REG_MONTH)) - 1; + tm->tm_year = bcd2bin(spi_w8r8(spi, M41T94_REG_YEAR)); if ((hour & M41T94_BIT_CB) || !(hour & M41T94_BIT_CEB)) tm->tm_year += 100; diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index ce4eff6a8d51..04b63dab6932 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -76,10 +76,10 @@ static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm) /* Issue the READ command */ M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL); - tm->tm_year = BCD2BIN(M48T59_READ(M48T59_YEAR)); + tm->tm_year = bcd2bin(M48T59_READ(M48T59_YEAR)); /* tm_mon is 0-11 */ - tm->tm_mon = BCD2BIN(M48T59_READ(M48T59_MONTH)) - 1; - tm->tm_mday = BCD2BIN(M48T59_READ(M48T59_MDAY)); + tm->tm_mon = bcd2bin(M48T59_READ(M48T59_MONTH)) - 1; + tm->tm_mday = bcd2bin(M48T59_READ(M48T59_MDAY)); val = M48T59_READ(M48T59_WDAY); if ((pdata->type == M48T59RTC_TYPE_M48T59) && @@ -88,10 +88,10 @@ static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm) tm->tm_year += 100; /* one century */ } - tm->tm_wday = BCD2BIN(val & 0x07); - tm->tm_hour = BCD2BIN(M48T59_READ(M48T59_HOUR) & 0x3F); - tm->tm_min = BCD2BIN(M48T59_READ(M48T59_MIN) & 0x7F); - tm->tm_sec = BCD2BIN(M48T59_READ(M48T59_SEC) & 0x7F); + tm->tm_wday = bcd2bin(val & 0x07); + tm->tm_hour = bcd2bin(M48T59_READ(M48T59_HOUR) & 0x3F); + tm->tm_min = bcd2bin(M48T59_READ(M48T59_MIN) & 0x7F); + tm->tm_sec = bcd2bin(M48T59_READ(M48T59_SEC) & 0x7F); /* Clear the READ bit */ M48T59_CLEAR_BITS(M48T59_CNTL_READ, M48T59_CNTL); @@ -119,17 +119,17 @@ static int m48t59_rtc_set_time(struct device *dev, struct rtc_time *tm) /* Issue the WRITE command */ M48T59_SET_BITS(M48T59_CNTL_WRITE, M48T59_CNTL); - M48T59_WRITE((BIN2BCD(tm->tm_sec) & 0x7F), M48T59_SEC); - M48T59_WRITE((BIN2BCD(tm->tm_min) & 0x7F), M48T59_MIN); - M48T59_WRITE((BIN2BCD(tm->tm_hour) & 0x3F), M48T59_HOUR); - M48T59_WRITE((BIN2BCD(tm->tm_mday) & 0x3F), M48T59_MDAY); + M48T59_WRITE((bin2bcd(tm->tm_sec) & 0x7F), M48T59_SEC); + M48T59_WRITE((bin2bcd(tm->tm_min) & 0x7F), M48T59_MIN); + M48T59_WRITE((bin2bcd(tm->tm_hour) & 0x3F), M48T59_HOUR); + M48T59_WRITE((bin2bcd(tm->tm_mday) & 0x3F), M48T59_MDAY); /* tm_mon is 0-11 */ - M48T59_WRITE((BIN2BCD(tm->tm_mon + 1) & 0x1F), M48T59_MONTH); - M48T59_WRITE(BIN2BCD(tm->tm_year % 100), M48T59_YEAR); + M48T59_WRITE((bin2bcd(tm->tm_mon + 1) & 0x1F), M48T59_MONTH); + M48T59_WRITE(bin2bcd(tm->tm_year % 100), M48T59_YEAR); if (pdata->type == M48T59RTC_TYPE_M48T59 && (tm->tm_year / 100)) val = (M48T59_WDAY_CEB | M48T59_WDAY_CB); - val |= (BIN2BCD(tm->tm_wday) & 0x07); + val |= (bin2bcd(tm->tm_wday) & 0x07); M48T59_WRITE(val, M48T59_WDAY); /* Clear the WRITE bit */ @@ -158,18 +158,18 @@ static int m48t59_rtc_readalarm(struct device *dev, struct rtc_wkalrm *alrm) /* Issue the READ command */ M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL); - tm->tm_year = BCD2BIN(M48T59_READ(M48T59_YEAR)); + tm->tm_year = bcd2bin(M48T59_READ(M48T59_YEAR)); /* tm_mon is 0-11 */ - tm->tm_mon = BCD2BIN(M48T59_READ(M48T59_MONTH)) - 1; + tm->tm_mon = bcd2bin(M48T59_READ(M48T59_MONTH)) - 1; val = M48T59_READ(M48T59_WDAY); if ((val & M48T59_WDAY_CEB) && (val & M48T59_WDAY_CB)) tm->tm_year += 100; /* one century */ - tm->tm_mday = BCD2BIN(M48T59_READ(M48T59_ALARM_DATE)); - tm->tm_hour = BCD2BIN(M48T59_READ(M48T59_ALARM_HOUR)); - tm->tm_min = BCD2BIN(M48T59_READ(M48T59_ALARM_MIN)); - tm->tm_sec = BCD2BIN(M48T59_READ(M48T59_ALARM_SEC)); + tm->tm_mday = bcd2bin(M48T59_READ(M48T59_ALARM_DATE)); + tm->tm_hour = bcd2bin(M48T59_READ(M48T59_ALARM_HOUR)); + tm->tm_min = bcd2bin(M48T59_READ(M48T59_ALARM_MIN)); + tm->tm_sec = bcd2bin(M48T59_READ(M48T59_ALARM_SEC)); /* Clear the READ bit */ M48T59_CLEAR_BITS(M48T59_CNTL_READ, M48T59_CNTL); @@ -201,18 +201,18 @@ static int m48t59_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) * 0xff means "always match" */ mday = tm->tm_mday; - mday = (mday >= 1 && mday <= 31) ? BIN2BCD(mday) : 0xff; + mday = (mday >= 1 && mday <= 31) ? bin2bcd(mday) : 0xff; if (mday == 0xff) mday = M48T59_READ(M48T59_MDAY); hour = tm->tm_hour; - hour = (hour < 24) ? BIN2BCD(hour) : 0x00; + hour = (hour < 24) ? bin2bcd(hour) : 0x00; min = tm->tm_min; - min = (min < 60) ? BIN2BCD(min) : 0x00; + min = (min < 60) ? bin2bcd(min) : 0x00; sec = tm->tm_sec; - sec = (sec < 60) ? BIN2BCD(sec) : 0x00; + sec = (sec < 60) ? bin2bcd(sec) : 0x00; spin_lock_irqsave(&m48t59->lock, flags); /* Issue the WRITE command */ @@ -360,7 +360,6 @@ static struct bin_attribute m48t59_nvram_attr = { .attr = { .name = "nvram", .mode = S_IRUGO | S_IWUSR, - .owner = THIS_MODULE, }, .read = m48t59_nvram_read, .write = m48t59_nvram_write, diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index 3f7f99a5d96a..7c045cffa9ff 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -62,14 +62,14 @@ static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm) tm->tm_wday = ops->readbyte(M48T86_REG_DOW); } else { /* bcd mode */ - tm->tm_sec = BCD2BIN(ops->readbyte(M48T86_REG_SEC)); - tm->tm_min = BCD2BIN(ops->readbyte(M48T86_REG_MIN)); - tm->tm_hour = BCD2BIN(ops->readbyte(M48T86_REG_HOUR) & 0x3F); - tm->tm_mday = BCD2BIN(ops->readbyte(M48T86_REG_DOM)); + tm->tm_sec = bcd2bin(ops->readbyte(M48T86_REG_SEC)); + tm->tm_min = bcd2bin(ops->readbyte(M48T86_REG_MIN)); + tm->tm_hour = bcd2bin(ops->readbyte(M48T86_REG_HOUR) & 0x3F); + tm->tm_mday = bcd2bin(ops->readbyte(M48T86_REG_DOM)); /* tm_mon is 0-11 */ - tm->tm_mon = BCD2BIN(ops->readbyte(M48T86_REG_MONTH)) - 1; - tm->tm_year = BCD2BIN(ops->readbyte(M48T86_REG_YEAR)) + 100; - tm->tm_wday = BCD2BIN(ops->readbyte(M48T86_REG_DOW)); + tm->tm_mon = bcd2bin(ops->readbyte(M48T86_REG_MONTH)) - 1; + tm->tm_year = bcd2bin(ops->readbyte(M48T86_REG_YEAR)) + 100; + tm->tm_wday = bcd2bin(ops->readbyte(M48T86_REG_DOW)); } /* correct the hour if the clock is in 12h mode */ @@ -103,13 +103,13 @@ static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm) ops->writebyte(tm->tm_wday, M48T86_REG_DOW); } else { /* bcd mode */ - ops->writebyte(BIN2BCD(tm->tm_sec), M48T86_REG_SEC); - ops->writebyte(BIN2BCD(tm->tm_min), M48T86_REG_MIN); - ops->writebyte(BIN2BCD(tm->tm_hour), M48T86_REG_HOUR); - ops->writebyte(BIN2BCD(tm->tm_mday), M48T86_REG_DOM); - ops->writebyte(BIN2BCD(tm->tm_mon + 1), M48T86_REG_MONTH); - ops->writebyte(BIN2BCD(tm->tm_year % 100), M48T86_REG_YEAR); - ops->writebyte(BIN2BCD(tm->tm_wday), M48T86_REG_DOW); + ops->writebyte(bin2bcd(tm->tm_sec), M48T86_REG_SEC); + ops->writebyte(bin2bcd(tm->tm_min), M48T86_REG_MIN); + ops->writebyte(bin2bcd(tm->tm_hour), M48T86_REG_HOUR); + ops->writebyte(bin2bcd(tm->tm_mday), M48T86_REG_DOM); + ops->writebyte(bin2bcd(tm->tm_mon + 1), M48T86_REG_MONTH); + ops->writebyte(bin2bcd(tm->tm_year % 100), M48T86_REG_YEAR); + ops->writebyte(bin2bcd(tm->tm_wday), M48T86_REG_DOW); } /* update ended */ diff --git a/drivers/rtc/rtc-max6900.c b/drivers/rtc/rtc-max6900.c index 12c9cd25cad8..80782798763f 100644 --- a/drivers/rtc/rtc-max6900.c +++ b/drivers/rtc/rtc-max6900.c @@ -150,14 +150,14 @@ static int max6900_i2c_read_time(struct i2c_client *client, struct rtc_time *tm) if (rc < 0) return rc; - tm->tm_sec = BCD2BIN(regs[MAX6900_REG_SC]); - tm->tm_min = BCD2BIN(regs[MAX6900_REG_MN]); - tm->tm_hour = BCD2BIN(regs[MAX6900_REG_HR] & 0x3f); - tm->tm_mday = BCD2BIN(regs[MAX6900_REG_DT]); - tm->tm_mon = BCD2BIN(regs[MAX6900_REG_MO]) - 1; - tm->tm_year = BCD2BIN(regs[MAX6900_REG_YR]) + - BCD2BIN(regs[MAX6900_REG_CENTURY]) * 100 - 1900; - tm->tm_wday = BCD2BIN(regs[MAX6900_REG_DW]); + tm->tm_sec = bcd2bin(regs[MAX6900_REG_SC]); + tm->tm_min = bcd2bin(regs[MAX6900_REG_MN]); + tm->tm_hour = bcd2bin(regs[MAX6900_REG_HR] & 0x3f); + tm->tm_mday = bcd2bin(regs[MAX6900_REG_DT]); + tm->tm_mon = bcd2bin(regs[MAX6900_REG_MO]) - 1; + tm->tm_year = bcd2bin(regs[MAX6900_REG_YR]) + + bcd2bin(regs[MAX6900_REG_CENTURY]) * 100 - 1900; + tm->tm_wday = bcd2bin(regs[MAX6900_REG_DW]); return 0; } @@ -184,14 +184,14 @@ max6900_i2c_set_time(struct i2c_client *client, struct rtc_time const *tm) if (rc < 0) return rc; - regs[MAX6900_REG_SC] = BIN2BCD(tm->tm_sec); - regs[MAX6900_REG_MN] = BIN2BCD(tm->tm_min); - regs[MAX6900_REG_HR] = BIN2BCD(tm->tm_hour); - regs[MAX6900_REG_DT] = BIN2BCD(tm->tm_mday); - regs[MAX6900_REG_MO] = BIN2BCD(tm->tm_mon + 1); - regs[MAX6900_REG_DW] = BIN2BCD(tm->tm_wday); - regs[MAX6900_REG_YR] = BIN2BCD(tm->tm_year % 100); - regs[MAX6900_REG_CENTURY] = BIN2BCD((tm->tm_year + 1900) / 100); + regs[MAX6900_REG_SC] = bin2bcd(tm->tm_sec); + regs[MAX6900_REG_MN] = bin2bcd(tm->tm_min); + regs[MAX6900_REG_HR] = bin2bcd(tm->tm_hour); + regs[MAX6900_REG_DT] = bin2bcd(tm->tm_mday); + regs[MAX6900_REG_MO] = bin2bcd(tm->tm_mon + 1); + regs[MAX6900_REG_DW] = bin2bcd(tm->tm_wday); + regs[MAX6900_REG_YR] = bin2bcd(tm->tm_year % 100); + regs[MAX6900_REG_CENTURY] = bin2bcd((tm->tm_year + 1900) / 100); /* set write protect */ regs[MAX6900_REG_CT] = MAX6900_REG_CT_WP; diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index 78b2551fb19d..2f6507df7b49 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -124,15 +124,15 @@ static int max6902_get_datetime(struct device *dev, struct rtc_time *dt) /* The chip sends data in this order: * Seconds, Minutes, Hours, Date, Month, Day, Year */ - dt->tm_sec = BCD2BIN(chip->buf[1]); - dt->tm_min = BCD2BIN(chip->buf[2]); - dt->tm_hour = BCD2BIN(chip->buf[3]); - dt->tm_mday = BCD2BIN(chip->buf[4]); - dt->tm_mon = BCD2BIN(chip->buf[5]) - 1; - dt->tm_wday = BCD2BIN(chip->buf[6]); - dt->tm_year = BCD2BIN(chip->buf[7]); + dt->tm_sec = bcd2bin(chip->buf[1]); + dt->tm_min = bcd2bin(chip->buf[2]); + dt->tm_hour = bcd2bin(chip->buf[3]); + dt->tm_mday = bcd2bin(chip->buf[4]); + dt->tm_mon = bcd2bin(chip->buf[5]) - 1; + dt->tm_wday = bcd2bin(chip->buf[6]); + dt->tm_year = bcd2bin(chip->buf[7]); - century = BCD2BIN(tmp) * 100; + century = bcd2bin(tmp) * 100; dt->tm_year += century; dt->tm_year -= 1900; @@ -168,15 +168,15 @@ static int max6902_set_datetime(struct device *dev, struct rtc_time *dt) /* Remove write protection */ max6902_set_reg(dev, 0xF, 0); - max6902_set_reg(dev, 0x01, BIN2BCD(dt->tm_sec)); - max6902_set_reg(dev, 0x03, BIN2BCD(dt->tm_min)); - max6902_set_reg(dev, 0x05, BIN2BCD(dt->tm_hour)); + max6902_set_reg(dev, 0x01, bin2bcd(dt->tm_sec)); + max6902_set_reg(dev, 0x03, bin2bcd(dt->tm_min)); + max6902_set_reg(dev, 0x05, bin2bcd(dt->tm_hour)); - max6902_set_reg(dev, 0x07, BIN2BCD(dt->tm_mday)); - max6902_set_reg(dev, 0x09, BIN2BCD(dt->tm_mon+1)); - max6902_set_reg(dev, 0x0B, BIN2BCD(dt->tm_wday)); - max6902_set_reg(dev, 0x0D, BIN2BCD(dt->tm_year%100)); - max6902_set_reg(dev, 0x13, BIN2BCD(dt->tm_year/100)); + max6902_set_reg(dev, 0x07, bin2bcd(dt->tm_mday)); + max6902_set_reg(dev, 0x09, bin2bcd(dt->tm_mon+1)); + max6902_set_reg(dev, 0x0B, bin2bcd(dt->tm_wday)); + max6902_set_reg(dev, 0x0D, bin2bcd(dt->tm_year%100)); + max6902_set_reg(dev, 0x13, bin2bcd(dt->tm_year/100)); /* Compulab used a delay here. However, the datasheet * does not mention a delay being required anywhere... */ diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index 8876605d4d4b..2cbeb0794f14 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -186,30 +186,30 @@ static int tm2bcd(struct rtc_time *tm) if (rtc_valid_tm(tm) != 0) return -EINVAL; - tm->tm_sec = BIN2BCD(tm->tm_sec); - tm->tm_min = BIN2BCD(tm->tm_min); - tm->tm_hour = BIN2BCD(tm->tm_hour); - tm->tm_mday = BIN2BCD(tm->tm_mday); + tm->tm_sec = bin2bcd(tm->tm_sec); + tm->tm_min = bin2bcd(tm->tm_min); + tm->tm_hour = bin2bcd(tm->tm_hour); + tm->tm_mday = bin2bcd(tm->tm_mday); - tm->tm_mon = BIN2BCD(tm->tm_mon + 1); + tm->tm_mon = bin2bcd(tm->tm_mon + 1); /* epoch == 1900 */ if (tm->tm_year < 100 || tm->tm_year > 199) return -EINVAL; - tm->tm_year = BIN2BCD(tm->tm_year - 100); + tm->tm_year = bin2bcd(tm->tm_year - 100); return 0; } static void bcd2tm(struct rtc_time *tm) { - tm->tm_sec = BCD2BIN(tm->tm_sec); - tm->tm_min = BCD2BIN(tm->tm_min); - tm->tm_hour = BCD2BIN(tm->tm_hour); - tm->tm_mday = BCD2BIN(tm->tm_mday); - tm->tm_mon = BCD2BIN(tm->tm_mon) - 1; + tm->tm_sec = bcd2bin(tm->tm_sec); + tm->tm_min = bcd2bin(tm->tm_min); + tm->tm_hour = bcd2bin(tm->tm_hour); + tm->tm_mday = bcd2bin(tm->tm_mday); + tm->tm_mon = bcd2bin(tm->tm_mon) - 1; /* epoch == 1900 */ - tm->tm_year = BCD2BIN(tm->tm_year) + 100; + tm->tm_year = bcd2bin(tm->tm_year) + 100; } diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c index a829f20ad6d6..b725913ccbe8 100644 --- a/drivers/rtc/rtc-pcf8563.c +++ b/drivers/rtc/rtc-pcf8563.c @@ -97,13 +97,13 @@ static int pcf8563_get_datetime(struct i2c_client *client, struct rtc_time *tm) buf[8]); - tm->tm_sec = BCD2BIN(buf[PCF8563_REG_SC] & 0x7F); - tm->tm_min = BCD2BIN(buf[PCF8563_REG_MN] & 0x7F); - tm->tm_hour = BCD2BIN(buf[PCF8563_REG_HR] & 0x3F); /* rtc hr 0-23 */ - tm->tm_mday = BCD2BIN(buf[PCF8563_REG_DM] & 0x3F); + tm->tm_sec = bcd2bin(buf[PCF8563_REG_SC] & 0x7F); + tm->tm_min = bcd2bin(buf[PCF8563_REG_MN] & 0x7F); + tm->tm_hour = bcd2bin(buf[PCF8563_REG_HR] & 0x3F); /* rtc hr 0-23 */ + tm->tm_mday = bcd2bin(buf[PCF8563_REG_DM] & 0x3F); tm->tm_wday = buf[PCF8563_REG_DW] & 0x07; - tm->tm_mon = BCD2BIN(buf[PCF8563_REG_MO] & 0x1F) - 1; /* rtc mn 1-12 */ - tm->tm_year = BCD2BIN(buf[PCF8563_REG_YR]); + tm->tm_mon = bcd2bin(buf[PCF8563_REG_MO] & 0x1F) - 1; /* rtc mn 1-12 */ + tm->tm_year = bcd2bin(buf[PCF8563_REG_YR]); if (tm->tm_year < 70) tm->tm_year += 100; /* assume we are in 1970...2069 */ /* detect the polarity heuristically. see note above. */ @@ -138,17 +138,17 @@ static int pcf8563_set_datetime(struct i2c_client *client, struct rtc_time *tm) tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); /* hours, minutes and seconds */ - buf[PCF8563_REG_SC] = BIN2BCD(tm->tm_sec); - buf[PCF8563_REG_MN] = BIN2BCD(tm->tm_min); - buf[PCF8563_REG_HR] = BIN2BCD(tm->tm_hour); + buf[PCF8563_REG_SC] = bin2bcd(tm->tm_sec); + buf[PCF8563_REG_MN] = bin2bcd(tm->tm_min); + buf[PCF8563_REG_HR] = bin2bcd(tm->tm_hour); - buf[PCF8563_REG_DM] = BIN2BCD(tm->tm_mday); + buf[PCF8563_REG_DM] = bin2bcd(tm->tm_mday); /* month, 1 - 12 */ - buf[PCF8563_REG_MO] = BIN2BCD(tm->tm_mon + 1); + buf[PCF8563_REG_MO] = bin2bcd(tm->tm_mon + 1); /* year and century */ - buf[PCF8563_REG_YR] = BIN2BCD(tm->tm_year % 100); + buf[PCF8563_REG_YR] = bin2bcd(tm->tm_year % 100); if (pcf8563->c_polarity ? (tm->tm_year >= 100) : (tm->tm_year < 100)) buf[PCF8563_REG_MO] |= PCF8563_MO_C; diff --git a/drivers/rtc/rtc-pcf8583.c b/drivers/rtc/rtc-pcf8583.c index d388c662bf4b..7d33cda3f8f6 100644 --- a/drivers/rtc/rtc-pcf8583.c +++ b/drivers/rtc/rtc-pcf8583.c @@ -76,11 +76,11 @@ static int pcf8583_get_datetime(struct i2c_client *client, struct rtc_time *dt) buf[4] &= 0x3f; buf[5] &= 0x1f; - dt->tm_sec = BCD2BIN(buf[1]); - dt->tm_min = BCD2BIN(buf[2]); - dt->tm_hour = BCD2BIN(buf[3]); - dt->tm_mday = BCD2BIN(buf[4]); - dt->tm_mon = BCD2BIN(buf[5]) - 1; + dt->tm_sec = bcd2bin(buf[1]); + dt->tm_min = bcd2bin(buf[2]); + dt->tm_hour = bcd2bin(buf[3]); + dt->tm_mday = bcd2bin(buf[4]); + dt->tm_mon = bcd2bin(buf[5]) - 1; } return ret == 2 ? 0 : -EIO; @@ -94,14 +94,14 @@ static int pcf8583_set_datetime(struct i2c_client *client, struct rtc_time *dt, buf[0] = 0; buf[1] = get_ctrl(client) | 0x80; buf[2] = 0; - buf[3] = BIN2BCD(dt->tm_sec); - buf[4] = BIN2BCD(dt->tm_min); - buf[5] = BIN2BCD(dt->tm_hour); + buf[3] = bin2bcd(dt->tm_sec); + buf[4] = bin2bcd(dt->tm_min); + buf[5] = bin2bcd(dt->tm_hour); if (datetoo) { len = 8; - buf[6] = BIN2BCD(dt->tm_mday) | (dt->tm_year << 6); - buf[7] = BIN2BCD(dt->tm_mon + 1) | (dt->tm_wday << 5); + buf[6] = bin2bcd(dt->tm_mday) | (dt->tm_year << 6); + buf[7] = bin2bcd(dt->tm_mon + 1) | (dt->tm_wday << 5); } ret = i2c_master_send(client, (char *)buf, len); diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c index 395985b339c9..42028f233bef 100644 --- a/drivers/rtc/rtc-r9701.c +++ b/drivers/rtc/rtc-r9701.c @@ -80,13 +80,13 @@ static int r9701_get_datetime(struct device *dev, struct rtc_time *dt) memset(dt, 0, sizeof(*dt)); - dt->tm_sec = BCD2BIN(buf[0]); /* RSECCNT */ - dt->tm_min = BCD2BIN(buf[1]); /* RMINCNT */ - dt->tm_hour = BCD2BIN(buf[2]); /* RHRCNT */ + dt->tm_sec = bcd2bin(buf[0]); /* RSECCNT */ + dt->tm_min = bcd2bin(buf[1]); /* RMINCNT */ + dt->tm_hour = bcd2bin(buf[2]); /* RHRCNT */ - dt->tm_mday = BCD2BIN(buf[3]); /* RDAYCNT */ - dt->tm_mon = BCD2BIN(buf[4]) - 1; /* RMONCNT */ - dt->tm_year = BCD2BIN(buf[5]) + 100; /* RYRCNT */ + dt->tm_mday = bcd2bin(buf[3]); /* RDAYCNT */ + dt->tm_mon = bcd2bin(buf[4]) - 1; /* RMONCNT */ + dt->tm_year = bcd2bin(buf[5]) + 100; /* RYRCNT */ /* the rtc device may contain illegal values on power up * according to the data sheet. make sure they are valid. @@ -103,12 +103,12 @@ static int r9701_set_datetime(struct device *dev, struct rtc_time *dt) if (year >= 2100 || year < 2000) return -EINVAL; - ret = write_reg(dev, RHRCNT, BIN2BCD(dt->tm_hour)); - ret = ret ? ret : write_reg(dev, RMINCNT, BIN2BCD(dt->tm_min)); - ret = ret ? ret : write_reg(dev, RSECCNT, BIN2BCD(dt->tm_sec)); - ret = ret ? ret : write_reg(dev, RDAYCNT, BIN2BCD(dt->tm_mday)); - ret = ret ? ret : write_reg(dev, RMONCNT, BIN2BCD(dt->tm_mon + 1)); - ret = ret ? ret : write_reg(dev, RYRCNT, BIN2BCD(dt->tm_year - 100)); + ret = write_reg(dev, RHRCNT, bin2bcd(dt->tm_hour)); + ret = ret ? ret : write_reg(dev, RMINCNT, bin2bcd(dt->tm_min)); + ret = ret ? ret : write_reg(dev, RSECCNT, bin2bcd(dt->tm_sec)); + ret = ret ? ret : write_reg(dev, RDAYCNT, bin2bcd(dt->tm_mday)); + ret = ret ? ret : write_reg(dev, RMONCNT, bin2bcd(dt->tm_mon + 1)); + ret = ret ? ret : write_reg(dev, RYRCNT, bin2bcd(dt->tm_year - 100)); ret = ret ? ret : write_reg(dev, RWKCNT, 1 << dt->tm_wday); return ret; diff --git a/drivers/rtc/rtc-rs5c313.c b/drivers/rtc/rtc-rs5c313.c index 1c14d4497c4d..e6ea3f5ee1eb 100644 --- a/drivers/rtc/rtc-rs5c313.c +++ b/drivers/rtc/rtc-rs5c313.c @@ -235,33 +235,33 @@ static int rs5c313_rtc_read_time(struct device *dev, struct rtc_time *tm) data = rs5c313_read_reg(RS5C313_ADDR_SEC); data |= (rs5c313_read_reg(RS5C313_ADDR_SEC10) << 4); - tm->tm_sec = BCD2BIN(data); + tm->tm_sec = bcd2bin(data); data = rs5c313_read_reg(RS5C313_ADDR_MIN); data |= (rs5c313_read_reg(RS5C313_ADDR_MIN10) << 4); - tm->tm_min = BCD2BIN(data); + tm->tm_min = bcd2bin(data); data = rs5c313_read_reg(RS5C313_ADDR_HOUR); data |= (rs5c313_read_reg(RS5C313_ADDR_HOUR10) << 4); - tm->tm_hour = BCD2BIN(data); + tm->tm_hour = bcd2bin(data); data = rs5c313_read_reg(RS5C313_ADDR_DAY); data |= (rs5c313_read_reg(RS5C313_ADDR_DAY10) << 4); - tm->tm_mday = BCD2BIN(data); + tm->tm_mday = bcd2bin(data); data = rs5c313_read_reg(RS5C313_ADDR_MON); data |= (rs5c313_read_reg(RS5C313_ADDR_MON10) << 4); - tm->tm_mon = BCD2BIN(data) - 1; + tm->tm_mon = bcd2bin(data) - 1; data = rs5c313_read_reg(RS5C313_ADDR_YEAR); data |= (rs5c313_read_reg(RS5C313_ADDR_YEAR10) << 4); - tm->tm_year = BCD2BIN(data); + tm->tm_year = bcd2bin(data); if (tm->tm_year < 70) tm->tm_year += 100; data = rs5c313_read_reg(RS5C313_ADDR_WEEK); - tm->tm_wday = BCD2BIN(data); + tm->tm_wday = bcd2bin(data); RS5C313_CEDISABLE; ndelay(700); /* CE:L */ @@ -294,31 +294,31 @@ static int rs5c313_rtc_set_time(struct device *dev, struct rtc_time *tm) } } - data = BIN2BCD(tm->tm_sec); + data = bin2bcd(tm->tm_sec); rs5c313_write_reg(RS5C313_ADDR_SEC, data); rs5c313_write_reg(RS5C313_ADDR_SEC10, (data >> 4)); - data = BIN2BCD(tm->tm_min); + data = bin2bcd(tm->tm_min); rs5c313_write_reg(RS5C313_ADDR_MIN, data ); rs5c313_write_reg(RS5C313_ADDR_MIN10, (data >> 4)); - data = BIN2BCD(tm->tm_hour); + data = bin2bcd(tm->tm_hour); rs5c313_write_reg(RS5C313_ADDR_HOUR, data); rs5c313_write_reg(RS5C313_ADDR_HOUR10, (data >> 4)); - data = BIN2BCD(tm->tm_mday); + data = bin2bcd(tm->tm_mday); rs5c313_write_reg(RS5C313_ADDR_DAY, data); rs5c313_write_reg(RS5C313_ADDR_DAY10, (data>> 4)); - data = BIN2BCD(tm->tm_mon + 1); + data = bin2bcd(tm->tm_mon + 1); rs5c313_write_reg(RS5C313_ADDR_MON, data); rs5c313_write_reg(RS5C313_ADDR_MON10, (data >> 4)); - data = BIN2BCD(tm->tm_year % 100); + data = bin2bcd(tm->tm_year % 100); rs5c313_write_reg(RS5C313_ADDR_YEAR, data); rs5c313_write_reg(RS5C313_ADDR_YEAR10, (data >> 4)); - data = BIN2BCD(tm->tm_wday); + data = bin2bcd(tm->tm_wday); rs5c313_write_reg(RS5C313_ADDR_WEEK, data); RS5C313_CEDISABLE; /* CE:H */ diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c index 839462659afa..dd1e2bc7a472 100644 --- a/drivers/rtc/rtc-rs5c348.c +++ b/drivers/rtc/rtc-rs5c348.c @@ -74,20 +74,20 @@ rs5c348_rtc_set_time(struct device *dev, struct rtc_time *tm) txbuf[3] = 0; /* dummy */ txbuf[4] = RS5C348_CMD_MW(RS5C348_REG_SECS); /* cmd, sec, ... */ txp = &txbuf[5]; - txp[RS5C348_REG_SECS] = BIN2BCD(tm->tm_sec); - txp[RS5C348_REG_MINS] = BIN2BCD(tm->tm_min); + txp[RS5C348_REG_SECS] = bin2bcd(tm->tm_sec); + txp[RS5C348_REG_MINS] = bin2bcd(tm->tm_min); if (pdata->rtc_24h) { - txp[RS5C348_REG_HOURS] = BIN2BCD(tm->tm_hour); + txp[RS5C348_REG_HOURS] = bin2bcd(tm->tm_hour); } else { /* hour 0 is AM12, noon is PM12 */ - txp[RS5C348_REG_HOURS] = BIN2BCD((tm->tm_hour + 11) % 12 + 1) | + txp[RS5C348_REG_HOURS] = bin2bcd((tm->tm_hour + 11) % 12 + 1) | (tm->tm_hour >= 12 ? RS5C348_BIT_PM : 0); } - txp[RS5C348_REG_WDAY] = BIN2BCD(tm->tm_wday); - txp[RS5C348_REG_DAY] = BIN2BCD(tm->tm_mday); - txp[RS5C348_REG_MONTH] = BIN2BCD(tm->tm_mon + 1) | + txp[RS5C348_REG_WDAY] = bin2bcd(tm->tm_wday); + txp[RS5C348_REG_DAY] = bin2bcd(tm->tm_mday); + txp[RS5C348_REG_MONTH] = bin2bcd(tm->tm_mon + 1) | (tm->tm_year >= 100 ? RS5C348_BIT_Y2K : 0); - txp[RS5C348_REG_YEAR] = BIN2BCD(tm->tm_year % 100); + txp[RS5C348_REG_YEAR] = bin2bcd(tm->tm_year % 100); /* write in one transfer to avoid data inconsistency */ ret = spi_write_then_read(spi, txbuf, sizeof(txbuf), NULL, 0); udelay(62); /* Tcsr 62us */ @@ -116,20 +116,20 @@ rs5c348_rtc_read_time(struct device *dev, struct rtc_time *tm) if (ret < 0) return ret; - tm->tm_sec = BCD2BIN(rxbuf[RS5C348_REG_SECS] & RS5C348_SECS_MASK); - tm->tm_min = BCD2BIN(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK); - tm->tm_hour = BCD2BIN(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK); + tm->tm_sec = bcd2bin(rxbuf[RS5C348_REG_SECS] & RS5C348_SECS_MASK); + tm->tm_min = bcd2bin(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK); + tm->tm_hour = bcd2bin(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK); if (!pdata->rtc_24h) { tm->tm_hour %= 12; if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM) tm->tm_hour += 12; } - tm->tm_wday = BCD2BIN(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK); - tm->tm_mday = BCD2BIN(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK); + tm->tm_wday = bcd2bin(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK); + tm->tm_mday = bcd2bin(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK); tm->tm_mon = - BCD2BIN(rxbuf[RS5C348_REG_MONTH] & RS5C348_MONTH_MASK) - 1; + bcd2bin(rxbuf[RS5C348_REG_MONTH] & RS5C348_MONTH_MASK) - 1; /* year is 1900 + tm->tm_year */ - tm->tm_year = BCD2BIN(rxbuf[RS5C348_REG_YEAR]) + + tm->tm_year = bcd2bin(rxbuf[RS5C348_REG_YEAR]) + ((rxbuf[RS5C348_REG_MONTH] & RS5C348_BIT_Y2K) ? 100 : 0); if (rtc_valid_tm(tm) < 0) { diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 8b561958fb1e..2f2c68d476da 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -148,9 +148,9 @@ static unsigned rs5c_reg2hr(struct rs5c372 *rs5c, unsigned reg) unsigned hour; if (rs5c->time24) - return BCD2BIN(reg & 0x3f); + return bcd2bin(reg & 0x3f); - hour = BCD2BIN(reg & 0x1f); + hour = bcd2bin(reg & 0x1f); if (hour == 12) hour = 0; if (reg & 0x20) @@ -161,15 +161,15 @@ static unsigned rs5c_reg2hr(struct rs5c372 *rs5c, unsigned reg) static unsigned rs5c_hr2reg(struct rs5c372 *rs5c, unsigned hour) { if (rs5c->time24) - return BIN2BCD(hour); + return bin2bcd(hour); if (hour > 12) - return 0x20 | BIN2BCD(hour - 12); + return 0x20 | bin2bcd(hour - 12); if (hour == 12) - return 0x20 | BIN2BCD(12); + return 0x20 | bin2bcd(12); if (hour == 0) - return BIN2BCD(12); - return BIN2BCD(hour); + return bin2bcd(12); + return bin2bcd(hour); } static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm) @@ -180,18 +180,18 @@ static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm) if (status < 0) return status; - tm->tm_sec = BCD2BIN(rs5c->regs[RS5C372_REG_SECS] & 0x7f); - tm->tm_min = BCD2BIN(rs5c->regs[RS5C372_REG_MINS] & 0x7f); + tm->tm_sec = bcd2bin(rs5c->regs[RS5C372_REG_SECS] & 0x7f); + tm->tm_min = bcd2bin(rs5c->regs[RS5C372_REG_MINS] & 0x7f); tm->tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C372_REG_HOURS]); - tm->tm_wday = BCD2BIN(rs5c->regs[RS5C372_REG_WDAY] & 0x07); - tm->tm_mday = BCD2BIN(rs5c->regs[RS5C372_REG_DAY] & 0x3f); + tm->tm_wday = bcd2bin(rs5c->regs[RS5C372_REG_WDAY] & 0x07); + tm->tm_mday = bcd2bin(rs5c->regs[RS5C372_REG_DAY] & 0x3f); /* tm->tm_mon is zero-based */ - tm->tm_mon = BCD2BIN(rs5c->regs[RS5C372_REG_MONTH] & 0x1f) - 1; + tm->tm_mon = bcd2bin(rs5c->regs[RS5C372_REG_MONTH] & 0x1f) - 1; /* year is 1900 + tm->tm_year */ - tm->tm_year = BCD2BIN(rs5c->regs[RS5C372_REG_YEAR]) + 100; + tm->tm_year = bcd2bin(rs5c->regs[RS5C372_REG_YEAR]) + 100; dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, " "mday=%d, mon=%d, year=%d, wday=%d\n", @@ -216,13 +216,13 @@ static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm) tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); addr = RS5C_ADDR(RS5C372_REG_SECS); - buf[0] = BIN2BCD(tm->tm_sec); - buf[1] = BIN2BCD(tm->tm_min); + buf[0] = bin2bcd(tm->tm_sec); + buf[1] = bin2bcd(tm->tm_min); buf[2] = rs5c_hr2reg(rs5c, tm->tm_hour); - buf[3] = BIN2BCD(tm->tm_wday); - buf[4] = BIN2BCD(tm->tm_mday); - buf[5] = BIN2BCD(tm->tm_mon + 1); - buf[6] = BIN2BCD(tm->tm_year - 100); + buf[3] = bin2bcd(tm->tm_wday); + buf[4] = bin2bcd(tm->tm_mday); + buf[5] = bin2bcd(tm->tm_mon + 1); + buf[6] = bin2bcd(tm->tm_year - 100); if (i2c_smbus_write_i2c_block_data(client, addr, sizeof(buf), buf) < 0) { dev_err(&client->dev, "%s: write error\n", __func__); @@ -367,7 +367,7 @@ static int rs5c_read_alarm(struct device *dev, struct rtc_wkalrm *t) /* report alarm time */ t->time.tm_sec = 0; - t->time.tm_min = BCD2BIN(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f); + t->time.tm_min = bcd2bin(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f); t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]); t->time.tm_mday = -1; t->time.tm_mon = -1; @@ -413,7 +413,7 @@ static int rs5c_set_alarm(struct device *dev, struct rtc_wkalrm *t) } /* set alarm */ - buf[0] = BIN2BCD(t->time.tm_min); + buf[0] = bin2bcd(t->time.tm_min); buf[1] = rs5c_hr2reg(rs5c, t->time.tm_hour); buf[2] = 0x7f; /* any/all days */ diff --git a/drivers/rtc/rtc-s35390a.c b/drivers/rtc/rtc-s35390a.c index a6fa1f2f2ca6..def4d396d0b0 100644 --- a/drivers/rtc/rtc-s35390a.c +++ b/drivers/rtc/rtc-s35390a.c @@ -104,12 +104,12 @@ static int s35390a_disable_test_mode(struct s35390a *s35390a) static char s35390a_hr2reg(struct s35390a *s35390a, int hour) { if (s35390a->twentyfourhour) - return BIN2BCD(hour); + return bin2bcd(hour); if (hour < 12) - return BIN2BCD(hour); + return bin2bcd(hour); - return 0x40 | BIN2BCD(hour - 12); + return 0x40 | bin2bcd(hour - 12); } static int s35390a_reg2hr(struct s35390a *s35390a, char reg) @@ -117,9 +117,9 @@ static int s35390a_reg2hr(struct s35390a *s35390a, char reg) unsigned hour; if (s35390a->twentyfourhour) - return BCD2BIN(reg & 0x3f); + return bcd2bin(reg & 0x3f); - hour = BCD2BIN(reg & 0x3f); + hour = bcd2bin(reg & 0x3f); if (reg & 0x40) hour += 12; @@ -137,13 +137,13 @@ static int s35390a_set_datetime(struct i2c_client *client, struct rtc_time *tm) tm->tm_min, tm->tm_hour, tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); - buf[S35390A_BYTE_YEAR] = BIN2BCD(tm->tm_year - 100); - buf[S35390A_BYTE_MONTH] = BIN2BCD(tm->tm_mon + 1); - buf[S35390A_BYTE_DAY] = BIN2BCD(tm->tm_mday); - buf[S35390A_BYTE_WDAY] = BIN2BCD(tm->tm_wday); + buf[S35390A_BYTE_YEAR] = bin2bcd(tm->tm_year - 100); + buf[S35390A_BYTE_MONTH] = bin2bcd(tm->tm_mon + 1); + buf[S35390A_BYTE_DAY] = bin2bcd(tm->tm_mday); + buf[S35390A_BYTE_WDAY] = bin2bcd(tm->tm_wday); buf[S35390A_BYTE_HOURS] = s35390a_hr2reg(s35390a, tm->tm_hour); - buf[S35390A_BYTE_MINS] = BIN2BCD(tm->tm_min); - buf[S35390A_BYTE_SECS] = BIN2BCD(tm->tm_sec); + buf[S35390A_BYTE_MINS] = bin2bcd(tm->tm_min); + buf[S35390A_BYTE_SECS] = bin2bcd(tm->tm_sec); /* This chip expects the bits of each byte to be in reverse order */ for (i = 0; i < 7; ++i) @@ -168,13 +168,13 @@ static int s35390a_get_datetime(struct i2c_client *client, struct rtc_time *tm) for (i = 0; i < 7; ++i) buf[i] = bitrev8(buf[i]); - tm->tm_sec = BCD2BIN(buf[S35390A_BYTE_SECS]); - tm->tm_min = BCD2BIN(buf[S35390A_BYTE_MINS]); + tm->tm_sec = bcd2bin(buf[S35390A_BYTE_SECS]); + tm->tm_min = bcd2bin(buf[S35390A_BYTE_MINS]); tm->tm_hour = s35390a_reg2hr(s35390a, buf[S35390A_BYTE_HOURS]); - tm->tm_wday = BCD2BIN(buf[S35390A_BYTE_WDAY]); - tm->tm_mday = BCD2BIN(buf[S35390A_BYTE_DAY]); - tm->tm_mon = BCD2BIN(buf[S35390A_BYTE_MONTH]) - 1; - tm->tm_year = BCD2BIN(buf[S35390A_BYTE_YEAR]) + 100; + tm->tm_wday = bcd2bin(buf[S35390A_BYTE_WDAY]); + tm->tm_mday = bcd2bin(buf[S35390A_BYTE_DAY]); + tm->tm_mon = bcd2bin(buf[S35390A_BYTE_MONTH]) - 1; + tm->tm_year = bcd2bin(buf[S35390A_BYTE_YEAR]) + 100; dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, mday=%d, " "mon=%d, year=%d, wday=%d\n", __func__, tm->tm_sec, diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index e7d19b6c265a..910bc704939c 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -134,12 +134,12 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm) rtc_tm->tm_year, rtc_tm->tm_mon, rtc_tm->tm_mday, rtc_tm->tm_hour, rtc_tm->tm_min, rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_sec); - BCD_TO_BIN(rtc_tm->tm_min); - BCD_TO_BIN(rtc_tm->tm_hour); - BCD_TO_BIN(rtc_tm->tm_mday); - BCD_TO_BIN(rtc_tm->tm_mon); - BCD_TO_BIN(rtc_tm->tm_year); + rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec); + rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min); + rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour); + rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday); + rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon); + rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year); rtc_tm->tm_year += 100; rtc_tm->tm_mon -= 1; @@ -163,12 +163,12 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm) return -EINVAL; } - writeb(BIN2BCD(tm->tm_sec), base + S3C2410_RTCSEC); - writeb(BIN2BCD(tm->tm_min), base + S3C2410_RTCMIN); - writeb(BIN2BCD(tm->tm_hour), base + S3C2410_RTCHOUR); - writeb(BIN2BCD(tm->tm_mday), base + S3C2410_RTCDATE); - writeb(BIN2BCD(tm->tm_mon + 1), base + S3C2410_RTCMON); - writeb(BIN2BCD(year), base + S3C2410_RTCYEAR); + writeb(bin2bcd(tm->tm_sec), base + S3C2410_RTCSEC); + writeb(bin2bcd(tm->tm_min), base + S3C2410_RTCMIN); + writeb(bin2bcd(tm->tm_hour), base + S3C2410_RTCHOUR); + writeb(bin2bcd(tm->tm_mday), base + S3C2410_RTCDATE); + writeb(bin2bcd(tm->tm_mon + 1), base + S3C2410_RTCMON); + writeb(bin2bcd(year), base + S3C2410_RTCYEAR); return 0; } @@ -199,34 +199,34 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm) /* decode the alarm enable field */ if (alm_en & S3C2410_RTCALM_SECEN) - BCD_TO_BIN(alm_tm->tm_sec); + alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec); else alm_tm->tm_sec = 0xff; if (alm_en & S3C2410_RTCALM_MINEN) - BCD_TO_BIN(alm_tm->tm_min); + alm_tm->tm_min = bcd2bin(alm_tm->tm_min); else alm_tm->tm_min = 0xff; if (alm_en & S3C2410_RTCALM_HOUREN) - BCD_TO_BIN(alm_tm->tm_hour); + alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour); else alm_tm->tm_hour = 0xff; if (alm_en & S3C2410_RTCALM_DAYEN) - BCD_TO_BIN(alm_tm->tm_mday); + alm_tm->tm_mday = bcd2bin(alm_tm->tm_mday); else alm_tm->tm_mday = 0xff; if (alm_en & S3C2410_RTCALM_MONEN) { - BCD_TO_BIN(alm_tm->tm_mon); + alm_tm->tm_mon = bcd2bin(alm_tm->tm_mon); alm_tm->tm_mon -= 1; } else { alm_tm->tm_mon = 0xff; } if (alm_en & S3C2410_RTCALM_YEAREN) - BCD_TO_BIN(alm_tm->tm_year); + alm_tm->tm_year = bcd2bin(alm_tm->tm_year); else alm_tm->tm_year = 0xffff; @@ -250,17 +250,17 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) if (tm->tm_sec < 60 && tm->tm_sec >= 0) { alrm_en |= S3C2410_RTCALM_SECEN; - writeb(BIN2BCD(tm->tm_sec), base + S3C2410_ALMSEC); + writeb(bin2bcd(tm->tm_sec), base + S3C2410_ALMSEC); } if (tm->tm_min < 60 && tm->tm_min >= 0) { alrm_en |= S3C2410_RTCALM_MINEN; - writeb(BIN2BCD(tm->tm_min), base + S3C2410_ALMMIN); + writeb(bin2bcd(tm->tm_min), base + S3C2410_ALMMIN); } if (tm->tm_hour < 24 && tm->tm_hour >= 0) { alrm_en |= S3C2410_RTCALM_HOUREN; - writeb(BIN2BCD(tm->tm_hour), base + S3C2410_ALMHOUR); + writeb(bin2bcd(tm->tm_hour), base + S3C2410_ALMHOUR); } pr_debug("setting S3C2410_RTCALM to %08x\n", alrm_en); diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c index 3f393c82e32c..aaf9d6a337cc 100644 --- a/drivers/rtc/rtc-sh.c +++ b/drivers/rtc/rtc-sh.c @@ -324,23 +324,23 @@ static int sh_rtc_read_time(struct device *dev, struct rtc_time *tm) sec128 = readb(rtc->regbase + R64CNT); - tm->tm_sec = BCD2BIN(readb(rtc->regbase + RSECCNT)); - tm->tm_min = BCD2BIN(readb(rtc->regbase + RMINCNT)); - tm->tm_hour = BCD2BIN(readb(rtc->regbase + RHRCNT)); - tm->tm_wday = BCD2BIN(readb(rtc->regbase + RWKCNT)); - tm->tm_mday = BCD2BIN(readb(rtc->regbase + RDAYCNT)); - tm->tm_mon = BCD2BIN(readb(rtc->regbase + RMONCNT)) - 1; + tm->tm_sec = bcd2bin(readb(rtc->regbase + RSECCNT)); + tm->tm_min = bcd2bin(readb(rtc->regbase + RMINCNT)); + tm->tm_hour = bcd2bin(readb(rtc->regbase + RHRCNT)); + tm->tm_wday = bcd2bin(readb(rtc->regbase + RWKCNT)); + tm->tm_mday = bcd2bin(readb(rtc->regbase + RDAYCNT)); + tm->tm_mon = bcd2bin(readb(rtc->regbase + RMONCNT)) - 1; if (rtc->capabilities & RTC_CAP_4_DIGIT_YEAR) { yr = readw(rtc->regbase + RYRCNT); - yr100 = BCD2BIN(yr >> 8); + yr100 = bcd2bin(yr >> 8); yr &= 0xff; } else { yr = readb(rtc->regbase + RYRCNT); - yr100 = BCD2BIN((yr == 0x99) ? 0x19 : 0x20); + yr100 = bcd2bin((yr == 0x99) ? 0x19 : 0x20); } - tm->tm_year = (yr100 * 100 + BCD2BIN(yr)) - 1900; + tm->tm_year = (yr100 * 100 + bcd2bin(yr)) - 1900; sec2 = readb(rtc->regbase + R64CNT); cf_bit = readb(rtc->regbase + RCR1) & RCR1_CF; @@ -382,20 +382,20 @@ static int sh_rtc_set_time(struct device *dev, struct rtc_time *tm) tmp &= ~RCR2_START; writeb(tmp, rtc->regbase + RCR2); - writeb(BIN2BCD(tm->tm_sec), rtc->regbase + RSECCNT); - writeb(BIN2BCD(tm->tm_min), rtc->regbase + RMINCNT); - writeb(BIN2BCD(tm->tm_hour), rtc->regbase + RHRCNT); - writeb(BIN2BCD(tm->tm_wday), rtc->regbase + RWKCNT); - writeb(BIN2BCD(tm->tm_mday), rtc->regbase + RDAYCNT); - writeb(BIN2BCD(tm->tm_mon + 1), rtc->regbase + RMONCNT); + writeb(bin2bcd(tm->tm_sec), rtc->regbase + RSECCNT); + writeb(bin2bcd(tm->tm_min), rtc->regbase + RMINCNT); + writeb(bin2bcd(tm->tm_hour), rtc->regbase + RHRCNT); + writeb(bin2bcd(tm->tm_wday), rtc->regbase + RWKCNT); + writeb(bin2bcd(tm->tm_mday), rtc->regbase + RDAYCNT); + writeb(bin2bcd(tm->tm_mon + 1), rtc->regbase + RMONCNT); if (rtc->capabilities & RTC_CAP_4_DIGIT_YEAR) { - year = (BIN2BCD((tm->tm_year + 1900) / 100) << 8) | - BIN2BCD(tm->tm_year % 100); + year = (bin2bcd((tm->tm_year + 1900) / 100) << 8) | + bin2bcd(tm->tm_year % 100); writew(year, rtc->regbase + RYRCNT); } else { year = tm->tm_year % 100; - writeb(BIN2BCD(year), rtc->regbase + RYRCNT); + writeb(bin2bcd(year), rtc->regbase + RYRCNT); } /* Start RTC */ @@ -417,7 +417,7 @@ static inline int sh_rtc_read_alarm_value(struct sh_rtc *rtc, int reg_off) byte = readb(rtc->regbase + reg_off); if (byte & AR_ENB) { byte &= ~AR_ENB; /* strip the enable bit */ - value = BCD2BIN(byte); + value = bcd2bin(byte); } return value; @@ -455,7 +455,7 @@ static inline void sh_rtc_write_alarm_value(struct sh_rtc *rtc, if (value < 0) writeb(0, rtc->regbase + reg_off); else - writeb(BIN2BCD(value) | AR_ENB, rtc->regbase + reg_off); + writeb(bin2bcd(value) | AR_ENB, rtc->regbase + reg_off); } static int sh_rtc_check_alarm(struct rtc_time *tm) diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index 9a7e920315fa..f4cd46e15af9 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -82,14 +82,14 @@ static int stk17ta8_rtc_set_time(struct device *dev, struct rtc_time *tm) flags = readb(pdata->ioaddr + RTC_FLAGS); writeb(flags | RTC_WRITE, pdata->ioaddr + RTC_FLAGS); - writeb(BIN2BCD(tm->tm_year % 100), ioaddr + RTC_YEAR); - writeb(BIN2BCD(tm->tm_mon + 1), ioaddr + RTC_MONTH); - writeb(BIN2BCD(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY); - writeb(BIN2BCD(tm->tm_mday), ioaddr + RTC_DATE); - writeb(BIN2BCD(tm->tm_hour), ioaddr + RTC_HOURS); - writeb(BIN2BCD(tm->tm_min), ioaddr + RTC_MINUTES); - writeb(BIN2BCD(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS); - writeb(BIN2BCD((tm->tm_year + 1900) / 100), ioaddr + RTC_CENTURY); + writeb(bin2bcd(tm->tm_year % 100), ioaddr + RTC_YEAR); + writeb(bin2bcd(tm->tm_mon + 1), ioaddr + RTC_MONTH); + writeb(bin2bcd(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY); + writeb(bin2bcd(tm->tm_mday), ioaddr + RTC_DATE); + writeb(bin2bcd(tm->tm_hour), ioaddr + RTC_HOURS); + writeb(bin2bcd(tm->tm_min), ioaddr + RTC_MINUTES); + writeb(bin2bcd(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS); + writeb(bin2bcd((tm->tm_year + 1900) / 100), ioaddr + RTC_CENTURY); writeb(flags & ~RTC_WRITE, pdata->ioaddr + RTC_FLAGS); return 0; @@ -120,14 +120,14 @@ static int stk17ta8_rtc_read_time(struct device *dev, struct rtc_time *tm) year = readb(ioaddr + RTC_YEAR); century = readb(ioaddr + RTC_CENTURY); writeb(flags & ~RTC_READ, ioaddr + RTC_FLAGS); - tm->tm_sec = BCD2BIN(second); - tm->tm_min = BCD2BIN(minute); - tm->tm_hour = BCD2BIN(hour); - tm->tm_mday = BCD2BIN(day); - tm->tm_wday = BCD2BIN(week); - tm->tm_mon = BCD2BIN(month) - 1; + tm->tm_sec = bcd2bin(second); + tm->tm_min = bcd2bin(minute); + tm->tm_hour = bcd2bin(hour); + tm->tm_mday = bcd2bin(day); + tm->tm_wday = bcd2bin(week); + tm->tm_mon = bcd2bin(month) - 1; /* year is 1900 + tm->tm_year */ - tm->tm_year = BCD2BIN(year) + BCD2BIN(century) * 100 - 1900; + tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900; if (rtc_valid_tm(tm) < 0) { dev_err(dev, "retrieved date/time is not valid.\n"); @@ -148,16 +148,16 @@ static void stk17ta8_rtc_update_alarm(struct rtc_plat_data *pdata) writeb(flags | RTC_WRITE, ioaddr + RTC_FLAGS); writeb(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_mday), + 0x80 : bin2bcd(pdata->alrm_mday), ioaddr + RTC_DATE_ALARM); writeb(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_hour), + 0x80 : bin2bcd(pdata->alrm_hour), ioaddr + RTC_HOURS_ALARM); writeb(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_min), + 0x80 : bin2bcd(pdata->alrm_min), ioaddr + RTC_MINUTES_ALARM); writeb(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : BIN2BCD(pdata->alrm_sec), + 0x80 : bin2bcd(pdata->alrm_sec), ioaddr + RTC_SECONDS_ALARM); writeb(pdata->irqen ? RTC_INTS_AIE : 0, ioaddr + RTC_INTERRUPTS); readb(ioaddr + RTC_FLAGS); /* clear interrupts */ @@ -280,7 +280,6 @@ static struct bin_attribute stk17ta8_nvram_attr = { .attr = { .name = "nvram", .mode = S_IRUGO | S_IWUSR, - .owner = THIS_MODULE, }, .size = RTC_OFFSET, .read = stk17ta8_nvram_read, diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index 10025d840268..14d4f036a768 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -92,19 +92,19 @@ static int v3020_read_time(struct device *dev, struct rtc_time *dt) /* ...and then read constant values. */ tmp = v3020_get_reg(chip, V3020_SECONDS); - dt->tm_sec = BCD2BIN(tmp); + dt->tm_sec = bcd2bin(tmp); tmp = v3020_get_reg(chip, V3020_MINUTES); - dt->tm_min = BCD2BIN(tmp); + dt->tm_min = bcd2bin(tmp); tmp = v3020_get_reg(chip, V3020_HOURS); - dt->tm_hour = BCD2BIN(tmp); + dt->tm_hour = bcd2bin(tmp); tmp = v3020_get_reg(chip, V3020_MONTH_DAY); - dt->tm_mday = BCD2BIN(tmp); + dt->tm_mday = bcd2bin(tmp); tmp = v3020_get_reg(chip, V3020_MONTH); - dt->tm_mon = BCD2BIN(tmp) - 1; + dt->tm_mon = bcd2bin(tmp) - 1; tmp = v3020_get_reg(chip, V3020_WEEK_DAY); - dt->tm_wday = BCD2BIN(tmp); + dt->tm_wday = bcd2bin(tmp); tmp = v3020_get_reg(chip, V3020_YEAR); - dt->tm_year = BCD2BIN(tmp)+100; + dt->tm_year = bcd2bin(tmp)+100; #ifdef DEBUG printk("\n%s : Read RTC values\n",__func__); @@ -136,13 +136,13 @@ static int v3020_set_time(struct device *dev, struct rtc_time *dt) #endif /* Write all the values to ram... */ - v3020_set_reg(chip, V3020_SECONDS, BIN2BCD(dt->tm_sec)); - v3020_set_reg(chip, V3020_MINUTES, BIN2BCD(dt->tm_min)); - v3020_set_reg(chip, V3020_HOURS, BIN2BCD(dt->tm_hour)); - v3020_set_reg(chip, V3020_MONTH_DAY, BIN2BCD(dt->tm_mday)); - v3020_set_reg(chip, V3020_MONTH, BIN2BCD(dt->tm_mon + 1)); - v3020_set_reg(chip, V3020_WEEK_DAY, BIN2BCD(dt->tm_wday)); - v3020_set_reg(chip, V3020_YEAR, BIN2BCD(dt->tm_year % 100)); + v3020_set_reg(chip, V3020_SECONDS, bin2bcd(dt->tm_sec)); + v3020_set_reg(chip, V3020_MINUTES, bin2bcd(dt->tm_min)); + v3020_set_reg(chip, V3020_HOURS, bin2bcd(dt->tm_hour)); + v3020_set_reg(chip, V3020_MONTH_DAY, bin2bcd(dt->tm_mday)); + v3020_set_reg(chip, V3020_MONTH, bin2bcd(dt->tm_mon + 1)); + v3020_set_reg(chip, V3020_WEEK_DAY, bin2bcd(dt->tm_wday)); + v3020_set_reg(chip, V3020_YEAR, bin2bcd(dt->tm_year % 100)); /* ...and set the clock. */ v3020_set_reg(chip, V3020_CMD_RAM2CLOCK, 0); diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c index 7dcfba1bbfe1..310c10795e9a 100644 --- a/drivers/rtc/rtc-x1205.c +++ b/drivers/rtc/rtc-x1205.c @@ -118,13 +118,13 @@ static int x1205_get_datetime(struct i2c_client *client, struct rtc_time *tm, for (i = 0; i <= 4; i++) buf[i] &= 0x7F; - tm->tm_sec = BCD2BIN(buf[CCR_SEC]); - tm->tm_min = BCD2BIN(buf[CCR_MIN]); - tm->tm_hour = BCD2BIN(buf[CCR_HOUR] & 0x3F); /* hr is 0-23 */ - tm->tm_mday = BCD2BIN(buf[CCR_MDAY]); - tm->tm_mon = BCD2BIN(buf[CCR_MONTH]) - 1; /* mon is 0-11 */ - tm->tm_year = BCD2BIN(buf[CCR_YEAR]) - + (BCD2BIN(buf[CCR_Y2K]) * 100) - 1900; + tm->tm_sec = bcd2bin(buf[CCR_SEC]); + tm->tm_min = bcd2bin(buf[CCR_MIN]); + tm->tm_hour = bcd2bin(buf[CCR_HOUR] & 0x3F); /* hr is 0-23 */ + tm->tm_mday = bcd2bin(buf[CCR_MDAY]); + tm->tm_mon = bcd2bin(buf[CCR_MONTH]) - 1; /* mon is 0-11 */ + tm->tm_year = bcd2bin(buf[CCR_YEAR]) + + (bcd2bin(buf[CCR_Y2K]) * 100) - 1900; tm->tm_wday = buf[CCR_WDAY]; dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, " @@ -174,11 +174,11 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm, __func__, tm->tm_sec, tm->tm_min, tm->tm_hour); - buf[CCR_SEC] = BIN2BCD(tm->tm_sec); - buf[CCR_MIN] = BIN2BCD(tm->tm_min); + buf[CCR_SEC] = bin2bcd(tm->tm_sec); + buf[CCR_MIN] = bin2bcd(tm->tm_min); /* set hour and 24hr bit */ - buf[CCR_HOUR] = BIN2BCD(tm->tm_hour) | X1205_HR_MIL; + buf[CCR_HOUR] = bin2bcd(tm->tm_hour) | X1205_HR_MIL; /* should we also set the date? */ if (datetoo) { @@ -187,15 +187,15 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm, __func__, tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday); - buf[CCR_MDAY] = BIN2BCD(tm->tm_mday); + buf[CCR_MDAY] = bin2bcd(tm->tm_mday); /* month, 1 - 12 */ - buf[CCR_MONTH] = BIN2BCD(tm->tm_mon + 1); + buf[CCR_MONTH] = bin2bcd(tm->tm_mon + 1); /* year, since the rtc epoch*/ - buf[CCR_YEAR] = BIN2BCD(tm->tm_year % 100); + buf[CCR_YEAR] = bin2bcd(tm->tm_year % 100); buf[CCR_WDAY] = tm->tm_wday & 0x07; - buf[CCR_Y2K] = BIN2BCD(tm->tm_year / 100); + buf[CCR_Y2K] = bin2bcd(tm->tm_year / 100); } /* If writing alarm registers, set compare bits on registers 0-4 */ @@ -437,7 +437,7 @@ static int x1205_validate_client(struct i2c_client *client) return -EIO; } - value = BCD2BIN(reg & probe_limits_pattern[i].mask); + value = bcd2bin(reg & probe_limits_pattern[i].mask); if (value > probe_limits_pattern[i].max || value < probe_limits_pattern[i].min) { diff --git a/drivers/scsi/arcmsr/arcmsr_attr.c b/drivers/scsi/arcmsr/arcmsr_attr.c index 69f8346aa288..5877f29a6005 100644 --- a/drivers/scsi/arcmsr/arcmsr_attr.c +++ b/drivers/scsi/arcmsr/arcmsr_attr.c @@ -189,7 +189,6 @@ static struct bin_attribute arcmsr_sysfs_message_read_attr = { .attr = { .name = "mu_read", .mode = S_IRUSR , - .owner = THIS_MODULE, }, .size = 1032, .read = arcmsr_sysfs_iop_message_read, @@ -199,7 +198,6 @@ static struct bin_attribute arcmsr_sysfs_message_write_attr = { .attr = { .name = "mu_write", .mode = S_IWUSR, - .owner = THIS_MODULE, }, .size = 1032, .write = arcmsr_sysfs_iop_message_write, @@ -209,7 +207,6 @@ static struct bin_attribute arcmsr_sysfs_message_clear_attr = { .attr = { .name = "mu_clear", .mode = S_IWUSR, - .owner = THIS_MODULE, }, .size = 1, .write = arcmsr_sysfs_iop_message_clear, diff --git a/drivers/scsi/sr_vendor.c b/drivers/scsi/sr_vendor.c index 4eb3da996b36..4ad3e017213f 100644 --- a/drivers/scsi/sr_vendor.c +++ b/drivers/scsi/sr_vendor.c @@ -223,9 +223,9 @@ int sr_cd_check(struct cdrom_device_info *cdi) no_multi = 1; break; } - min = BCD2BIN(buffer[15]); - sec = BCD2BIN(buffer[16]); - frame = BCD2BIN(buffer[17]); + min = bcd2bin(buffer[15]); + sec = bcd2bin(buffer[16]); + frame = bcd2bin(buffer[17]); sector = min * CD_SECS * CD_FRAMES + sec * CD_FRAMES + frame; break; } @@ -252,9 +252,9 @@ int sr_cd_check(struct cdrom_device_info *cdi) } if (rc != 0) break; - min = BCD2BIN(buffer[1]); - sec = BCD2BIN(buffer[2]); - frame = BCD2BIN(buffer[3]); + min = bcd2bin(buffer[1]); + sec = bcd2bin(buffer[2]); + frame = bcd2bin(buffer[3]); sector = min * CD_SECS * CD_FRAMES + sec * CD_FRAMES + frame; if (sector) sector -= CD_MSF_OFFSET; diff --git a/drivers/serial/8250_gsc.c b/drivers/serial/8250_gsc.c index 0416ad3bc127..418b4fe9a0a1 100644 --- a/drivers/serial/8250_gsc.c +++ b/drivers/serial/8250_gsc.c @@ -111,7 +111,7 @@ static struct parisc_driver serial_driver = { .probe = serial_init_chip, }; -int __init probe_serial_gsc(void) +static int __init probe_serial_gsc(void) { register_parisc_driver(&lasi_driver); register_parisc_driver(&serial_driver); diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c index 8fcb4c5b9a26..7313c2edcb83 100644 --- a/drivers/serial/serial_txx9.c +++ b/drivers/serial/serial_txx9.c @@ -1039,7 +1039,7 @@ static int __devinit serial_txx9_probe(struct platform_device *dev) ret = serial_txx9_register_port(&port); if (ret < 0) { dev_err(&dev->dev, "unable to register port at index %d " - "(IO%x MEM%llx IRQ%d): %d\n", i, + "(IO%lx MEM%llx IRQ%d): %d\n", i, p->iobase, (unsigned long long)p->mapbase, p->irq, ret); } diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c index b73e3c0056cd..d5276c012f78 100644 --- a/drivers/serial/sn_console.c +++ b/drivers/serial/sn_console.c @@ -61,7 +61,7 @@ #define SN_SAL_BUFFER_SIZE (64 * (1 << 10)) #define SN_SAL_UART_FIFO_DEPTH 16 -#define SN_SAL_UART_FIFO_SPEED_CPS 9600/10 +#define SN_SAL_UART_FIFO_SPEED_CPS (9600/10) /* sn_transmit_chars() calling args */ #define TRANSMIT_BUFFERED 0 diff --git a/drivers/staging/go7007/Kconfig b/drivers/staging/go7007/Kconfig index 57a121c338c4..593fdb767aad 100644 --- a/drivers/staging/go7007/Kconfig +++ b/drivers/staging/go7007/Kconfig @@ -1,10 +1,12 @@ config VIDEO_GO7007 tristate "Go 7007 support" depends on VIDEO_DEV && PCI && I2C && INPUT + depends on SND select VIDEOBUF_DMA_SG select VIDEO_IR select VIDEO_TUNER select VIDEO_TVEEPROM + select SND_PCM select CRC32 default N ---help--- diff --git a/drivers/staging/sxg/Kconfig b/drivers/staging/sxg/Kconfig index 1ae350806600..6e6cf0b9ef99 100644 --- a/drivers/staging/sxg/Kconfig +++ b/drivers/staging/sxg/Kconfig @@ -1,6 +1,7 @@ config SXG tristate "Alacritech SLIC Technology Non-Accelerated 10Gbe support" depends on PCI && NETDEV_10000 + depends on X86 default n help This driver supports the Alacritech SLIC Technology Non-Accelerated diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c index 4d74ba36c3a1..37caf4d69037 100644 --- a/drivers/telephony/phonedev.c +++ b/drivers/telephony/phonedev.c @@ -54,7 +54,6 @@ static int phone_open(struct inode *inode, struct file *file) if (minor >= PHONE_NUM_DEVICES) return -ENODEV; - lock_kernel(); mutex_lock(&phone_lock); p = phone_device[minor]; if (p) @@ -81,7 +80,6 @@ static int phone_open(struct inode *inode, struct file *file) fops_put(old_fops); end: mutex_unlock(&phone_lock); - unlock_kernel(); return err; } diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 217c5118ae9e..cd5f20da738a 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1002,101 +1002,132 @@ fb_blank(struct fb_info *info, int blank) return ret; } -static int -fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd, +static long +fb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + struct inode *inode = file->f_path.dentry->d_inode; int fbidx = iminor(inode); - struct fb_info *info = registered_fb[fbidx]; - struct fb_ops *fb = info->fbops; + struct fb_info *info; + struct fb_ops *fb; struct fb_var_screeninfo var; struct fb_fix_screeninfo fix; struct fb_con2fbmap con2fb; struct fb_cmap_user cmap; struct fb_event event; void __user *argp = (void __user *)arg; - int i; - - if (!fb) + long ret = 0; + + info = registered_fb[fbidx]; + mutex_lock(&info->lock); + fb = info->fbops; + + if (!fb) { + mutex_unlock(&info->lock); return -ENODEV; + } switch (cmd) { case FBIOGET_VSCREENINFO: - return copy_to_user(argp, &info->var, + ret = copy_to_user(argp, &info->var, sizeof(var)) ? -EFAULT : 0; + break; case FBIOPUT_VSCREENINFO: - if (copy_from_user(&var, argp, sizeof(var))) - return -EFAULT; + if (copy_from_user(&var, argp, sizeof(var))) { + ret = -EFAULT; + break; + } acquire_console_sem(); info->flags |= FBINFO_MISC_USEREVENT; - i = fb_set_var(info, &var); + ret = fb_set_var(info, &var); info->flags &= ~FBINFO_MISC_USEREVENT; release_console_sem(); - if (i) return i; - if (copy_to_user(argp, &var, sizeof(var))) - return -EFAULT; - return 0; + if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) + ret = -EFAULT; + break; case FBIOGET_FSCREENINFO: - return copy_to_user(argp, &info->fix, + ret = copy_to_user(argp, &info->fix, sizeof(fix)) ? -EFAULT : 0; + break; case FBIOPUTCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) - return -EFAULT; - return (fb_set_user_cmap(&cmap, info)); + ret = -EFAULT; + else + ret = fb_set_user_cmap(&cmap, info); + break; case FBIOGETCMAP: if (copy_from_user(&cmap, argp, sizeof(cmap))) - return -EFAULT; - return fb_cmap_to_user(&info->cmap, &cmap); + ret = -EFAULT; + else + ret = fb_cmap_to_user(&info->cmap, &cmap); + break; case FBIOPAN_DISPLAY: - if (copy_from_user(&var, argp, sizeof(var))) - return -EFAULT; + if (copy_from_user(&var, argp, sizeof(var))) { + ret = -EFAULT; + break; + } acquire_console_sem(); - i = fb_pan_display(info, &var); + ret = fb_pan_display(info, &var); release_console_sem(); - if (i) - return i; - if (copy_to_user(argp, &var, sizeof(var))) - return -EFAULT; - return 0; + if (ret == 0 && copy_to_user(argp, &var, sizeof(var))) + ret = -EFAULT; + break; case FBIO_CURSOR: - return -EINVAL; + ret = -EINVAL; + break; case FBIOGET_CON2FBMAP: if (copy_from_user(&con2fb, argp, sizeof(con2fb))) - return -EFAULT; - if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) - return -EINVAL; - con2fb.framebuffer = -1; - event.info = info; - event.data = &con2fb; - fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event); - return copy_to_user(argp, &con2fb, + ret = -EFAULT; + else if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) + ret = -EINVAL; + else { + con2fb.framebuffer = -1; + event.info = info; + event.data = &con2fb; + fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, + &event); + ret = copy_to_user(argp, &con2fb, sizeof(con2fb)) ? -EFAULT : 0; + } + break; case FBIOPUT_CON2FBMAP: - if (copy_from_user(&con2fb, argp, sizeof(con2fb))) - return - EFAULT; - if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) - return -EINVAL; - if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) - return -EINVAL; - if (!registered_fb[con2fb.framebuffer]) - request_module("fb%d", con2fb.framebuffer); + if (copy_from_user(&con2fb, argp, sizeof(con2fb))) { + ret = -EFAULT; + break; + } + if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) { + ret = -EINVAL; + break; + } + if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) { + ret = -EINVAL; + break; + } if (!registered_fb[con2fb.framebuffer]) - return -EINVAL; + request_module("fb%d", con2fb.framebuffer); + if (!registered_fb[con2fb.framebuffer]) { + ret = -EINVAL; + break; + } event.info = info; event.data = &con2fb; - return fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, + ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP, &event); + break; case FBIOBLANK: acquire_console_sem(); info->flags |= FBINFO_MISC_USEREVENT; - i = fb_blank(info, arg); + ret = fb_blank(info, arg); info->flags &= ~FBINFO_MISC_USEREVENT; release_console_sem(); - return i; + break;; default: if (fb->fb_ioctl == NULL) - return -EINVAL; - return fb->fb_ioctl(info, cmd, arg); + ret = -ENOTTY; + else + ret = fb->fb_ioctl(info, cmd, arg); } + mutex_unlock(&info->lock); + return ret; } #ifdef CONFIG_COMPAT @@ -1150,7 +1181,7 @@ static int fb_getput_cmap(struct inode *inode, struct file *file, put_user(compat_ptr(data), &cmap->transp)) return -EFAULT; - err = fb_ioctl(inode, file, cmd, (unsigned long) cmap); + err = fb_ioctl(file, cmd, (unsigned long) cmap); if (!err) { if (copy_in_user(&cmap32->start, @@ -1204,7 +1235,7 @@ static int fb_get_fscreeninfo(struct inode *inode, struct file *file, old_fs = get_fs(); set_fs(KERNEL_DS); - err = fb_ioctl(inode, file, cmd, (unsigned long) &fix); + err = fb_ioctl(file, cmd, (unsigned long) &fix); set_fs(old_fs); if (!err) @@ -1222,7 +1253,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct fb_ops *fb = info->fbops; long ret = -ENOIOCTLCMD; - lock_kernel(); + mutex_lock(&info->lock); switch(cmd) { case FBIOGET_VSCREENINFO: case FBIOPUT_VSCREENINFO: @@ -1231,7 +1262,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FBIOPUT_CON2FBMAP: arg = (unsigned long) compat_ptr(arg); case FBIOBLANK: - ret = fb_ioctl(inode, file, cmd, arg); + ret = fb_ioctl(file, cmd, arg); break; case FBIOGET_FSCREENINFO: @@ -1248,7 +1279,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ret = fb->fb_compat_ioctl(info, cmd, arg); break; } - unlock_kernel(); + mutex_unlock(&info->lock); return ret; } #endif @@ -1270,13 +1301,13 @@ fb_mmap(struct file *file, struct vm_area_struct * vma) return -ENODEV; if (fb->fb_mmap) { int res; - lock_kernel(); + mutex_lock(&info->lock); res = fb->fb_mmap(info, vma); - unlock_kernel(); + mutex_unlock(&info->lock); return res; } - lock_kernel(); + mutex_lock(&info->lock); /* frame buffer memory */ start = info->fix.smem_start; @@ -1285,13 +1316,13 @@ fb_mmap(struct file *file, struct vm_area_struct * vma) /* memory mapped io */ off -= len; if (info->var.accel_flags) { - unlock_kernel(); + mutex_unlock(&info->lock); return -EINVAL; } start = info->fix.mmio_start; len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.mmio_len); } - unlock_kernel(); + mutex_unlock(&info->lock); start &= PAGE_MASK; if ((vma->vm_end - vma->vm_start + off) > len) return -EINVAL; @@ -1315,13 +1346,13 @@ fb_open(struct inode *inode, struct file *file) if (fbidx >= FB_MAX) return -ENODEV; - lock_kernel(); - if (!(info = registered_fb[fbidx])) + info = registered_fb[fbidx]; + if (!info) request_module("fb%d", fbidx); - if (!(info = registered_fb[fbidx])) { - res = -ENODEV; - goto out; - } + info = registered_fb[fbidx]; + if (!info) + return -ENODEV; + mutex_lock(&info->lock); if (!try_module_get(info->fbops->owner)) { res = -ENODEV; goto out; @@ -1337,7 +1368,7 @@ fb_open(struct inode *inode, struct file *file) fb_deferred_io_open(info, inode, file); #endif out: - unlock_kernel(); + mutex_unlock(&info->lock); return res; } @@ -1346,11 +1377,11 @@ fb_release(struct inode *inode, struct file *file) { struct fb_info * const info = file->private_data; - lock_kernel(); + mutex_lock(&info->lock); if (info->fbops->fb_release) info->fbops->fb_release(info,1); module_put(info->fbops->owner); - unlock_kernel(); + mutex_unlock(&info->lock); return 0; } @@ -1358,7 +1389,7 @@ static const struct file_operations fb_fops = { .owner = THIS_MODULE, .read = fb_read, .write = fb_write, - .ioctl = fb_ioctl, + .unlocked_ioctl = fb_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fb_compat_ioctl, #endif @@ -1429,6 +1460,7 @@ register_framebuffer(struct fb_info *fb_info) if (!registered_fb[i]) break; fb_info->node = i; + mutex_init(&fb_info->lock); fb_info->dev = device_create(fb_class, fb_info->device, MKDEV(FB_MAJOR, i), NULL, "fb%d", i); diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c index ed6b0576208c..1f09d4e4144c 100644 --- a/drivers/w1/slaves/w1_ds2760.c +++ b/drivers/w1/slaves/w1_ds2760.c @@ -80,7 +80,6 @@ static struct bin_attribute w1_ds2760_bin_attr = { .attr = { .name = "w1_slave", .mode = S_IRUGO, - .owner = THIS_MODULE, }, .size = DS2760_DATA_SIZE, .read = w1_ds2760_read_bin, diff --git a/fs/Kconfig b/fs/Kconfig index d0a1174fb516..4eca61c201f0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1168,195 +1168,7 @@ config EFS_FS To compile the EFS file system support as a module, choose M here: the module will be called efs. -config JFFS2_FS - tristate "Journalling Flash File System v2 (JFFS2) support" - select CRC32 - depends on MTD - help - JFFS2 is the second generation of the Journalling Flash File System - for use on diskless embedded devices. It provides improved wear - levelling, compression and support for hard links. You cannot use - this on normal block devices, only on 'MTD' devices. - - Further information on the design and implementation of JFFS2 is - available at <http://sources.redhat.com/jffs2/>. - -config JFFS2_FS_DEBUG - int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" - depends on JFFS2_FS - default "0" - help - This controls the amount of debugging messages produced by the JFFS2 - code. Set it to zero for use in production systems. For evaluation, - testing and debugging, it's advisable to set it to one. This will - enable a few assertions and will print debugging messages at the - KERN_DEBUG loglevel, where they won't normally be visible. Level 2 - is unlikely to be useful - it enables extra debugging in certain - areas which at one point needed debugging, but when the bugs were - located and fixed, the detailed messages were relegated to level 2. - - If reporting bugs, please try to have available a full dump of the - messages at debug level 1 while the misbehaviour was occurring. - -config JFFS2_FS_WRITEBUFFER - bool "JFFS2 write-buffering support" - depends on JFFS2_FS - default y - help - This enables the write-buffering support in JFFS2. - - This functionality is required to support JFFS2 on the following - types of flash devices: - - NAND flash - - NOR flash with transparent ECC - - DataFlash - -config JFFS2_FS_WBUF_VERIFY - bool "Verify JFFS2 write-buffer reads" - depends on JFFS2_FS_WRITEBUFFER - default n - help - This causes JFFS2 to read back every page written through the - write-buffer, and check for errors. - -config JFFS2_SUMMARY - bool "JFFS2 summary support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - This feature makes it possible to use summary information - for faster filesystem mount. - - The summary information can be inserted into a filesystem image - by the utility 'sumtool'. - - If unsure, say 'N'. - -config JFFS2_FS_XATTR - bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - -config JFFS2_FS_POSIX_ACL - bool "JFFS2 POSIX Access Control Lists" - depends on JFFS2_FS_XATTR - default y - select FS_POSIX_ACL - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - To learn more about Access Control Lists, visit the Posix ACLs for - Linux website <http://acl.bestbits.at/>. - - If you don't know what Access Control Lists are, say N - -config JFFS2_FS_SECURITY - bool "JFFS2 Security Labels" - depends on JFFS2_FS_XATTR - default y - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the jffs2 filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. - -config JFFS2_COMPRESSION_OPTIONS - bool "Advanced compression options for JFFS2" - depends on JFFS2_FS - default n - help - Enabling this option allows you to explicitly choose which - compression modules, if any, are enabled in JFFS2. Removing - compressors can mean you cannot read existing file systems, - and enabling experimental compressors can mean that you - write a file system which cannot be read by a standard kernel. - - If unsure, you should _definitely_ say 'N'. - -config JFFS2_ZLIB - bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS - select ZLIB_INFLATE - select ZLIB_DEFLATE - depends on JFFS2_FS - default y - help - Zlib is designed to be a free, general-purpose, legally unencumbered, - lossless data-compression library for use on virtually any computer - hardware and operating system. See <http://www.gzip.org/zlib/> for - further information. - - Say 'Y' if unsure. - -config JFFS2_LZO - bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS - select LZO_COMPRESS - select LZO_DECOMPRESS - depends on JFFS2_FS - default n - help - minilzo-based compression. Generally works better than Zlib. - - This feature was added in July, 2007. Say 'N' if you need - compatibility with older bootloaders or kernels. - -config JFFS2_RTIME - bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default y - help - Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. - -config JFFS2_RUBIN - bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS - depends on JFFS2_FS - default n - help - RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. - -choice - prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS - default JFFS2_CMODE_PRIORITY - depends on JFFS2_FS - help - You can set here the default compression mode of JFFS2 from - the available compression modes. Don't touch if unsure. - -config JFFS2_CMODE_NONE - bool "no compression" - help - Uses no compression. - -config JFFS2_CMODE_PRIORITY - bool "priority" - help - Tries the compressors in a predefined order and chooses the first - successful one. - -config JFFS2_CMODE_SIZE - bool "size (EXPERIMENTAL)" - help - Tries all compressors and chooses the one which has the smallest - result. - -config JFFS2_CMODE_FAVOURLZO - bool "Favour LZO" - help - Tries all compressors and chooses the one which has the smallest - result but gives some preference to LZO (which has faster - decompression) at the expense of size. - -endchoice - +source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" @@ -1913,148 +1725,7 @@ config SMB_NLS_REMOTE smbmount from samba 2.2.0 or later supports this. -config CIFS - tristate "CIFS support (advanced network filesystem, SMBFS successor)" - depends on INET - select NLS - help - This is the client VFS module for the Common Internet File System - (CIFS) protocol which is the successor to the Server Message Block - (SMB) protocol, the native file sharing mechanism for most early - PC operating systems. The CIFS protocol is fully supported by - file servers such as Windows 2000 (including Windows 2003, NT 4 - and Windows XP) as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems). Limited - support for OS/2 and Windows ME and similar servers is provided as - well. - - The cifs module provides an advanced network file system - client for mounting to CIFS compliant servers. It includes - support for DFS (hierarchical name space), secure per-user - session establishment via Kerberos or NTLM or NTLMv2, - safe distributed caching (oplock), optional packet - signing, Unicode and other internationalization improvements. - If you need to mount to Samba or Windows from this machine, say Y. - -config CIFS_STATS - bool "CIFS statistics" - depends on CIFS - help - Enabling this option will cause statistics for each server share - mounted by the cifs client to be displayed in /proc/fs/cifs/Stats - -config CIFS_STATS2 - bool "Extended statistics" - depends on CIFS_STATS - help - Enabling this option will allow more detailed statistics on SMB - request timing to be displayed in /proc/fs/cifs/DebugData and also - allow optional logging of slow responses to dmesg (depending on the - value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). - These additional statistics may have a minor effect on performance - and memory utilization. - - Unless you are a developer or are doing network performance analysis - or tuning, say N. - -config CIFS_WEAK_PW_HASH - bool "Support legacy servers which use weaker LANMAN security" - depends on CIFS - help - Modern CIFS servers including Samba and most Windows versions - (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) - security mechanisms. These hash the password more securely - than the mechanisms used in the older LANMAN version of the - SMB protocol but LANMAN based authentication is needed to - establish sessions with some old SMB servers. - - Enabling this option allows the cifs module to mount to older - LANMAN based servers such as OS/2 and Windows 95, but such - mounts may be less secure than mounts using NTLM or more recent - security mechanisms if you are on a public network. Unless you - have a need to access old SMB servers (and are on a private - network) you probably want to say N. Even if this support - is enabled in the kernel build, LANMAN authentication will not be - used automatically. At runtime LANMAN mounts are disabled but - can be set to required (or optional) either in - /proc/fs/cifs (see fs/cifs/README for more detail) or via an - option on the mount command. This support is disabled by - default in order to reduce the possibility of a downgrade - attack. - - If unsure, say N. - -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - -config CIFS_XATTR - bool "CIFS extended attributes" - depends on CIFS - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). CIFS maps the name of - extended attributes beginning with the user namespace prefix - to SMB/CIFS EAs. EAs are stored on Windows servers without the - user namespace prefix, but their names are seen by Linux cifs clients - prefaced by the user namespace prefix. The system namespace - (used by some filesystems to store ACLs) is not supported at - this time. - - If unsure, say N. - -config CIFS_POSIX - bool "CIFS POSIX Extensions" - depends on CIFS_XATTR - help - Enabling this option will cause the cifs client to attempt to - negotiate a newer dialect with servers, such as Samba 3.0.5 - or later, that optionally can handle more POSIX like (rather - than Windows like) file behavior. It also enables - support for POSIX ACLs (getfacl and setfacl) to servers - (such as Samba 3.10 and later) which can negotiate - CIFS POSIX ACL support. If unsure, say N. - -config CIFS_DEBUG2 - bool "Enable additional CIFS debugging routines" - depends on CIFS - help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. - -config CIFS_EXPERIMENTAL - bool "CIFS Experimental Features (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL - help - Enables cifs features under testing. These features are - experimental and currently include DFS support and directory - change notification ie fcntl(F_DNOTIFY), as well as the upcall - mechanism which will be used for Kerberos session negotiation - and uid remapping. Some of these features also may depend on - setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental - (which is disabled by default). See the file fs/cifs/README - for more details. If unsure, say N. - -config CIFS_DFS_UPCALL - bool "DFS feature support (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which contacts userspace - helper utilities to provide server name resolution (host names to - IP addresses) which is needed for implicit mounts of DFS junction - points. If unsure, say N. +source "fs/cifs/Kconfig" config NCP_FS tristate "NCP file system support (to mount NetWare volumes)" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 801db1341811..ce9fb3fbfae4 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config CORE_DUMP_DEFAULT_ELF_HEADERS + bool "Write ELF core dumps with partial segments" + default n + depends on BINFMT_ELF + help + ELF core dump files describe each memory mapping of the crashed + process, and can contain or omit the memory contents of each one. + The contents of an unmodified text mapping are omitted by default. + + For an unmodified text mapping of an ELF object, including just + the first page of the file in a core dump makes it possible to + identify the build ID bits in the file, without paying the i/o + cost and disk space to dump all the text. However, versions of + GDB before 6.7 are confused by ELF core dump files in this format. + + The core dump behavior can be controlled per process using + the /proc/PID/coredump_filter pseudo-file; this setting is + inherited. See Documentation/filesystems/proc.txt for details. + + This config option changes the default setting of coredump_filter + seen at boot time. If unsure, say N. + config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU && (!FRV || BROKEN) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c76afa26edf7..e2159063198a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off) static unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) { +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + /* The vma can be set up to tell us the answer directly. */ if (vma->vm_flags & VM_ALWAYSDUMP) goto whole; + /* Hugetlb memory check */ + if (vma->vm_flags & VM_HUGETLB) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + } + /* Do not dump I/O mapped devices or special mappings */ if (vma->vm_flags & (VM_IO | VM_RESERVED)) return 0; -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - /* By default, dump shared memory if mapped from an anonymous file. */ if (vma->vm_flags & VM_SHARED) { if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ? diff --git a/fs/buffer.c b/fs/buffer.c index ac78d4c19b3b..6569fda5cfed 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer); void unlock_buffer(struct buffer_head *bh) { - smp_mb__before_clear_bit(); - clear_buffer_locked(bh); + clear_bit_unlock(BH_Lock, &bh->b_state); smp_mb__after_clear_bit(); wake_up_bit(&bh->b_state, BH_Lock); } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig new file mode 100644 index 000000000000..341a98965bd0 --- /dev/null +++ b/fs/cifs/Kconfig @@ -0,0 +1,142 @@ +config CIFS + tristate "CIFS support (advanced network filesystem, SMBFS successor)" + depends on INET + select NLS + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. The CIFS protocol is fully supported by + file servers such as Windows 2000 (including Windows 2003, NT 4 + and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). Limited + support for OS/2 and Windows ME and similar servers is provided as + well. + + The cifs module provides an advanced network file system + client for mounting to CIFS compliant servers. It includes + support for DFS (hierarchical name space), secure per-user + session establishment via Kerberos or NTLM or NTLMv2, + safe distributed caching (oplock), optional packet + signing, Unicode and other internationalization improvements. + If you need to mount to Samba or Windows from this machine, say Y. + +config CIFS_STATS + bool "CIFS statistics" + depends on CIFS + help + Enabling this option will cause statistics for each server share + mounted by the cifs client to be displayed in /proc/fs/cifs/Stats + +config CIFS_STATS2 + bool "Extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol but LANMAN based authentication is needed to + establish sessions with some old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, LANMAN authentication will not be + used automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + +config CIFS_XATTR + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). CIFS maps the name of + extended attributes beginning with the user namespace prefix + to SMB/CIFS EAs. EAs are stored on Windows servers without the + user namespace prefix, but their names are seen by Linux cifs clients + prefaced by the user namespace prefix. The system namespace + (used by some filesystems to store ACLs) is not supported at + this time. + + If unsure, say N. + +config CIFS_POSIX + bool "CIFS POSIX Extensions" + depends on CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to + negotiate a newer dialect with servers, such as Samba 3.0.5 + or later, that optionally can handle more POSIX like (rather + than Windows like) file behavior. It also enables + support for POSIX ACLs (getfacl and setfacl) to servers + (such as Samba 3.10 and later) which can negotiate + CIFS POSIX ACL support. If unsure, say N. + +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + depends on CIFS + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + +config CIFS_EXPERIMENTAL + bool "CIFS Experimental Features (EXPERIMENTAL)" + depends on CIFS && EXPERIMENTAL + help + Enables cifs features under testing. These features are + experimental and currently include DFS support and directory + change notification ie fcntl(F_DNOTIFY), as well as the upcall + mechanism which will be used for Kerberos session negotiation + and uid remapping. Some of these features also may depend on + setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental + (which is disabled by default). See the file fs/cifs/README + for more details. If unsure, say N. + +config CIFS_DFS_UPCALL + bool "DFS feature support (EXPERIMENTAL)" + depends on CIFS_EXPERIMENTAL + depends on KEYS + help + Enables an upcall mechanism for CIFS which contacts userspace + helper utilities to provide server name resolution (host names to + IP addresses) which is needed for implicit mounts of DFS junction + points. If unsure, say N. diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c4a8a0605125..62d8bd8f14c0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); + __pagevec_lru_add_file(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, bytes_read = 0; } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); /* need to free smb_read_data buf before exit */ if (smb_read_data) { diff --git a/fs/exec.c b/fs/exec.c index a41e7902ed0b..4e834f16d9da 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1386,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt); * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(char *corename, int nr_threads, long signr) +static int format_corename(char *corename, long signr) { const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); @@ -1493,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr) * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ - if (!ispipe && !pid_in_pattern - && (core_uses_pid || nr_threads)) { + if (!ispipe && !pid_in_pattern && core_uses_pid) { rc = snprintf(out_ptr, out_end - out_ptr, ".%d", task_tgid_vnr(current)); if (rc > out_end - out_ptr) @@ -1757,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * uses lock_kernel() */ lock_kernel(); - ispipe = format_corename(corename, retval, signr); + ispipe = format_corename(corename, signr); unlock_kernel(); /* * Don't bother to check the RLIMIT_CORE value if core_pattern points diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 92fd0338a6eb..f5b57a2ca35a 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1547,6 +1547,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1585,7 +1586,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 2eea96ec78ed..4c82531ea0a8 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp, int err; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + int dir_has_error = 0; sb = inode->i_sb; @@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext3_error (sb, "ext3_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext3_error(sb, __func__, "directory #%lu " + "contains a hole at offset %lld", + inode->i_ino, filp->f_pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ if (filp->f_pos > inode->i_blocks << 9) break; @@ -410,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -449,11 +453,21 @@ static int ext3_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT3_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ebfec4d0148e..f8424ad89971 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1186,6 +1186,13 @@ write_begin_failed: ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 77278e947e94..78fdf3836370 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, - EXT3_FEATURE_COMPAT_RESIZE_INODE)){ + EXT3_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext3_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 399a96a6c556..3a260af5544d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext3_show_quota_options(seq, sb); return 0; @@ -754,6 +757,7 @@ enum { Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, @@ -796,6 +800,8 @@ static const match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JFS_BARRIER; else journal->j_flags &= ~JFS_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index fec8f61227ff..0022eec63cda 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, goto done; } + if (inode->i_ino == HFSPLUS_EXT_CNID) + return -EIO; + mutex_lock(&HFSPLUS_I(inode).extents_lock); res = hfsplus_ext_read_extent(inode, ablock); if (!res) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b085d64a2b67..963be644297a 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -254,6 +254,8 @@ static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode).rsrc_inode; + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EOVERFLOW; atomic_inc(&HFSPLUS_I(inode).opencnt); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index ae08c057e751..25719d902c51 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal) printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); err = 0; } @@ -518,9 +520,10 @@ void journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -762,6 +765,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 6\n"); if (journal_write_commit_record(journal, commit_transaction)) @@ -852,6 +858,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 0540ca27a446..d15cd6e7251e 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int ret = 0; if (is_handle_aborted(handle)) - return 0; + return ret; jh = journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); @@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) time if it is redirtied */ } - /* journal_clean_data_list() may have got there first */ + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + if (jh->b_transaction != NULL) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); @@ -1108,7 +1118,7 @@ no_journal: } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); - return 0; + return ret; } /** diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig new file mode 100644 index 000000000000..6ae169cd8faa --- /dev/null +++ b/fs/jffs2/Kconfig @@ -0,0 +1,188 @@ +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + select CRC32 + depends on MTD + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at <http://sources.redhat.com/jffs2/>. + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" + depends on JFFS2_FS + default y + help + This enables the write-buffering support in JFFS2. + + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash + +config JFFS2_FS_WBUF_VERIFY + bool "Verify JFFS2 write-buffer reads" + depends on JFFS2_FS_WRITEBUFFER + default n + help + This causes JFFS2 to read back every page written through the + write-buffer, and check for errors. + +config JFFS2_SUMMARY + bool "JFFS2 summary support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + +config JFFS2_FS_XATTR + bool "JFFS2 XATTR support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config JFFS2_FS_POSIX_ACL + bool "JFFS2 POSIX Access Control Lists" + depends on JFFS2_FS_XATTR + default y + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config JFFS2_FS_SECURITY + bool "JFFS2 Security Labels" + depends on JFFS2_FS_XATTR + default y + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jffs2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFFS2_COMPRESSION_OPTIONS + bool "Advanced compression options for JFFS2" + depends on JFFS2_FS + default n + help + Enabling this option allows you to explicitly choose which + compression modules, if any, are enabled in JFFS2. Removing + compressors can mean you cannot read existing file systems, + and enabling experimental compressors can mean that you + write a file system which cannot be read by a standard kernel. + + If unsure, you should _definitely_ say 'N'. + +config JFFS2_ZLIB + bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS + select ZLIB_INFLATE + select ZLIB_DEFLATE + depends on JFFS2_FS + default y + help + Zlib is designed to be a free, general-purpose, legally unencumbered, + lossless data-compression library for use on virtually any computer + hardware and operating system. See <http://www.gzip.org/zlib/> for + further information. + + Say 'Y' if unsure. + +config JFFS2_LZO + bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS + select LZO_COMPRESS + select LZO_DECOMPRESS + depends on JFFS2_FS + default n + help + minilzo-based compression. Generally works better than Zlib. + + This feature was added in July, 2007. Say 'N' if you need + compatibility with older bootloaders or kernels. + +config JFFS2_RTIME + bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default y + help + Rtime does manage to recompress already-compressed data. Say 'Y' if unsure. + +config JFFS2_RUBIN + bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS + depends on JFFS2_FS + default n + help + RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure. + +choice + prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS + default JFFS2_CMODE_PRIORITY + depends on JFFS2_FS + help + You can set here the default compression mode of JFFS2 from + the available compression modes. Don't touch if unsure. + +config JFFS2_CMODE_NONE + bool "no compression" + help + Uses no compression. + +config JFFS2_CMODE_PRIORITY + bool "priority" + help + Tries the compressors in a predefined order and chooses the first + successful one. + +config JFFS2_CMODE_SIZE + bool "size (EXPERIMENTAL)" + help + Tries all compressors and chooses the one which has the smallest + result. + +config JFFS2_CMODE_FAVOURLZO + bool "Favour LZO" + help + Tries all compressors and chooses the one which has the smallest + result but gives some preference to LZO (which has faster + decompression) at the expense of size. + +endchoice diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 86739ee53b37..f25e70c1b51c 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, } /* jffs2_compress: - * @data: Pointer to uncompressed data - * @cdata: Pointer to returned pointer to buffer for compressed data + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data * @datalen: On entry, holds the amount of data available for compression. * On exit, expected to hold the amount of data actually compressed. * @cdatalen: On entry, holds the amount of space available for compressed diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index cd219ef55254..b1aaae823a52 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -311,7 +311,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char /* FIXME: If you care. We'd need to use frags for the target if it grows much more than this */ if (targetlen > 254) - return -EINVAL; + return -ENAMETOOLONG; ri = jffs2_alloc_raw_inode(); diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index dddb2a6c9e2c..259461b910af 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, instr->len = c->sector_size; instr->callback = jffs2_erase_callback; instr->priv = (unsigned long)(&instr[1]); - instr->fail_addr = 0xffffffff; + instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; ((struct erase_priv_struct *)instr->priv)->jeb = jeb; ((struct erase_priv_struct *)instr->priv)->c = c; @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 086c43830221..249305d65d5b 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = JFFS2_MAX_NAME_LEN; + buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC; + buf->f_fsid.val[1] = c->mtd->index; spin_lock(&c->erase_completion_lock); avail = c->dirty_size + c->free_size; @@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ - ri->uid = cpu_to_je16(current->fsuid); + ri->uid = cpu_to_je16(current_fsuid()); if (dir_i->i_mode & S_ISGID) { ri->gid = cpu_to_je16(dir_i->i_gid); if (S_ISDIR(mode)) mode |= S_ISGID; } else { - ri->gid = cpu_to_je16(current->fsgid); + ri->gid = cpu_to_je16(current_fsgid()); } /* POSIX ACLs have to be processed now, at least partly. diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index a9bf9603c1ba..0875b60b4bf7 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) jffs2_sum_reset_collected(c->summary); /* reset collected summary */ + /* adjust write buffer offset, else we get a non contiguous write bug */ + if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len) + c->wbuf_ofs = 0xffffffff; + D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset)); return 0; diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 0e78b00035e4..d9a721e6db70 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) memset(c->wbuf,0xff,c->wbuf_pagesize); /* adjust write buffer offset, else we get a non contiguous write bug */ - if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize)) - c->wbuf_ofs += c->wbuf_pagesize; - else - c->wbuf_ofs = 0xffffffff; + c->wbuf_ofs += c->wbuf_pagesize; c->wbuf_len = 0; return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2ab70d46ecbc..efdba2e802d7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1517,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index d020866d4232..3140a4429af1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pages[nr] = *cached_page; page_cache_get(*cached_page); if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add(lru_pvec); + __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -2084,7 +2084,7 @@ err_out: OSYNC_METADATA|OSYNC_DATA); } } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 59ea42e1ef03..61b25f4eabe6 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -136,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, unsigned long allowed; struct vmalloc_info vmi; long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; /* * display in kilobytes. @@ -154,51 +156,70 @@ static int meminfo_read_proc(char *page, char **start, off_t off, get_vmalloc_info(&vmi); + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + /* * Tagged format, for easy grepping and expansion. */ len = sprintf(page, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" #endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "PageTables: %8lu kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "PageTables: %8lu kB\n" #ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" + "Quicklists: %8lu kB\n" #endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), K(cached), K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), +#ifdef CONFIG_UNEVICTABLE_LRU + K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), +#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 841368b87a29..cd9ca67f841b 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -32,9 +32,6 @@ static size_t elfcorebuf_sz; /* Total size of vmcore file. */ static u64 vmcore_size; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - struct proc_dir_entry *proc_vmcore = NULL; /* Reads a page from the oldmem device from given offset. */ @@ -647,7 +644,7 @@ static int __init vmcore_init(void) int rc = 0; /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ - if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX)) + if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); if (rc) { diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5145cb9125af..76acdbc34611 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) goto add_error; if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); + __pagevec_lru_add_file(&lru_pvec); unlock_page(page); } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b13123424e49..f031d1c925f0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff --git a/fs/seq_file.c b/fs/seq_file.c index bd20f7f5a933..eba2eabcd2b8 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -452,17 +452,34 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits) { - size_t len = bitmap_scnprintf_len(nr_bits); + if (m->count < m->size) { + int len = bitmap_scnprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } + } + m->count = m->size; + return -1; +} +EXPORT_SYMBOL(seq_bitmap); - if (m->count + len < m->size) { - bitmap_scnprintf(m->buf + m->count, m->size - m->count, - bits, nr_bits); - m->count += len; - return 0; +int seq_bitmap_list(struct seq_file *m, unsigned long *bits, + unsigned int nr_bits) +{ + if (m->count < m->size) { + int len = bitmap_scnlistprintf(m->buf + m->count, + m->size - m->count, bits, nr_bits); + if (m->count + len < m->size) { + m->count += len; + return 0; + } } m->count = m->size; return -1; } +EXPORT_SYMBOL(seq_bitmap_list); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/include/asm-cris/thread_info.h b/include/asm-cris/thread_info.h index 7efe1000f99d..cee97f14af3b 100644 --- a/include/asm-cris/thread_info.h +++ b/include/asm-cris/thread_info.h @@ -88,6 +88,7 @@ struct thread_info { #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 +#define TIF_FREEZE 18 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) @@ -95,6 +96,7 @@ struct thread_info { #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) +#define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h index 71ef3f0b9685..89061c1a67d4 100644 --- a/include/asm-generic/rtc.h +++ b/include/asm-generic/rtc.h @@ -84,12 +84,12 @@ static inline unsigned int get_rtc_time(struct rtc_time *time) if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(time->tm_sec); - BCD_TO_BIN(time->tm_min); - BCD_TO_BIN(time->tm_hour); - BCD_TO_BIN(time->tm_mday); - BCD_TO_BIN(time->tm_mon); - BCD_TO_BIN(time->tm_year); + time->tm_sec = bcd2bin(time->tm_sec); + time->tm_min = bcd2bin(time->tm_min); + time->tm_hour = bcd2bin(time->tm_hour); + time->tm_mday = bcd2bin(time->tm_mday); + time->tm_mon = bcd2bin(time->tm_mon); + time->tm_year = bcd2bin(time->tm_year); } #ifdef CONFIG_MACH_DECSTATION @@ -159,12 +159,12 @@ static inline int set_rtc_time(struct rtc_time *time) if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(sec); - BIN_TO_BCD(min); - BIN_TO_BCD(hrs); - BIN_TO_BCD(day); - BIN_TO_BCD(mon); - BIN_TO_BCD(yrs); + sec = bin2bcd(sec); + min = bin2bcd(min); + hrs = bin2bcd(hrs); + day = bin2bcd(day); + mon = bin2bcd(mon); + yrs = bin2bcd(yrs); } save_control = CMOS_READ(RTC_CONTROL); diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h index abc002798a2b..af0fda46e94b 100644 --- a/include/asm-m68k/thread_info.h +++ b/include/asm-m68k/thread_info.h @@ -52,5 +52,6 @@ struct thread_info { #define TIF_DELAYED_TRACE 14 /* single step a syscall */ #define TIF_SYSCALL_TRACE 15 /* syscall trace active */ #define TIF_MEMDIE 16 +#define TIF_FREEZE 17 /* thread is freezing for suspend */ #endif /* _ASM_M68K_THREAD_INFO_H */ diff --git a/include/asm-parisc/thread_info.h b/include/asm-parisc/thread_info.h index 9f812741c355..0407959da489 100644 --- a/include/asm-parisc/thread_info.h +++ b/include/asm-parisc/thread_info.h @@ -58,6 +58,7 @@ struct thread_info { #define TIF_32BIT 4 /* 32 bit binary */ #define TIF_MEMDIE 5 #define TIF_RESTORE_SIGMASK 6 /* restore saved signal mask */ +#define TIF_FREEZE 7 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -65,6 +66,7 @@ struct thread_info { #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_FREEZE (1 << TIF_FREEZE) #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK) diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h index e07e72846c7a..62274ab9471f 100644 --- a/include/asm-um/thread_info.h +++ b/include/asm-um/thread_info.h @@ -69,6 +69,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 5 #define TIF_SYSCALL_AUDIT 6 #define TIF_RESTORE_SIGMASK 7 +#define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -77,5 +78,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) +#define _TIF_FREEZE (1 << TIF_FREEZE) #endif diff --git a/include/asm-xtensa/thread_info.h b/include/asm-xtensa/thread_info.h index 7e4131dd546c..0f4fe1faf9ba 100644 --- a/include/asm-xtensa/thread_info.h +++ b/include/asm-xtensa/thread_info.h @@ -134,6 +134,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 5 #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +#define TIF_FREEZE 17 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) @@ -142,6 +143,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_IRET (1<<TIF_IRET) #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) +#define _TIF_FREEZE (1<<TIF_FREEZE) #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ diff --git a/include/linux/Kbuild b/include/linux/Kbuild index bf9aca548f14..e531783e5d78 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -183,6 +183,7 @@ unifdef-y += auto_fs.h unifdef-y += auxvec.h unifdef-y += binfmts.h unifdef-y += blktrace_api.h +unifdef-y += byteorder.h unifdef-y += capability.h unifdef-y += capi.h unifdef-y += cciss_ioctl.h @@ -340,6 +341,7 @@ unifdef-y += soundcard.h unifdef-y += stat.h unifdef-y += stddef.h unifdef-y += string.h +unifdef-y += swab.h unifdef-y += synclink.h unifdef-y += sysctl.h unifdef-y += tcp.h diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0a24d5550eb3..bee52abb8a4d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -175,6 +175,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_READ_MAP: Can be mapped for reading * BDI_CAP_WRITE_MAP: Can be mapped for writing * BDI_CAP_EXEC_MAP: Can be mapped for execution + * + * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -184,6 +186,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_WRITE_MAP 0x00000020 #define BDI_CAP_EXEC_MAP 0x00000040 #define BDI_CAP_NO_ACCT_WB 0x00000080 +#define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -248,6 +251,11 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) BDI_CAP_NO_WRITEBACK)); } +static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_SWAP_BACKED; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); @@ -258,4 +266,9 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping) return bdi_cap_account_dirty(mapping->backing_dev_info); } +static inline bool mapping_cap_swap_backed(struct address_space *mapping) +{ + return bdi_cap_swap_backed(mapping->backing_dev_info); +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/bcd.h b/include/linux/bcd.h index 7ac518e3c152..22ea563ba3eb 100644 --- a/include/linux/bcd.h +++ b/include/linux/bcd.h @@ -1,12 +1,3 @@ -/* Permission is hereby granted to copy, modify and redistribute this code - * in terms of the GNU Library General Public License, Version 2 or later, - * at your option. - */ - -/* macros to translate to/from binary and binary-coded decimal (frequently - * found in RTC chips). - */ - #ifndef _BCD_H #define _BCD_H @@ -15,11 +6,4 @@ unsigned bcd2bin(unsigned char val) __attribute_const__; unsigned char bin2bcd(unsigned val) __attribute_const__; -#define BCD2BIN(val) bcd2bin(val) -#define BIN2BCD(val) bin2bcd(val) - -/* backwards compat */ -#define BCD_TO_BIN(val) ((val)=BCD2BIN(val)) -#define BIN_TO_BCD(val) ((val)=BIN2BCD(val)) - #endif /* _BCD_H */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 89781fd48859..1abfe664c444 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -110,7 +110,6 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits); extern int bitmap_scnprintf(char *buf, unsigned int len, const unsigned long *src, int nbits); -extern int bitmap_scnprintf_len(unsigned int nr_bits); extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, unsigned long *dst, int nbits); extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index eadaab44015f..3ce64b90118c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -322,7 +322,7 @@ static inline void wait_on_buffer(struct buffer_head *bh) static inline int trylock_buffer(struct buffer_head *bh) { - return likely(!test_and_set_bit(BH_Lock, &bh->b_state)); + return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); } static inline void lock_buffer(struct buffer_head *bh) diff --git a/include/linux/byteorder/Kbuild b/include/linux/byteorder/Kbuild index 1133d5f9d818..fbaa7f9cee32 100644 --- a/include/linux/byteorder/Kbuild +++ b/include/linux/byteorder/Kbuild @@ -1,3 +1,4 @@ unifdef-y += big_endian.h unifdef-y += little_endian.h unifdef-y += swab.h +unifdef-y += swabb.h diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h index 44f95b92393b..1cba3f3efe5f 100644 --- a/include/linux/byteorder/big_endian.h +++ b/include/linux/byteorder/big_endian.h @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/byteorder/swab.h> +#include <linux/byteorder/swabb.h> #define __constant_htonl(x) ((__force __be32)(__u32)(x)) #define __constant_ntohl(x) ((__force __u32)(__be32)(x)) diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h index 4cc170a31762..cedc1b5a289c 100644 --- a/include/linux/byteorder/little_endian.h +++ b/include/linux/byteorder/little_endian.h @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/byteorder/swab.h> +#include <linux/byteorder/swabb.h> #define __constant_htonl(x) ((__force __be32)___constant_swab32((x))) #define __constant_ntohl(x) ___constant_swab32((__force __be32)(x)) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 30934e4bfaab..8b00f6643e93 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -9,12 +9,12 @@ */ #include <linux/sched.h> -#include <linux/kref.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> #include <linux/cgroupstats.h> #include <linux/prio_heap.h> +#include <linux/rwsem.h> #ifdef CONFIG_CGROUPS @@ -137,6 +137,15 @@ struct cgroup { * release_list_lock */ struct list_head release_list; + + /* pids_mutex protects the fields below */ + struct rw_semaphore pids_mutex; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the current tasks_pids array */ + int pids_use_count; + /* Length of the current tasks_pids array */ + int pids_length; }; /* A css_set is a structure holding pointers to a set of @@ -149,7 +158,7 @@ struct cgroup { struct css_set { /* Reference count */ - struct kref ref; + atomic_t refcount; /* * List running through all cgroup groups in the same hash @@ -394,6 +403,9 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); +void cgroup_mm_owner_callbacks(struct task_struct *old, + struct task_struct *new); + #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } @@ -412,15 +424,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats, return -EINVAL; } +static inline void cgroup_mm_owner_callbacks(struct task_struct *old, + struct task_struct *new) {} + #endif /* !CONFIG_CGROUPS */ -#ifdef CONFIG_MM_OWNER -extern void -cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new); -#else /* !CONFIG_MM_OWNER */ -static inline void -cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ -} -#endif /* CONFIG_MM_OWNER */ #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e2877454ec82..9c22396e8b50 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -48,3 +48,9 @@ SUBSYS(devices) #endif /* */ + +#ifdef CONFIG_CGROUP_FREEZER +SUBSYS(freezer) +#endif + +/* */ diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 025e4f575103..0acf3b737e2e 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -8,12 +8,9 @@ #include <linux/proc_fs.h> #define ELFCORE_ADDR_MAX (-1ULL) +#define ELFCORE_ADDR_ERR (-2ULL) -#ifdef CONFIG_PROC_VMCORE extern unsigned long long elfcorehdr_addr; -#else -static const unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; -#endif extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); @@ -28,10 +25,43 @@ extern struct proc_dir_entry *proc_vmcore; #define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x)) +/* + * is_kdump_kernel() checks whether this kernel is booting after a panic of + * previous kernel or not. This is determined by checking if previous kernel + * has passed the elf core header address on command line. + * + * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will + * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of + * previous kernel. + */ + static inline int is_kdump_kernel(void) { return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0; } + +/* is_vmcore_usable() checks if the kernel is booting after a panic and + * the vmcore region is usable. + * + * This makes use of the fact that due to alignment -2ULL is not + * a valid pointer, much in the vain of IS_ERR(), except + * dealing directly with an unsigned long long rather than a pointer. + */ + +static inline int is_vmcore_usable(void) +{ + return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0; +} + +/* vmcore_unusable() marks the vmcore as unusable, + * without disturbing the logic of is_kdump_kernel() + */ + +static inline void vmcore_unusable(void) +{ + if (is_kdump_kernel()) + elfcorehdr_addr = ELFCORE_ADDR_ERR; +} #else /* !CONFIG_CRASH_DUMP */ static inline int is_kdump_kernel(void) { return 0; } #endif /* CONFIG_CRASH_DUMP */ diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 159d9b476cd7..d14f02918483 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -380,6 +380,8 @@ struct ext3_inode { #define EXT3_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT3_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT3_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT3_MOUNT_DATA_ERR_ABORT 0x400000 /* Abort on file data write + * error in ordered mode */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff --git a/include/linux/fb.h b/include/linux/fb.h index 531ccd5f5960..75a81eaf3430 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -808,6 +808,7 @@ struct fb_tile_ops { struct fb_info { int node; int flags; + struct mutex lock; /* Lock for open/release/ioctl funcs */ struct fb_var_screeninfo var; /* Current var */ struct fb_fix_screeninfo fix; /* Current fix */ struct fb_monspecs monspecs; /* Current Monitor specs */ diff --git a/include/linux/freezer.h b/include/linux/freezer.h index deddeedf3257..8f225339eee9 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -6,7 +6,7 @@ #include <linux/sched.h> #include <linux/wait.h> -#ifdef CONFIG_PM_SLEEP +#ifdef CONFIG_FREEZER /* * Check if a process has been frozen */ @@ -39,28 +39,18 @@ static inline void clear_freeze_flag(struct task_struct *p) clear_tsk_thread_flag(p, TIF_FREEZE); } +static inline bool should_send_signal(struct task_struct *p) +{ + return !(p->flags & PF_FREEZER_NOSIG); +} + /* * Wake up a frozen process - * - * task_lock() is taken to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. */ -static inline int thaw_process(struct task_struct *p) -{ - task_lock(p); - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - task_unlock(p); - wake_up_process(p); - return 1; - } - clear_freeze_flag(p); - task_unlock(p); - return 0; -} +extern int __thaw_process(struct task_struct *p); + +/* Takes and releases task alloc lock using task_lock() */ +extern int thaw_process(struct task_struct *p); extern void refrigerator(void); extern int freeze_processes(void); @@ -75,6 +65,15 @@ static inline int try_to_freeze(void) return 0; } +extern bool freeze_task(struct task_struct *p, bool sig_only); +extern void cancel_freezing(struct task_struct *p); + +#ifdef CONFIG_CGROUP_FREEZER +extern int cgroup_frozen(struct task_struct *task); +#else /* !CONFIG_CGROUP_FREEZER */ +static inline int cgroup_frozen(struct task_struct *task) { return 0; } +#endif /* !CONFIG_CGROUP_FREEZER */ + /* * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it * calls wait_for_completion(&vfork) and reset right after it returns from this @@ -166,7 +165,7 @@ static inline void set_freezable_with_signal(void) } while (try_to_freeze()); \ __retval; \ }) -#else /* !CONFIG_PM_SLEEP */ +#else /* !CONFIG_FREEZER */ static inline int frozen(struct task_struct *p) { return 0; } static inline int freezing(struct task_struct *p) { return 0; } static inline void set_freeze_flag(struct task_struct *p) {} @@ -191,6 +190,6 @@ static inline void set_freezable_with_signal(void) {} #define wait_event_freezable_timeout(wq, condition, timeout) \ wait_event_interruptible_timeout(wq, condition, timeout) -#endif /* !CONFIG_PM_SLEEP */ +#endif /* !CONFIG_FREEZER */ #endif /* FREEZER_H_INCLUDED */ diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 7ebbcb1c9ba4..35d4f6342fac 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -816,6 +816,9 @@ struct journal_s #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JFS_LOADED 0x010 /* The journal superblock has been loaded */ #define JFS_BARRIER 0x020 /* Use IDE barriers */ +#define JFS_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file + * data write error in ordered + * mode */ /* * Function declarations for the journaling transaction and buffer diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fdf3967e1397..1fbe14d39521 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -27,16 +27,13 @@ struct mm_struct; #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0) - -extern struct page_cgroup *page_get_page_cgroup(struct page *page); extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); +extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_move_lists(struct page *page, bool active); extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask); extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, @@ -44,7 +41,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); @@ -69,21 +66,11 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority); -extern long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority); -extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority); - -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ -static inline void page_reset_bad_cgroup(struct page *page) -{ -} +extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru); -static inline struct page_cgroup *page_get_page_cgroup(struct page *page) -{ - return NULL; -} +#else /* CONFIG_CGROUP_MEM_RES_CTLR */ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -159,14 +146,9 @@ static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, { } -static inline long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority) -{ - return 0; -} - -static inline long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority) +static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, + struct zone *zone, int priority, + enum lru_list lru) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 03aea612d284..3f34005068d4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,7 +7,6 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION -extern int isolate_lru_page(struct page *p, struct list_head *pagelist); extern int putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); @@ -21,8 +20,6 @@ extern int migrate_vmas(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, unsigned long flags); #else -static inline int isolate_lru_page(struct page *p, struct list_head *list) - { return -ENOSYS; } static inline int putback_lru_pages(struct list_head *l) { return 0; } static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private) { return -ENOSYS; } diff --git a/include/linux/mm.h b/include/linux/mm.h index c61ba10768ea..ffee2f743418 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -132,6 +132,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) /* + * special vmas that are non-mergable, non-mlock()able + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) + +/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ @@ -700,10 +705,10 @@ static inline int page_mapped(struct page *page) extern void show_free_areas(void); #ifdef CONFIG_SHMEM -int shmem_lock(struct file *file, int lock, struct user_struct *user); +extern int shmem_lock(struct file *file, int lock, struct user_struct *user); #else static inline int shmem_lock(struct file *file, int lock, - struct user_struct *user) + struct user_struct *user) { return 0; } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 895bc4e93039..c948350c378e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,40 +1,100 @@ -static inline void -add_page_to_active_list(struct zone *zone, struct page *page) -{ - list_add(&page->lru, &zone->active_list); - __inc_zone_state(zone, NR_ACTIVE); -} +#ifndef LINUX_MM_INLINE_H +#define LINUX_MM_INLINE_H -static inline void -add_page_to_inactive_list(struct zone *zone, struct page *page) +/** + * page_is_file_cache - should the page be on a file LRU or anon LRU? + * @page: the page to test + * + * Returns LRU_FILE if @page is page cache page backed by a regular filesystem, + * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed. + * Used by functions that manipulate the LRU lists, to sort a page + * onto the right LRU list. + * + * We would like to get this info without a page flag, but the state + * needs to survive until the page is last deleted from the LRU, which + * could be as far down as __page_cache_release. + */ +static inline int page_is_file_cache(struct page *page) { - list_add(&page->lru, &zone->inactive_list); - __inc_zone_state(zone, NR_INACTIVE); + if (PageSwapBacked(page)) + return 0; + + /* The page is page cache backed by a normal filesystem. */ + return LRU_FILE; } static inline void -del_page_from_active_list(struct zone *zone, struct page *page) +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) { - list_del(&page->lru); - __dec_zone_state(zone, NR_ACTIVE); + list_add(&page->lru, &zone->lru[l].list); + __inc_zone_state(zone, NR_LRU_BASE + l); } static inline void -del_page_from_inactive_list(struct zone *zone, struct page *page) +del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_INACTIVE); + __dec_zone_state(zone, NR_LRU_BASE + l); } static inline void del_page_from_lru(struct zone *zone, struct page *page) { + enum lru_list l = LRU_BASE; + list_del(&page->lru); - if (PageActive(page)) { - __ClearPageActive(page); - __dec_zone_state(zone, NR_ACTIVE); + if (PageUnevictable(page)) { + __ClearPageUnevictable(page); + l = LRU_UNEVICTABLE; } else { - __dec_zone_state(zone, NR_INACTIVE); + if (PageActive(page)) { + __ClearPageActive(page); + l += LRU_ACTIVE; + } + l += page_is_file_cache(page); + } + __dec_zone_state(zone, NR_LRU_BASE + l); +} + +/** + * page_lru - which LRU list should a page be on? + * @page: the page to test + * + * Returns the LRU list a page should be on, as an index + * into the array of LRU lists. + */ +static inline enum lru_list page_lru(struct page *page) +{ + enum lru_list lru = LRU_BASE; + + if (PageUnevictable(page)) + lru = LRU_UNEVICTABLE; + else { + if (PageActive(page)) + lru += LRU_ACTIVE; + lru += page_is_file_cache(page); } + + return lru; } +/** + * inactive_anon_is_low - check if anonymous pages need to be deactivated + * @zone: zone to check + * + * Returns true if the zone does not have enough inactive anon pages, + * meaning some active anon pages need to be deactivated. + */ +static inline int inactive_anon_is_low(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_ANON); + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + + if (inactive * zone->inactive_ratio < active) + return 1; + + return 0; +} +#endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9d49fa36bbef..fe825471d5aa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -94,9 +94,6 @@ struct page { void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR - unsigned long page_cgroup; -#endif }; /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 428328a05fa1..35a7b5e19465 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -81,21 +81,31 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, - NR_INACTIVE, - NR_ACTIVE, + NR_LRU_BASE, + NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ + NR_ACTIVE_ANON, /* " " " " " */ + NR_INACTIVE_FILE, /* " " " " " */ + NR_ACTIVE_FILE, /* " " " " " */ +#ifdef CONFIG_UNEVICTABLE_LRU + NR_UNEVICTABLE, /* " " " " " */ + NR_MLOCK, /* mlock()ed pages found and moved off LRU */ +#else + NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ + NR_MLOCK = NR_ACTIVE_FILE, +#endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, - /* Second 128 byte cacheline */ NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, + /* Second 128 byte cacheline */ NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ @@ -107,6 +117,55 @@ enum zone_stat_item { #endif NR_VM_ZONE_STAT_ITEMS }; +/* + * We do arithmetic on the LRU lists in various places in the code, + * so it is important to keep the active lists LRU_ACTIVE higher in + * the array than the corresponding inactive lists, and to keep + * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. + * + * This has to be kept in sync with the statistics in zone_stat_item + * above and the descriptions in vmstat_text in mm/vmstat.c + */ +#define LRU_BASE 0 +#define LRU_ACTIVE 1 +#define LRU_FILE 2 + +enum lru_list { + LRU_INACTIVE_ANON = LRU_BASE, + LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, + LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, + LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, +#ifdef CONFIG_UNEVICTABLE_LRU + LRU_UNEVICTABLE, +#else + LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */ +#endif + NR_LRU_LISTS +}; + +#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) + +#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++) + +static inline int is_file_lru(enum lru_list l) +{ + return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); +} + +static inline int is_active_lru(enum lru_list l) +{ + return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); +} + +static inline int is_unevictable_lru(enum lru_list l) +{ +#ifdef CONFIG_UNEVICTABLE_LRU + return (l == LRU_UNEVICTABLE); +#else + return 0; +#endif +} + struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ @@ -251,10 +310,22 @@ struct zone { /* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; - struct list_head active_list; - struct list_head inactive_list; - unsigned long nr_scan_active; - unsigned long nr_scan_inactive; + struct { + struct list_head list; + unsigned long nr_scan; + } lru[NR_LRU_LISTS]; + + /* + * The pageout code in vmscan.c keeps track of how many of the + * mem/swap backed and file backed pages are refeferenced. + * The higher the rotated/scanned ratio, the more valuable + * that cache is. + * + * The anon LRU stats live in [0], file LRU stats in [1] + */ + unsigned long recent_rotated[2]; + unsigned long recent_scanned[2]; + unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ @@ -276,6 +347,12 @@ struct zone { */ int prev_priority; + /* + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this zone's LRU. Maintained by the pageout code. + */ + unsigned int inactive_ratio; + ZONE_PADDING(_pad2_) /* Rarely used or read-mostly fields */ @@ -524,8 +601,11 @@ typedef struct pglist_data { struct zone node_zones[MAX_NR_ZONES]; struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; -#ifdef CONFIG_FLAT_NODE_MEM_MAP +#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + struct page_cgroup *node_page_cgroup; +#endif #endif struct bootmem_data *bdata; #ifdef CONFIG_MEMORY_HOTPLUG @@ -854,6 +934,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) #endif struct page; +struct page_cgroup; struct mem_section { /* * This is, logically, a pointer to an array of struct @@ -871,6 +952,14 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + /* + * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use + * section. (see memcontrol.h/page_cgroup.h about this.) + */ + struct page_cgroup *page_cgroup; + unsigned long pad; +#endif }; #ifdef CONFIG_SPARSEMEM_EXTREME diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index d6fb115f5a07..ee5124ec319e 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -12,6 +12,7 @@ #include <linux/mtd/flashchip.h> #include <linux/mtd/map.h> #include <linux/mtd/cfi_endian.h> +#include <linux/mtd/xip.h> #ifdef CONFIG_MTD_CFI_I1 #define cfi_interleave(cfi) 1 @@ -430,7 +431,6 @@ static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t { map_word val; uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, cfi_interleave(cfi), type); - val = cfi_build_cmd(cmd, map, cfi); if (prev_val) @@ -483,6 +483,13 @@ static inline void cfi_udelay(int us) } } +int __xipram cfi_qry_present(struct map_info *map, __u32 base, + struct cfi_private *cfi); +int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map, + struct cfi_private *cfi); +void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map, + struct cfi_private *cfi); + struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size, const char* name); struct cfi_fixup { diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h index 08dd131301c1..d4f38c5fd44e 100644 --- a/include/linux/mtd/flashchip.h +++ b/include/linux/mtd/flashchip.h @@ -73,6 +73,10 @@ struct flchip { int buffer_write_time; int erase_time; + int word_write_time_max; + int buffer_write_time_max; + int erase_time_max; + void *priv; }; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 922636548558..eae26bb6430a 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -25,8 +25,10 @@ #define MTD_ERASE_DONE 0x08 #define MTD_ERASE_FAILED 0x10 +#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff + /* If the erase fails, fail_addr might indicate exactly which block failed. If - fail_addr = 0xffffffff, the failure was not at the device level or was not + fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not specific to any particular block. */ struct erase_info { struct mtd_info *mtd; diff --git a/include/linux/mtd/nand-gpio.h b/include/linux/mtd/nand-gpio.h new file mode 100644 index 000000000000..51534e50f7fc --- /dev/null +++ b/include/linux/mtd/nand-gpio.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_MTD_NAND_GPIO_H +#define __LINUX_MTD_NAND_GPIO_H + +#include <linux/mtd/nand.h> + +struct gpio_nand_platdata { + int gpio_nce; + int gpio_nwp; + int gpio_cle; + int gpio_ale; + int gpio_rdy; + void (*adjust_parts)(struct gpio_nand_platdata *, size_t); + struct mtd_partition *parts; + unsigned int num_parts; + unsigned int options; + int chip_delay; +}; + +#endif diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 81774e5facf4..733d3f3b4eb8 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -248,6 +248,7 @@ struct nand_hw_control { * @read_page_raw: function to read a raw page without ECC * @write_page_raw: function to write a raw page without ECC * @read_page: function to read a page according to the ecc generator requirements + * @read_subpage: function to read parts of the page covered by ECC. * @write_page: function to write a page according to the ecc generator requirements * @read_oob: function to read chip OOB data * @write_oob: function to write chip OOB data diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h index d1b310c92eb4..0c6bbe28f38c 100644 --- a/include/linux/mtd/onenand_regs.h +++ b/include/linux/mtd/onenand_regs.h @@ -152,6 +152,8 @@ #define ONENAND_SYS_CFG1_INT (1 << 6) #define ONENAND_SYS_CFG1_IOBE (1 << 5) #define ONENAND_SYS_CFG1_RDY_CONF (1 << 4) +#define ONENAND_SYS_CFG1_HF (1 << 2) +#define ONENAND_SYS_CFG1_SYNC_WRITE (1 << 1) /* * Controller Status Register F240h (R) diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index 5014f7a9f5df..c92b4d439609 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -73,7 +73,6 @@ struct device; struct device_node; int __devinit of_mtd_parse_partitions(struct device *dev, - struct mtd_info *mtd, struct device_node *node, struct mtd_partition **pparts); diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h new file mode 100644 index 000000000000..e77c1cea404d --- /dev/null +++ b/include/linux/mtd/sh_flctl.h @@ -0,0 +1,125 @@ +/* + * SuperH FLCTL nand controller + * + * Copyright © 2008 Renesas Solutions Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __SH_FLCTL_H__ +#define __SH_FLCTL_H__ + +#include <linux/mtd/mtd.h> +#include <linux/mtd/nand.h> +#include <linux/mtd/partitions.h> + +/* FLCTL registers */ +#define FLCMNCR(f) (f->reg + 0x0) +#define FLCMDCR(f) (f->reg + 0x4) +#define FLCMCDR(f) (f->reg + 0x8) +#define FLADR(f) (f->reg + 0xC) +#define FLADR2(f) (f->reg + 0x3C) +#define FLDATAR(f) (f->reg + 0x10) +#define FLDTCNTR(f) (f->reg + 0x14) +#define FLINTDMACR(f) (f->reg + 0x18) +#define FLBSYTMR(f) (f->reg + 0x1C) +#define FLBSYCNT(f) (f->reg + 0x20) +#define FLDTFIFO(f) (f->reg + 0x24) +#define FLECFIFO(f) (f->reg + 0x28) +#define FLTRCR(f) (f->reg + 0x2C) +#define FL4ECCRESULT0(f) (f->reg + 0x80) +#define FL4ECCRESULT1(f) (f->reg + 0x84) +#define FL4ECCRESULT2(f) (f->reg + 0x88) +#define FL4ECCRESULT3(f) (f->reg + 0x8C) +#define FL4ECCCR(f) (f->reg + 0x90) +#define FL4ECCCNT(f) (f->reg + 0x94) +#define FLERRADR(f) (f->reg + 0x98) + +/* FLCMNCR control bits */ +#define ECCPOS2 (0x1 << 25) +#define _4ECCCNTEN (0x1 << 24) +#define _4ECCEN (0x1 << 23) +#define _4ECCCORRECT (0x1 << 22) +#define SNAND_E (0x1 << 18) /* SNAND (0=512 1=2048)*/ +#define QTSEL_E (0x1 << 17) +#define ENDIAN (0x1 << 16) /* 1 = little endian */ +#define FCKSEL_E (0x1 << 15) +#define ECCPOS_00 (0x00 << 12) +#define ECCPOS_01 (0x01 << 12) +#define ECCPOS_02 (0x02 << 12) +#define ACM_SACCES_MODE (0x01 << 10) +#define NANWF_E (0x1 << 9) +#define SE_D (0x1 << 8) /* Spare area disable */ +#define CE1_ENABLE (0x1 << 4) /* Chip Enable 1 */ +#define CE0_ENABLE (0x1 << 3) /* Chip Enable 0 */ +#define TYPESEL_SET (0x1 << 0) + +/* FLCMDCR control bits */ +#define ADRCNT2_E (0x1 << 31) /* 5byte address enable */ +#define ADRMD_E (0x1 << 26) /* Sector address access */ +#define CDSRC_E (0x1 << 25) /* Data buffer selection */ +#define DOSR_E (0x1 << 24) /* Status read check */ +#define SELRW (0x1 << 21) /* 0:read 1:write */ +#define DOADR_E (0x1 << 20) /* Address stage execute */ +#define ADRCNT_1 (0x00 << 18) /* Address data bytes: 1byte */ +#define ADRCNT_2 (0x01 << 18) /* Address data bytes: 2byte */ +#define ADRCNT_3 (0x02 << 18) /* Address data bytes: 3byte */ +#define ADRCNT_4 (0x03 << 18) /* Address data bytes: 4byte */ +#define DOCMD2_E (0x1 << 17) /* 2nd cmd stage execute */ +#define DOCMD1_E (0x1 << 16) /* 1st cmd stage execute */ + +/* FLTRCR control bits */ +#define TRSTRT (0x1 << 0) /* translation start */ +#define TREND (0x1 << 1) /* translation end */ + +/* FL4ECCCR control bits */ +#define _4ECCFA (0x1 << 2) /* 4 symbols correct fault */ +#define _4ECCEND (0x1 << 1) /* 4 symbols end */ +#define _4ECCEXST (0x1 << 0) /* 4 symbols exist */ + +#define INIT_FL4ECCRESULT_VAL 0x03FF03FF +#define LOOP_TIMEOUT_MAX 0x00010000 + +#define mtd_to_flctl(mtd) container_of(mtd, struct sh_flctl, mtd) + +struct sh_flctl { + struct mtd_info mtd; + struct nand_chip chip; + void __iomem *reg; + + uint8_t done_buff[2048 + 64]; /* max size 2048 + 64 */ + int read_bytes; + int index; + int seqin_column; /* column in SEQIN cmd */ + int seqin_page_addr; /* page_addr in SEQIN cmd */ + uint32_t seqin_read_cmd; /* read cmd in SEQIN cmd */ + int erase1_page_addr; /* page_addr in ERASE1 cmd */ + uint32_t erase_ADRCNT; /* bits of FLCMDCR in ERASE1 cmd */ + uint32_t rw_ADRCNT; /* bits of FLCMDCR in READ WRITE cmd */ + + int hwecc_cant_correct[4]; + + unsigned page_size:1; /* NAND page size (0 = 512, 1 = 2048) */ + unsigned hwecc:1; /* Hardware ECC (0 = disabled, 1 = enabled) */ +}; + +struct sh_flctl_platform_data { + struct mtd_partition *parts; + int nr_parts; + unsigned long flcmncr_val; + + unsigned has_hwecc:1; +}; + +#endif /* __SH_FLCTL_H__ */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c74d3e875314..b12f93a3c345 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -93,6 +93,11 @@ enum pageflags { PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_buddy, /* Page is free, on buddy lists */ + PG_swapbacked, /* Page is backed by RAM/swap */ +#ifdef CONFIG_UNEVICTABLE_LRU + PG_unevictable, /* Page is "unevictable" */ + PG_mlocked, /* Page is vma mlocked */ +#endif #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR PG_uncached, /* Page has been mapped as uncached */ #endif @@ -161,6 +166,18 @@ static inline int Page##uname(struct page *page) \ #define TESTSCFLAG(uname, lname) \ TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define SETPAGEFLAG_NOOP(uname) \ +static inline void SetPage##uname(struct page *page) { } + +#define CLEARPAGEFLAG_NOOP(uname) \ +static inline void ClearPage##uname(struct page *page) { } + +#define __CLEARPAGEFLAG_NOOP(uname) \ +static inline void __ClearPage##uname(struct page *page) { } + +#define TESTCLEARFLAG_FALSE(uname) \ +static inline int TestClearPage##uname(struct page *page) { return 0; } + struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) @@ -169,6 +186,7 @@ PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) + TESTCLEARFLAG(Active, active) __PAGEFLAG(Slab, slab) PAGEFLAG(Checked, checked) /* Used by some filesystems */ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ @@ -176,6 +194,7 @@ PAGEFLAG(SavePinned, savepinned); /* Xen */ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) +PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobPage, slob_page) __PAGEFLAG(SlobFree, slob_free) @@ -211,6 +230,25 @@ PAGEFLAG(SwapCache, swapcache) PAGEFLAG_FALSE(SwapCache) #endif +#ifdef CONFIG_UNEVICTABLE_LRU +PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) + TESTCLEARFLAG(Unevictable, unevictable) + +#define MLOCK_PAGES 1 +PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) + TESTSCFLAG(Mlocked, mlocked) + +#else + +#define MLOCK_PAGES 0 +PAGEFLAG_FALSE(Mlocked) + SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) + +PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) + SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) + __CLEARPAGEFLAG_NOOP(Unevictable) +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR PAGEFLAG(Uncached, uncached) #else @@ -326,15 +364,25 @@ static inline void __ClearPageTail(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_UNEVICTABLE_LRU +#define __PG_UNEVICTABLE (1 << PG_unevictable) +#define __PG_MLOCKED (1 << PG_mlocked) +#else +#define __PG_UNEVICTABLE 0 +#define __PG_MLOCKED 0 +#endif + #define PAGE_FLAGS (1 << PG_lru | 1 << PG_private | 1 << PG_locked | \ 1 << PG_buddy | 1 << PG_writeback | \ - 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active) + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ + __PG_UNEVICTABLE | __PG_MLOCKED) /* * Flags checked in bad_page(). Pages on the free list should not have * these flags set. It they are, there is a problem. */ -#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | 1 << PG_reclaim | 1 << PG_dirty) +#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | \ + 1 << PG_reclaim | 1 << PG_dirty | 1 << PG_swapbacked) /* * Flags checked when a page is freed. Pages being freed should not have @@ -347,7 +395,8 @@ static inline void __ClearPageTail(struct page *page) * Pages being prepped should not have these flags set. It they are, there * is a problem. */ -#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | 1 << PG_reserved | 1 << PG_dirty) +#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | \ + 1 << PG_reserved | 1 << PG_dirty | 1 << PG_swapbacked) #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h new file mode 100644 index 000000000000..0fd39f2231ec --- /dev/null +++ b/include/linux/page_cgroup.h @@ -0,0 +1,103 @@ +#ifndef __LINUX_PAGE_CGROUP_H +#define __LINUX_PAGE_CGROUP_H + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#include <linux/bit_spinlock.h> +/* + * Page Cgroup can be considered as an extended mem_map. + * A page_cgroup page is associated with every page descriptor. The + * page_cgroup helps us identify information about the cgroup + * All page cgroups are allocated at boot or memory hotplug event, + * then the page cgroup for pfn always exists. + */ +struct page_cgroup { + unsigned long flags; + struct mem_cgroup *mem_cgroup; + struct page *page; + struct list_head lru; /* per cgroup LRU list */ +}; + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat); +void __init page_cgroup_init(void); +struct page_cgroup *lookup_page_cgroup(struct page *page); + +enum { + /* flags for mem_cgroup */ + PCG_LOCK, /* page cgroup is locked */ + PCG_CACHE, /* charged as cache */ + PCG_USED, /* this object is in use. */ + /* flags for LRU placement */ + PCG_ACTIVE, /* page is active in this cgroup */ + PCG_FILE, /* page is file system backed */ + PCG_UNEVICTABLE, /* page is unevictableable */ +}; + +#define TESTPCGFLAG(uname, lname) \ +static inline int PageCgroup##uname(struct page_cgroup *pc) \ + { return test_bit(PCG_##lname, &pc->flags); } + +#define SETPCGFLAG(uname, lname) \ +static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ + { set_bit(PCG_##lname, &pc->flags); } + +#define CLEARPCGFLAG(uname, lname) \ +static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ + { clear_bit(PCG_##lname, &pc->flags); } + +/* Cache flag is set only once (at allocation) */ +TESTPCGFLAG(Cache, CACHE) + +TESTPCGFLAG(Used, USED) +CLEARPCGFLAG(Used, USED) + +/* LRU management flags (from global-lru definition) */ +TESTPCGFLAG(File, FILE) +SETPCGFLAG(File, FILE) +CLEARPCGFLAG(File, FILE) + +TESTPCGFLAG(Active, ACTIVE) +SETPCGFLAG(Active, ACTIVE) +CLEARPCGFLAG(Active, ACTIVE) + +TESTPCGFLAG(Unevictable, UNEVICTABLE) +SETPCGFLAG(Unevictable, UNEVICTABLE) +CLEARPCGFLAG(Unevictable, UNEVICTABLE) + +static inline int page_cgroup_nid(struct page_cgroup *pc) +{ + return page_to_nid(pc->page); +} + +static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) +{ + return page_zonenum(pc->page); +} + +static inline void lock_page_cgroup(struct page_cgroup *pc) +{ + bit_spin_lock(PCG_LOCK, &pc->flags); +} + +static inline int trylock_page_cgroup(struct page_cgroup *pc) +{ + return bit_spin_trylock(PCG_LOCK, &pc->flags); +} + +static inline void unlock_page_cgroup(struct page_cgroup *pc) +{ + bit_spin_unlock(PCG_LOCK, &pc->flags); +} + +#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +struct page_cgroup; + +static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ +} + +static inline struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + return NULL; +} +#endif +#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5da31c12101c..709742be02f0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -32,6 +32,34 @@ static inline void mapping_set_error(struct address_space *mapping, int error) } } +#ifdef CONFIG_UNEVICTABLE_LRU +#define AS_UNEVICTABLE (__GFP_BITS_SHIFT + 2) /* e.g., ramdisk, SHM_LOCK */ + +static inline void mapping_set_unevictable(struct address_space *mapping) +{ + set_bit(AS_UNEVICTABLE, &mapping->flags); +} + +static inline void mapping_clear_unevictable(struct address_space *mapping) +{ + clear_bit(AS_UNEVICTABLE, &mapping->flags); +} + +static inline int mapping_unevictable(struct address_space *mapping) +{ + if (likely(mapping)) + return test_bit(AS_UNEVICTABLE, &mapping->flags); + return !!mapping; +} +#else +static inline void mapping_set_unevictable(struct address_space *mapping) { } +static inline void mapping_clear_unevictable(struct address_space *mapping) { } +static inline int mapping_unevictable(struct address_space *mapping) +{ + return 0; +} +#endif + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; @@ -271,19 +299,19 @@ extern int __lock_page_killable(struct page *page); extern void __lock_page_nosync(struct page *page); extern void unlock_page(struct page *page); -static inline void set_page_locked(struct page *page) +static inline void __set_page_locked(struct page *page) { - set_bit(PG_locked, &page->flags); + __set_bit(PG_locked, &page->flags); } -static inline void clear_page_locked(struct page *page) +static inline void __clear_page_locked(struct page *page) { - clear_bit(PG_locked, &page->flags); + __clear_bit(PG_locked, &page->flags); } static inline int trylock_page(struct page *page) { - return !test_and_set_bit(PG_locked, &page->flags); + return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } /* @@ -410,17 +438,17 @@ extern void __remove_from_page_cache(struct page *page); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: - * the page is new, so we can just run set_page_locked() against it. + * the page is new, so we can just run __set_page_locked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; - set_page_locked(page); + __set_page_locked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) - clear_page_locked(page); + __clear_page_locked(page); return error; } diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 8eb7fa76c1d0..e90a2cb02915 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -23,9 +23,9 @@ struct pagevec { void __pagevec_release(struct pagevec *pvec); void __pagevec_release_nonlru(struct pagevec *pvec); void __pagevec_free(struct pagevec *pvec); -void __pagevec_lru_add(struct pagevec *pvec); -void __pagevec_lru_add_active(struct pagevec *pvec); +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); +void pagevec_swap_free(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -81,10 +81,36 @@ static inline void pagevec_free(struct pagevec *pvec) __pagevec_free(pvec); } -static inline void pagevec_lru_add(struct pagevec *pvec) +static inline void __pagevec_lru_add_anon(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); +} + +static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); +} + +static inline void __pagevec_lru_add_file(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); +} + +static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); +} + +static inline void pagevec_lru_add_file(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_lru_add_file(pvec); +} + +static inline void pagevec_lru_add_anon(struct pagevec *pvec) { if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); + __pagevec_lru_add_anon(pvec); } #endif /* _LINUX_PAGEVEC_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 98dc6243a706..acf8f24037cd 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -631,6 +631,8 @@ int __must_check pci_assign_resource(struct pci_dev *dev, int i); int pci_select_bars(struct pci_dev *dev, unsigned long flags); /* ROM control related routines */ +int pci_enable_rom(struct pci_dev *pdev); +void pci_disable_rom(struct pci_dev *pdev); void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size); void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom); size_t pci_get_rom_size(void __iomem *rom, size_t size); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index ea7416c901d1..22641d5d45df 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -94,7 +94,6 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); -extern void ptrace_untrace(struct task_struct *child); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fed6f5e0b411..89f0564b10c8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -39,18 +39,6 @@ struct anon_vma { #ifdef CONFIG_MMU -extern struct kmem_cache *anon_vma_cachep; - -static inline struct anon_vma *anon_vma_alloc(void) -{ - return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); -} - -static inline void anon_vma_free(struct anon_vma *anon_vma) -{ - kmem_cache_free(anon_vma_cachep, anon_vma); -} - static inline void anon_vma_lock(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; @@ -75,6 +63,9 @@ void anon_vma_unlink(struct vm_area_struct *); void anon_vma_link(struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); +extern struct anon_vma *page_lock_anon_vma(struct page *page); +extern void page_unlock_anon_vma(struct anon_vma *anon_vma); + /* * rmap interfaces called when adding or removing pte of page */ @@ -117,6 +108,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int page_mkclean(struct page *); +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * called in munlock()/munmap() path to check for other vmas holding + * the page mlocked. + */ +int try_to_munlock(struct page *); +#else +static inline int try_to_munlock(struct page *page) +{ + return 0; /* a.k.a. SWAP_SUCCESS */ +} +#endif + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) @@ -140,5 +144,6 @@ static inline int page_mkclean(struct page *page) #define SWAP_SUCCESS 0 #define SWAP_AGAIN 1 #define SWAP_FAIL 2 +#define SWAP_MLOCK 3 #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index c226c7b82946..f52dbd3587a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -403,12 +403,21 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_DUMP_MAPPED_PRIVATE 4 #define MMF_DUMP_MAPPED_SHARED 5 #define MMF_DUMP_ELF_HEADERS 6 +#define MMF_DUMP_HUGETLB_PRIVATE 7 +#define MMF_DUMP_HUGETLB_SHARED 8 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 5 +#define MMF_DUMP_FILTER_BITS 7 #define MMF_DUMP_FILTER_MASK \ (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) + ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ + (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS +# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +#else +# define MMF_DUMP_MASK_DEFAULT_ELF 0 +#endif struct sighand_struct { atomic_t count; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index a1783b229ef4..dc50bcc282a8 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -60,6 +60,19 @@ static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask) return seq_bitmap(m, mask->bits, MAX_NUMNODES); } +int seq_bitmap_list(struct seq_file *m, unsigned long *bits, + unsigned int nr_bits); + +static inline int seq_cpumask_list(struct seq_file *m, cpumask_t *mask) +{ + return seq_bitmap_list(m, mask->bits, NR_CPUS); +} + +static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask) +{ + return seq_bitmap_list(m, mask->bits, MAX_NUMNODES); +} + int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); diff --git a/include/linux/swab.h b/include/linux/swab.h index 270d5c208a89..bbed279f3b32 100644 --- a/include/linux/swab.h +++ b/include/linux/swab.h @@ -47,8 +47,6 @@ static inline __attribute_const__ __u16 ___swab16(__u16 val) { #ifdef __arch_swab16 return __arch_swab16(val); -#elif defined(__arch_swab16p) - return __arch_swab16p(&val); #else return __const_swab16(val); #endif @@ -58,8 +56,6 @@ static inline __attribute_const__ __u32 ___swab32(__u32 val) { #ifdef __arch_swab32 return __arch_swab32(val); -#elif defined(__arch_swab32p) - return __arch_swab32p(&val); #else return __const_swab32(val); #endif @@ -69,8 +65,6 @@ static inline __attribute_const__ __u64 ___swab64(__u64 val) { #ifdef __arch_swab64 return __arch_swab64(val); -#elif defined(__arch_swab64p) - return __arch_swab64p(&val); #elif defined(__SWAB_64_THRU_32__) __u32 h = val >> 32; __u32 l = val & ((1ULL << 32) - 1); @@ -84,8 +78,6 @@ static inline __attribute_const__ __u32 ___swahw32(__u32 val) { #ifdef __arch_swahw32 return __arch_swahw32(val); -#elif defined(__arch_swahw32p) - return __arch_swahw32p(&val); #else return __const_swahw32(val); #endif @@ -95,8 +87,6 @@ static inline __attribute_const__ __u32 ___swahb32(__u32 val) { #ifdef __arch_swahb32 return __arch_swahb32(val); -#elif defined(__arch_swahb32p) - return __arch_swahb32p(&val); #else return __const_swahb32(val); #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index de40f169a4e4..a3af95b2cb6d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -7,6 +7,7 @@ #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> +#include <linux/node.h> #include <asm/atomic.h> #include <asm/page.h> @@ -171,8 +172,10 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ -extern void lru_cache_add(struct page *); -extern void lru_cache_add_active(struct page *); +extern void __lru_cache_add(struct page *, enum lru_list lru); +extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_cache_add_active_or_unevictable(struct page *, + struct vm_area_struct *); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); @@ -180,12 +183,38 @@ extern int lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void swap_setup(void); +extern void add_page_to_unevictable_list(struct page *page); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +static inline void lru_cache_add_anon(struct page *page) +{ + __lru_cache_add(page, LRU_INACTIVE_ANON); +} + +static inline void lru_cache_add_active_anon(struct page *page) +{ + __lru_cache_add(page, LRU_ACTIVE_ANON); +} + +static inline void lru_cache_add_file(struct page *page) +{ + __lru_cache_add(page, LRU_INACTIVE_FILE); +} + +static inline void lru_cache_add_active_file(struct page *page) +{ + __lru_cache_add(page, LRU_ACTIVE_FILE); +} + /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask); -extern int __isolate_lru_page(struct page *page, int mode); +extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); @@ -204,6 +233,34 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif +#ifdef CONFIG_UNEVICTABLE_LRU +extern int page_evictable(struct page *page, struct vm_area_struct *vma); +extern void scan_mapping_unevictable_pages(struct address_space *); + +extern unsigned long scan_unevictable_pages; +extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +extern int scan_unevictable_register_node(struct node *node); +extern void scan_unevictable_unregister_node(struct node *node); +#else +static inline int page_evictable(struct page *page, + struct vm_area_struct *vma) +{ + return 1; +} + +static inline void scan_mapping_unevictable_pages(struct address_space *mapping) +{ +} + +static inline int scan_unevictable_register_node(struct node *node) +{ + return 0; +} + +static inline void scan_unevictable_unregister_node(struct node *node) { } +#endif + extern int kswapd_run(int nid); #ifdef CONFIG_MMU @@ -251,6 +308,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern struct swap_info_struct *get_swap_info_struct(unsigned); extern int can_share_swap_page(struct page *); extern int remove_exclusive_swap_page(struct page *); +extern int remove_exclusive_swap_page_ref(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ @@ -339,6 +397,11 @@ static inline int remove_exclusive_swap_page(struct page *p) return 0; } +static inline int remove_exclusive_swap_page_ref(struct page *page) +{ + return 0; +} + static inline swp_entry_t get_swap_page(void) { swp_entry_t entry; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index b330e289d71f..9d68fed50f11 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -21,8 +21,9 @@ struct kobject; struct module; /* FIXME - * The *owner field is no longer used, but leave around - * until the tree gets cleaned up fully. + * The *owner field is no longer used. + * x86 tree has been cleaned up. The owner + * attribute is still left for other arches. */ struct attribute { const char *name; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 328eb4022727..4c28c4d564e2 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -2,6 +2,7 @@ #define _LINUX_VMALLOC_H #include <linux/spinlock.h> +#include <linux/init.h> #include <asm/page.h> /* pgprot_t */ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ @@ -23,7 +24,6 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #endif struct vm_struct { - /* keep next,addr,size together to speedup lookups */ struct vm_struct *next; void *addr; unsigned long size; @@ -37,6 +37,19 @@ struct vm_struct { /* * Highlevel APIs for driver use */ +extern void vm_unmap_ram(const void *mem, unsigned int count); +extern void *vm_map_ram(struct page **pages, unsigned int count, + int node, pgprot_t prot); +extern void vm_unmap_aliases(void); + +#ifdef CONFIG_MMU +extern void __init vmalloc_init(void); +#else +static inline void vmalloc_init(void) +{ +} +#endif + extern void *vmalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 58334d439516..9cd3ab0f554d 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -41,6 +41,16 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ + UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ + UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ + UNEVICTABLE_PGMLOCKED, + UNEVICTABLE_PGMUNLOCKED, + UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ + UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ + UNEVICTABLE_MLOCKFREED, +#endif NR_VM_EVENT_ITEMS }; @@ -159,6 +169,16 @@ static inline unsigned long zone_page_state(struct zone *zone, return x; } +extern unsigned long global_lru_pages(void); + +static inline unsigned long zone_lru_pages(struct zone *zone) +{ + return (zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_FILE)); +} + #ifdef CONFIG_NUMA /* * Determine the per node value of a stat item. This function diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h index 0cb63ed2c1fc..b8093971ccb4 100644 --- a/include/net/netns/x_tables.h +++ b/include/net/netns/x_tables.h @@ -2,9 +2,9 @@ #define __NETNS_X_TABLES_H #include <linux/list.h> -#include <linux/net.h> +#include <linux/netfilter.h> struct netns_xt { - struct list_head tables[NPROTO]; + struct list_head tables[NFPROTO_NUMPROTO]; }; #endif diff --git a/init/Kconfig b/init/Kconfig index 5ceff3249a2d..8828ed0b2051 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -299,6 +299,13 @@ config CGROUP_NS for instance virtual servers and checkpoint/restart jobs. +config CGROUP_FREEZER + bool "control group freezer subsystem" + depends on CGROUPS + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + config CGROUP_DEVICE bool "Device controller for cgroups" depends on CGROUPS && EXPERIMENTAL diff --git a/init/main.c b/init/main.c index 27f6bf6108e9..4371d11721f6 100644 --- a/init/main.c +++ b/init/main.c @@ -27,6 +27,7 @@ #include <linux/gfp.h> #include <linux/percpu.h> #include <linux/kmod.h> +#include <linux/vmalloc.h> #include <linux/kernel_stat.h> #include <linux/start_kernel.h> #include <linux/security.h> @@ -642,6 +643,7 @@ asmlinkage void __init start_kernel(void) initrd_start = 0; } #endif + vmalloc_init(); vfs_caches_init_early(); cpuset_init_early(); mem_init(); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 96fb36cd9874..68eb857cfdea 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -52,6 +52,14 @@ #define HARD_MSGMAX (131072/sizeof(void*)) #define DFLT_MSGSIZEMAX 8192 /* max message size */ +/* + * Define the ranges various user-specified maximum values can + * be set to. + */ +#define MIN_MSGMAX 1 /* min value for msg_max */ +#define MAX_MSGMAX HARD_MSGMAX /* max value for msg_max */ +#define MIN_MSGSIZEMAX 128 /* min value for msgsize_max */ +#define MAX_MSGSIZEMAX (8192*128) /* max value for msgsize_max */ struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; @@ -134,8 +142,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode, info->qsize = 0; info->user = NULL; /* set when all is ok */ memset(&info->attr, 0, sizeof(info->attr)); - info->attr.mq_maxmsg = DFLT_MSGMAX; - info->attr.mq_msgsize = DFLT_MSGSIZEMAX; + info->attr.mq_maxmsg = msg_max; + info->attr.mq_msgsize = msgsize_max; if (attr) { info->attr.mq_maxmsg = attr->mq_maxmsg; info->attr.mq_msgsize = attr->mq_msgsize; @@ -1191,11 +1199,11 @@ static struct file_system_type mqueue_fs_type = { .kill_sb = kill_litter_super, }; -static int msg_max_limit_min = DFLT_MSGMAX; -static int msg_max_limit_max = HARD_MSGMAX; +static int msg_max_limit_min = MIN_MSGMAX; +static int msg_max_limit_max = MAX_MSGMAX; -static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX; -static int msg_maxsize_limit_max = INT_MAX; +static int msg_maxsize_limit_min = MIN_MSGSIZEMAX; +static int msg_maxsize_limit_max = MAX_MSGSIZEMAX; static ctl_table mq_sysctls[] = { { diff --git a/ipc/shm.c b/ipc/shm.c index e77ec698cf40..0add3fa5f547 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -737,6 +737,10 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) case SHM_LOCK: case SHM_UNLOCK: { + struct file *uninitialized_var(shm_file); + + lru_add_drain_all(); /* drain pagevecs to lru lists */ + shp = shm_lock_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 000000000000..a3bb4cb52539 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df7c3e2..066550aa61c5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -55,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c6e1c17e6d3..046c1609606b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg) struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; @@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg) list_del(&link->cgrp_link_list); kfree(link); } - - write_unlock(&css_set_lock); } -static void __release_css_set(struct kref *k, int taskexit) +static void __put_css_set(struct css_set *cg, int taskexit) { int i; - struct css_set *cg = container_of(k, struct css_set, ref); - + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cg->refcount, -1, 1)) + return; + write_lock(&css_set_lock); + if (!atomic_dec_and_test(&cg->refcount)) { + write_unlock(&css_set_lock); + return; + } unlink_css_set(cg); + write_unlock(&css_set_lock); rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { @@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit) kfree(cg); } -static void release_css_set(struct kref *k) -{ - __release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ - __release_css_set(k, 1); -} - /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cg) { - kref_get(&cg->ref); + atomic_inc(&cg->refcount); } static inline void put_css_set(struct css_set *cg) { - kref_put(&cg->ref, release_css_set); + __put_css_set(cg, 0); } static inline void put_css_set_taskexit(struct css_set *cg) { - kref_put(&cg->ref, release_css_set_taskexit); + __put_css_set(cg, 1); } /* @@ -427,7 +425,7 @@ static struct css_set *find_css_set( return NULL; } - kref_init(&res->ref); + atomic_set(&res->refcount, 1); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); INIT_HLIST_NODE(&res->hlist); @@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = { .remount_fs = cgroup_remount, }; +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); + init_rwsem(&cgrp->pids_mutex); +} static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp) read_lock(&css_set_lock); list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->ref.refcount); + count += atomic_read(&link->cg->refcount); } read_unlock(&css_set_lock); return count; @@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here). The struct - * ctr_struct * is stored in file->private_data. Its resources will - * be freed by release() when the file is closed. The array is used - * to sprintf the PIDs and then used by read(). */ -struct ctr_struct { - char *buf; - int bufsz; -}; /* * Load into 'pidarray' up to 'npids' of the tasks using cgroup @@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b) return *(pid_t *)a - *(pid_t *)b; } + /* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'. Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. + * seq_file methods for the "tasks" file. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->tasks_pids array. */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) + +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) { - int cnt = 0; - int i; + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct cgroup *cgrp = s->private; + int index = 0, pid = *pos; + int *iter; - for (i = 0; i < npids; i++) - cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); - return cnt; + down_read(&cgrp->pids_mutex); + if (pid) { + int end = cgrp->pids_length; + int i; + while (index < end) { + int mid = (index + end) / 2; + if (cgrp->tasks_pids[mid] == pid) { + index = mid; + break; + } else if (cgrp->tasks_pids[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= cgrp->pids_length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = cgrp->tasks_pids + index; + *pos = *iter; + return iter; +} + +static void cgroup_tasks_stop(struct seq_file *s, void *v) +{ + struct cgroup *cgrp = s->private; + up_read(&cgrp->pids_mutex); } +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct cgroup *cgrp = s->private; + int *p = v; + int *end = cgrp->tasks_pids + cgrp->pids_length; + + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_tasks_show(struct seq_file *s, void *v) +{ + return seq_printf(s, "%d\n", *(int *)v); +} + +static struct seq_operations cgroup_tasks_seq_operations = { + .start = cgroup_tasks_start, + .stop = cgroup_tasks_stop, + .next = cgroup_tasks_next, + .show = cgroup_tasks_show, +}; + +static void release_cgroup_pid_array(struct cgroup *cgrp) +{ + down_write(&cgrp->pids_mutex); + BUG_ON(!cgrp->pids_use_count); + if (!--cgrp->pids_use_count) { + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = NULL; + cgrp->pids_length = 0; + } + up_write(&cgrp->pids_mutex); +} + +static int cgroup_tasks_release(struct inode *inode, struct file *file) +{ + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (!(file->f_mode & FMODE_READ)) + return 0; + + release_cgroup_pid_array(cgrp); + return seq_release(inode, file); +} + +static struct file_operations cgroup_tasks_operations = { + .read = seq_read, + .llseek = seq_lseek, + .write = cgroup_file_write, + .release = cgroup_tasks_release, +}; + /* - * Handle an open on 'tasks' file. Prepare a buffer listing the + * Handle an open on 'tasks' file. Prepare an array containing the * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any. */ + static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct ctr_struct *ctr; pid_t *pidarray; int npids; - char c; + int retval; + /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); - if (!ctr) - goto err0; - /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * show up until sometime later on. */ npids = cgroup_task_count(cgrp); - if (npids) { - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - goto err1; - - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* Call pid_array_to_buf() twice, first just to get bufsz */ - ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; - ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); - if (!ctr->buf) - goto err2; - ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - - kfree(pidarray); - } else { - ctr->buf = NULL; - ctr->bufsz = 0; - } - file->private_data = ctr; - return 0; - -err2: - kfree(pidarray); -err1: - kfree(ctr); -err0: - return -ENOMEM; -} + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + return -ENOMEM; + npids = pid_array_load(pidarray, npids, cgrp); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct ctr_struct *ctr = file->private_data; + /* + * Store the array in the cgroup, freeing the old + * array if necessary + */ + down_write(&cgrp->pids_mutex); + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = pidarray; + cgrp->pids_length = npids; + cgrp->pids_use_count++; + up_write(&cgrp->pids_mutex); - return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} + file->f_op = &cgroup_tasks_operations; -static int cgroup_tasks_release(struct inode *unused_inode, - struct file *file) -{ - struct ctr_struct *ctr; - - if (file->f_mode & FMODE_READ) { - ctr = file->private_data; - kfree(ctr->buf); - kfree(ctr); + retval = seq_open(file, &cgroup_tasks_seq_operations); + if (retval) { + release_cgroup_pid_array(cgrp); + return retval; } + ((struct seq_file *)file->private_data)->private = cgrp; return 0; } @@ -2210,7 +2268,6 @@ static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, - .read = cgroup_tasks_read, .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, @@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->root = parent->root; @@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) int __init cgroup_init_early(void) { int i; - kref_init(&init_css_set.ref); - kref_get(&init_css_set.ref); + atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index c3dc3aba4c02..daca6209202d 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, u64 count; rcu_read_lock(); - count = atomic_read(¤t->cgroups->ref.refcount); + count = atomic_read(¤t->cgroups->refcount); rcu_read_unlock(); return count; } @@ -90,7 +90,7 @@ static struct cftype files[] = { { .name = "releasable", .read_u64 = releasable_read, - } + }, }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 000000000000..e95056954498 --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,379 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> + +enum freezer_state { + CGROUP_THAWED = 0, + CGROUP_FREEZING, + CGROUP_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ + struct freezer *freezer; + enum freezer_state state; + + task_lock(task); + freezer = task_freezer(task); + state = freezer->state; + task_unlock(task); + + return state == CGROUP_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "THAWED", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______THAWED_______/ | + * \__________________________THAWED____________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (to prevent races with freeze_task()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = CGROUP_THAWED; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + +/* Task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + +/* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + int retval; + + /* Anything frozen can't move or be moved to/from */ + + if (is_task_frozen_enough(task)) + return -EBUSY; + + freezer = cgroup_freezer(new_cgroup); + if (freezer->state == CGROUP_FROZEN) + return -EBUSY; + + retval = 0; + task_lock(task); + freezer = task_freezer(task); + if (freezer->state == CGROUP_FROZEN) + retval = -EBUSY; + task_unlock(task); + return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + task_lock(task); + freezer = task_freezer(task); + task_unlock(task); + + BUG_ON(freezer->state == CGROUP_FROZEN); + spin_lock_irq(&freezer->lock); + /* Locking avoids race with FREEZING -> THAWED transitions. */ + if (freezer->state == CGROUP_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void update_freezer_state(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + if (is_task_frozen_enough(task)) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + else if (nfrozen > 0) + freezer->state = CGROUP_FREEZING; + else + freezer->state = CGROUP_THAWED; + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == CGROUP_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + update_freezer_state(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = CGROUP_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (is_task_frozen_enough(task)) + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + int do_wake; + + task_lock(task); + do_wake = __thaw_process(task); + task_unlock(task); + if (do_wake) + wake_up_process(task); + } + cgroup_iter_end(cgroup, &it); + freezer->state = CGROUP_THAWED; + + return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + update_freezer_state(cgroup, freezer); + if (goal_state == freezer->state) + goto out; + switch (freezer->state) { + case CGROUP_THAWED: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + case CGROUP_FREEZING: + if (goal_state == CGROUP_FROZEN) { + /* Userspace is retrying after + * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + } + /* state == FREEZING and goal_state == THAWED, so unfreeze */ + case CGROUP_FROZEN: + retval = unfreeze_cgroup(cgroup, freezer); + break; + default: + break; + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + goal_state = CGROUP_THAWED; + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + goal_state = CGROUP_FROZEN; + else + return -EIO; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/configs.c b/kernel/configs.c index 4c345210ed8c..abaee684ecbf 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -54,9 +54,6 @@ #ifdef CONFIG_IKCONFIG_PROC -/**************************************************/ -/* globals and useful constants */ - static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) @@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = { .read = ikconfig_read_current, }; -/***************************************************/ -/* ikconfig_init: start up everything we need to */ - static int __init ikconfig_init(void) { struct proc_dir_entry *entry; @@ -89,9 +83,6 @@ static int __init ikconfig_init(void) return 0; } -/***************************************************/ -/* ikconfig_cleanup: clean up our mess */ - static void __exit ikconfig_cleanup(void) { remove_proc_entry("config.gz", NULL); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index eab7bd6628e0..3e00526f52ec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, { struct cpuset trialcs; int err; - int cpus_nonempty, balance_flag_changed; + int balance_flag_changed; trialcs = *cs; if (turning_on) @@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) return err; - cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(&trialcs)); @@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); - if (cpus_nonempty && balance_flag_changed) + if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); return 0; @@ -2437,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = { void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); - m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Cpus_allowed_list:\t"); - m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask_list(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); - m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed_list:\t"); - m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask_list(m, &task->mems_allowed); seq_printf(m, "\n"); } diff --git a/kernel/freezer.c b/kernel/freezer.c new file mode 100644 index 000000000000..ba6248b323ef --- /dev/null +++ b/kernel/freezer.c @@ -0,0 +1,154 @@ +/* + * kernel/freezer.c - Function to freeze a process + * + * Originally from kernel/power/process.c + */ + +#include <linux/interrupt.h> +#include <linux/suspend.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/freezer.h> + +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + wmb(); + } + clear_freeze_flag(current); +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(); + task_unlock(current); + } else { + task_unlock(current); + return; + } + save = current->state; + pr_debug("%s entered refrigerator\n", current->comm); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; + schedule(); + } + pr_debug("%s left refrigerator\n", current->comm); + __set_current_state(save); +} +EXPORT_SYMBOL(refrigerator); + +static void fake_signal_wake_up(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); +} + +/** + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and + * either sending a fake signal to it or waking it up, depending on whether + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. + */ +bool freeze_task(struct task_struct *p, bool sig_only) +{ + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; + + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; + } else { + wake_up_state(p, TASK_INTERRUPTIBLE); + } + + return true; +} + +void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + clear_freeze_flag(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_and_wake(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/kexec.c b/kernel/kexec.c index aef265325cd3..777ac458ac99 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1371,6 +1371,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(node_online_map); VMCOREINFO_SYMBOL(swapper_pg_dir); VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmlist); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1406,6 +1407,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vm_struct, addr); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/kthread.c b/kernel/kthread.c index 96cff2f8710b..14ec64fe175a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -171,12 +171,11 @@ EXPORT_SYMBOL(kthread_create); */ void kthread_bind(struct task_struct *k, unsigned int cpu) { - if (k->state != TASK_UNINTERRUPTIBLE) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; diff --git a/kernel/power/process.c b/kernel/power/process.c index 278946aecaf0..ca634019497a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p) return 1; } -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - wmb(); - } - clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) - break; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static inline bool should_send_signal(struct task_struct *p) -{ - return !(p->flags & PF_FREEZER_NOSIG); -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise - * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. - */ -static bool freeze_task(struct task_struct *p, bool sig_only) -{ - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } - - if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else if (sig_only) { - return false; - } else { - wake_up_state(p, TASK_INTERRUPTIBLE); - } - - return true; -} - -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; + if (cgroup_frozen(p)) + continue; + thaw_process(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -264,4 +152,3 @@ void thaw_processes(void) printk("done.\n"); } -EXPORT_SYMBOL(refrigerator); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 356699a96d56..1e68e4c39e2c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) * TASK_TRACED, resume it now. * Requires that irqs be disabled. */ -void ptrace_untrace(struct task_struct *child) +static void ptrace_untrace(struct task_struct *child) { spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ca4bbbe04aa4..59236e8b9daa 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -54,9 +54,9 @@ #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> -#include <linux/byteorder/swabb.h> #include <linux/cpumask.h> #include <linux/rcupreempt_trace.h> +#include <asm/byteorder.h> /* * PREEMPT_RCU data structures. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a0..b3cc73931d1f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -833,6 +833,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/lib/bitmap.c b/lib/bitmap.c index 06fb57c86de0..482df94ea21e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -316,17 +316,6 @@ int bitmap_scnprintf(char *buf, unsigned int buflen, EXPORT_SYMBOL(bitmap_scnprintf); /** - * bitmap_scnprintf_len - return buffer length needed to convert - * bitmap to an ASCII hex string - * @nr_bits: number of bits to be converted - */ -int bitmap_scnprintf_len(unsigned int nr_bits) -{ - unsigned int nr_nibbles = ALIGN(nr_bits, 4) / 4; - return nr_nibbles + ALIGN(nr_nibbles, CHUNKSZ / 4) / (CHUNKSZ / 4) - 1; -} - -/** * __bitmap_parse - convert an ASCII hex string into a bitmap. * @buf: pointer to buffer containing string. * @buflen: buffer size in bytes. If string is smaller than this diff --git a/lib/vsprintf.c b/lib/vsprintf.c index cceecb6a963d..a013bbc23717 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -24,6 +24,7 @@ #include <linux/kernel.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> +#include <linux/ioport.h> #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/div64.h> @@ -550,18 +551,51 @@ static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int #endif } +static char *resource_string(char *buf, char *end, struct resource *res, int field_width, int precision, int flags) +{ +#ifndef IO_RSRC_PRINTK_SIZE +#define IO_RSRC_PRINTK_SIZE 4 +#endif + +#ifndef MEM_RSRC_PRINTK_SIZE +#define MEM_RSRC_PRINTK_SIZE 8 +#endif + + /* room for the actual numbers, the two "0x", -, [, ] and the final zero */ + char sym[4*sizeof(resource_size_t) + 8]; + char *p = sym, *pend = sym + sizeof(sym); + int size = -1; + + if (res->flags & IORESOURCE_IO) + size = IO_RSRC_PRINTK_SIZE; + else if (res->flags & IORESOURCE_MEM) + size = MEM_RSRC_PRINTK_SIZE; + + *p++ = '['; + p = number(p, pend, res->start, 16, size, -1, SPECIAL | SMALL | ZEROPAD); + *p++ = '-'; + p = number(p, pend, res->end, 16, size, -1, SPECIAL | SMALL | ZEROPAD); + *p++ = ']'; + *p = 0; + + return string(buf, end, sym, field_width, precision, flags); +} + /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format * specifiers. * - * Right now we just handle 'F' (for symbolic Function descriptor pointers) - * and 'S' (for Symbolic direct pointers), but this can easily be - * extended in the future (network address types etc). + * Right now we handle: + * + * - 'F' For symbolic function descriptor pointers + * - 'S' For symbolic direct pointers + * - 'R' For a struct resource pointer, it prints the range of + * addresses (not the name nor the flags) * - * The difference between 'S' and 'F' is that on ia64 and ppc64 function - * pointers are really function descriptors, which contain a pointer the - * real address. + * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 + * function pointers are really function descriptors, which contain a + * pointer to the real address. */ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags) { @@ -571,6 +605,8 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field /* Fallthrough */ case 'S': return symbol_string(buf, end, ptr, field_width, precision, flags); + case 'R': + return resource_string(buf, end, ptr, field_width, precision, flags); } flags |= SMALL; if (field_width == -1) { @@ -590,6 +626,7 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field * This function follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol * %pF output the name of a function pointer + * %pR output the address range in a struct resource * * The return value is the number of characters which would * be generated for the given input, excluding the trailing diff --git a/mm/Kconfig b/mm/Kconfig index 1a501a4de95c..5b5790f8a816 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -209,5 +209,16 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS +config UNEVICTABLE_LRU + bool "Add LRU list to track non-evictable pages" + default y + depends on MMU + help + Keeps unevictable pages off of the active and inactive pageout + lists, so kswapd will not waste CPU time or have its balancing + algorithms thrown off by scanning these pages. Selecting this + will use one page flag and increase the code size a little, + say Y unless you know what you are doing. + config MMU_NOTIFIER bool diff --git a/mm/Makefile b/mm/Makefile index da4ccf015aea..c06b45a1ff5f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o - +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/filemap.c b/mm/filemap.c index 903bf316912a..ab8553658af3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> +#include <linux/mm_inline.h> /* for page_is_file_cache() */ #include "internal.h" /* @@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_cache_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + int ret; + + /* + * Splice_read and readahead add shmem/tmpfs pages into the page cache + * before shmem_readpage has a chance to mark them as SwapBacked: they + * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * (called in add_to_page_cache) needs to know where they're going too. + */ + if (mapping_cap_swap_backed(mapping)) + SetPageSwapBacked(page); + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) { + if (page_is_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } return ret; } @@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * mechananism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The first mb is necessary to safely close the critical section opened by the - * test_and_set_bit() to lock the page; the second mb is necessary to enforce - * ordering between the clear_bit and the read of the waitqueue (to avoid SMP - * races with a parallel wait_on_page_locked()). + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). */ void unlock_page(struct page *page) { - smp_mb__before_clear_bit(); - if (!test_and_clear_bit(PG_locked, &page->flags)) - BUG(); - smp_mb__after_clear_bit(); + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); diff --git a/mm/fremap.c b/mm/fremap.c index 7881638e4a12..7d12ca70ef7b 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -21,6 +21,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, spin_unlock(&mapping->i_mmap_lock); } + if (vma->vm_flags & VM_LOCKED) { + /* + * drop PG_Mlocked flag for over-mapped range + */ + unsigned int saved_flags = vma->vm_flags; + munlock_vma_pages_range(vma, start, start + size); + vma->vm_flags = saved_flags; + } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { - if (unlikely(has_write_lock)) { - downgrade_write(&mm->mmap_sem); - has_write_lock = 0; + if (vma->vm_flags & VM_LOCKED) { + /* + * might be mapping previously unmapped range of file + */ + mlock_vma_pages_range(vma, start, start + size); + } else { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); } - make_pages_present(start, start+size); } /* @@ -240,4 +258,3 @@ out: return err; } - diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 38633864a93e..ce8cbb29860b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -262,7 +262,7 @@ struct resv_map { struct list_head regions; }; -struct resv_map *resv_map_alloc(void) +static struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) @@ -274,7 +274,7 @@ struct resv_map *resv_map_alloc(void) return resv_map; } -void resv_map_release(struct kref *ref) +static void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); @@ -289,7 +289,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return 0; + return NULL; } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf) { struct hstate *h = &default_hstate; return sprintf(buf, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %5lu kB\n", + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", h->nr_huge_pages, h->free_huge_pages, h->resv_huge_pages, @@ -1747,10 +1747,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -int unmap_ref_private(struct mm_struct *mm, - struct vm_area_struct *vma, - struct page *page, - unsigned long address) +static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct vm_area_struct *iter_vma; struct address_space *mapping; @@ -2073,6 +2071,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } +static int huge_zeropage_ok(pte_t *ptep, int write, int shared) +{ + if (!ptep || write || shared) + return 0; + else + return huge_pte_none(huge_ptep_get(ptep)); +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, @@ -2082,6 +2088,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); + int zeropage_ok = 0; + int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { @@ -2094,8 +2102,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (huge_zeropage_ok(pte, write, shared)) + zeropage_ok = 1; - if (!pte || huge_pte_none(huge_ptep_get(pte)) || + if (!pte || + (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || (write && !pte_write(huge_ptep_get(pte)))) { int ret; @@ -2115,8 +2126,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - get_page(page); - pages[i] = page + pfn_offset; + if (zeropage_ok) + pages[i] = ZERO_PAGE(0); + else + pages[i] = page + pfn_offset; + get_page(pages[i]); } if (vmas) diff --git a/mm/internal.h b/mm/internal.h index 1f43f7416972..e4e728bdf324 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +/* + * in mm/vmscan.c: + */ +extern int isolate_lru_page(struct page *page); +extern void putback_lru_page(struct page *page); + +/* + * in mm/page_alloc.c + */ extern void __free_pages_bootmem(struct page *page, unsigned int order); /* @@ -52,6 +61,120 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +extern long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +extern void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static inline void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * unevictable_migrate_page() called only from migrate_page_copy() to + * migrate unevictable flag to new page. + * Note that the old page has been isolated from the LRU lists at this + * point so we don't need to worry about LRU statistics. + */ +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ + if (TestClearPageUnevictable(old)) + SetPageUnevictable(new); +} +#else +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ +} +#endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Called only in fault path via page_evictable() for a new page + * to determine if it's being mapped into a LOCKED vma. + * If so, mark page as mlocked. + */ +static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +{ + VM_BUG_ON(PageLRU(page)); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) + return 0; + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + return 1; +} + +/* + * must be called with vma's mmap_sem held for read, and page locked. + */ +extern void mlock_vma_page(struct page *page); + +/* + * Clear the page's PageMlocked(). This can be useful in a situation where + * we want to unconditionally remove a page from the pagecache -- e.g., + * on truncation or freeing. + * + * It is legal to call this function for any page, mlocked or not. + * If called for a page that is still mapped by mlocked vmas, all we do + * is revert to lazy LRU behaviour -- semantics are not broken. + */ +extern void __clear_page_mlock(struct page *page); +static inline void clear_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) + __clear_page_mlock(page); +} + +/* + * mlock_migrate_page - called only from migrate_page_copy() to + * migrate the Mlocked page flag; update statistics. + */ +static inline void mlock_migrate_page(struct page *newpage, struct page *page) +{ + if (TestClearPageMlocked(page)) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + SetPageMlocked(newpage); + __inc_zone_page_state(newpage, NR_MLOCK); + local_irq_restore(flags); + } +} + +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); + local_irq_restore(flags); + } +} + +#else /* CONFIG_UNEVICTABLE_LRU */ +static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +{ + return 0; +} +static inline void clear_page_mlock(struct page *page) { } +static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_migrate_page(struct page *new, struct page *old) { } +static inline void free_page_mlock(struct page *page) { } + +#endif /* CONFIG_UNEVICTABLE_LRU */ + /* * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, * so all functions starting at paging_init should be marked __init @@ -120,4 +243,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ +#define GUP_FLAGS_WRITE 0x1 +#define GUP_FLAGS_FORCE 0x2 +#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas); + #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36896f3eb7f5..d4a92b63e98e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -32,11 +32,12 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <linux/mm_inline.h> +#include <linux/page_cgroup.h> #include <asm/uaccess.h> struct cgroup_subsys mem_cgroup_subsys __read_mostly; -static struct kmem_cache *page_cgroup_cache __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 /* @@ -65,11 +66,10 @@ struct mem_cgroup_stat { /* * For accounting under irq disable, no need for increment preempt count. */ -static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, +static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, enum mem_cgroup_stat_index idx, int val) { - int cpu = smp_processor_id(); - stat->cpustat[cpu].count[idx] += val; + stat->count[idx] += val; } static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, @@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, /* * per-zone information in memory controller. */ - -enum mem_cgroup_zstat_index { - MEM_CGROUP_ZSTAT_ACTIVE, - MEM_CGROUP_ZSTAT_INACTIVE, - - NR_MEM_CGROUP_ZSTAT, -}; - struct mem_cgroup_per_zone { /* * spin_lock to protect the per cgroup LRU */ spinlock_t lru_lock; - struct list_head active_list; - struct list_head inactive_list; - unsigned long count[NR_MEM_CGROUP_ZSTAT]; + struct list_head lists[NR_LRU_LISTS]; + unsigned long count[NR_LRU_LISTS]; }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -144,69 +135,52 @@ struct mem_cgroup { }; static struct mem_cgroup init_mem_cgroup; -/* - * We use the lower bit of the page->page_cgroup pointer as a bit spin - * lock. We need to ensure that page->page_cgroup is at least two - * byte aligned (based on comments from Nick Piggin). But since - * bit_spin_lock doesn't actually set that lock bit in a non-debug - * uniprocessor kernel, we should avoid setting it here too. - */ -#define PAGE_CGROUP_LOCK_BIT 0x0 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) -#else -#define PAGE_CGROUP_LOCK 0x0 -#endif - -/* - * A page_cgroup page is associated with every page descriptor. The - * page_cgroup helps us identify information about the cgroup - */ -struct page_cgroup { - struct list_head lru; /* per cgroup LRU list */ - struct page *page; - struct mem_cgroup *mem_cgroup; - int flags; -}; -#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ -#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ - -static int page_cgroup_nid(struct page_cgroup *pc) -{ - return page_to_nid(pc->page); -} - -static enum zone_type page_cgroup_zid(struct page_cgroup *pc) -{ - return page_zonenum(pc->page); -} - enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + NR_CHARGE_TYPE, +}; + +/* only for here (for easy reading.) */ +#define PCGF_CACHE (1UL << PCG_CACHE) +#define PCGF_USED (1UL << PCG_USED) +#define PCGF_ACTIVE (1UL << PCG_ACTIVE) +#define PCGF_LOCK (1UL << PCG_LOCK) +#define PCGF_FILE (1UL << PCG_FILE) +static const unsigned long +pcg_default_flags[NR_CHARGE_TYPE] = { + PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ + PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ + PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ + 0, /* FORCE */ }; /* * Always modified under lru lock. Then, not necessary to preempt_disable() */ -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, - bool charge) +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, + struct page_cgroup *pc, + bool charge) { int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; VM_BUG_ON(!irqs_disabled()); - if (flags & PAGE_CGROUP_FLAG_CACHE) - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); + + cpustat = &stat->cpustat[smp_processor_id()]; + if (PageCgroupCache(pc)) + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); else - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); if (charge) - __mem_cgroup_stat_add_safe(stat, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGIN_COUNT, 1); else - __mem_cgroup_stat_add_safe(stat, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); } @@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) } static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, - enum mem_cgroup_zstat_index idx) + enum lru_list idx) { int nid, zid; struct mem_cgroup_per_zone *mz; @@ -262,85 +236,77 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static inline int page_cgroup_locked(struct page *page) -{ - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) -{ - VM_BUG_ON(!page_cgroup_locked(page)); - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); -} - -struct page_cgroup *page_get_page_cgroup(struct page *page) -{ - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); -} - -static void lock_page_cgroup(struct page *page) -{ - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static int try_lock_page_cgroup(struct page *page) -{ - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void unlock_page_cgroup(struct page *page) -{ - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; + } - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + MEM_CGROUP_ZSTAT(mz, lru) -= 1; - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); list_del(&pc->lru); } static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; - - if (!to) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - list_add(&pc->lru, &mz->inactive_list); - } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - list_add(&pc->lru, &mz->active_list); + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; } - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_add(&pc->lru, &mz->lists[lru]); + + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); } -static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int active = PageCgroupActive(pc); + int file = PageCgroupFile(pc); + int unevictable = PageCgroupUnevictable(pc); + enum lru_list from = unevictable ? LRU_UNEVICTABLE : + (LRU_FILE * !!file + !!active); - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + if (lru == from) + return; - if (active) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->active_list); + MEM_CGROUP_ZSTAT(mz, from) -= 1; + /* + * However this is done under mz->lru_lock, another flags, which + * are not related to LRU, will be modified from out-of-lock. + * We have to use atomic set/clear flags. + */ + if (is_unevictable_lru(lru)) { + ClearPageCgroupActive(pc); + SetPageCgroupUnevictable(pc); } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->inactive_list); + if (is_active_lru(lru)) + SetPageCgroupActive(pc); + else + ClearPageCgroupActive(pc); + ClearPageCgroupUnevictable(pc); } + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_move(&pc->lru, &mz->lists[lru]); } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) @@ -356,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) /* * This routine assumes that the appropriate zone's lru lock is already held */ -void mem_cgroup_move_lists(struct page *page, bool active) +void mem_cgroup_move_lists(struct page *page, enum lru_list lru) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; @@ -372,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active) * safely get to page_cgroup without it, so just try_lock it: * mem_cgroup_isolate_pages allows for page left on wrong list. */ - if (!try_lock_page_cgroup(page)) + pc = lookup_page_cgroup(page); + if (!trylock_page_cgroup(pc)) return; - - pc = page_get_page_cgroup(page); - if (pc) { + if (pc && PageCgroupUsed(pc)) { mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_move_lists(pc, active); + __mem_cgroup_move_lists(pc, lru); spin_unlock_irqrestore(&mz->lru_lock, flags); } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); } /* @@ -403,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) } /* - * This function is called from vmscan.c. In page reclaiming loop. balance - * between active and inactive list is calculated. For memory controller - * page reclaiming, we should use using mem_cgroup's imbalance rather than - * zone's global lru imbalance. - */ -long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) -{ - unsigned long active, inactive; - /* active and inactive are the number of pages. 'long' is ok.*/ - active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); - inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); - return (long) (active / (inactive + 1)); -} - -/* * prev_priority control...this will be used in memory reclaim path. */ int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) @@ -444,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) * (see include/linux/mmzone.h) */ -long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority) +long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru) { - long nr_active; + long nr_pages; int nid = zone->zone_pgdat->node_id; int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); - return (nr_active >> priority); -} + nr_pages = MEM_CGROUP_ZSTAT(mz, lru); -long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority) -{ - long nr_inactive; - int nid = zone->zone_pgdat->node_id; - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - - nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); - return (nr_inactive >> priority); + return (nr_pages >> priority); } unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, @@ -473,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { unsigned long nr_taken = 0; struct page *page; @@ -484,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; + int lru = LRU_FILE * !!file + !!active; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - if (active) - src = &mz->active_list; - else - src = &mz->inactive_list; - + src = &mz->lists[lru]; spin_lock(&mz->lru_lock); scan = 0; list_for_each_entry_safe_reverse(pc, tmp, src, lru) { if (scan >= nr_to_scan) break; + if (unlikely(!PageCgroupUsed(pc))) + continue; page = pc->page; if (unlikely(!PageLRU(page))) continue; - if (PageActive(page) && !active) { - __mem_cgroup_move_lists(pc, true); - continue; - } - if (!PageActive(page) && active) { - __mem_cgroup_move_lists(pc, false); + /* + * TODO: play better with lumpy reclaim, grabbing anything. + */ + if (PageUnevictable(page) || + (PageActive(page) && !active) || + (!PageActive(page) && active)) { + __mem_cgroup_move_lists(pc, page_lru(page)); continue; } scan++; list_move(&pc->lru, &pc_list); - if (__isolate_lru_page(page, mode) == 0) { + if (__isolate_lru_page(page, mode, file) == 0) { list_move(&page->lru, dst); nr_taken++; } @@ -540,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, { struct mem_cgroup *mem; struct page_cgroup *pc; - unsigned long flags; unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; + unsigned long flags; - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); - if (unlikely(pc == NULL)) - goto err; - + pc = lookup_page_cgroup(page); + /* can happen at boot */ + if (unlikely(!pc)) + return 0; + prefetchw(pc); /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ + if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!mem)) { rcu_read_unlock(); - kmem_cache_free(page_cgroup_cache, pc); return 0; } /* @@ -572,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, css_get(&memcg->css); } - while (res_counter_charge(&mem->res, PAGE_SIZE)) { + while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { if (!(gfp_mask & __GFP_WAIT)) goto out; @@ -595,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } } - pc->mem_cgroup = mem; - pc->page = page; - /* - * If a page is accounted as a page cache, insert to inactive list. - * If anon, insert to active list. - */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) - pc->flags = PAGE_CGROUP_FLAG_CACHE; - else - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; - lock_page_cgroup(page); - if (unlikely(page_get_page_cgroup(page))) { - unlock_page_cgroup(page); + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); + goto done; } - page_assign_page_cgroup(page, pc); + pc->mem_cgroup = mem; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ + pc->flags = pcg_default_flags[ctype]; mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); - unlock_page_cgroup(page); done: return 0; out: css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); -err: return -ENOMEM; } @@ -635,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { if (mem_cgroup_subsys.disabled) return 0; - + if (PageCompound(page)) + return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -656,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, { if (mem_cgroup_subsys.disabled) return 0; - + if (PageCompound(page)) + return 0; /* * Corner case handling. This is called from add_to_page_cache() * in usual. But some FS (shmem) precharges this page before calling it @@ -669,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) { struct page_cgroup *pc; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (pc) { - VM_BUG_ON(pc->page != page); - VM_BUG_ON(!pc->mem_cgroup); - unlock_page_cgroup(page); + + pc = lookup_page_cgroup(page); + if (!pc) + return 0; + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + unlock_page_cgroup(pc); return 0; } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); } if (unlikely(!mm)) mm = &init_mm; - return mem_cgroup_charge_common(page, mm, gfp_mask, + if (page_is_file_cache(page)) + return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + else + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); } /* @@ -704,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) /* * Check if our page_cgroup is valid */ - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (unlikely(!pc)) - goto unlock; - - VM_BUG_ON(pc->page != page); + pc = lookup_page_cgroup(page); + if (unlikely(!pc || !PageCgroupUsed(pc))) + return; - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) - && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) - || page_mapped(page))) - goto unlock; + lock_page_cgroup(pc); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) + || !PageCgroupUsed(pc)) { + /* This happens at race in zap_pte_range() and do_swap_page()*/ + unlock_page_cgroup(pc); + return; + } + ClearPageCgroupUsed(pc); + mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - - mem = pc->mem_cgroup; res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); return; -unlock: - unlock_page_cgroup(page); } void mem_cgroup_uncharge_page(struct page *page) { + /* early check. */ + if (page_mapped(page)) + return; + if (page->mapping && !PageAnon(page)) + return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_uncharge_cache_page(struct page *page) { VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } @@ -758,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) if (mem_cgroup_subsys.disabled) return 0; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (pc) { + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; css_get(&mem->css); - if (pc->flags & PAGE_CGROUP_FLAG_CACHE) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + if (PageCgroupCache(pc)) { + if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + } } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); if (mem) { ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, ctype, mem); @@ -791,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage) */ if (!newpage->mapping) __mem_cgroup_uncharge_common(newpage, - MEM_CGROUP_CHARGE_TYPE_FORCE); + MEM_CGROUP_CHARGE_TYPE_FORCE); else if (PageAnon(newpage)) mem_cgroup_uncharge_page(newpage); } @@ -863,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) #define FORCE_UNCHARGE_BATCH (128) static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, - int active) + enum lru_list lru) { struct page_cgroup *pc; struct page *page; @@ -871,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, unsigned long flags; struct list_head *list; - if (active) - list = &mz->active_list; - else - list = &mz->inactive_list; + list = &mz->lists[lru]; spin_lock_irqsave(&mz->lru_lock, flags); while (!list_empty(list)) { pc = list_entry(list->prev, struct page_cgroup, lru); page = pc->page; + if (!PageCgroupUsed(pc)) + break; get_page(page); spin_unlock_irqrestore(&mz->lru_lock, flags); /* @@ -894,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, count = FORCE_UNCHARGE_BATCH; cond_resched(); } - } else - cond_resched(); + } else { + spin_lock_irqsave(&mz->lru_lock, flags); + break; + } spin_lock_irqsave(&mz->lru_lock, flags); } spin_unlock_irqrestore(&mz->lru_lock, flags); @@ -919,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) while (mem->res.usage > 0) { if (atomic_read(&mem->css.cgroup->count) > 0) goto out; + /* This is for making all *used* pages to be on LRU. */ + lru_add_drain_all(); for_each_node_state(node, N_POSSIBLE) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct mem_cgroup_per_zone *mz; + enum lru_list l; mz = mem_cgroup_zoneinfo(mem, node, zid); - /* drop all page_cgroup in active_list */ - mem_cgroup_force_empty_list(mem, mz, 1); - /* drop all page_cgroup in inactive_list */ - mem_cgroup_force_empty_list(mem, mz, 0); + for_each_lru(l) + mem_cgroup_force_empty_list(mem, mz, l); } + cond_resched(); } ret = 0; out: @@ -1012,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, } /* showing # of active pages */ { - unsigned long active, inactive; - - inactive = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_INACTIVE); - active = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_ACTIVE); - cb->fill(cb, "active", (active) * PAGE_SIZE); - cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); + unsigned long active_anon, inactive_anon; + unsigned long active_file, inactive_file; + unsigned long unevictable; + + inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_ANON); + active_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_ANON); + inactive_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_FILE); + active_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_FILE); + unevictable = mem_cgroup_get_all_zonestat(mem_cont, + LRU_UNEVICTABLE); + + cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); + cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); + cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); + cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); + cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + } return 0; } @@ -1062,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; + enum lru_list l; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -1082,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - INIT_LIST_HEAD(&mz->active_list); - INIT_LIST_HEAD(&mz->inactive_list); spin_lock_init(&mz->lru_lock); + for_each_lru(l) + INIT_LIST_HEAD(&mz->lists[l]); } return 0; } @@ -1124,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int node; if (unlikely((cont->parent) == NULL)) { + page_cgroup_init(); mem = &init_mem_cgroup; - page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); } else { mem = mem_cgroup_alloc(); if (!mem) diff --git a/mm/memory.c b/mm/memory.c index 1002f473f497..3a6c4a658325 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -64,6 +64,8 @@ #include "internal.h" +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma) return !vma->vm_ops || !vma->vm_ops->fault; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); if (len <= 0) return 0; @@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud_t *pud; pmd_t *pmd; pte_t *pte; - if (write) /* user gate pages are read-only */ + + /* user gate pages are read-only */ + if (!ignore && write) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) - || !(vm_flags & vma->vm_flags)) + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (len); return i; } + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + EXPORT_SYMBOL(get_user_pages); pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -1296,18 +1323,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte_t *pte; spinlock_t *ptl; - retval = mem_cgroup_charge(page, mm, GFP_KERNEL); - if (retval) - goto out; - retval = -EINVAL; if (PageAnon(page)) - goto out_uncharge; + goto out; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out_uncharge; + goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; @@ -1323,8 +1346,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, return retval; out_unlock: pte_unmap_unlock(pte, ptl); -out_uncharge: - mem_cgroup_uncharge_page(page); out: return retval; } @@ -1858,6 +1879,15 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -1886,11 +1916,13 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - set_pte_at(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lru_cache_add_active(new_page); + SetPageSwapBacked(new_page); + lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); if (old_page) { /* * Only after switching the pte to the new page may @@ -2288,16 +2320,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } + mark_page_accessed(page); + + lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); ret = VM_FAULT_OOM; + unlock_page(page); goto out; } - mark_page_accessed(page); - lock_page(page); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - /* * Back out if somebody else already faulted in this pte. */ @@ -2324,7 +2357,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_anon_rmap(page, vma, address); swap_free(entry); - if (vm_swap_full()) + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) remove_exclusive_swap_page(page); unlock_page(page); @@ -2382,7 +2415,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2423,6 +2457,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t entry; int anon = 0; + int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; @@ -2463,6 +2498,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + page_cache_release(page); + goto out; + } + charged = 1; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -2497,11 +2544,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - goto out; - } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* @@ -2520,11 +2562,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); + inc_mm_counter(mm, anon_rss); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); @@ -2533,11 +2575,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); } else { - mem_cgroup_uncharge_page(page); + if (charged) + mem_cgroup_uncharge_page(page); if (anon) page_cache_release(page); else @@ -2772,19 +2817,9 @@ int make_pages_present(unsigned long addr, unsigned long end) len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); - if (ret < 0) { - /* - SUS require strange return value to mlock - - invalid addr generate to ENOMEM. - - out of memory should generate EAGAIN. - */ - if (ret == -EFAULT) - ret = -ENOMEM; - else if (ret == -ENOMEM) - ret = -EAGAIN; + if (ret < 0) return ret; - } - return ret == len ? 0 : -ENOMEM; + return ret == len ? 0 : -EFAULT; } #if !defined(__HAVE_ARCH_GATE_AREA) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 89fee2dcb039..6837a1014372 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/migrate.h> #include <linux/page-isolation.h> +#include <linux/pfn.h> #include <asm/tlbflush.h> @@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + release_mem_region(pfn << PAGE_SHIFT, + PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; @@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can only deal with pages on * LRU. */ - ret = isolate_lru_page(page, &source); + ret = isolate_lru_page(page); if (!ret) { /* Success */ + list_add_tail(&page->lru, &source); move_pages--; } else { /* Becasue we don't have big zone->lock. we should @@ -849,10 +851,19 @@ failed_removal: return ret; } + +int remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = PFN_DOWN(start); + end_pfn = start_pfn + PFN_DOWN(size); + return offline_pages(start_pfn, end_pfn, 120 * HZ); +} #else int remove_memory(u64 start, u64 size) { return -EINVAL; } -EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ +EXPORT_SYMBOL_GPL(remove_memory); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83369058ec13..36f42573a335 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,6 +93,8 @@ #include <asm/tlbflush.h> #include <asm/uaccess.h> +#include "internal.h" + /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ @@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) - isolate_lru_page(page, pagelist); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (!isolate_lru_page(page)) { + list_add_tail(&page->lru, pagelist); + } + } } static struct page *new_node_page(struct page *page, unsigned long node, int **x) @@ -2197,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) if (PageSwapCache(page)) md->swapcache++; - if (PageActive(page)) + if (PageActive(page) || PageUnevictable(page)) md->active++; if (PageWriteback(page)) diff --git a/mm/migrate.c b/mm/migrate.c index 2a80136b23bb..6602941bfab0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -37,36 +37,6 @@ #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* - * Isolate one page from the LRU lists. If successful put it onto - * the indicated list with elevated page count. - * - * Result: - * -EBUSY: page not on LRU list - * 0: page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page, struct list_head *pagelist) -{ - int ret = -EBUSY; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { - ret = 0; - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - list_add_tail(&page->lru, pagelist); - } - spin_unlock_irq(&zone->lru_lock); - } - return ret; -} - -/* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). */ @@ -83,23 +53,9 @@ int migrate_prep(void) return 0; } -static inline void move_to_lru(struct page *page) -{ - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - /* - * Add isolated pages on the list back to the LRU. + * Add isolated pages on the list back to the LRU under page lock + * to avoid leaking evictable pages back onto unevictable list. * * returns the number of pages put back. */ @@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); count++; } return count; @@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, __inc_zone_page_state(newpage, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); - if (!PageSwapCache(newpage)) - mem_cgroup_uncharge_cache_page(page); return 0; } @@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ static void migrate_page_copy(struct page *newpage, struct page *page) { + int anon; + copy_highpage(newpage, page); if (PageError(page)) @@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) SetPageReferenced(newpage); if (PageUptodate(page)) SetPageUptodate(newpage); - if (PageActive(page)) + if (TestClearPageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); + } else + unevictable_migrate_page(newpage, page); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + mlock_migrate_page(newpage, page); + #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif - ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); + /* page->mapping contains a flag for PageAnon() */ + anon = PageAnon(page); page->mapping = NULL; + if (!anon) /* This page was removed from radix-tree. */ + mem_cgroup_uncharge_cache_page(page); + /* * If any waiters have accumulated on the new page then * wake them up. @@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping, * * The new page will have replaced the old page if this function * is successful. + * + * Return value: + * < 0 - error code + * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page) { @@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page) /* Prepare mapping for the new page.*/ newpage->index = page->index; newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); mapping = page_mapping(page); if (!mapping) @@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - if (page_count(page) == 1) + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto move_newpage; + } charge = mem_cgroup_prepare_migration(page, newpage); if (charge == -ENOMEM) { @@ -730,7 +702,6 @@ rcu_unlock: rcu_read_unlock(); unlock: - unlock_page(page); if (rc != -EAGAIN) { @@ -741,17 +712,19 @@ unlock: * restored. */ list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); } move_newpage: if (!charge) mem_cgroup_end_migration(newpage); + /* * Move the new page to the LRU. If migration was not successful * then this will free the page. */ - move_to_lru(newpage); + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -858,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, * Move a set of pages as indicated in the pm array. The addr * field must be set to the virtual address of the page to be moved * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. */ -static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, - int migrate_all) +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) { int err; struct page_to_node *pp; @@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, !migrate_all) goto put_and_set; - err = isolate_lru_page(page, &pagelist); + err = isolate_lru_page(page); + if (!err) + list_add_tail(&page->lru, &pagelist); put_and_set: /* * Either remove the duplicate refcount from @@ -926,36 +903,118 @@ set_status: pp->status = err; } + err = 0; if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm); - else - err = -ENOENT; up_read(&mm->mmap_sem); return err; } /* - * Determine the nodes of a list of pages. The addr in the pm array - * must have been set to the virtual address of which we want to determine - * the node number. + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. */ -static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +static int do_pages_move(struct mm_struct *mm, struct task_struct *task, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) { + struct page_to_node *pm = NULL; + nodemask_t task_nodes; + int err = 0; + int i; + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void __user *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_pm; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out_pm; + + err = -ENODEV; + if (!node_state(node, N_HIGH_MEMORY)) + goto out_pm; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; + + pm[i].node = node; + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out_pm: + vfree(pm); +out: + return err; +} + +/* + * Determine the nodes of an array of pages and store it in an array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ + unsigned long i; + int err; + down_read(&mm->mmap_sem); - for ( ; pm->node != MAX_NUMNODES; pm++) { + for (i = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; struct vm_area_struct *vma; struct page *page; - int err; err = -EFAULT; - vma = find_vma(mm, pm->addr); + if (get_user(p, pages+i)) + goto out; + addr = (unsigned long) p; + + vma = find_vma(mm, addr); if (!vma) goto set_status; - page = follow_page(vma, pm->addr, 0); + page = follow_page(vma, addr, 0); err = PTR_ERR(page); if (IS_ERR(page)) @@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) err = page_to_nid(page); set_status: - pm->status = err; + put_user(err, status+i); } + err = 0; +out: up_read(&mm->mmap_sem); - return 0; + return err; } /* @@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - int err = 0; - int i; struct task_struct *task; - nodemask_t task_nodes; struct mm_struct *mm; - struct page_to_node *pm = NULL; + int err; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, (current->uid != task->suid) && (current->uid != task->uid) && !capable(CAP_SYS_NICE)) { err = -EPERM; - goto out2; + goto out; } err = security_task_movememory(task); if (err) - goto out2; - - - task_nodes = cpuset_mems_allowed(task); - - /* Limit nr_pages so that the multiplication may not overflow */ - if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { - err = -E2BIG; - goto out2; - } - - pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); - if (!pm) { - err = -ENOMEM; - goto out2; - } - - /* - * Get parameters from user space and initialize the pm - * array. Return various errors if the user did something wrong. - */ - for (i = 0; i < nr_pages; i++) { - const void __user *p; - - err = -EFAULT; - if (get_user(p, pages + i)) - goto out; - - pm[i].addr = (unsigned long)p; - if (nodes) { - int node; - - if (get_user(node, nodes + i)) - goto out; - - err = -ENODEV; - if (!node_state(node, N_HIGH_MEMORY)) - goto out; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out; + goto out; - pm[i].node = node; - } else - pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + if (nodes) { + err = do_pages_move(mm, task, nr_pages, pages, nodes, status, + flags); + } else { + err = do_pages_stat(mm, nr_pages, pages, status); } - /* End marker */ - pm[nr_pages].node = MAX_NUMNODES; - - if (nodes) - err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); - else - err = do_pages_stat(mm, pm); - - if (err >= 0) - /* Return status information */ - for (i = 0; i < nr_pages; i++) - if (put_user(pm[i].status, status + i)) - err = -EFAULT; out: - vfree(pm); -out2: mmput(mm); return err; } diff --git a/mm/mlock.c b/mm/mlock.c index 01fbe93eff5c..008ea70b7afa 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -8,10 +8,18 @@ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/rmap.h> +#include <linux/mmzone.h> +#include <linux/hugetlb.h> + +#include "internal.h" int can_do_mlock(void) { @@ -23,17 +31,381 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void __clear_page_mlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + + if (!page->mapping) { /* truncated ? */ + return; + } + + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * Page not on the LRU yet. Flush all pagevecs and retry. + */ + lru_add_drain_all(); + if (!isolate_lru_page(page)) + putback_lru_page(page); + else if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * called from munlock()/munmap() path with page supposedly on the LRU. + * + * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked + * [in try_to_munlock()] and then attempt to isolate the page. We must + * isolate the page to keep others from messing with its unevictable + * and mlocked state while trying to munlock. However, we pre-clear the + * mlocked state anyway as we might lose the isolation race and we might + * not get another chance to clear PageMlocked. If we successfully + * isolate the page and try_to_munlock() detects other VM_LOCKED vmas + * mapping the page, it will restore the PageMlocked state, unless the page + * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * perhaps redundantly. + * If we lose the isolation race, and the page is mapped by other VM_LOCKED + * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() + * either of which will restore the PageMlocked state by calling + * mlock_vma_page() above, if it can grab the vma's mmap sem. + */ +static void munlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page)) { + dec_zone_page_state(page, NR_MLOCK); + if (!isolate_lru_page(page)) { + int ret = try_to_munlock(page); + /* + * did try_to_unlock() succeed or punt? + */ + if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); + } else { + /* + * We lost the race. let try_to_unmap() deal + * with it. At least we get the page state and + * mlock stats right. However, page is still on + * the noreclaim list. We'll fix that up when + * the page is eventually freed or we scan the + * noreclaim list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } + } +} + +/** + * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @mlock: 0 indicate munlock, otherwise mlock. + * + * If @mlock == 0, unlock an mlocked range; + * else mlock the range of pages. This takes care of making the pages present , + * too. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held for at least read. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + struct page *pages[16]; /* 16 gives a reasonable batch */ + int nr_pages = (end - start) / PAGE_SIZE; + int ret; + int gup_flags = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && + (atomic_read(&mm->mm_users) != 0)); + + /* + * mlock: don't page populate if page has PROT_NONE permission. + * munlock: the pages always do munlock althrough + * its has PROT_NONE permission. + */ + if (!mlock) + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + + if (vma->vm_flags & VM_WRITE) + gup_flags |= GUP_FLAGS_WRITE; + + lru_add_drain_all(); /* push cached pages to LRU */ + + while (nr_pages > 0) { + int i; + + cond_resched(); + + /* + * get_user_pages makes pages present if we are + * setting mlock. and this extra reference count will + * disable migration of this page. However, page may + * still be truncated out from under us. + */ + ret = __get_user_pages(current, mm, addr, + min_t(int, nr_pages, ARRAY_SIZE(pages)), + gup_flags, pages, NULL); + /* + * This can happen for, e.g., VM_NONLINEAR regions before + * a page has been allocated and mapped at a given offset, + * or for addresses that map beyond end of a file. + * We'll mlock the the pages if/when they get faulted in. + */ + if (ret < 0) + break; + if (ret == 0) { + /* + * We know the vma is there, so the only time + * we cannot get a single page should be an + * error (ret < 0) case. + */ + WARN_ON(1); + break; + } + + lru_add_drain(); /* push cached pages to LRU */ + + for (i = 0; i < ret; i++) { + struct page *page = pages[i]; + + lock_page(page); + /* + * Because we lock page here and migration is blocked + * by the elevated reference, we need only check for + * page truncation (file-cache only). + */ + if (page->mapping) { + if (mlock) + mlock_vma_page(page); + else + munlock_vma_page(page); + } + unlock_page(page); + put_page(page); /* ref from get_user_pages() */ + + /* + * here we assume that get_user_pages() has given us + * a list of virtually contiguous pages. + */ + addr += PAGE_SIZE; /* for next get_user_pages() */ + nr_pages--; + } + ret = 0; + } + + lru_add_drain_all(); /* to update stats */ + + return ret; /* count entire vma as locked_vm */ +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +/* + * Just make pages present if VM_LOCKED. No-op if unlocking. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + if (mlock && (vma->vm_flags & VM_LOCKED)) + return make_pages_present(start, end); + return 0; +} + +static inline int __mlock_posix_error_return(long retval) +{ + return 0; +} + +#endif /* CONFIG_UNEVICTABLE_LRU */ + +/** + * mlock_vma_pages_range() - mlock pages in specified vma range. + * @vma - the vma containing the specfied address range + * @start - starting address in @vma to mlock + * @end - end address [+1] in @vma to mlock + * + * For mmap()/mremap()/expansion of mlocked vma. + * + * return 0 on success for "normal" vmas. + * + * return number of pages [> 0] to be removed from locked_vm on success + * of "special" vmas. + * + * return negative error if vma spanning @start-@range disappears while + * mmap semaphore is dropped. Unlikely? + */ +long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int nr_pages = (end - start) / PAGE_SIZE; + BUG_ON(!(vma->vm_flags & VM_LOCKED)); + + /* + * filter unlockable vmas + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto no_mlock; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current))) { + long error; + downgrade_write(&mm->mmap_sem); + + error = __mlock_vma_pages_range(vma, start, end, 1); + + up_read(&mm->mmap_sem); + /* vma can change or disappear */ + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + /* non-NULL vma must contain @start, but need to check @end */ + if (!vma || end > vma->vm_end) + return -ENOMEM; + + return 0; /* hide other errors from mmap(), et al */ + } + + /* + * User mapped kernel pages or huge pages: + * make these pages present to populate the ptes, but + * fall thru' to reset VM_LOCKED--no need to unlock, and + * return nr_pages so these don't get counted against task's + * locked limit. huge pages are already counted against + * locked vm limit. + */ + make_pages_present(start, end); + +no_mlock: + vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ + return nr_pages; /* error or pages NOT mlocked */ +} + + +/* + * munlock_vma_pages_range() - munlock all pages in the vma range.' + * @vma - vma containing range to be munlock()ed. + * @start - start address in @vma of the range + * @end - end of range in @vma. + * + * For mremap(), munmap() and exit(). + * + * Called with @vma VM_LOCKED. + * + * Returns with VM_LOCKED cleared. Callers must be prepared to + * deal with this. + * + * We don't save and restore VM_LOCKED here because pages are + * still on lru. In unmap path, pages might be scanned by reclaim + * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * free them. This will result in freeing mlocked pages. + */ +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + vma->vm_flags &= ~VM_LOCKED; + __mlock_vma_pages_range(vma, start, end, 0); +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes via make_pages_present(). + * + * For vmas that pass the filters, merge/split as appropriate. + */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned int newflags) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; - int pages; + int nr_pages; int ret = 0; - - if (newflags == vma->vm_flags) { - *prev = vma; - goto out; + int lock = newflags & VM_LOCKED; + + if (newflags == vma->vm_flags || + (vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; /* don't set VM_LOCKED, don't count */ + + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current)) { + if (lock) + make_pages_present(start, end); + goto out; /* don't set VM_LOCKED, don't count */ } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto success; } - *prev = vma; - if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) @@ -60,24 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, success: /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; + + /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, make_pages_present below will bring it back. + * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ vma->vm_flags = newflags; - /* - * Keep track of amount of locked VM. - */ - pages = (end - start) >> PAGE_SHIFT; - if (newflags & VM_LOCKED) { - pages = -pages; - if (!(newflags & VM_IO)) - ret = make_pages_present(start, end); + if (lock) { + /* + * mmap_sem is currently held for write. Downgrade the write + * lock to a read lock so that other faults, mmap scans, ... + * while we fault in all pages. + */ + downgrade_write(&mm->mmap_sem); + + ret = __mlock_vma_pages_range(vma, start, end, 1); + + /* + * Need to reacquire mmap sem in write mode, as our callers + * expect this. We have no support for atomically upgrading + * a sem to write, so we need to check for ranges while sem + * is unlocked. + */ + up_read(&mm->mmap_sem); + /* vma can change or disappear */ + down_write(&mm->mmap_sem); + *prev = find_vma(mm, start); + /* non-NULL *prev must contain @start, but need to check @end */ + if (!(*prev) || end > (*prev)->vm_end) + ret = -ENOMEM; + else if (ret > 0) { + mm->locked_vm -= ret; + ret = 0; + } else + ret = __mlock_posix_error_return(ret); /* translate if needed */ + } else { + /* + * TODO: for unlocking, pages will already be resident, so + * we don't need to wait for allocations/reclaim/pagein, ... + * However, unlocking a very large region can still take a + * while. Should we downgrade the semaphore for both lock + * AND unlock ? + */ + __mlock_vma_pages_range(vma, start, end, 0); } - mm->locked_vm -= pages; out: + *prev = vma; return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index e7a5a68a9c2e..74f4d158022e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, rb_insert_color(&vma->vm_rb, &mm->mm_rb); } -static inline void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma) { struct file * file; @@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) - static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { @@ -972,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, return -EPERM; vm_flags |= VM_LOCKED; } + /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -1139,10 +1138,12 @@ munmap_back: * The VM_SHARED test is necessary because shmem_zero_setup * will create the file object for a shared anonymous map below. */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; + if (!file && !(vm_flags & VM_SHARED)) { + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + } /* * Determine the object being mapped and call the appropriate @@ -1224,10 +1225,14 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + /* + * makes pages present; downgrades, drops, reacquires mmap_sem + */ + long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages < 0) + return nr_pages; /* vma gone! */ + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1586,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un * vma is the last one with address > vma->vm_end. Have to extend vma. */ #ifndef CONFIG_IA64 -static inline +static #endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { @@ -1636,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -static inline int expand_downwards(struct vm_area_struct *vma, +static int expand_downwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1698,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) - make_pages_present(addr, prev->vm_end); + if (prev->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) + return NULL; /* vma gone! */ + } return prev; } #else @@ -1727,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) - make_pages_present(addr, start); + if (vma->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(vma, addr, start) < 0) + return NULL; /* vma gone! */ + } return vma; } #endif @@ -1747,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) long nrpages = vma_pages(vma); mm->total_vm -= nrpages; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); @@ -1914,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) vma = prev? prev->vm_next: mm->mmap; /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + + /* * Remove the vma's, and unmap the actual pages */ detach_vmas_to_be_unmapped(mm, vma, prev, end); @@ -2025,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) goto out; /* @@ -2048,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } return addr; } @@ -2060,7 +2082,7 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2068,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + vma = mm->mmap; lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); diff --git a/mm/mremap.c b/mm/mremap.c index 1a7743923c8c..58a2908f42f5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -24,6 +24,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); + mlock_vma_pages_range(new_vma, new_addr + old_len, + new_addr + new_len); } return new_addr; @@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr, vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; - make_pages_present(addr + old_len, + mlock_vma_pages_range(vma, addr + old_len, addr + new_len); } ret = addr; diff --git a/mm/nommu.c b/mm/nommu.c index ed75bc962fbe..2696b24f2bb3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -34,6 +34,8 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> +#include "internal.h" + void *high_memory; struct page *mem_map; unsigned long max_mapnr; @@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. * - if 'force' is set, we only require the "MAY" flags. @@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* protect what we can, including chardevs */ if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - !(vm_flags & vma->vm_flags)) + (!ignore && !(vm_flags & vma->vm_flags))) goto finish_or_fault; if (pages) { @@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, finish_or_fault: return i ? : -EFAULT; } + + +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b40f6d5f8fe9..2970e35fd03f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) - + zone_page_state(z, NR_INACTIVE) - + zone_page_state(z, NR_ACTIVE); + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) - + global_page_state(NR_INACTIVE) - + global_page_state(NR_ACTIVE); + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9eb9eb928285..d0a240fbb8bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -44,7 +44,7 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/memcontrol.h> +#include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <asm/tlbflush.h> @@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - void *pc = page_get_page_cgroup(page); - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", current->comm, page, (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); - if (pc) { - printk(KERN_EMERG "cgroup:%p\n", pc); - page_reset_bad_cgroup(page); - } + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); @@ -454,14 +449,16 @@ static inline void __free_one_page(struct page *page, static inline int free_pages_check(struct page *page) { + free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); + if (PageSwapBacked(page)) + __ClearPageSwapBacked(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -600,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) bad_page(page); @@ -614,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk +#ifdef CONFIG_UNEVICTABLE_LRU + | 1 << PG_mlocked +#endif + ); set_page_private(page, 0); set_page_refcounted(page); @@ -1862,10 +1862,21 @@ void show_free_areas(void) } } - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" + printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" + " inactive_file:%lu" +//TODO: check/adjust line lengths +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lu" +#endif + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", - global_page_state(NR_ACTIVE), - global_page_state(NR_INACTIVE), + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_INACTIVE_FILE), +#ifdef CONFIG_UNEVICTABLE_LRU + global_page_state(NR_UNEVICTABLE), +#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1888,8 +1899,13 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" - " active:%lukB" - " inactive:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lukB" +#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -1899,8 +1915,13 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + K(zone_page_state(zone, NR_UNEVICTABLE)), +#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -3410,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; + pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; + enum lru_list l; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -3428,8 +3451,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - mminit_dprintk(MMINIT_TRACE, "memmap_init", - "%s zone: %lu pages used for memmap\n", + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING @@ -3439,8 +3462,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; - mminit_dprintk(MMINIT_TRACE, "memmap_init", - "%s zone: %lu pages reserved\n", + printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } @@ -3465,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; - zone->nr_scan_inactive = 0; + for_each_lru(l) { + INIT_LIST_HEAD(&zone->lru[l].list); + zone->lru[l].nr_scan = 0; + } + zone->recent_rotated[0] = 0; + zone->recent_rotated[1] = 0; + zone->recent_scanned[0] = 0; + zone->recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) @@ -4210,7 +4236,7 @@ void setup_per_zone_pages_min(void) for_each_zone(zone) { u64 tmp; - spin_lock_irqsave(&zone->lru_lock, flags); + spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone->present_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { @@ -4242,13 +4268,53 @@ void setup_per_zone_pages_min(void) zone->pages_low = zone->pages_min + (tmp >> 2); zone->pages_high = zone->pages_min + (tmp >> 1); setup_zone_migrate_reserve(zone); - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } /* update totalreserve_pages */ calculate_totalreserve_pages(); } +/** + * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. + * + * The inactive anon list should be small enough that the VM never has to + * do too much work, but large enough that each inactive page has a chance + * to be referenced again before it is swapped out. + * + * The inactive_anon ratio is the target ratio of ACTIVE_ANON to + * INACTIVE_ANON pages on this zone's LRU, maintained by the + * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of + * the anonymous pages are kept on the inactive list. + * + * total target max + * memory ratio inactive anon + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +void setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) { + unsigned int gb, ratio; + + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + ratio = int_sqrt(10 * gb); + if (!ratio) + ratio = 1; + + zone->inactive_ratio = ratio; + } +} + /* * Initialise min_free_kbytes. * @@ -4286,6 +4352,7 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 65536; setup_per_zone_pages_min(); setup_per_zone_lowmem_reserve(); + setup_per_zone_inactive_ratio(); return 0; } module_init(init_per_zone_pages_min) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 000000000000..5d86550701f2 --- /dev/null +++ b/mm/page_cgroup.c @@ -0,0 +1,237 @@ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/bit_spinlock.h> +#include <linux/page_cgroup.h> +#include <linux/hash.h> +#include <linux/memory.h> + +static void __meminit +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +{ + pc->flags = 0; + pc->mem_cgroup = NULL; + pc->page = pfn_to_page(pfn); +} +static unsigned long total_usage; + +#if !defined(CONFIG_SPARSEMEM) + + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + pgdat->node_page_cgroup = NULL; +} + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_cgroup *base; + + base = NODE_DATA(page_to_nid(page))->node_page_cgroup; + if (unlikely(!base)) + return NULL; + + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; + return base + offset; +} + +static int __init alloc_node_page_cgroup(int nid) +{ + struct page_cgroup *base, *pc; + unsigned long table_size; + unsigned long start_pfn, nr_pages, index; + + start_pfn = NODE_DATA(nid)->node_start_pfn; + nr_pages = NODE_DATA(nid)->node_spanned_pages; + + table_size = sizeof(struct page_cgroup) * nr_pages; + + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!base) + return -ENOMEM; + for (index = 0; index < nr_pages; index++) { + pc = base + index; + __init_page_cgroup(pc, start_pfn + index); + } + NODE_DATA(nid)->node_page_cgroup = base; + total_usage += table_size; + return 0; +} + +void __init page_cgroup_init(void) +{ + + int nid, fail; + + for_each_online_node(nid) { + fail = alloc_node_page_cgroup(nid); + if (fail) + goto fail; + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you" + " don't want\n"); + return; +fail: + printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); + printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); + + return section->page_cgroup + pfn; +} + +int __meminit init_section_page_cgroup(unsigned long pfn) +{ + struct mem_section *section; + struct page_cgroup *base, *pc; + unsigned long table_size; + int nid, index; + + section = __pfn_to_section(pfn); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + base = kmalloc_node(table_size, GFP_KERNEL, nid); + if (!base) + base = vmalloc_node(table_size, nid); + + if (!base) { + printk(KERN_ERR "page cgroup allocation failure\n"); + return -ENOMEM; + } + + for (index = 0; index < PAGES_PER_SECTION; index++) { + pc = base + index; + __init_page_cgroup(pc, pfn + index); + } + + section = __pfn_to_section(pfn); + section->page_cgroup = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +void __free_page_cgroup(unsigned long pfn) +{ + struct mem_section *ms; + struct page_cgroup *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_cgroup) + return; + base = ms->page_cgroup + pfn; + ms->page_cgroup = NULL; + if (is_vmalloc_addr(base)) + vfree(base); + else + kfree(base); +} + +int online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = start_pfn & (PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + + return -ENOMEM; +} + +int offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = start_pfn & (PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + return 0; + +} + +static int page_cgroup_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + ret = notifier_from_errno(ret); + return ret; +} + +#endif + +void __init page_cgroup_init(void) +{ + unsigned long pfn; + int fail = 0; + + for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (fail) { + printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); + panic("Out of memory"); + } else { + hotplug_memory_notifier(page_cgroup_callback, 0); + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you don't" + " want\n"); +} + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + return; +} + +#endif diff --git a/mm/readahead.c b/mm/readahead.c index 6cbd9a72fde2..bec83c15a78f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } diff --git a/mm/rmap.c b/mm/rmap.c index 0383acfcb068..10993942d6c9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -53,9 +53,47 @@ #include <asm/tlbflush.h> -struct kmem_cache *anon_vma_cachep; +#include "internal.h" -/* This must be called under the mmap_sem. */ +static struct kmem_cache *anon_vma_cachep; + +static inline struct anon_vma *anon_vma_alloc(void) +{ + return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); +} + +static inline void anon_vma_free(struct anon_vma *anon_vma) +{ + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +/** + * anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * This makes sure the memory mapping described by 'vma' has + * an 'anon_vma' attached to it, so that we can associate the + * anonymous pages mapped into it with that anon_vma. + * + * The common case will be that we already have one, but if + * if not we either need to find an adjacent mapping that we + * can re-use the anon_vma from (very common when the only + * reason for splitting a vma has been mprotect()), or we + * allocate a new one. + * + * Anon-vma allocations are very subtle, because we may have + * optimistically looked up an anon_vma in page_lock_anon_vma() + * and that may actually touch the spinlock even in the newly + * allocated vma (it depends on RCU to make sure that the + * anon_vma isn't actually destroyed). + * + * As a result, we need to do proper anon_vma locking even + * for the new allocation. At the same time, we do not want + * to do any locking for the common case of already having + * an anon_vma. + * + * This must be called with the mmap_sem held for reading. + */ int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; @@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma) might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated, *locked; + struct anon_vma *allocated; anon_vma = find_mergeable_anon_vma(vma); - if (anon_vma) { - allocated = NULL; - locked = anon_vma; - spin_lock(&locked->lock); - } else { + allocated = NULL; + if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) return -ENOMEM; allocated = anon_vma; - locked = NULL; } + spin_lock(&anon_vma->lock); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); @@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) } spin_unlock(&mm->page_table_lock); - if (locked) - spin_unlock(&locked->lock); + spin_unlock(&anon_vma->lock); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -157,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -177,7 +211,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, return NULL; } +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + + address = vma_address(page, vma); + if (address == -EFAULT) /* out of vma range */ + return 0; + pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); + if (!pte) /* the page is not in this mm */ + return 0; + pte_unmap_unlock(pte, ptl); + + return 1; +} + /* * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. @@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page, if (!pte) goto out; + /* + * Don't want to elevate referenced for mlocked page that gets this far, + * in order that it progresses to try_to_unmap and is moved to the + * unevictable list. + */ if (vma->vm_flags & VM_LOCKED) { - referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young_notify(vma, address, pte)) + goto out_unmap; + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page, rwsem_is_locked(&mm->mmap_sem)) referenced++; +out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: @@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) - == (VM_LOCKED|VM_MAYSHARE)) { - referenced++; - break; - } referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; @@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) page_clear_dirty(page); set_page_dirty(page); } - - mem_cgroup_uncharge_page(page); + if (PageAnon(page)) + mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); /* @@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young_notify(vma, address, pte)))) { - ret = SWAP_FAIL; - goto out_unmap; - } + if (!migration) { + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_MLOCK; + goto out_unmap; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); @@ -802,12 +870,17 @@ out: * For very sparsely populated VMAs this is a little inefficient - chances are * there there won't be many ptes located within the scan cluster. In this case * maybe we could scan further - to the end of the pte page, perhaps. + * + * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can + * acquire it without blocking. If vma locked, mlock the pages in the cluster, + * rather than unmapping them. If we encounter the "check_page" that vmscan is + * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. */ #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) -static void try_to_unmap_cluster(unsigned long cursor, - unsigned int *mapcount, struct vm_area_struct *vma) +static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, + struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; + int ret = SWAP_AGAIN; + int locked_vma = 0; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - return; + return ret; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - return; + return ret; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) - return; + return ret; + + /* + * MLOCK_PAGES => feature is configured. + * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * keep the sem while scanning the cluster for mlocking pages. + */ + if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + locked_vma = (vma->vm_flags & VM_LOCKED); + if (!locked_vma) + up_read(&vma->vm_mm->mmap_sem); /* don't need it */ + } pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor, page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); + if (locked_vma) { + mlock_vma_page(page); /* no-op if already mlocked */ + if (page == check_page) + ret = SWAP_MLOCK; + continue; /* don't unmap */ + } + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; @@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + if (locked_vma) + up_read(&vma->vm_mm->mmap_sem); + return ret; } -static int try_to_unmap_anon(struct page *page, int migration) +/* + * common handling for pages mapped in VM_LOCKED vmas + */ +static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) +{ + int mlocked = 0; + + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + mlocked++; /* really mlocked the page */ + } + up_read(&vma->vm_mm->mmap_sem); + } + return mlocked; +} + +/** + * try_to_unmap_anon - unmap or unlock anonymous page using the object-based + * rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * This function is only called from try_to_unmap/try_to_munlock for + * anonymous pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. + */ +static int try_to_unmap_anon(struct page *page, int unlock, int migration) { struct anon_vma *anon_vma; struct vm_area_struct *vma; + unsigned int mlocked = 0; int ret = SWAP_AGAIN; + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ + anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) + continue; /* must visit all unlocked vmas */ + ret = SWAP_MLOCK; /* saw at least one mlocked vma */ + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + break; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } page_unlock_anon_vma(anon_vma); + + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + return ret; } /** - * try_to_unmap_file - unmap file page using the object-based rmap method - * @page: the page to unmap - * @migration: migration flag + * try_to_unmap_file - unmap/unlock file page using the object-based rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * This function is only called from try_to_unmap for object-based pages. + * This function is only called from try_to_unmap/try_to_munlock for + * object-based pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int migration) +static int try_to_unmap_file(struct page *page, int unlock, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; + unsigned int mlocked = 0; + + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + goto out; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } + if (mlocked) + goto out; + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; /* leave mlocked == 0 */ + goto out; /* no need to look further */ + } + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration) max_nl_size = cursor; } - if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ ret = SWAP_FAIL; goto out; } @@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (!MLOCK_PAGES && !migration && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - try_to_unmap_cluster(cursor, &mapcount, vma); + ret = try_to_unmap_cluster(cursor, &mapcount, + vma, page); + if (ret == SWAP_MLOCK) + mlocked = 2; /* to return below */ cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } @@ -1002,6 +1192,7 @@ out: * SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable + * SWAP_MLOCK - page is mlocked. */ int try_to_unmap(struct page *page, int migration) { @@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration) BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, migration); + ret = try_to_unmap_anon(page, 0, migration); else - ret = try_to_unmap_file(page, migration); - - if (!page_mapped(page)) + ret = try_to_unmap_file(page, 0, migration); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; } +#ifdef CONFIG_UNEVICTABLE_LRU +/** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked + * + * Called from munlock code. Checks all of the VMAs mapping the page + * to make sure nobody else has this page mlocked. The page will be + * returned with PG_mlocked cleared if no other vmas have it mlocked. + * + * Return values are: + * + * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_MLOCK - page is now mlocked. + */ +int try_to_munlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + + if (PageAnon(page)) + return try_to_unmap_anon(page, 1, 0); + else + return try_to_unmap_file(page, 1, 0); +} +#endif diff --git a/mm/shmem.c b/mm/shmem.c index d87958a5f03e..d38d7e61fcd0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -199,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = default_unplug_io_fn, }; @@ -1367,6 +1367,7 @@ repeat: error = -ENOMEM; goto failed; } + SetPageSwapBacked(filepage); /* Precharge page while we can wait, compensate after */ error = mem_cgroup_cache_charge(filepage, current->mm, @@ -1476,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) if (!user_shm_lock(inode->i_size, user)) goto out_nomem; info->flags |= VM_LOCKED; + mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && user) { user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; + mapping_clear_unevictable(file->f_mapping); + scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; + out_nomem: spin_unlock(&info->lock); return retval; diff --git a/mm/swap.c b/mm/swap.c index 9e0cb3118079..2152e48a7b8f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,11 +31,12 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include "internal.h" + /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs); +static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); /* @@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec) zone = pagezone; spin_lock(&zone->lru_lock); } - if (PageLRU(page) && !PageActive(page)) { - list_move_tail(&page->lru, &zone->inactive_list); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_is_file_cache(page); + list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } } @@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec) void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && - PageLRU(page)) { + !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; @@ -157,12 +159,19 @@ void activate_page(struct page *page) struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = LRU_BASE + file; + del_page_from_lru_list(zone, page, lru); + SetPageActive(page); - add_page_to_active_list(zone, page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - mem_cgroup_move_lists(page, true); + mem_cgroup_move_lists(page, lru); + + zone->recent_rotated[!!file]++; + zone->recent_scanned[!!file]++; } spin_unlock_irq(&zone->lru_lock); } @@ -176,7 +185,8 @@ void activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { + if (!PageActive(page) && !PageUnevictable(page) && + PageReferenced(page) && PageLRU(page)) { activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { @@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); -/** - * lru_cache_add: add a page to the page lists - * @page: the page to add - */ -void lru_cache_add(struct page *page) +void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) - __pagevec_lru_add(pvec); + ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } -void lru_cache_add_active(struct page *page) +/** + * lru_cache_add_lru - add a page to a page list + * @page: the page to be added to the LRU. + * @lru: the LRU list to which the page is added. + */ +void lru_cache_add_lru(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + if (PageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); + ClearPageActive(page); + } else if (PageUnevictable(page)) { + VM_BUG_ON(PageActive(page)); + ClearPageUnevictable(page); + } - page_cache_get(page); - if (!pagevec_add(pvec, page)) - __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); + __lru_cache_add(page, lru); +} + +/** + * add_page_to_unevictable_list - add a page to the unevictable list + * @page: the page to be added to the unevictable list + * + * Add page directly to its zone's unevictable list. To avoid races with + * tasks that might be making the page evictable, through eg. munlock, + * munmap or exit, while it's not on the lru, we want to add the page + * while it's locked or otherwise "invisible" to other tasks. This is + * difficult to do when using the pagevec cache, so bypass that. + */ +void add_page_to_unevictable_list(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + SetPageUnevictable(page); + SetPageLRU(page); + add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + spin_unlock_irq(&zone->lru_lock); +} + +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * place @page on active or unevictable LRU list, depending on + * page_evictable(). Note that if the page is not evictable, + * it goes directly back onto it's zone's unevictable list. It does + * NOT use a per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + if (page_evictable(page, vma)) + lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); + else + add_page_to_unevictable_list(page); } /* @@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page) */ static void drain_cpu_pagevecs(int cpu) { + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); struct pagevec *pvec; + int lru; - pvec = &per_cpu(lru_add_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + ____pagevec_lru_add(pvec, lru); + } pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -244,7 +299,7 @@ void lru_add_drain(void) put_cpu(); } -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); @@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold) if (PageLRU(page)) { struct zone *pagezone = page_zone(page); + if (pagezone != zone) { if (zone) spin_unlock_irqrestore(&zone->lru_lock, @@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec) * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec) +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { int i; struct zone *zone = NULL; + VM_BUG_ON(is_unevictable_lru(lru)); for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -395,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - add_page_to_inactive_list(zone, page); + if (is_active_lru(lru)) + SetPageActive(page); + add_page_to_lru_list(zone, page, lru); } if (zone) spin_unlock_irq(&zone->lru_lock); @@ -405,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec) pagevec_reinit(pvec); } -EXPORT_SYMBOL(__pagevec_lru_add); +EXPORT_SYMBOL(____pagevec_lru_add); -void __pagevec_lru_add_active(struct pagevec *pvec) +/* + * Try to drop buffers from the pages in a pagevec + */ +void pagevec_strip(struct pagevec *pvec) { int i; - struct zone *zone = NULL; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (PagePrivate(page) && trylock_page(page)) { + if (PagePrivate(page)) + try_to_release_page(page, 0); + unlock_page(page); } - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(PageActive(page)); - SetPageActive(page); - add_page_to_active_list(zone, page); } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); } -/* - * Try to drop buffers from the pages in a pagevec +/** + * pagevec_swap_free - try to free swap space from the pages in a pagevec + * @pvec: pagevec with swapcache pages to free the swap space of + * + * The caller needs to hold an extra reference to each page and + * not hold the page lock on the pages. This function uses a + * trylock on the page lock so it may not always free the swap + * space associated with a page. */ -void pagevec_strip(struct pagevec *pvec) +void pagevec_swap_free(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && trylock_page(page)) { - if (PagePrivate(page)) - try_to_release_page(page, 0); + if (PageSwapCache(page) && trylock_page(page)) { + if (PageSwapCache(page)) + remove_exclusive_swap_page_ref(page); unlock_page(page); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 797c3831cbec..3353c9029cef 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; @@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) BUG_ON(!PageLocked(page)); BUG_ON(PageSwapCache(page)); BUG_ON(PagePrivate(page)); + BUG_ON(!PageSwapBacked(page)); error = radix_tree_preload(gfp_mask); if (!error) { page_cache_get(page); @@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ - set_page_locked(new_page); + __set_page_locked(new_page); + SetPageSwapBacked(new_page); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + lru_cache_add_anon(new_page); swap_readpage(NULL, new_page); return new_page; } - clear_page_locked(new_page); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); swap_free(entry); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e330f2998fa..90cb67a5417c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page) * Work out if there are any other processes sharing this * swap cache page. Free it if you can. Return success. */ -int remove_exclusive_swap_page(struct page *page) +static int remove_exclusive_swap_page_count(struct page *page, int count) { int retval; struct swap_info_struct * p; @@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != 2) /* 2: us + cache */ + if (page_count(page) != count) /* us + cache + ptes */ return 0; entry.val = page_private(page); @@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page) if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ spin_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == 2) && !PageWriteback(page)) { + if ((page_count(page) == count) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; @@ -388,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page) } /* + * Most of the time the page should have two references: one for the + * process and one for the swap cache. + */ +int remove_exclusive_swap_page(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2); +} + +/* + * The pageout code holds an extra reference to the page. That raises + * the reference count to test for to 2 for a page that is only in the + * swap cache plus 1 for each process that maps the page. + */ +int remove_exclusive_swap_page_ref(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); +} + +/* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ @@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { page = find_get_page(&swapper_space, entry.val); - if (page && unlikely(!trylock_page(page))) { + if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } diff --git a/mm/truncate.c b/mm/truncate.c index e83e4b114ef1..1229211104f8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -18,6 +18,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include "internal.h" /** @@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); + clear_page_mlock(page); remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ @@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; + clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PageDirty(page)) goto failed; + clear_page_mlock(page); BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bba06c41fc59..712ae47af0bf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -8,6 +8,7 @@ * Numa awareness, Christoph Lameter, SGI, June 2005 */ +#include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> @@ -18,16 +19,17 @@ #include <linux/debugobjects.h> #include <linux/vmalloc.h> #include <linux/kallsyms.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/radix-tree.h> +#include <linux/rcupdate.h> +#include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); +/*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) } while (pte++, addr += PAGE_SIZE, addr != end); } -static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; @@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end) +static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; @@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } -void unmap_kernel_range(unsigned long addr, unsigned long size) +static void vunmap_page_range(unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; - unsigned long start = addr; - unsigned long end = addr + size; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); -} - -static void unmap_vm_area(struct vm_struct *area) -{ - unmap_kernel_range((unsigned long)area->addr, area->size); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pte_t *pte; + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { - struct page *page = **pages; - WARN_ON(!pte_none(*pte)); - if (!page) + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) return -ENOMEM; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); - (*pages)++; + (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0; } -static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pmd_t *pmd; unsigned long next; @@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; unsigned long next; @@ -141,44 +140,49 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +/* + * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and + * will have pfns corresponding to the "pages" array. + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +static int vmap_page_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; - unsigned long addr = (unsigned long) area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; - int err; + int err = 0; + int nr = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_pud_range(pgd, addr, next, prot, pages); + err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap((unsigned long) area->addr, end); - return err; + flush_cache_vmap(addr, end); + + if (unlikely(err)) + return err; + return nr; } -EXPORT_SYMBOL_GPL(map_vm_area); /* - * Map a vmalloc()-space virtual address to the physical page. + * Walk a vmap address to the struct page it maps. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for @@ -188,10 +192,12 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) !is_module_address(addr)); if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); + pmd_t *pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { + pte_t *ptep, pte; + ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) @@ -213,13 +219,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -static struct vm_struct * -__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + +/*** Global kva allocator ***/ + +#define VM_LAZY_FREE 0x01 +#define VM_LAZY_FREEING 0x02 +#define VM_VM_AREA 0x04 + +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + void *private; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(vmap_area_lock); +static struct rb_root vmap_area_root = RB_ROOT; +static LIST_HEAD(vmap_area_list); + +static struct vmap_area *__find_vmap_area(unsigned long addr) { - struct vm_struct **p, *tmp, *area; - unsigned long align = 1; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr > va->va_start) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +static void __insert_vmap_area(struct vmap_area *va) +{ + struct rb_node **p = &vmap_area_root.rb_node; + struct rb_node *parent = NULL; + struct rb_node *tmp; + + while (*p) { + struct vmap_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (va->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&va->rb_node, parent, p); + rb_insert_color(&va->rb_node, &vmap_area_root); + + /* address-sort this list so it is usable like the vmlist */ + tmp = rb_prev(&va->rb_node); + if (tmp) { + struct vmap_area *prev; + prev = rb_entry(tmp, struct vmap_area, rb_node); + list_add_rcu(&va->list, &prev->list); + } else + list_add_rcu(&va->list, &vmap_area_list); +} + +static void purge_vmap_area_lazy(void); + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + struct rb_node *n; + unsigned long addr; + int purged = 0; + + BUG_ON(size & ~PAGE_MASK); + + addr = ALIGN(vstart, align); + + va = kmalloc_node(sizeof(struct vmap_area), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +retry: + spin_lock(&vmap_area_lock); + /* XXX: could have a last_hole cache */ + n = vmap_area_root.rb_node; + if (n) { + struct vmap_area *first = NULL; + + do { + struct vmap_area *tmp; + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + + while (addr + size >= first->va_start && addr + size <= vend) { + addr = ALIGN(first->va_end + PAGE_SIZE, align); + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + } +found: + if (addr + size > vend) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING "vmap allocation failed: " + "use vmalloc=<size> to increase size.\n"); + return ERR_PTR(-EBUSY); + } + + BUG_ON(addr & (align-1)); + + va->va_start = addr; + va->va_end = addr + size; + va->flags = 0; + __insert_vmap_area(va); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void rcu_free_va(struct rcu_head *head) +{ + struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); + + kfree(va); +} + +static void __free_vmap_area(struct vmap_area *va) +{ + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + rb_erase(&va->rb_node, &vmap_area_root); + RB_CLEAR_NODE(&va->rb_node); + list_del_rcu(&va->list); + + call_rcu(&va->rcu_head, rcu_free_va); +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + spin_lock(&vmap_area_lock); + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); +} + +/* + * Clear the pagetable entries of a given vmap_area + */ +static void unmap_vmap_area(struct vmap_area *va) +{ + vunmap_page_range(va->va_start, va->va_end); +} + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); + +/* + * Purges all lazily-freed vmap areas. + * + * If sync is 0 then don't purge if there is already a purge in progress. + * If force_flush is 1, then flush kernel TLBs between *start and *end even + * if we found no lazy vmap areas to unmap (callers can use this to optimise + * their own TLB flushing). + * Returns with *start = min(*start, lowest purged address) + * *end = max(*end, highest purged address) + */ +static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + int sync, int force_flush) +{ + static DEFINE_SPINLOCK(purge_lock); + LIST_HEAD(valist); + struct vmap_area *va; + int nr = 0; + + /* + * If sync is 0 but force_flush is 1, we'll go sync anyway but callers + * should not expect such behaviour. This just simplifies locking for + * the case that isn't actually used at the moment anyway. + */ + if (!sync && !force_flush) { + if (!spin_trylock(&purge_lock)) + return; + } else + spin_lock(&purge_lock); + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + unmap_vmap_area(va); + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + + if (nr) { + BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + atomic_sub(nr, &vmap_lazy_nr); + } + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); + + if (nr) { + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + } + spin_unlock(&purge_lock); +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + va->flags |= VM_LAZY_FREE; + atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + purge_vmap_area_lazy(); +} + +static struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void free_unmap_vmap_area_addr(unsigned long addr) +{ + struct vmap_area *va; + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); +} + + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / NR_CPUS / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; + struct list_head dirty; + unsigned int nr_dirty; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + struct vmap_block_queue *vbq; + unsigned long free, dirty; + DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); + DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + union { + struct { + struct list_head free_list; + struct list_head dirty_list; + }; + struct rcu_head rcu_head; + }; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_SPINLOCK(vmap_block_tree_lock); +static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (unlikely(IS_ERR(va))) { + kfree(vb); + return ERR_PTR(PTR_ERR(va)); + } + + err = radix_tree_preload(gfp_mask); + if (unlikely(err)) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + spin_lock_init(&vb->lock); + vb->va = va; + vb->free = VMAP_BBMAP_BITS; + vb->dirty = 0; + bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + INIT_LIST_HEAD(&vb->free_list); + INIT_LIST_HEAD(&vb->dirty_list); + + vb_idx = addr_to_vb_idx(va->va_start); + spin_lock(&vmap_block_tree_lock); + err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(err); + radix_tree_preload_end(); + + vbq = &get_cpu_var(vmap_block_queue); + vb->vbq = vbq; + spin_lock(&vbq->lock); + list_add(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + put_cpu_var(vmap_cpu_blocks); + + return vb; +} + +static void rcu_free_vb(struct rcu_head *head) +{ + struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); + + kfree(vb); +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + unsigned long vb_idx; + + spin_lock(&vb->vbq->lock); + if (!list_empty(&vb->free_list)) + list_del(&vb->free_list); + if (!list_empty(&vb->dirty_list)) + list_del(&vb->dirty_list); + spin_unlock(&vb->vbq->lock); + + vb_idx = addr_to_vb_idx(vb->va->va_start); + spin_lock(&vmap_block_tree_lock); + tmp = radix_tree_delete(&vmap_block_tree, vb_idx); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(tmp != vb); + + free_unmap_vmap_area(vb->va); + call_rcu(&vb->rcu_head, rcu_free_vb); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + unsigned long addr = 0; + unsigned int order; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + +again: + rcu_read_lock(); + vbq = &get_cpu_var(vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = bitmap_find_free_region(vb->alloc_map, + VMAP_BBMAP_BITS, order); + + if (i >= 0) { + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_init(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; + } + spin_unlock(&vb->lock); + } + put_cpu_var(vmap_cpu_blocks); + rcu_read_unlock(); + + if (!addr) { + vb = new_vmap_block(gfp_mask); + if (IS_ERR(vb)) + return vb; + goto again; + } + + return (void *)addr; +} + +static void vb_free(const void *addr, unsigned long size) +{ + unsigned long offset; + unsigned long vb_idx; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + + offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + + vb_idx = addr_to_vb_idx((unsigned long)addr); + rcu_read_lock(); + vb = radix_tree_lookup(&vmap_block_tree, vb_idx); + rcu_read_unlock(); + BUG_ON(!vb); + + spin_lock(&vb->lock); + bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + if (!vb->dirty) { + spin_lock(&vb->vbq->lock); + list_add(&vb->dirty_list, &vb->vbq->dirty); + spin_unlock(&vb->vbq->lock); + } + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free || !list_empty(&vb->free_list)); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int cpu; + int flush = 0; + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); + while (i < VMAP_BBMAP_BITS) { + unsigned long s, e; + int j; + j = find_next_zero_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + + s = vb->va->va_start + (i << PAGE_SHIFT); + e = vb->va->va_start + (j << PAGE_SHIFT); + vunmap_page_range(s, e); + flush = 1; + + if (s < start) + start = s; + if (e > end) + end = e; + + i = j; + i = find_next_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + __purge_vmap_area_lazy(&start, &end, 1, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr = (unsigned long)mem; + + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(addr & (PAGE_SIZE-1)); + + debug_check_no_locks_freed(mem, size); + + if (likely(count <= VMAP_MAX_ALLOC)) + vb_free(mem, size); + else + free_unmap_vmap_area_addr(addr); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * @prot: memory protection to use. PAGE_KERNEL for regular RAM + * @returns: a pointer to the address that has been mapped, or NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + unsigned long size = count << PAGE_SHIFT; unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +void __init vmalloc_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + INIT_LIST_HEAD(&vbq->dirty); + vbq->nr_dirty = 0; + } +} + +void unmap_kernel_range(unsigned long addr, unsigned long size) +{ + unsigned long end = addr + size; + vunmap_page_range(addr, end); + flush_tlb_kernel_range(addr, end); +} + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + err = vmap_page_range(addr, end, prot, *pages); + if (err > 0) { + *pages += err; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(map_vm_area); + +/*** Old vmalloc interfaces ***/ +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long flags, unsigned long start, unsigned long end, + int node, gfp_t gfp_mask, void *caller) +{ + static struct vmap_area *va; + struct vm_struct *area; + struct vm_struct *tmp, **p; + unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -232,13 +976,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, align = 1ul << bit; } - addr = ALIGN(start, align); + size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); - if (unlikely(!area)) return NULL; @@ -247,48 +990,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, */ size += PAGE_SIZE; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { - if ((unsigned long)tmp->addr < addr) { - if((unsigned long)tmp->addr + tmp->size >= addr) - addr = ALIGN(tmp->size + - (unsigned long)tmp->addr, align); - continue; - } - if ((size + addr) < addr) - goto out; - if (size + addr <= (unsigned long)tmp->addr) - goto found; - addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); - if (addr > end - size) - goto out; + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; } - if ((size + addr) < addr) - goto out; - if (addr > end - size) - goto out; - -found: - area->next = *p; - *p = area; area->flags = flags; - area->addr = (void *)addr; + area->addr = (void *)va->va_start; area->size = size; area->pages = NULL; area->nr_pages = 0; area->phys_addr = 0; area->caller = caller; + va->private = area; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= area->addr) + break; + } + area->next = *p; + *p = area; write_unlock(&vmlist_lock); return area; - -out: - write_unlock(&vmlist_lock); - kfree(area); - if (printk_ratelimit()) - printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); - return NULL; } struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, @@ -328,39 +1055,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, gfp_mask, __builtin_return_address(0)); } -/* Caller must hold vmlist_lock */ -static struct vm_struct *__find_vm_area(const void *addr) +static struct vm_struct *find_vm_area(const void *addr) { - struct vm_struct *tmp; + struct vmap_area *va; - for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { - if (tmp->addr == addr) - break; - } - - return tmp; -} - -/* Caller must hold vmlist_lock */ -static struct vm_struct *__remove_vm_area(const void *addr) -{ - struct vm_struct **p, *tmp; + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) + return va->private; - for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { - if (tmp->addr == addr) - goto found; - } return NULL; - -found: - unmap_vm_area(tmp); - *p = tmp->next; - - /* - * Remove the guard page. - */ - tmp->size -= PAGE_SIZE; - return tmp; } /** @@ -373,11 +1076,24 @@ found: */ struct vm_struct *remove_vm_area(const void *addr) { - struct vm_struct *v; - write_lock(&vmlist_lock); - v = __remove_vm_area(addr); - write_unlock(&vmlist_lock); - return v; + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) { + struct vm_struct *vm = va->private; + struct vm_struct *tmp, **p; + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + + return vm; + } + return NULL; } static void __vunmap(const void *addr, int deallocate_pages) @@ -487,6 +1203,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { @@ -613,10 +1331,8 @@ void *vmalloc_user(unsigned long size) ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -696,10 +1412,8 @@ void *vmalloc_32_user(unsigned long size) ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -800,26 +1514,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, struct vm_struct *area; unsigned long uaddr = vma->vm_start; unsigned long usize = vma->vm_end - vma->vm_start; - int ret; if ((PAGE_SIZE-1) & (unsigned long)addr) return -EINVAL; - read_lock(&vmlist_lock); - area = __find_vm_area(addr); + area = find_vm_area(addr); if (!area) - goto out_einval_locked; + return -EINVAL; if (!(area->flags & VM_USERMAP)) - goto out_einval_locked; + return -EINVAL; if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) - goto out_einval_locked; - read_unlock(&vmlist_lock); + return -EINVAL; addr += pgoff << PAGE_SHIFT; do { struct page *page = vmalloc_to_page(addr); + int ret; + ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; @@ -832,11 +1545,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, /* Prevent "things" like memory migration? VM_flags need a cleanup... */ vma->vm_flags |= VM_RESERVED; - return ret; - -out_einval_locked: - read_unlock(&vmlist_lock); - return -EINVAL; + return 0; } EXPORT_SYMBOL(remap_vmalloc_range); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ff1a58e7c10..3b5860294bb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/memcontrol.h> #include <linux/delayacct.h> +#include <linux/sysctl.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -78,7 +79,7 @@ struct scan_control { unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page) return 0; } +/** + * putback_lru_page - put previously isolated page onto appropriate LRU list + * @page: page to be put back to appropriate lru list + * + * Add previously isolated @page to appropriate LRU list. + * Page may still be unevictable for other reasons. + * + * lru_lock must not be held, interrupts must be enabled. + */ +#ifdef CONFIG_UNEVICTABLE_LRU +void putback_lru_page(struct page *page) +{ + int lru; + int active = !!TestClearPageActive(page); + int was_unevictable = PageUnevictable(page); + + VM_BUG_ON(PageLRU(page)); + +redo: + ClearPageUnevictable(page); + + if (page_evictable(page, NULL)) { + /* + * For evictable pages, we can use the cache. + * In event of a race, worst case is we end up with an + * unevictable page on [in]active list. + * We know how to handle that. + */ + lru = active + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + } else { + /* + * Put unevictable pages directly on zone's unevictable + * list. + */ + lru = LRU_UNEVICTABLE; + add_page_to_unevictable_list(page); + } + mem_cgroup_move_lists(page, lru); + + /* + * page's status can change while we move it among lru. If an evictable + * page is on unevictable list, it never be freed. To avoid that, + * check after we added it to the list, again. + */ + if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (!isolate_lru_page(page)) { + put_page(page); + goto redo; + } + /* This means someone else dropped this page from LRU + * So, it will be freed or putback to LRU again. There is + * nothing to do here. + */ + } + + if (was_unevictable && lru != LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGRESCUED); + else if (!was_unevictable && lru == LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGCULLED); + + put_page(page); /* drop ref from isolate */ +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +void putback_lru_page(struct page *page) +{ + int lru; + VM_BUG_ON(PageLRU(page)); + + lru = !!TestClearPageActive(page) + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + mem_cgroup_move_lists(page, lru); + put_page(page); +} +#endif /* CONFIG_UNEVICTABLE_LRU */ + + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; + if (unlikely(!page_evictable(page, NULL))) + goto cull_mlocked; + if (!sc->may_swap && page_mapped(page)) goto keep_locked; @@ -539,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) + if (PageAnon(page) && !PageSwapCache(page)) { + switch (try_to_munlock(page)) { + case SWAP_FAIL: /* shouldn't happen */ + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* fall thru'; add to swap cache */ + } if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; + } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -556,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case SWAP_AGAIN: goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -602,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. + * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * @@ -637,7 +732,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; - unlock_page(page); + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); free_it: nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) { @@ -646,14 +748,23 @@ free_it: } continue; +cull_mlocked: + unlock_page(page); + putback_lru_page(page); + continue; + activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + remove_exclusive_swap_page_ref(page); + VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) @@ -677,7 +788,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode, int file) { int ret = -EINVAL; @@ -693,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; + if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + return ret; + + /* + * When this function is being called for lumpy reclaim, we + * initially look into all LRU pages, active, inactive and + * unevictable; only give shrink_page_list evictable pages. + */ + if (PageUnevictable(page)) + return ret; + ret = -EBUSY; if (likely(get_page_unless_zero(page))) { /* @@ -723,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode) * @scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode) + unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; unsigned long scan; @@ -745,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON(!PageLRU(page)); - switch (__isolate_lru_page(page, mode)) { + switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); nr_taken++; @@ -788,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode)) { + switch (__isolate_lru_page(cursor_page, mode, file)) { case 0: list_move(&cursor_page->lru, dst); nr_taken++; @@ -802,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* else it is being freed elsewhere */ list_move(&cursor_page->lru, src); default: - break; + break; /* ! on LRU or wrong list */ } } } @@ -816,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { + int lru = LRU_BASE; if (active) - return isolate_lru_pages(nr, &z->active_list, dst, - scanned, order, mode); - else - return isolate_lru_pages(nr, &z->inactive_list, dst, - scanned, order, mode); + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, + mode, !!file); } /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. */ -static unsigned long clear_active_flags(struct list_head *page_list) +static unsigned long clear_active_flags(struct list_head *page_list, + unsigned int *count) { int nr_active = 0; + int lru; struct page *page; - list_for_each_entry(page, page_list, lru) + list_for_each_entry(page, page_list, lru) { + lru = page_is_file_cache(page); if (PageActive(page)) { + lru += LRU_ACTIVE; ClearPageActive(page); nr_active++; } + count[lru]++; + } return nr_active; } +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the unevictable list, it will have the PageUnevictable bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. + */ +int isolate_lru_page(struct page *page) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && get_page_unless_zero(page)) { + int lru = page_lru(page); + ret = 0; + ClearPageLRU(page); + + del_page_from_lru_list(zone, page, lru); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) + struct zone *zone, struct scan_control *sc, + int priority, int file) { LIST_HEAD(page_list); struct pagevec pvec; @@ -866,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scan; unsigned long nr_freed; unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + int mode = ISOLATE_INACTIVE; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + mode = ISOLATE_BOTH; + else if (sc->order && priority < DEF_PRIORITY - 2) + mode = ISOLATE_BOTH; nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, - (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, 0); - nr_active = clear_active_flags(&page_list); + &page_list, &nr_scan, sc->order, mode, + zone, sc->mem_cgroup, 0, file); + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); - __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); - __mod_zone_page_state(zone, NR_INACTIVE, - -(nr_taken - nr_active)); - if (scan_global_lru(sc)) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + if (scan_global_lru(sc)) { zone->pages_scanned += nr_scan; + zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + } spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -899,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * The attempt at page out may have made some * of the pages active, mark them inactive again. */ - nr_active = clear_active_flags(&page_list); + nr_active = clear_active_flags(&page_list, count); count_vm_events(PGDEACTIVATE, nr_active); nr_freed += shrink_page_list(&page_list, sc, @@ -924,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { + int lru; page = lru_to_page(&page_list); VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + mem_cgroup_move_lists(page, lru); + if (PageActive(page) && scan_global_lru(sc)) { + int file = !!page_is_file_cache(page); + zone->recent_rotated[file]++; + } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -962,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) static inline int zone_is_near_oom(struct zone *zone) { - return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE))*3; -} - -/* - * Determine we should try to reclaim mapped pages. - * This is called only when sc->mem_cgroup is NULL. - */ -static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, - int priority) -{ - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - int reclaim_mapped = 0; - int prev_priority; - - if (scan_global_lru(sc) && zone_is_near_oom(zone)) - return 1; - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - if (scan_global_lru(sc)) - prev_priority = zone->prev_priority; - else - prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); - - distress = 100 >> min(prev_priority, priority); - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - if (scan_global_lru(sc)) - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - else - mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; - - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - if (scan_global_lru(sc)) { - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - } else - imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); - - /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; - - /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= mapped_ratio; - imbalance /= 100; - - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - - return reclaim_mapped; + return zone->pages_scanned >= (zone_lru_pages(zone) * 3); } /* @@ -1093,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority) + struct scan_control *sc, int priority, int file) { unsigned long pgmoved; int pgdeactivate = 0; unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ - LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ - LIST_HEAD(l_active); /* Pages to go onto the active_list */ + LIST_HEAD(l_inactive); struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - - if (sc->may_swap) - reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + enum lru_list lru; lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1); + sc->mem_cgroup, 1, file); /* * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) + if (scan_global_lru(sc)) { zone->pages_scanned += pgscanned; + zone->recent_scanned[!!file] += pgmoved; + } - __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); + if (file) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + else + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); + pgmoved = 0; while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->mem_cgroup)) { - list_add(&page->lru, &l_active); - continue; - } + + if (unlikely(!page_evictable(page, NULL))) { + putback_lru_page(page); + continue; } + + /* page_referenced clears PageReferenced */ + if (page_mapping_inuse(page) && + page_referenced(page, 0, sc->mem_cgroup)) + pgmoved++; + list_add(&page->lru, &l_inactive); } + /* + * Count referenced pages from currently used mappings as + * rotated, even though they are moved to the inactive list. + * This helps balance scan pressure between file and anonymous + * pages in get_scan_ratio. + */ + zone->recent_rotated[!!file] += pgmoved; + + /* + * Move the pages to the [file or anon] inactive list. + */ pagevec_init(&pvec, 1); + pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); @@ -1149,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, VM_BUG_ON(!PageActive(page)); ClearPageActive(page); - list_move(&page->lru, &zone->inactive_list); - mem_cgroup_move_lists(page, false); + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_move_lists(page, lru); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -1163,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); pagevec_strip(&pvec); spin_lock_irq(&zone->lru_lock); } - - pgmoved = 0; - while (!list_empty(&l_active)) { - page = lru_to_page(&l_active); - prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - - list_move(&page->lru, &zone->active_list); - mem_cgroup_move_lists(page, true); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - pgmoved = 0; - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); + if (vm_swap_full()) + pagevec_swap_free(&pvec); pagevec_release(&pvec); } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct zone *zone, struct scan_control *sc, int priority) +{ + int file = is_file_lru(lru); + + if (lru == LRU_ACTIVE_FILE) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + + if (lru == LRU_ACTIVE_ANON && + (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); +} + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * percent[0] specifies how much pressure to put on ram/swap backed + * memory, while percent[1] determines pressure on the file LRUs. + */ +static void get_scan_ratio(struct zone *zone, struct scan_control *sc, + unsigned long *percent) +{ + unsigned long anon, file, free; + unsigned long anon_prio, file_prio; + unsigned long ap, fp; + + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + + /* If we have no swap space, do not bother scanning anon pages. */ + if (nr_swap_pages <= 0) { + percent[0] = 0; + percent[1] = 100; + return; + } + + /* If we have very few page cache pages, force-scan anon pages. */ + if (unlikely(file + free <= zone->pages_high)) { + percent[0] = 100; + percent[1] = 0; + return; + } + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + if (unlikely(zone->recent_scanned[0] > anon / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[0] /= 2; + zone->recent_rotated[0] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + if (unlikely(zone->recent_scanned[1] > file / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[1] /= 2; + zone->recent_rotated[1] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* + * anon recent_rotated[0] + * %anon = 100 * ----------- / ----------------- * IO cost + * anon + file rotate_sum + */ + ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); + ap /= zone->recent_rotated[0] + 1; + + fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); + fp /= zone->recent_rotated[1] + 1; + + /* Normalize to percentages */ + percent[0] = 100 * ap / (ap + fp + 1); + percent[1] = 100 - percent[0]; +} + + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static unsigned long shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; + unsigned long percent[2]; /* anon @ 0; file @ 1 */ + enum lru_list l; - if (scan_global_lru(sc)) { - /* - * Add one to nr_to_scan just to make sure that the kernel - * will slowly sift through the active list. - */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; - - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else - nr_active = 0; - } else { - /* - * This reclaim occurs not because zone memory shortage but - * because memory controller hits its limit. - * Then, don't modify zone reclaim related data. - */ - nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, - zone, priority); - - nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, - zone, priority); - } + get_scan_ratio(zone, sc, percent); + for_each_evictable_lru(l) { + if (scan_global_lru(sc)) { + int file = is_file_lru(l); + int scan; - while (nr_active || nr_inactive) { - if (nr_active) { - nr_to_scan = min(nr_active, - (unsigned long)sc->swap_cluster_max); - nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc, priority); + scan = zone_page_state(zone, NR_LRU_BASE + l); + if (priority) { + scan >>= priority; + scan = (scan * percent[file]) / 100; + } + zone->lru[l].nr_scan += scan; + nr[l] = zone->lru[l].nr_scan; + if (nr[l] >= sc->swap_cluster_max) + zone->lru[l].nr_scan = 0; + else + nr[l] = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage + * but because memory controller hits its limit. + * Don't modify zone reclaim related data. + */ + nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, + priority, l); } + } - if (nr_inactive) { - nr_to_scan = min(nr_inactive, + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + for_each_evictable_lru(l) { + if (nr[l]) { + nr_to_scan = min(nr[l], (unsigned long)sc->swap_cluster_max); - nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, - sc); + nr[l] -= nr_to_scan; + + nr_reclaimed += shrink_list(l, nr_to_scan, + zone, sc, priority); + } } } + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + else if (!scan_global_lru(sc)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + throttle_vm_writeout(sc->gfp_mask); return nr_reclaimed; } @@ -1321,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, return nr_reclaimed; } - + /* * This is the main entry point to direct page reclaim. * @@ -1364,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } } @@ -1555,6 +1759,14 @@ loop_again: priority != DEF_PRIORITY) continue; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + if (inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, + &sc, priority, 0); + if (!zone_watermark_ok(zone, order, zone->pages_high, 0, 0)) { end_zone = i; @@ -1567,8 +1779,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } /* @@ -1612,8 +1823,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE)) * 6) + (zone_lru_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1667,7 +1877,7 @@ out: /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1761,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } +unsigned long global_lru_pages(void) +{ + return global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_INACTIVE_FILE); +} + #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages @@ -1774,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, { struct zone *zone; unsigned long nr_to_scan, ret = 0; + enum lru_list l; for_each_zone(zone) { @@ -1783,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; - /* For pass = 0 we don't shrink the active list */ - if (pass > 0) { - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; - if (zone->nr_scan_active >= nr_pages || pass > 3) { - zone->nr_scan_active = 0; + for_each_evictable_lru(l) { + /* For pass = 0, we don't shrink the active list */ + if (pass == 0 && + (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + continue; + + zone->lru[l].nr_scan += + (zone_page_state(zone, NR_LRU_BASE + l) + >> prio) + 1; + if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_scan = 0; nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_ACTIVE)); - shrink_active_list(nr_to_scan, zone, sc, prio); + zone_page_state(zone, + NR_LRU_BASE + l)); + ret += shrink_list(l, nr_to_scan, zone, + sc, prio); + if (ret >= nr_pages) + return ret; } } - - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; - if (zone->nr_scan_inactive >= nr_pages || pass > 3) { - zone->nr_scan_inactive = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_INACTIVE)); - ret += shrink_inactive_list(nr_to_scan, zone, sc); - if (ret >= nr_pages) - return ret; - } } return ret; } -static unsigned long count_lru_pages(void) -{ - return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); -} - /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1840,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = count_lru_pages(); + lru_pages = global_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1883,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); + global_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1900,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -2128,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return ret; } #endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * page_evictable - test whether a page is evictable + * @page: the page to test + * @vma: the VMA in which the page is or will be mapped, may be NULL + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. The vma argument is !NULL when called from the + * fault path to determine how to instantate a new page. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +int page_evictable(struct page *page, struct vm_area_struct *vma) +{ + + if (mapping_unevictable(page_mapping(page))) + return 0; + + if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + return 0; + + return 1; +} + +static void show_page_path(struct page *page) +{ + char buf[256]; + if (page_is_file_cache(page)) { + struct address_space *mapping = page->mapping; + struct dentry *dentry; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + spin_lock(&mapping->i_mmap_lock); + dentry = d_find_alias(mapping->host); + printk(KERN_INFO "rescued: %s %lu\n", + dentry_path(dentry, buf, 256), pgoff); + spin_unlock(&mapping->i_mmap_lock); + } else { +#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + printk(KERN_INFO "rescued: anon %s\n", + vma->vm_mm->owner->comm); + break; + } + page_unlock_anon_vma(anon_vma); +#endif + } +} + + +/** + * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list + * @page: page to check evictability and move to appropriate lru list + * @zone: zone page is in + * + * Checks a page for evictability and moves the page to the appropriate + * zone lru list. + * + * Restrictions: zone->lru_lock must be held, page must be on LRU and must + * have PageUnevictable set. + */ +static void check_move_unevictable_page(struct page *page, struct zone *zone) +{ + VM_BUG_ON(PageActive(page)); + +retry: + ClearPageUnevictable(page); + if (page_evictable(page, NULL)) { + enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + + show_page_path(page); + + __dec_zone_state(zone, NR_UNEVICTABLE); + list_move(&page->lru, &zone->lru[l].list); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); + __count_vm_event(UNEVICTABLE_PGRESCUED); + } else { + /* + * rotate unevictable list + */ + SetPageUnevictable(page); + list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + if (page_evictable(page, NULL)) + goto retry; + } +} + +/** + * scan_mapping_unevictable_pages - scan an address space for evictable pages + * @mapping: struct address_space to scan for evictable pages + * + * Scan all pages in mapping. Check unevictable pages for + * evictability and move them to the appropriate zone lru list. + */ +void scan_mapping_unevictable_pages(struct address_space *mapping) +{ + pgoff_t next = 0; + pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + struct zone *zone; + struct pagevec pvec; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + while (next < end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + int i; + int pg_scanned = 0; + + zone = NULL; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + struct zone *pagezone = page_zone(page); + + pg_scanned++; + if (page_index > next) + next = page_index; + next++; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + + if (PageLRU(page) && PageUnevictable(page)) + check_move_unevictable_page(page, zone); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); + } + +} + +/** + * scan_zone_unevictable_pages - check unevictable list for evictable pages + * @zone - zone of which to scan the unevictable list + * + * Scan @zone's unevictable LRU lists to check for pages that have become + * evictable. Move those that have to @zone's inactive list where they + * become candidates for reclaim, unless shrink_inactive_zone() decides + * to reactivate them. Pages that are still unevictable are rotated + * back onto @zone's unevictable list. + */ +#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ +void scan_zone_unevictable_pages(struct zone *zone) +{ + struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; + unsigned long scan; + unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); + + while (nr_to_scan > 0) { + unsigned long batch_size = min(nr_to_scan, + SCAN_UNEVICTABLE_BATCH_SIZE); + + spin_lock_irq(&zone->lru_lock); + for (scan = 0; scan < batch_size; scan++) { + struct page *page = lru_to_page(l_unevictable); + + if (!trylock_page(page)) + continue; + + prefetchw_prev_lru_page(page, l_unevictable, flags); + + if (likely(PageLRU(page) && PageUnevictable(page))) + check_move_unevictable_page(page, zone); + + unlock_page(page); + } + spin_unlock_irq(&zone->lru_lock); + + nr_to_scan -= batch_size; + } +} + + +/** + * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages + * + * A really big hammer: scan all zones' unevictable LRU lists to check for + * pages that have become evictable. Move those back to the zones' + * inactive list where they become candidates for reclaim. + * This occurs when, e.g., we have unswappable pages on the unevictable lists, + * and we add swap to the system. As such, it runs in the context of a task + * that has possibly/probably made some previously unevictable pages + * evictable. + */ +void scan_all_zones_unevictable_pages(void) +{ + struct zone *zone; + + for_each_zone(zone) { + scan_zone_unevictable_pages(zone); + } +} + +/* + * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of + * all nodes' unevictable lists for evictable pages + */ +unsigned long scan_unevictable_pages; + +int scan_unevictable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write && *(unsigned long *)table->data) + scan_all_zones_unevictable_pages(); + + scan_unevictable_pages = 0; + return 0; +} + +/* + * per node 'scan_unevictable_pages' attribute. On demand re-scan of + * a specified node's per zone unevictable lists for evictable pages. + */ + +static ssize_t read_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "0\n"); /* always zero; should fit... */ +} + +static ssize_t write_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct zone *node_zones = NODE_DATA(dev->id)->node_zones; + struct zone *zone; + unsigned long res; + unsigned long req = strict_strtoul(buf, 10, &res); + + if (!req) + return 1; /* zero is no-op */ + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + scan_zone_unevictable_pages(zone); + } + return 1; +} + + +static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, + read_scan_unevictable_node, + write_scan_unevictable_node); + +int scan_unevictable_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +void scan_unevictable_unregister_node(struct node *node) +{ + sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index d7826af2fb07..9343227c5c60 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -619,8 +619,14 @@ const struct seq_operations pagetypeinfo_op = { static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", - "nr_inactive", - "nr_active", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", +#ifdef CONFIG_UNEVICTABLE_LRU + "nr_unevictable", + "nr_mlock", +#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -675,6 +681,16 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_UNEVICTABLE_LRU + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + "unevictable_pgs_mlockfreed", +#endif #endif }; @@ -688,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" + "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -696,7 +712,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->pages_low, zone->pages_high, zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, + zone->lru[LRU_ACTIVE_ANON].nr_scan, + zone->lru[LRU_INACTIVE_ANON].nr_scan, + zone->lru[LRU_ACTIVE_FILE].nr_scan, + zone->lru[LRU_INACTIVE_FILE].nr_scan, zone->spanned_pages, zone->present_pages); @@ -733,10 +752,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n start_pfn: %lu", + "\n start_pfn: %lu" + "\n inactive_ratio: %u", zone_is_all_unreclaimable(zone), zone->prev_priority, - zone->zone_start_pfn); + zone->zone_start_pfn, + zone->inactive_ratio); seq_putc(m, '\n'); } diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index a4abed5b4c44..fa5cda4e552a 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -719,7 +719,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb, return NF_ACCEPT; } *d = (struct net_device *)in; - NF_HOOK(NF_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in, + NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in, (struct net_device *)out, br_nf_forward_finish); return NF_STOLEN; diff --git a/net/core/dev.c b/net/core/dev.c index 868ec0ba8b77..b8a4fd0806af 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -924,10 +924,10 @@ int dev_change_name(struct net_device *dev, const char *newname) strlcpy(dev->name, newname, IFNAMSIZ); rollback: - err = device_rename(&dev->dev, dev->name); - if (err) { + ret = device_rename(&dev->dev, dev->name); + if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); - return err; + return ret; } write_lock_bh(&dev_base_lock); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 11062780bb02..d4ce1224e008 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -259,7 +259,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) fl.fl6_flowlabel = 0; fl.oif = ireq6->iif; fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->sport; + fl.fl_ip_sport = inet_rsk(req)->loc_port; security_req_classify_flow(req, &fl); opt = np->opt; @@ -558,7 +558,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); fl.oif = sk->sk_bound_dev_if; fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->sport; + fl.fl_ip_sport = inet_rsk(req)->loc_port; security_sk_classify_flow(sk, &fl); if (ip6_dst_lookup(sk, &dst, &fl)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b2804e2d1b8c..e6bf99e3e41a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -309,6 +309,7 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) struct dccp_request_sock *dreq = dccp_rsk(req); inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->loc_port = dccp_hdr(skb)->dccph_dport; inet_rsk(req)->acked = 0; req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; diff --git a/net/dccp/output.c b/net/dccp/output.c index d06945c7d3df..809d803d5006 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -347,7 +347,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, /* Build and checksum header */ dh = dccp_zeroed_hdr(skb, dccp_header_size); - dh->dccph_sport = inet_sk(sk)->sport; + dh->dccph_sport = inet_rsk(req)->loc_port; dh->dccph_dport = inet_rsk(req)->rmt_port; dh->dccph_doff = (dccp_header_size + DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b043eda60b04..1a9dd66511fc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -663,7 +663,7 @@ out: void arp_xmit(struct sk_buff *skb) { /* Send it off, maybe filter it using firewalling first. */ - NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); + NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); } /* @@ -928,7 +928,7 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); freeskb: kfree_skb(skb); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ffeaffc3fffe..8303e4b406c0 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -742,6 +742,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); if (*obj == NULL) { + kfree(p); kfree(id); if (net_ratelimit()) printk("OOM in bsalg (%d)\n", __LINE__); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index ec394cf5a19b..676c80b5b14b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -204,6 +204,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) req->mss = mss; ireq->rmt_port = th->source; + ireq->loc_port = th->dest; ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr); ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr); if (ipv6_opt_accepted(sk, skb) || diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e5310c9b84dc..b6b356b7912a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -476,7 +476,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req) fl.fl6_flowlabel = 0; fl.oif = treq->iif; fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->sport; + fl.fl_ip_sport = inet_rsk(req)->loc_port; security_req_classify_flow(req, &fl); opt = np->opt; @@ -1309,7 +1309,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr); fl.oif = sk->sk_bound_dev_if; fl.fl_ip_dport = inet_rsk(req)->rmt_port; - fl.fl_ip_sport = inet_sk(sk)->sport; + fl.fl_ip_sport = inet_rsk(req)->loc_port; security_req_classify_flow(req, &fl); if (ip6_dst_lookup(sk, &dst, &fl)) @@ -1865,7 +1865,7 @@ static void get_openreq6(struct seq_file *seq, i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], - ntohs(inet_sk(sk)->sport), + ntohs(inet_rsk(req)->loc_port), dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->rmt_port), diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 78892cf2b021..25dcef9f2194 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -271,7 +271,6 @@ config NF_CONNTRACK_TFTP config NF_CT_NETLINK tristate 'Connection tracking netlink interface' select NETFILTER_NETLINK - depends on NF_NAT=n || NF_NAT default m if NETFILTER_ADVANCED=n help This option enables support for a netlink-based userspace interface diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 05048e403266..79a698052218 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -25,11 +25,13 @@ menuconfig IP_VS if IP_VS config IP_VS_IPV6 - bool "IPv6 support for IPVS (DANGEROUS)" + bool "IPv6 support for IPVS" depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) ---help--- Add IPv6 support to IPVS. This is incomplete and might be dangerous. + See http://www.mindbasket.com/ipvs for more information. + Say N if unsure. config IP_VS_DEBUG diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 2e4ad9671e19..a040d46f85d6 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -813,6 +813,7 @@ out: return err; } +#ifdef CONFIG_NF_NAT_NEEDED static int ctnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, @@ -840,6 +841,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, return parse_nat_setup(ct, manip, attr); } +#endif static int ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[]) diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 2cc1fff49307..f9977b3311f7 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -48,7 +48,7 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = { }, { .name = "NFQUEUE", - .family = NF_ARP, + .family = NFPROTO_ARP, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index 6f62c36948d9..7ac54eab0b00 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -61,7 +61,7 @@ iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par) if (info->flags & IPRANGE_SRC) { m = ntohl(iph->saddr) < ntohl(info->src_min.ip); m |= ntohl(iph->saddr) > ntohl(info->src_max.ip); - m ^= info->flags & IPRANGE_SRC_INV; + m ^= !!(info->flags & IPRANGE_SRC_INV); if (m) { pr_debug("src IP " NIPQUAD_FMT " NOT in range %s" NIPQUAD_FMT "-" NIPQUAD_FMT "\n", @@ -75,7 +75,7 @@ iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par) if (info->flags & IPRANGE_DST) { m = ntohl(iph->daddr) < ntohl(info->dst_min.ip); m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip); - m ^= info->flags & IPRANGE_DST_INV; + m ^= !!(info->flags & IPRANGE_DST_INV); if (m) { pr_debug("dst IP " NIPQUAD_FMT " NOT in range %s" NIPQUAD_FMT "-" NIPQUAD_FMT "\n", @@ -114,14 +114,14 @@ iprange_mt6(const struct sk_buff *skb, const struct xt_match_param *par) if (info->flags & IPRANGE_SRC) { m = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0; m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0; - m ^= info->flags & IPRANGE_SRC_INV; + m ^= !!(info->flags & IPRANGE_SRC_INV); if (m) return false; } if (info->flags & IPRANGE_DST) { m = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0; m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0; - m ^= info->flags & IPRANGE_DST_INV; + m ^= !!(info->flags & IPRANGE_DST_INV); if (m) return false; } diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 4ebd4ca9a991..280c471bcdf4 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -318,15 +318,15 @@ static bool recent_mt_check(const struct xt_mtchk_param *par) for (i = 0; i < ip_list_hash_size; i++) INIT_LIST_HEAD(&t->iphash[i]); #ifdef CONFIG_PROC_FS - t->proc = proc_create(t->name, ip_list_perms, recent_proc_dir, - &recent_mt_fops); + t->proc = proc_create_data(t->name, ip_list_perms, recent_proc_dir, + &recent_mt_fops, t); if (t->proc == NULL) { kfree(t); goto out; } #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT - t->proc_old = proc_create(t->name, ip_list_perms, proc_old_dir, - &recent_old_fops); + t->proc_old = proc_create_data(t->name, ip_list_perms, proc_old_dir, + &recent_old_fops, t); if (t->proc_old == NULL) { remove_proc_entry(t->name, proc_old_dir); kfree(t); @@ -334,11 +334,9 @@ static bool recent_mt_check(const struct xt_mtchk_param *par) } t->proc_old->uid = ip_list_uid; t->proc_old->gid = ip_list_gid; - t->proc_old->data = t; #endif t->proc->uid = ip_list_uid; t->proc->gid = ip_list_gid; - t->proc->data = t; #endif spin_lock_bh(&recent_lock); list_add_tail(&t->list, &tables); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7b5572d6beb5..93cd30ce6501 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -326,6 +326,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = { static struct netdev_queue noop_netdev_queue = { .qdisc = &noop_qdisc, + .qdisc_sleeping = &noop_qdisc, }; struct Qdisc noop_qdisc = { @@ -352,6 +353,7 @@ static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { static struct Qdisc noqueue_qdisc; static struct netdev_queue noqueue_netdev_queue = { .qdisc = &noqueue_qdisc, + .qdisc_sleeping = &noqueue_qdisc, }; static struct Qdisc noqueue_qdisc = { diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 46f23971f7e4..5ba78701adc3 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -1,5 +1,5 @@ /* - * dev_cgroup.c - device cgroup subsystem + * device_cgroup.c - device cgroup subsystem * * Copyright 2007 IBM Corp */ @@ -10,6 +10,7 @@ #include <linux/list.h> #include <linux/uaccess.h> #include <linux/seq_file.h> +#include <linux/rcupdate.h> #define ACC_MKNOD 1 #define ACC_READ 2 @@ -22,18 +23,8 @@ /* * whitelist locking rules: - * cgroup_lock() cannot be taken under dev_cgroup->lock. - * dev_cgroup->lock can be taken with or without cgroup_lock(). - * - * modifications always require cgroup_lock - * modifications to a list which is visible require the - * dev_cgroup->lock *and* cgroup_lock() - * walking the list requires dev_cgroup->lock or cgroup_lock(). - * - * reasoning: dev_whitelist_copy() needs to kmalloc, so needs - * a mutex, which the cgroup_lock() is. Since modifying - * a visible list requires both locks, either lock can be - * taken for walking the list. + * hold cgroup_lock() for update/read. + * hold rcu_read_lock() for read. */ struct dev_whitelist_item { @@ -47,7 +38,6 @@ struct dev_whitelist_item { struct dev_cgroup { struct cgroup_subsys_state css; struct list_head whitelist; - spinlock_t lock; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) @@ -84,13 +74,9 @@ static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig) struct dev_whitelist_item *wh, *tmp, *new; list_for_each_entry(wh, orig, list) { - new = kmalloc(sizeof(*wh), GFP_KERNEL); + new = kmemdup(wh, sizeof(*wh), GFP_KERNEL); if (!new) goto free_and_exit; - new->major = wh->major; - new->minor = wh->minor; - new->type = wh->type; - new->access = wh->access; list_add_tail(&new->list, dest); } @@ -107,19 +93,16 @@ free_and_exit: /* Stupid prototype - don't bother combining existing entries */ /* * called under cgroup_lock() - * since the list is visible to other tasks, we need the spinlock also */ static int dev_whitelist_add(struct dev_cgroup *dev_cgroup, struct dev_whitelist_item *wh) { struct dev_whitelist_item *whcopy, *walk; - whcopy = kmalloc(sizeof(*whcopy), GFP_KERNEL); + whcopy = kmemdup(wh, sizeof(*wh), GFP_KERNEL); if (!whcopy) return -ENOMEM; - memcpy(whcopy, wh, sizeof(*whcopy)); - spin_lock(&dev_cgroup->lock); list_for_each_entry(walk, &dev_cgroup->whitelist, list) { if (walk->type != wh->type) continue; @@ -135,7 +118,6 @@ static int dev_whitelist_add(struct dev_cgroup *dev_cgroup, if (whcopy != NULL) list_add_tail_rcu(&whcopy->list, &dev_cgroup->whitelist); - spin_unlock(&dev_cgroup->lock); return 0; } @@ -149,14 +131,12 @@ static void whitelist_item_free(struct rcu_head *rcu) /* * called under cgroup_lock() - * since the list is visible to other tasks, we need the spinlock also */ static void dev_whitelist_rm(struct dev_cgroup *dev_cgroup, struct dev_whitelist_item *wh) { struct dev_whitelist_item *walk, *tmp; - spin_lock(&dev_cgroup->lock); list_for_each_entry_safe(walk, tmp, &dev_cgroup->whitelist, list) { if (walk->type == DEV_ALL) goto remove; @@ -174,7 +154,6 @@ remove: call_rcu(&walk->rcu, whitelist_item_free); } } - spin_unlock(&dev_cgroup->lock); } /* @@ -214,7 +193,6 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, } } - spin_lock_init(&dev_cgroup->lock); return &dev_cgroup->css; } @@ -330,15 +308,11 @@ static int parent_has_perm(struct dev_cgroup *childcg, { struct cgroup *pcg = childcg->css.cgroup->parent; struct dev_cgroup *parent; - int ret; if (!pcg) return 1; parent = cgroup_to_devcgroup(pcg); - spin_lock(&parent->lock); - ret = may_access_whitelist(parent, wh); - spin_unlock(&parent->lock); - return ret; + return may_access_whitelist(parent, wh); } /* @@ -357,17 +331,14 @@ static int parent_has_perm(struct dev_cgroup *childcg, static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, const char *buffer) { - struct dev_cgroup *cur_devcgroup; const char *b; char *endp; - int retval = 0, count; + int count; struct dev_whitelist_item wh; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - cur_devcgroup = task_devcgroup(current); - memset(&wh, 0, sizeof(wh)); b = buffer; @@ -437,7 +408,6 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, } handle: - retval = 0; switch (filetype) { case DEVCG_ALLOW: if (!parent_has_perm(devcgroup, &wh)) diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c index 89b7f549bebd..ea2bf82c9373 100644 --- a/sound/core/pcm_misc.c +++ b/sound/core/pcm_misc.c @@ -319,6 +319,7 @@ EXPORT_SYMBOL(snd_pcm_format_physical_width); /** * snd_pcm_format_size - return the byte size of samples on the given format * @format: the format to check + * @samples: sampling rate * * Returns the byte size of the given samples for the format, or a * negative error code if unknown format. diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c index e5e749f3e0ef..73be7e14a603 100644 --- a/sound/drivers/dummy.c +++ b/sound/drivers/dummy.c @@ -51,7 +51,7 @@ static int emu10k1_playback_constraints(struct snd_pcm_runtime *runtime) if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 256, UINT_MAX); - if (err) < 0) + if (err < 0) return err; return 0; } diff --git a/sound/pci/ca0106/ca0106_main.c b/sound/pci/ca0106/ca0106_main.c index a7d89662acf6..88fbf285d2b7 100644 --- a/sound/pci/ca0106/ca0106_main.c +++ b/sound/pci/ca0106/ca0106_main.c @@ -759,7 +759,6 @@ static int snd_ca0106_pcm_prepare_playback(struct snd_pcm_substream *substream) SPCS_CHANNELNUM_LEFT | SPCS_SOURCENUM_UNSPEC | SPCS_GENERATIONSTATUS | 0x00001200 | 0x00000000 | SPCS_EMPHASIS_NONE | SPCS_COPYRIGHT ); - } #endif return 0; diff --git a/sound/ppc/snd_ps3.c b/sound/ppc/snd_ps3.c index 20d0e328288a..8f9e3859c37c 100644 --- a/sound/ppc/snd_ps3.c +++ b/sound/ppc/snd_ps3.c @@ -666,6 +666,7 @@ static int snd_ps3_init_avsetting(struct snd_ps3_card_info *card) card->avs.avs_audio_width = PS3AV_CMD_AUDIO_WORD_BITS_16; card->avs.avs_audio_format = PS3AV_CMD_AUDIO_FORMAT_PCM; card->avs.avs_audio_source = PS3AV_CMD_AUDIO_SOURCE_SERIAL; + memcpy(card->avs.avs_cs_info, ps3av_mode_cs_info, 8); ret = snd_ps3_change_avsetting(card); @@ -685,6 +686,7 @@ static int snd_ps3_set_avsetting(struct snd_pcm_substream *substream) { struct snd_ps3_card_info *card = snd_pcm_substream_chip(substream); struct snd_ps3_avsetting_info avs; + int ret; avs = card->avs; @@ -729,19 +731,92 @@ static int snd_ps3_set_avsetting(struct snd_pcm_substream *substream) return 1; } - if ((card->avs.avs_audio_width != avs.avs_audio_width) || - (card->avs.avs_audio_rate != avs.avs_audio_rate)) { - card->avs = avs; - snd_ps3_change_avsetting(card); + memcpy(avs.avs_cs_info, ps3av_mode_cs_info, 8); + if (memcmp(&card->avs, &avs, sizeof(avs))) { pr_debug("%s: after freq=%d width=%d\n", __func__, card->avs.avs_audio_rate, card->avs.avs_audio_width); - return 0; + card->avs = avs; + snd_ps3_change_avsetting(card); + ret = 0; } else + ret = 1; + + /* check CS non-audio bit and mute accordingly */ + if (avs.avs_cs_info[0] & 0x02) + ps3av_audio_mute_analog(1); /* mute if non-audio */ + else + ps3av_audio_mute_analog(0); + + return ret; +} + +/* + * SPDIF status bits controls + */ +static int snd_ps3_spdif_mask_info(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_info *uinfo) +{ + uinfo->type = SNDRV_CTL_ELEM_TYPE_IEC958; + uinfo->count = 1; + return 0; +} + +/* FIXME: ps3av_set_audio_mode() assumes only consumer mode */ +static int snd_ps3_spdif_cmask_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + memset(ucontrol->value.iec958.status, 0xff, 8); + return 0; +} + +static int snd_ps3_spdif_pmask_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return 0; +} + +static int snd_ps3_spdif_default_get(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + memcpy(ucontrol->value.iec958.status, ps3av_mode_cs_info, 8); + return 0; +} + +static int snd_ps3_spdif_default_put(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + if (memcmp(ps3av_mode_cs_info, ucontrol->value.iec958.status, 8)) { + memcpy(ps3av_mode_cs_info, ucontrol->value.iec958.status, 8); return 1; + } + return 0; } +static struct snd_kcontrol_new spdif_ctls[] = { + { + .access = SNDRV_CTL_ELEM_ACCESS_READ, + .iface = SNDRV_CTL_ELEM_IFACE_PCM, + .name = SNDRV_CTL_NAME_IEC958("",PLAYBACK,CON_MASK), + .info = snd_ps3_spdif_mask_info, + .get = snd_ps3_spdif_cmask_get, + }, + { + .access = SNDRV_CTL_ELEM_ACCESS_READ, + .iface = SNDRV_CTL_ELEM_IFACE_PCM, + .name = SNDRV_CTL_NAME_IEC958("",PLAYBACK,PRO_MASK), + .info = snd_ps3_spdif_mask_info, + .get = snd_ps3_spdif_pmask_get, + }, + { + .iface = SNDRV_CTL_ELEM_IFACE_PCM, + .name = SNDRV_CTL_NAME_IEC958("",PLAYBACK,DEFAULT), + .info = snd_ps3_spdif_mask_info, + .get = snd_ps3_spdif_default_get, + .put = snd_ps3_spdif_default_put, + }, +}; static int snd_ps3_map_mmio(void) @@ -842,7 +917,7 @@ static void snd_ps3_audio_set_base_addr(uint64_t ioaddr_start) static int __init snd_ps3_driver_probe(struct ps3_system_bus_device *dev) { - int ret; + int i, ret; u64 lpar_addr, lpar_size; BUG_ON(!firmware_has_feature(FW_FEATURE_PS3_LV1)); @@ -903,6 +978,15 @@ static int __init snd_ps3_driver_probe(struct ps3_system_bus_device *dev) strcpy(the_card.card->driver, "PS3"); strcpy(the_card.card->shortname, "PS3"); strcpy(the_card.card->longname, "PS3 sound"); + + /* create control elements */ + for (i = 0; i < ARRAY_SIZE(spdif_ctls); i++) { + ret = snd_ctl_add(the_card.card, + snd_ctl_new1(&spdif_ctls[i], &the_card)); + if (ret < 0) + goto clean_card; + } + /* create PCM devices instance */ /* NOTE:this driver works assuming pcm:substream = 1:1 */ ret = snd_pcm_new(the_card.card, diff --git a/sound/ppc/snd_ps3.h b/sound/ppc/snd_ps3.h index 4b7e6fbbe500..326fb29e82d8 100644 --- a/sound/ppc/snd_ps3.h +++ b/sound/ppc/snd_ps3.h @@ -51,6 +51,7 @@ struct snd_ps3_avsetting_info { uint32_t avs_audio_width; uint32_t avs_audio_format; /* fixed */ uint32_t avs_audio_source; /* fixed */ + unsigned char avs_cs_info[8]; }; /* * PS3 audio 'card' instance diff --git a/sound/soc/omap/omap-mcbsp.c b/sound/soc/omap/omap-mcbsp.c index 0a063a98a661..853b33ae3435 100644 --- a/sound/soc/omap/omap-mcbsp.c +++ b/sound/soc/omap/omap-mcbsp.c @@ -43,6 +43,7 @@ struct omap_mcbsp_data { unsigned int bus_id; struct omap_mcbsp_reg_cfg regs; + unsigned int fmt; /* * Flags indicating is the bus already activated and configured by * another substream @@ -200,6 +201,7 @@ static int omap_mcbsp_dai_hw_params(struct snd_pcm_substream *substream, struct omap_mcbsp_data *mcbsp_data = to_mcbsp(cpu_dai->private_data); struct omap_mcbsp_reg_cfg *regs = &mcbsp_data->regs; int dma, bus_id = mcbsp_data->bus_id, id = cpu_dai->id; + int wlen; unsigned long port; if (cpu_class_is_omap1()) { @@ -244,19 +246,29 @@ static int omap_mcbsp_dai_hw_params(struct snd_pcm_substream *substream, switch (params_format(params)) { case SNDRV_PCM_FORMAT_S16_LE: /* Set word lengths */ + wlen = 16; regs->rcr2 |= RWDLEN2(OMAP_MCBSP_WORD_16); regs->rcr1 |= RWDLEN1(OMAP_MCBSP_WORD_16); regs->xcr2 |= XWDLEN2(OMAP_MCBSP_WORD_16); regs->xcr1 |= XWDLEN1(OMAP_MCBSP_WORD_16); - /* Set FS period and length in terms of bit clock periods */ - regs->srgr2 |= FPER(16 * 2 - 1); - regs->srgr1 |= FWID(16 - 1); break; default: /* Unsupported PCM format */ return -EINVAL; } + /* Set FS period and length in terms of bit clock periods */ + switch (mcbsp_data->fmt & SND_SOC_DAIFMT_FORMAT_MASK) { + case SND_SOC_DAIFMT_I2S: + regs->srgr2 |= FPER(wlen * 2 - 1); + regs->srgr1 |= FWID(wlen - 1); + break; + case SND_SOC_DAIFMT_DSP_A: + regs->srgr2 |= FPER(wlen * 2 - 1); + regs->srgr1 |= FWID(0); + break; + } + omap_mcbsp_config(bus_id, &mcbsp_data->regs); mcbsp_data->configured = 1; @@ -272,10 +284,12 @@ static int omap_mcbsp_dai_set_dai_fmt(struct snd_soc_dai *cpu_dai, { struct omap_mcbsp_data *mcbsp_data = to_mcbsp(cpu_dai->private_data); struct omap_mcbsp_reg_cfg *regs = &mcbsp_data->regs; + unsigned int temp_fmt = fmt; if (mcbsp_data->configured) return 0; + mcbsp_data->fmt = fmt; memset(regs, 0, sizeof(*regs)); /* Generic McBSP register settings */ regs->spcr2 |= XINTM(3) | FREE; @@ -293,6 +307,8 @@ static int omap_mcbsp_dai_set_dai_fmt(struct snd_soc_dai *cpu_dai, /* 0-bit data delay */ regs->rcr2 |= RDATDLY(0); regs->xcr2 |= XDATDLY(0); + /* Invert bit clock and FS polarity configuration for DSP_A */ + temp_fmt ^= SND_SOC_DAIFMT_IB_IF; break; default: /* Unsupported data format */ @@ -316,7 +332,7 @@ static int omap_mcbsp_dai_set_dai_fmt(struct snd_soc_dai *cpu_dai, } /* Set bit clock (CLKX/CLKR) and FS polarities */ - switch (fmt & SND_SOC_DAIFMT_INV_MASK) { + switch (temp_fmt & SND_SOC_DAIFMT_INV_MASK) { case SND_SOC_DAIFMT_NB_NF: /* * Normal BCLK + FS. |