From 5db53f3e80dee2d9dff5e534f9e9fe1db17c9936 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 20 Nov 2009 20:13:39 +0100 Subject: [LogFS] add new flash file system This is a new flash file system. See Documentation/filesystems/logfs.txt Signed-off-by: Joern Engel --- Documentation/filesystems/00-INDEX | 2 + Documentation/filesystems/logfs.txt | 241 ++++++++++++++++++++++++++++++++++++ 2 files changed, 243 insertions(+) create mode 100644 Documentation/filesystems/logfs.txt (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f15621ee5599..d362aa543b27 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -62,6 +62,8 @@ jfs.txt - info and mount options for the JFS filesystem. locks.txt - info on file locking implementations, flock() vs. fcntl(), etc. +logfs.txt + - info on the LogFS flash filesystem. mandatory-locking.txt - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt new file mode 100644 index 000000000000..e64c94ba401a --- /dev/null +++ b/Documentation/filesystems/logfs.txt @@ -0,0 +1,241 @@ + +The LogFS Flash Filesystem +========================== + +Specification +============= + +Superblocks +----------- + +Two superblocks exist at the beginning and end of the filesystem. +Each superblock is 256 Bytes large, with another 3840 Bytes reserved +for future purposes, making a total of 4096 Bytes. + +Superblock locations may differ for MTD and block devices. On MTD the +first non-bad block contains a superblock in the first 4096 Bytes and +the last non-bad block contains a superblock in the last 4096 Bytes. +On block devices, the first 4096 Bytes of the device contain the first +superblock and the last aligned 4096 Byte-block contains the second +superblock. + +For the most part, the superblocks can be considered read-only. They +are written only to correct errors detected within the superblocks, +move the journal and change the filesystem parameters through tunefs. +As a result, the superblock does not contain any fields that require +constant updates, like the amount of free space, etc. + +Segments +-------- + +The space in the device is split up into equal-sized segments. +Segments are the primary write unit of LogFS. Within each segments, +writes happen from front (low addresses) to back (high addresses. If +only a partial segment has been written, the segment number, the +current position within and optionally a write buffer are stored in +the journal. + +Segments are erased as a whole. Therefore Garbage Collection may be +required to completely free a segment before doing so. + +Journal +-------- + +The journal contains all global information about the filesystem that +is subject to frequent change. At mount time, it has to be scanned +for the most recent commit entry, which contains a list of pointers to +all currently valid entries. + +Object Store +------------ + +All space except for the superblocks and journal is part of the object +store. Each segment contains a segment header and a number of +objects, each consisting of the object header and the payload. +Objects are either inodes, directory entries (dentries), file data +blocks or indirect blocks. + +Levels +------ + +Garbage collection (GC) may fail if all data is written +indiscriminately. One requirement of GC is that data is seperated +roughly according to the distance between the tree root and the data. +Effectively that means all file data is on level 0, indirect blocks +are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, +respectively. Inode file data is on level 6 for the inodes and 7-11 +for indirect blocks. + +Each segment contains objects of a single level only. As a result, +each level requires its own seperate segment to be open for writing. + +Inode File +---------- + +All inodes are stored in a special file, the inode file. Single +exception is the inode file's inode (master inode) which for obvious +reasons is stored in the journal instead. Instead of data blocks, the +leaf nodes of the inode files are inodes. + +Aliases +------- + +Writes in LogFS are done by means of a wandering tree. A naïve +implementation would require that for each write or a block, all +parent blocks are written as well, since the block pointers have +changed. Such an implementation would not be very efficient. + +In LogFS, the block pointer changes are cached in the journal by means +of alias entries. Each alias consists of its logical address - inode +number, block index, level and child number (index into block) - and +the changed data. Any 8-byte word can be changes in this manner. + +Currently aliases are used for block pointers, file size, file used +bytes and the height of an inodes indirect tree. + +Segment Aliases +--------------- + +Related to regular aliases, these are used to handle bad blocks. +Initially, bad blocks are handled by moving the affected segment +content to a spare segment and noting this move in the journal with a +segment alias, a simple (to, from) tupel. GC will later empty this +segment and the alias can be removed again. This is used on MTD only. + +Vim +--- + +By cleverly predicting the life time of data, it is possible to +seperate long-living data from short-living data and thereby reduce +the GC overhead later. Each type of distinc life expectency (vim) can +have a seperate segment open for writing. Each (level, vim) tupel can +be open just once. If an open segment with unknown vim is encountered +at mount time, it is closed and ignored henceforth. + +Indirect Tree +------------- + +Inodes in LogFS are similar to FFS-style filesystems with direct and +indirect block pointers. One difference is that LogFS uses a single +indirect pointer that can be either a 1x, 2x, etc. indirect pointer. +A height field in the inode defines the height of the indirect tree +and thereby the indirection of the pointer. + +Another difference is the addressing of indirect blocks. In LogFS, +the first 16 pointers in the first indirect block are left empty, +corresponding to the 16 direct pointers in the inode. In ext2 (maybe +others as well) the first pointer in the first indirect block +corresponds to logical block 12, skipping the 12 direct pointers. +So where ext2 is using arithmetic to better utilize space, LogFS keeps +arithmetic simple and uses compression to save space. + +Compression +----------- + +Both file data and metadata can be compressed. Compression for file +data can be enabled with chattr +c and disabled with chattr -c. Doing +so has no effect on existing data, but new data will be stored +accordingly. New inodes will inherit the compression flag of the +parent directory. + +Metadata is always compressed. However, the space accounting ignores +this and charges for the uncompressed size. Failing to do so could +result in GC failures when, after moving some data, indirect blocks +compress worse than previously. Even on a 100% full medium, GC may +not consume any extra space, so the compression gains are lost space +to the user. + +However, they are not lost space to the filesystem internals. By +cheating the user for those bytes, the filesystem gained some slack +space and GC will run less often and faster. + +Garbage Collection and Wear Leveling +------------------------------------ + +Garbage collection is invoked whenever the number of free segments +falls below a threshold. The best (known) candidate is picked based +on the least amount of valid data contained in the segment. All +remaining valid data is copied elsewhere, thereby invalidating it. + +The GC code also checks for aliases and writes then back if their +number gets too large. + +Wear leveling is done by occasionally picking a suboptimal segment for +garbage collection. If a stale segments erase count is significantly +lower than the active segments' erase counts, it will be picked. Wear +leveling is rate limited, so it will never monopolize the device for +more than one segment worth at a time. + +Values for "occasionally", "significantly lower" are compile time +constants. + +Hashed directories +------------------ + +To satisfy efficient lookup(), directory entries are hashed and +located based on the hash. In order to both support large directories +and not be overly inefficient for small directories, several hash +tables of increasing size are used. For each table, the hash value +modulo the table size gives the table index. + +Tables sizes are chosen to limit the number of indirect blocks with a +fully populated table to 0, 1, 2 or 3 respectively. So the first +table contains 16 entries, the second 512-16, etc. + +The last table is special in several ways. First its size depends on +the effective 32bit limit on telldir/seekdir cookies. Since logfs +uses the upper half of the address space for indirect blocks, the size +is limited to 2^31. Secondly the table contains hash buckets with 16 +entries each. + +Using single-entry buckets would result in birthday "attacks". At +just 2^16 used entries, hash collisions would be likely (P >= 0.5). +My math skills are insufficient to do the combinatorics for the 17x +collisions necessary to overflow a bucket, but testing showed that in +10,000 runs the lowest directory fill before a bucket overflow was +188,057,130 entries with an average of 315,149,915 entries. So for +directory sizes of up to a million, bucket overflows should be +virtually impossible under normal circumstances. + +With carefully chosen filenames, it is obviously possible to cause an +overflow with just 21 entries (4 higher tables + 16 entries + 1). So +there may be a security concern if a malicious user has write access +to a directory. + +Open For Discussion +=================== + +Device Address Space +-------------------- + +A device address space is used for caching. Both block devices and +MTD provide functions to either read a single page or write a segment. +Partial segments may be written for data integrity, but where possible +complete segments are written for performance on simple block device +flash media. + +Meta Inodes +----------- + +Inodes are stored in the inode file, which is just a regular file for +most purposes. At umount time, however, the inode file needs to +remain open until all dirty inodes are written. So +generic_shutdown_super() may not close this inode, but shouldn't +complain about remaining inodes due to the inode file either. Same +goes for mapping inode of the device address space. + +Currently logfs uses a hack that essentially copies part of fs/inode.c +code over. A general solution would be preferred. + +Indirect block mapping +---------------------- + +With compression, the block device (or mapping inode) cannot be used +to cache indirect blocks. Some other place is required. Currently +logfs uses the top half of each inode's address space. The low 8TB +(on 32bit) are filled with file data, the high 8TB are used for +indirect blocks. + +One problem is that 16TB files created on 64bit systems actually have +data in the top 8TB. But files >16TB would cause problems anyway, so +only the limit has changed. -- cgit v1.2.3 From 4c54005ca438a8b46dd542b497d4f0dc2ca375e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Jan 2010 16:10:57 -0800 Subject: rcu: 1Q2010 update for RCU documentation Add expedited functions. Review documentation and update obsolete verbiage. Also fix the advice for the RCU CPU-stall kernel configuration parameter, and document RCU CPU-stall warnings. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <12635142581866-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/filesystems/dentry-locking.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt index 4c0c575a4012..79334ed5daa7 100644 --- a/Documentation/filesystems/dentry-locking.txt +++ b/Documentation/filesystems/dentry-locking.txt @@ -62,7 +62,8 @@ changes are : 2. Insertion of a dentry into the hash table is done using hlist_add_head_rcu() which take care of ordering the writes - the writes to the dentry must be visible before the dentry is - inserted. This works in conjunction with hlist_for_each_rcu() while + inserted. This works in conjunction with hlist_for_each_rcu(), + which has since been replaced by hlist_for_each_entry_rcu(), while walking the hash chain. The only requirement is that all initialization to the dentry must be done before hlist_add_head_rcu() since we don't have dcache_lock protection -- cgit v1.2.3 From 73834d6f90f6833663f9effd4cf9b79b63bc36e1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Jan 2010 17:17:04 -0500 Subject: nfsd: 4.1 has an rfc number No need to refer to an internet draft; there's an RFC now. Signed-off-by: J. Bruce Fields --- Documentation/filesystems/nfs/nfs41-server.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/nfs/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 1bd0d0c05171..6a53a84afc72 100644 --- a/Documentation/filesystems/nfs/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt @@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4 on or off; rpc.nfsd does this correctly.) The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based -on the latest NFSv4.1 Internet Draft: -http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 +on RFC 5661. From the many new features in NFSv4.1 the current implementation focuses on the mandatory-to-implement NFSv4.1 Sessions, providing @@ -44,7 +43,7 @@ interoperability problems with future clients. Known issues: trunking, but this is a mandatory feature, and its use is recommended to clients in a number of places. (E.g. to ensure timely renewal in case an existing connection's retry timeouts - have gotten too long; see section 8.3 of the draft.) + have gotten too long; see section 8.3 of the RFC.) Therefore, lack of this feature may cause future clients to fail. - Incomplete backchannel support: incomplete backchannel gss -- cgit v1.2.3 From e902ec9906e844f4613fa6190c6fa65f162dc86e Mon Sep 17 00:00:00 2001 From: Jiro SEKIBA Date: Sat, 30 Jan 2010 18:06:35 +0900 Subject: nilfs2: issue discard request after cleaning segments This adds a function to send discard requests for given array of segment numbers, and calls the function when garbage collection succeeded. Signed-off-by: Jiro SEKIBA Signed-off-by: Ryusuke Konishi --- Documentation/filesystems/nilfs2.txt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 839efd8a8a8c..cf6d0d85ca82 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -74,6 +74,9 @@ norecovery Disable recovery of the filesystem on mount. This disables every write access on the device for read-only mounts or snapshots. This option will fail for r/w mounts on an unclean volume. +discard Issue discard/TRIM commands to the underlying block + device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs. NILFS2 usage ============ -- cgit v1.2.3 From cb2992a60b7e73fbabe9ffe54056eed0022f2ed2 Mon Sep 17 00:00:00 2001 From: Mulyadi Santosa Date: Thu, 18 Feb 2010 01:22:40 +0700 Subject: doc: typo - Table 1-2 should refer to "status", not "statm" Fixes a typo in proc.txt documentation. Table 1-2 should refer to "status", not "statm" Signed-off-by: Mulyadi Santosa Signed-off-by: Jiri Kosina --- Documentation/filesystems/proc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..bb314f60a76a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -188,7 +188,7 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file contains details information about the process itself. Its fields are explained in Table 1-4. -Table 1-2: Contents of the statm files (as of 2.6.30-rc7) +Table 1-2: Contents of the status files (as of 2.6.30-rc7) .............................................................................. Field Content Name filename of the executable -- cgit v1.2.3 From 2f99cc6e4688faf5f03f9afe5e1c6097278c75b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 16 Jan 2010 14:10:21 -0500 Subject: add several pieces to shared subtree documentation * document locking * add the missing part of data structure invariants (relationship between mnt_share and mnt_slave lists in case of a peer group among slaves). Signed-off-by: Al Viro --- Documentation/filesystems/sharedsubtree.txt | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index 23a181074f94..fc0e39af43c3 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt @@ -837,6 +837,9 @@ replicas continue to be exactly same. individual lists does not affect propagation or the way propagation tree is modified by operations. + All vfsmounts in a peer group have the same ->mnt_master. If it is + non-NULL, they form a contiguous (ordered) segment of slave list. + A example propagation tree looks as shown in the figure below. [ NOTE: Though it looks like a forest, if we consider all the shared mounts as a conceptual entity called 'pnode', it becomes a tree] @@ -874,8 +877,19 @@ replicas continue to be exactly same. NOTE: The propagation tree is orthogonal to the mount tree. +8B Locking: + + ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected + by namespace_sem (exclusive for modifications, shared for reading). + + Normally we have ->mnt_flags modifications serialized by vfsmount_lock. + There are two exceptions: do_add_mount() and clone_mnt(). + The former modifies a vfsmount that has not been visible in any shared + data structures yet. + The latter holds namespace_sem and the only references to vfsmount + are in lists that can't be traversed without namespace_sem. -8B Algorithm: +8C Algorithm: The crux of the implementation resides in rbind/move operation. -- cgit v1.2.3 From 5dd4056db84387975140ff2568eaa0406f07985e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Mar 2010 09:05:00 -0500 Subject: dquot: cleanup space allocation / freeing routines Get rid of the alloc_space, free_space, reserve_space, claim_space and release_rsv dquot operations - they are always called from the filesystem and if a filesystem really needs their own (which none currently does) it can just call into it's own routine directly. Move shared logic into the common __dquot_alloc_space, dquot_claim_space_nodirty and __dquot_free_space low-level methods, and rationalize the wrappers around it to move as much as possible code into the common block for CONFIG_QUOTA vs not. Also rename all these helpers to be named dquot_* instead of vfs_dq_*. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- Documentation/filesystems/Locking | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca0630..1192fde11638 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -462,9 +462,7 @@ in sys_read() and friends. prototypes: int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_space) (struct inode *, qsize_t); int (*free_inode) (const struct inode *, unsigned long); int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); @@ -481,9 +479,7 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem drop: yes - -alloc_space: ->mark_dirty() - alloc_inode: ->mark_dirty() - -free_space: ->mark_dirty() - free_inode: ->mark_dirty() - transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem @@ -495,7 +491,7 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called +->alloc_inode(), ->free_inode() are called only directly by the filesystem and do not call any fs functions only the ->mark_dirty() operation. -- cgit v1.2.3 From 63936ddaa16b9486e2d426ed7b09f559a5c60f87 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Mar 2010 09:05:01 -0500 Subject: dquot: cleanup inode allocation / freeing routines Get rid of the alloc_inode and free_inode dquot operations - they are always called from the filesystem and if a filesystem really needs their own (which none currently does) it can just call into it's own routine directly. Also get rid of the vfs_dq_alloc/vfs_dq_free wrappers and always call the lowlevel dquot_alloc_inode / dqout_free_inode routines directly, which now lose the number argument which is always 1. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- Documentation/filesystems/Locking | 8 -------- 1 file changed, 8 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1192fde11638..4428f55f2131 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -462,8 +462,6 @@ in sys_read() and friends. prototypes: int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_inode) (const struct inode *, unsigned long); int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); @@ -479,8 +477,6 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem drop: yes - -alloc_inode: ->mark_dirty() - -free_inode: ->mark_dirty() - transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem @@ -491,10 +487,6 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_inode(), ->free_inode() are called -only directly by the filesystem and do not call any fs functions only -the ->mark_dirty() operation. - More details about quota locking can be found in fs/dquot.c. --------------------------- vm_operations_struct ----------------------------- -- cgit v1.2.3 From b43fa8284d7790d9cca32c9c55e24f29be2fa33b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Mar 2010 09:05:03 -0500 Subject: dquot: cleanup dquot transfer routine Get rid of the transfer dquot operation - it is now always called from the filesystem and if a filesystem really needs it's own (which none currently does) it can just call into it's own routine directly. Rename the now static low-level dquot_transfer helper to __dquot_transfer and vfs_dq_transfer to dquot_transfer to have a consistent namespace, and make the new dquot_transfer return a normal negative errno value which all callers expect. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- Documentation/filesystems/Locking | 2 -- 1 file changed, 2 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4428f55f2131..4574e0272bdd 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -462,7 +462,6 @@ in sys_read() and friends. prototypes: int (*initialize) (struct inode *, int); int (*drop) (struct inode *); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -477,7 +476,6 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem drop: yes - -transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem -- cgit v1.2.3 From 9f7547580263d4a55efe06ce5cfd567f568be6e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Mar 2010 09:05:05 -0500 Subject: dquot: cleanup dquot drop routine Get rid of the drop dquot operation - it is now always called from the filesystem and if a filesystem really needs it's own (which none currently does) it can just call into it's own routine directly. Rename the now static low-level dquot_drop helper to __dquot_drop and vfs_dq_drop to dquot_drop to have a consistent namespace. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- Documentation/filesystems/Locking | 2 -- 1 file changed, 2 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 4574e0272bdd..fa10e4bf8e5e 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -461,7 +461,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -475,7 +474,6 @@ What filesystem should expect from the generic quota functions: FS recursion Held locks when called initialize: yes maybe dqonoff_sem -drop: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem -- cgit v1.2.3 From 871a293155a24554e153538d36e3a80fa169aefb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 Mar 2010 09:05:07 -0500 Subject: dquot: cleanup dquot initialize routine Get rid of the initialize dquot operation - it is now always called from the filesystem and if a filesystem really needs it's own (which none currently does) it can just call into it's own routine directly. Rename the now static low-level dquot_initialize helper to __dquot_initialize and vfs_dq_init to dquot_initialize to have a consistent namespace. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- Documentation/filesystems/Locking | 2 -- 1 file changed, 2 deletions(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index fa10e4bf8e5e..06bbbed71206 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -460,7 +460,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: - int (*initialize) (struct inode *, int); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -473,7 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. What filesystem should expect from the generic quota functions: FS recursion Held locks when called -initialize: yes maybe dqonoff_sem write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem -- cgit v1.2.3 From 34e55232e59f7b19050267a05ff1226e5cd122a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:40 -0800 Subject: mm: avoid false sharing of mm_counter Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0d07513a67a6..e418f3d8f427 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -188,6 +188,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file contains details information about the process itself. Its fields are explained in Table 1-4. +(for SMP CONFIG users) +For making accounting scalable, RSS related information are handled in +asynchronous manner and the vaule may not be very precise. To see a precise +snapshot of a moment, you can see /proc//smaps file and scan page table. +It's slow but very precise. + Table 1-2: Contents of the statm files (as of 2.6.30-rc7) .............................................................................. Field Content -- cgit v1.2.3 From b084d4353ff99d824d3bc5a5c2c22c70b1fba722 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:42 -0800 Subject: mm: count swap usage A frequent questions from users about memory management is what numbers of swap ents are user for processes. And this information will give some hints to oom-killer. Besides we can count the number of swapents per a process by scanning /proc//smaps, this is very slow and not good for usual process information handler which works like 'ps' or 'top'. (ps or top is now enough slow..) This patch adds a counter of swapents to mm_counter and update is at each swap events. Information is exported via /proc//status file as [kamezawa@bluextal memory]$ cat /proc/self/status Name: cat State: R (running) Tgid: 2910 Pid: 2910 PPid: 2823 TracerPid: 0 Uid: 500 500 500 500 Gid: 500 500 500 500 FDSize: 256 Groups: 500 VmPeak: 82696 kB VmSize: 82696 kB VmLck: 0 kB VmHWM: 432 kB VmRSS: 432 kB VmData: 172 kB VmStk: 84 kB VmExe: 48 kB VmLib: 1568 kB VmPTE: 40 kB VmSwap: 0 kB <=============== this. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Reviewed-by: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e418f3d8f427..b5c5fc657a88 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -164,6 +164,7 @@ read the file /proc/PID/status: VmExe: 68 kB VmLib: 1412 kB VmPTE: 20 kb + VmSwap: 0 kB Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -219,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries + VmSwap size of swap usage (the number of referred swapents) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread -- cgit v1.2.3 From a1b57ac061b0e348c9a878f8fa3a93a67fe6af42 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 5 Mar 2010 13:42:15 -0800 Subject: mm: document /proc/pagetypeinfo Add documentation for /proc/pagetypeinfo. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 45 +++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b5c5fc657a88..96a44dd95e03 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -438,6 +438,7 @@ Table 1-5: Kernel info in /proc modules List of loaded modules mounts Mounted filesystems net Networking info (see text) + pagetypeinfo Additional page allocator information (see text) (2.5) partitions Table of partitions known to the system pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, decoupled by lspci (2.4) @@ -592,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ... Node 0, zone Normal 1 0 0 1 101 8 ... Node 0, zone HighMem 2 0 0 1 1 0 ... -Memory fragmentation is a problem under some workloads, and buddyinfo is a +External fragmentation is a problem under some workloads, and buddyinfo is a useful tool for helping diagnose these problems. Buddyinfo will give you a clue as to how big an area you can safely allocate, or why a previous allocation failed. @@ -602,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE available in ZONE_NORMAL, etc... +More information relevant to external fragmentation can be found in +pagetypeinfo. + +> cat /proc/pagetypeinfo +Page block order: 9 +Pages per block: 512 + +Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 +Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0 +Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 +Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2 +Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0 +Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 +Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9 +Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0 +Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452 +Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0 +Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 + +Number of blocks type Unmovable Reclaimable Movable Reserve Isolate +Node 0, zone DMA 2 0 5 1 0 +Node 0, zone DMA32 41 6 967 2 0 + +Fragmentation avoidance in the kernel works by grouping pages of different +migrate types into the same contiguous regions of memory called page blocks. +A page block is typically the size of the default hugepage size e.g. 2MB on +X86-64. By keeping pages grouped based on their ability to move, the kernel +can reclaim pages within a page block to satisfy a high-order allocation. + +The pagetypinfo begins with information on the size of a page block. It +then gives the same type of information as buddyinfo except broken down +by migrate-type and finishes with details on how many page blocks of each +type exist. + +If min_free_kbytes has been tuned correctly (recommendations made by hugeadm +from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can +make an estimate of the likely number of huge pages that can be allocated +at a given point in time. All the "Movable" blocks should be allocatable +unless memory has been mlock()'d. Some of the Reclaimable blocks should +also be allocatable although a lot of filesystem metadata may have to be +reclaimed to achieve this. + .............................................................................. meminfo: -- cgit v1.2.3 From 1e0051ae48a253685e4309256f9c1ec2bdb74b5d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 10 Mar 2010 15:21:57 -0800 Subject: Documentation/fs/: split txt and source files Make dnotify_test.c source file and add it to Makefile so that bitrot can be prevented. Signed-off-by: Randy Dunlap Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/00-INDEX | 2 ++ Documentation/filesystems/Makefile | 8 +++++++ Documentation/filesystems/dnotify.txt | 39 ++++---------------------------- Documentation/filesystems/dnotify_test.c | 34 ++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 34 deletions(-) create mode 100644 Documentation/filesystems/Makefile create mode 100644 Documentation/filesystems/dnotify_test.c (limited to 'Documentation/filesystems') diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 5139b8c9d5af..3bae418c6ad3 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -32,6 +32,8 @@ dlmfs.txt - info on the userspace interface to the OCFS2 DLM. dnotify.txt - info about directory notification in Linux. +dnotify_test.c + - example program for dnotify ecryptfs.txt - docs on eCryptfs: stacked cryptographic filesystem for Linux. exofs.txt diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile new file mode 100644 index 000000000000..a5dd114da14f --- /dev/null +++ b/Documentation/filesystems/Makefile @@ -0,0 +1,8 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := dnotify_test + +# Tell kbuild to always build the programs +always := $(hostprogs-y) diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt index 9f5d338ddbb8..6baf88f46859 100644 --- a/Documentation/filesystems/dnotify.txt +++ b/Documentation/filesystems/dnotify.txt @@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL. Example ------- +See Documentation/filesystems/dnotify_test.c for an example. - #define _GNU_SOURCE /* needed to get the defines */ - #include /* in glibc 2.2 this has the needed - values defined */ - #include - #include - #include - - static volatile int event_fd; - - static void handler(int sig, siginfo_t *si, void *data) - { - event_fd = si->si_fd; - } - - int main(void) - { - struct sigaction act; - int fd; - - act.sa_sigaction = handler; - sigemptyset(&act.sa_mask); - act.sa_flags = SA_SIGINFO; - sigaction(SIGRTMIN + 1, &act, NULL); - - fd = open(".", O_RDONLY); - fcntl(fd, F_SETSIG, SIGRTMIN + 1); - fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); - /* we will now be notified if any of the files - in "." is modified or new files are created */ - while (1) { - pause(); - printf("Got event on fd=%d\n", event_fd); - } - } +NOTE +---- +Beginning with Linux 2.6.13, dnotify has been replaced by inotify. +See Documentation/filesystems/inotify.txt for more information on it. diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c new file mode 100644 index 000000000000..8b37b4a1e18d --- /dev/null +++ b/Documentation/filesystems/dnotify_test.c @@ -0,0 +1,34 @@ +#define _GNU_SOURCE /* needed to get the defines */ +#include /* in glibc 2.2 this has the needed + values defined */ +#include +#include +#include + +static volatile int event_fd; + +static void handler(int sig, siginfo_t *si, void *data) +{ + event_fd = si->si_fd; +} + +int main(void) +{ + struct sigaction act; + int fd; + + act.sa_sigaction = handler; + sigemptyset(&act.sa_mask); + act.sa_flags = SA_SIGINFO; + sigaction(SIGRTMIN + 1, &act, NULL); + + fd = open(".", O_RDONLY); + fcntl(fd, F_SETSIG, SIGRTMIN + 1); + fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT); + /* we will now be notified if any of the files + in "." is modified or new files are created */ + while (1) { + pause(); + printf("Got event on fd=%d\n", event_fd); + } +} -- cgit v1.2.3