From 7121064b214bbc97007ec4d0e70aaeffef1b55f7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Jul 2013 15:00:45 -0700 Subject: configfs: use capped length for ->store_attribute() The difference between "count" and "len" is that "len" is capped at 4095. Changing it like this makes it match how sysfs_write_file() is implemented. This is a static analysis patch. I haven't found any store_attribute() functions where this change makes a difference. Signed-off-by: Dan Carpenter Acked-by: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/configfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2b6cb23dd14e..1d1c41f1014d 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -203,7 +203,7 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof mutex_lock(&buffer->mutex); len = fill_write_buffer(buffer, buf, count); if (len > 0) - len = flush_write_buffer(file->f_path.dentry, buffer, count); + len = flush_write_buffer(file->f_path.dentry, buffer, len); if (len > 0) *ppos += len; mutex_unlock(&buffer->mutex); -- cgit v1.2.3 From 82d627cf1f41c6ca550bb404dfbc4bae2c954611 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 3 Jul 2013 15:00:46 -0700 Subject: fs/ocfs2/dlm/dlmrecovery.c: remove duplicate declarations Below 3 functions have already been declared in dlmcommon.h, so we have no need to declare them again in dlmrecovery.c: dlm_complete_recovery_thread dlm_launch_recovery_thread dlm_kick_recovery_thread Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Acked-by: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index e68588e6b1e8..3e18641626b4 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -55,9 +55,6 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); static int dlm_recovery_thread(void *data); -void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); -int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); -void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); static int dlm_do_recovery(struct dlm_ctxt *dlm); static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); -- cgit v1.2.3 From 22ab9014bfdaf97449759aef946871aabe78bb40 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 3 Jul 2013 15:00:47 -0700 Subject: fs/ocfs2/dlm/dlmrecovery.c:dlm_request_all_locks(): ret should be int instead of enum In dlm_request_all_locks, ret is type enum. But o2net_send_message returns a type int value. Then it will never run into the following error branch. So we should change the ret type from enum to int. Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Acked-by: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 3e18641626b4..53e5c0284fa2 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -786,7 +786,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, u8 dead_node) { struct dlm_lock_request lr; - enum dlm_status ret; + int ret; mlog(0, "\n"); @@ -799,7 +799,6 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, lr.dead_node = dead_node; // send message - ret = DLM_NOLOCKMGR; ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, &lr, sizeof(lr), request_from, NULL); -- cgit v1.2.3 From 13eb98874c837b4ca0cb2db6a4fe3551e51b7632 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 3 Jul 2013 15:00:48 -0700 Subject: ocfs2: should not use le32_add_cpu to set ocfs2_dinode i_flags If we use le32_add_cpu to set ocfs2_dinode i_flags, it may lead to the corresponding flag corrupted. So we should change it to bitwise and/or operation. Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: shencanquan Reviewed-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b4a5cdf9dbc5..75964addfe32 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -522,7 +522,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); - le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); + fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); fe->i_atime = fe->i_ctime = fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = @@ -2044,7 +2044,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } - le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; /* Record which orphan dir our inode now resides @@ -2434,7 +2434,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } di = (struct ocfs2_dinode *)di_bh->b_data; - le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); + di->i_flags &= ~cpu_to_le32(OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; set_nlink(inode, 1); ocfs2_set_links_count(di, inode->i_nlink); -- cgit v1.2.3 From 40c7f2eaf59230135fd91a398924bdbf873378ec Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 3 Jul 2013 15:00:50 -0700 Subject: ocfs2: add missing dlm_put() in dlm_begin_reco_handler() dlm_begin_reco_handler() returns without putting dlm when dlm recovery state is DLM_RECO_STATE_FINALIZE. Signed-off-by: joyce Reviewed-by: Jie Liu Acked-by: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 53e5c0284fa2..773bd32bfd8c 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2692,6 +2692,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); + dlm_put(dlm); return -EAGAIN; } spin_unlock(&dlm->spinlock); -- cgit v1.2.3 From 8fa9d17f93ee8eb79d595c98cef0944f6ee5de75 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 3 Jul 2013 15:00:51 -0700 Subject: ocfs2: remove unecessary variable needs_checkpoint Code cleanup: needs_checkpoint is assigned to but never used. Delete the variable. Signed-off-by: Goldwyn Rodrigues Cc: Jeff Liu Acked-by: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.h | 1 - fs/ocfs2/ocfs2.h | 1 - fs/ocfs2/super.c | 6 ++---- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index a3385b63ff5e..0a992737dcaf 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -200,7 +200,6 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) { - atomic_set(&osb->needs_checkpoint, 1); wake_up(&osb->checkpoint_event); } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d355e6e36b36..3a903470c794 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -347,7 +347,6 @@ struct ocfs2_super struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; - atomic_t needs_checkpoint; struct ocfs2_journal *journal; unsigned long osb_commit_interval; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 01b85165552b..854d80955bf8 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -286,10 +286,9 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) spin_unlock(&osb->osb_lock); out += snprintf(buf + out, len - out, - "%10s => Pid: %d Interval: %lu Needs: %d\n", "Commit", + "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), - osb->osb_commit_interval, - atomic_read(&osb->needs_checkpoint)); + osb->osb_commit_interval); out += snprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", @@ -2154,7 +2153,6 @@ static int ocfs2_initialize_super(struct super_block *sb, } init_waitqueue_head(&osb->checkpoint_event); - atomic_set(&osb->needs_checkpoint, 0); osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; -- cgit v1.2.3 From 33add0e3a09a62c7796ea9838243c1cd933d8543 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 3 Jul 2013 15:00:52 -0700 Subject: ocfs2: fix mutex_unlock and possible memory leak in ocfs2_remove_btree_range In ocfs2_remove_btree_range, when calling ocfs2_lock_refcount_tree and ocfs2_prepare_refcount_change_for_del failed, it goes to out and then tries to call mutex_unlock without mutex_lock before. And when calling ocfs2_reserve_blocks_for_rec_trunc failed, it should free ref_tree before return. Signed-off-by: Joseph Qi Reviewed-by: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b8a9d87231b1..17e6bdde96c5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5655,7 +5655,7 @@ int ocfs2_remove_btree_range(struct inode *inode, &ref_tree, NULL); if (ret) { mlog_errno(ret); - goto out; + goto bail; } ret = ocfs2_prepare_refcount_change_for_del(inode, @@ -5666,7 +5666,7 @@ int ocfs2_remove_btree_range(struct inode *inode, &extra_blocks); if (ret < 0) { mlog_errno(ret); - goto out; + goto bail; } } @@ -5674,7 +5674,7 @@ int ocfs2_remove_btree_range(struct inode *inode, extra_blocks); if (ret) { mlog_errno(ret); - return ret; + goto bail; } mutex_lock(&tl_inode->i_mutex); @@ -5734,7 +5734,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out: mutex_unlock(&tl_inode->i_mutex); - +bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); -- cgit v1.2.3 From 40bd62eb7fb8447798732e809a676ebaf2a7f910 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 3 Jul 2013 15:00:53 -0700 Subject: fs/ocfs2/journal.h: add bits_wanted while calculating credits in ocfs2_calc_extend_credits While adding extends to a file, the credits are calculated incorrectly and if the requested clusters is more than one (or more because we used a conservative limit) then we run out of journal credits and we hit an assert in journalling code. The function parameter bits_wanted variable was not used at all. Signed-off-by: Goldwyn Rodrigues Reviewed-by: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 0a992737dcaf..96f9ac237e86 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -537,7 +537,7 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + - ocfs2_quota_trans_credits(sb); + ocfs2_quota_trans_credits(sb) + bits_wanted; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) -- cgit v1.2.3 From d3e3b41b3dffff50c66651d60146b155d6ce0484 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 3 Jul 2013 15:00:54 -0700 Subject: fs/ocfs2/cluster/tcp.c: free sc->sc_page in sc_kref_release() There is a memory leak in sc_kref_release(). When free struct o2net_sock_container (sc), we should release sc->sc_page. Signed-off-by: Younger Liu Reviewed-by: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index aa88bd8bcedc..bb9a48bfbd01 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -406,6 +406,9 @@ static void sc_kref_release(struct kref *kref) sc->sc_node = NULL; o2net_debug_del_sc(sc); + + if (sc->sc_page) + __free_page(sc->sc_page); kfree(sc); } -- cgit v1.2.3 From b30f14c490d0e3ff1b7c0952ccd343408557484e Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 3 Jul 2013 15:00:55 -0700 Subject: ocfs2: xattr: remove useless free space checking Free space checking will be done in ocfs2_xattr_ibody_init(). So remove here. [akpm@linux-foundation.org: remove unused local] Signed-off-by: Junxiao Bi Reviewed-by: Jie Liu Acked-by: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2e3ea308c144..0deabcdaf916 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2751,20 +2751,12 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, { int ret; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; struct ocfs2_xa_loc loc; if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) return -ENOSPC; down_write(&oi->ip_alloc_sem); - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { - if (!ocfs2_xattr_has_space_inline(inode, di)) { - ret = -ENOSPC; - goto out; - } - } - if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { ret = ocfs2_xattr_ibody_init(inode, xs->inode_bh, ctxt); if (ret) { -- cgit v1.2.3 From 096b2ef83c87d7a7e0d2838f7f1391c74fdad0e6 Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 3 Jul 2013 15:00:56 -0700 Subject: ocfs2: dlmlock_master() should return DLM_NORMAL after adding lock to blocked list dlmlock_master() returns DLM_RECOVERING/DLM_MIGRATING/ DLM_FORWAR after adding lock to blocked list if lockres has the state DLM_LOCK_RES_RECOVERING/DLM_LOCK_RES_MIGRATING/ DLM_LOCK_RES_IN_PROGRESS. so it will retry in dlmlock(). And this may cause dlm_thread fall into an infinite loop Thread1 dlm_thread calls dlm_lock->dlmlock_master, if lockresA is in state DLM_LOCK_RES_RECOVERING, calls __dlm_wait_on_lockres() and waits until others threads clear this state; If cannot grant this lock, adding lock to blocked list, and return DLM_RECOVERING; Grant this lock and move it to grant list; After a while, retry and calls list_add_tail(), adding lock to blocked list again. Granted and blocked list of this lockres will become the following conditions: lock_res->granted.next = dlm_lock->list_head; lock_res->blocked.next = dlm_lock->list_head; dlm_lock->list_head.next = dlm_lock_resource->blocked; When dlm_thread traverses the granted list, it will fall into an endless loop, checking dlm_lock.list_head, dlm_lock->list_head.next (i.e.lock_res->blocked), lock_res->blocked.next(i.e.dlm_lock.list_head again) ..... Signed-off-by: joyce Reviewed-by: jensen Cc: Jeff Liu Acked-by: Sunil Mushran Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmlock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 975810b98492..47e67c2d228f 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -178,6 +178,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, lock->ml.node); } } else { + status = DLM_NORMAL; dlm_lock_get(lock); list_add_tail(&lock->list, &res->blocked); kick_thread = 1; -- cgit v1.2.3 From ea45466aec98b68faaf354901c42e281e58af50a Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 3 Jul 2013 15:00:58 -0700 Subject: ocfs2: need rollback when journal_access failed in ocfs2_orphan_add() While adding a file into orphan dir in ocfs2_orphan_add(), it calls __ocfs2_add_entry() before ocfs2_journal_access_di(). If ocfs2_journal_access_di() failed, the file is added into orphan dir, and orphan dir dinode updated, but file dinode has not been updated. Accordingly, the data is not consistent between file dinode and orphan dir. So, need to call ocfs2_journal_access_di() before __ocfs2_add_entry(), and if ocfs2_journal_access_di() failed, orphan_fe and orphan_dir_inode->i_nlink need rollback. This bug was added by 3939fda4 ("Ocfs2: Journaling i_flags and i_orphaned_slot when adding inode to orphan dir."). Signed-off-by: Younger Liu Acked-by: Jeff Liu Cc: Sunil Mushran Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 75964addfe32..30842f5cf590 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2012,6 +2012,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } + /* + * We're going to journal the change of i_flags and i_orphaned_slot. + * It's safe anyway, though some callers may duplicate the journaling. + * Journaling within the func just make the logic look more + * straightforward. + */ + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* we're a cluster, and nlink can change on disk from * underneath us... */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; @@ -2026,22 +2041,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, orphan_dir_bh, lookup); if (status < 0) { mlog_errno(status); - goto leave; - } - - /* - * We're going to journal the change of i_flags and i_orphaned_slot. - * It's safe anyway, though some callers may duplicate the journaling. - * Journaling within the func just make the logic look more - * straightforward. - */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(inode), - fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; + goto rollback; } fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); @@ -2057,11 +2057,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); +rollback: + if (status < 0) { + if (S_ISDIR(inode->i_mode)) + ocfs2_add_links_count(orphan_fe, -1); + set_nlink(orphan_dir_inode, ocfs2_read_links_count(orphan_fe)); + } + leave: brelse(orphan_dir_bh); - if (status) - mlog_errno(status); return status; } -- cgit v1.2.3 From 493098413bdb45f223ff0552e2f734849491dbbe Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 3 Jul 2013 15:00:59 -0700 Subject: ocfs2: rework transaction rollback in ocfs2_relink_block_group() In ocfs2_relink_block_group(), we roll back all those changes if notify intent to modify buffers for metadata update failed even if the relevant buffer has not yet been modified/got dirty at that point, that are not quite right because of: - None buffer has been modified/dirty if failed to call ocfs2_journal_access_gd() against the previous block group buffer - Only the previous block group buffer has got dirty if failed to call ocfs2_journal_access_gd() against the block group buffer - There is no need to roll back the change for file entry buffer at all Those problems will not cause anything wrong but unnecessary. This patch fix them and kill the useless bg_ptr variable as well. Signed-off-by: Jie Liu Cc: Younger Liu Cc: Sunil Mushran Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index b7e74b580c0f..101d16d78162 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1422,7 +1422,7 @@ static int ocfs2_relink_block_group(handle_t *handle, int status; /* there is a really tiny chance the journal calls could fail, * but we wouldn't want inconsistent blocks in *any* case. */ - u64 fe_ptr, bg_ptr, prev_bg_ptr; + u64 bg_ptr, prev_bg_ptr; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; @@ -1437,7 +1437,6 @@ static int ocfs2_relink_block_group(handle_t *handle, (unsigned long long)le64_to_cpu(bg->bg_blkno), (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); - fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); @@ -1446,7 +1445,7 @@ static int ocfs2_relink_block_group(handle_t *handle, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto out_rollback; + goto out; } prev_bg->bg_next_group = bg->bg_next_group; @@ -1456,7 +1455,7 @@ static int ocfs2_relink_block_group(handle_t *handle, bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto out_rollback; + goto out_rollback_prev_bg; } bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; @@ -1466,21 +1465,21 @@ static int ocfs2_relink_block_group(handle_t *handle, fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); - goto out_rollback; + goto out_rollback_bg; } fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; ocfs2_journal_dirty(handle, fe_bh); -out_rollback: - if (status < 0) { - fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); - bg->bg_next_group = cpu_to_le64(bg_ptr); - prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); - } +out: + return status; - if (status) - mlog_errno(status); +out_rollback_bg: + bg->bg_next_group = cpu_to_le64(bg_ptr); +out_rollback_prev_bg: + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + + mlog_errno(status); return status; } -- cgit v1.2.3 From 25e2892101ba541dce8593c698d70ccc278bc1fd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 3 Jul 2013 15:01:00 -0700 Subject: ocfs2: remove duplicated mlog_errno() in ocfs2_relink_block_group Cc: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Cc: Sunil Mushran Cc: Younger Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 101d16d78162..5397c07ce608 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1443,44 +1443,38 @@ static int ocfs2_relink_block_group(handle_t *handle, status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), prev_bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); + if (status < 0) goto out; - } prev_bg->bg_next_group = bg->bg_next_group; ocfs2_journal_dirty(handle, prev_bg_bh); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), bg_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); + if (status < 0) goto out_rollback_prev_bg; - } bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; ocfs2_journal_dirty(handle, bg_bh); status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), fe_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); + if (status < 0) goto out_rollback_bg; - } fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; ocfs2_journal_dirty(handle, fe_bh); out: + if (status < 0) + mlog_errno(status); return status; out_rollback_bg: bg->bg_next_group = cpu_to_le64(bg_ptr); out_rollback_prev_bg: prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); - - mlog_errno(status); - return status; + goto out; } static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, -- cgit v1.2.3 From b5a8bb717e29b549584cc27ac8e109a803c4f8c4 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 3 Jul 2013 15:01:01 -0700 Subject: ocfs2: fix readonly issue in ocfs2_unlink() While deleting a file with ocfs2_unlink(), there is a bug in this function. This bug will result in filesystem read-only. After calling ocfs2_orphan_add(), the file which will be deleted is added into orphan dir. If ocfs2_delete_entry() fails, the file still exists in the parent dir. And this scenario introduces a conflict of metadata. If a file is added into orphan dir, when we put inode of the file with iput(), the inode i_flags is setted (~OCFS2_VALID_FL) in ocfs2_remove_inode(), and then write back to disk. But as previously mentioned, the file still exists in the parent dir. On other nodes, the file can be still accessed. When first read the file with ocfs2_read_blocks() from disk, It will check and avalidate inode using ocfs2_validate_inode_block(). So File system will be readonly because the inode is invalid. In other words, the inode i_flags has been set (~OCFS2_VALID_FL). [akpm@linux-foundation.org: cleanups] [jeff.liu@oracle.com: s/inode_is_unlinkable/ocfs2_inode_is_unlinkable/] Signed-off-by: Younger Liu Signed-off-by: Jensen Cc: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Cc: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 30842f5cf590..be3f8676a438 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -773,7 +773,7 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry) return ret; } -static inline int inode_is_unlinkable(struct inode *inode) +static inline int ocfs2_inode_is_unlinkable(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { if (inode->i_nlink == 2) @@ -791,6 +791,7 @@ static int ocfs2_unlink(struct inode *dir, { int status; int child_locked = 0; + bool is_unlinkable = false; struct inode *inode = dentry->d_inode; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); @@ -865,7 +866,7 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - if (inode_is_unlinkable(inode)) { + if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, orphan_name, &orphan_insert); @@ -873,6 +874,7 @@ static int ocfs2_unlink(struct inode *dir, mlog_errno(status); goto leave; } + is_unlinkable = true; } handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); @@ -892,15 +894,6 @@ static int ocfs2_unlink(struct inode *dir, fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (inode_is_unlinkable(inode)) { - status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name, - &orphan_insert, orphan_dir); - if (status < 0) { - mlog_errno(status); - goto leave; - } - } - /* delete the name from the parent dir */ status = ocfs2_delete_entry(handle, dir, &lookup); if (status < 0) { @@ -923,6 +916,14 @@ static int ocfs2_unlink(struct inode *dir, mlog_errno(status); if (S_ISDIR(inode->i_mode)) inc_nlink(dir); + goto leave; + } + + if (is_unlinkable) { + status = ocfs2_orphan_add(osb, handle, inode, fe_bh, + orphan_name, &orphan_insert, orphan_dir); + if (status < 0) + mlog_errno(status); } leave: -- cgit v1.2.3 From ef962df057aaafd714f5c22ba3de1be459571fdf Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 3 Jul 2013 15:01:03 -0700 Subject: ocfs2: xattr: fix inlined xattr reflink Inlined xattr shared free space of inode block with inlined data or data extent record, so the size of the later two should be adjusted when inlined xattr is enabled. See ocfs2_xattr_ibody_init(). But this isn't done well when reflink. For inode with inlined data, its max inlined data size is adjusted in ocfs2_duplicate_inline_data(), no problem. But for inode with data extent record, its record count isn't adjusted. Fix it, or data extent record and inlined xattr may overwrite each other, then cause data corruption or xattr failure. One panic caused by this bug in our test environment is the following: kernel BUG at fs/ocfs2/xattr.c:1435! invalid opcode: 0000 [#1] SMP Pid: 10871, comm: multi_reflink_t Not tainted 2.6.39-300.17.1.el5uek #1 RIP: ocfs2_xa_offset_pointer+0x17/0x20 [ocfs2] RSP: e02b:ffff88007a587948 EFLAGS: 00010283 RAX: 0000000000000000 RBX: 0000000000000010 RCX: 00000000000051e4 RDX: ffff880057092060 RSI: 0000000000000f80 RDI: ffff88007a587a68 RBP: ffff88007a587948 R08: 00000000000062f4 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000010 R13: ffff88007a587a68 R14: 0000000000000001 R15: ffff88007a587c68 FS: 00007fccff7f06e0(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000015cf000 CR3: 000000007aa76000 CR4: 0000000000000660 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process multi_reflink_t Call Trace: ocfs2_xa_reuse_entry+0x60/0x280 [ocfs2] ocfs2_xa_prepare_entry+0x17e/0x2a0 [ocfs2] ocfs2_xa_set+0xcc/0x250 [ocfs2] ocfs2_xattr_ibody_set+0x98/0x230 [ocfs2] __ocfs2_xattr_set_handle+0x4f/0x700 [ocfs2] ocfs2_xattr_set+0x6c6/0x890 [ocfs2] ocfs2_xattr_user_set+0x46/0x50 [ocfs2] generic_setxattr+0x70/0x90 __vfs_setxattr_noperm+0x80/0x1a0 vfs_setxattr+0xa9/0xb0 setxattr+0xc3/0x120 sys_fsetxattr+0xa8/0xd0 system_call_fastpath+0x16/0x1b Signed-off-by: Junxiao Bi Reviewed-by: Jie Liu Acked-by: Joel Becker Cc: Mark Fasheh Cc: Sunil Mushran Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 0deabcdaf916..317ef0abccbb 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6491,6 +6491,16 @@ static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args) } new_oi = OCFS2_I(args->new_inode); + /* + * Adjust extent record count to reserve space for extended attribute. + * Inline data count had been adjusted in ocfs2_duplicate_inline_data(). + */ + if (!(new_oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) && + !(ocfs2_inode_is_fast_symlink(args->new_inode))) { + struct ocfs2_extent_list *el = &new_di->id2.i_list; + le16_add_cpu(&el->l_count, -(inline_size / + sizeof(struct ocfs2_extent_rec))); + } spin_lock(&new_oi->ip_lock); new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL; new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features); -- cgit v1.2.3 From e873fdb5259ac62cf099621590645470c12a1d21 Mon Sep 17 00:00:00 2001 From: Noboru Iwamatsu Date: Wed, 3 Jul 2013 15:01:04 -0700 Subject: ocfs2: submit disk heartbeat bio using WRITE_SYNC Under heavy I/O load, writing the disk heartbeat can be forced to wait for minutes, and this causes the node to be fenced. This patch tries to use WRITE_SYNC in submitting the heartbeat bio, so that writing the heartbeat will have a priority over other requests. Signed-off-by: Noboru Iwamatsu Acked-by: Tao Ma Acked-by: Sunil Mushran Cc: Srinivas Eeeda Reviewed-by: Jie Liu Tested-by: Gurudas Pai Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 42252bf64b51..f89b46b66235 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -500,7 +500,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE, bio); + submit_bio(WRITE_SYNC, bio); status = 0; bail: -- cgit v1.2.3 From 70f651edb75b68e3c039d137a56be84f53211558 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 3 Jul 2013 15:01:06 -0700 Subject: ocfs2: consolidate o2hb_global_hearbeat_mode_set() naming convention s/o2hb_global_hearbeat_mode_set/o2hb_global_heartbeat_mode_set/ to make the signature of those routines in a consistent manner with others for heartbeating. Signed-off-by: Jie Liu Acked-by: Sunil Mushran Cc: Gurudas Pai Cc: Joel Becker Cc: Mark Fasheh Cc: Noboru Iwamatsu Cc: Srinivas Eeeda Cc: Tao Ma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f89b46b66235..956d79d1fdde 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -176,7 +176,7 @@ static void o2hb_dead_threshold_set(unsigned int threshold) } } -static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode) +static int o2hb_global_heartbeat_mode_set(unsigned int hb_mode) { int ret = -1; @@ -2271,7 +2271,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) continue; - ret = o2hb_global_hearbeat_mode_set(i); + ret = o2hb_global_heartbeat_mode_set(i); if (!ret) printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n", o2hb_heartbeat_mode_desc[i]); @@ -2304,7 +2304,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { NULL, }; -static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { +static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { .show_attribute = o2hb_heartbeat_group_show, .store_attribute = o2hb_heartbeat_group_store, }; @@ -2316,7 +2316,7 @@ static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { static struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_hearbeat_group_item_ops, + .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; -- cgit v1.2.3 From b4d8ed4f8e527751c7ece6cef73f6212808e6130 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 3 Jul 2013 15:01:07 -0700 Subject: ocfs2: fix a comments typo at o2quo_hb_still_up() Fix a comment typo in o2quo_hb_still_up() Signed-off-by: Jie Liu Cc: Gurudas Pai Cc: Joel Becker Cc: Mark Fasheh Cc: Noboru Iwamatsu Cc: Srinivas Eeeda Cc: Sunil Mushran Cc: Tao Ma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/quorum.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index c19897d0fe14..1ec141e758d7 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -264,7 +264,7 @@ void o2quo_hb_still_up(u8 node) /* This is analogous to hb_up. as a node's connection comes up we delay the * quorum decision until we see it heartbeating. the hold will be droped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if - * it's already heartbeating we we might be dropping a hold that conn_up got. + * it's already heartbeating we might be dropping a hold that conn_up got. * */ void o2quo_conn_up(u8 node) { -- cgit v1.2.3 From 44e89cb8e279abdf83581a10e592c2451bae7824 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 3 Jul 2013 15:01:08 -0700 Subject: ocfs2: adjust switch_case syntax at o2net_state_change() Adjust switch..case syntax at o2net_state_change to meet the kernel coding standard. s/printk/pr_info/. [akpm@linux-foundation.org: revert pr_foo() change] Signed-off-by: Jie Liu Acked-by: Joel Becker Cc: Gurudas Pai Cc: Mark Fasheh Cc: Noboru Iwamatsu Cc: Srinivas Eeeda Cc: Sunil Mushran Cc: Tao Ma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index bb9a48bfbd01..d644dc611425 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -633,19 +633,19 @@ static void o2net_state_change(struct sock *sk) state_change = sc->sc_state_change; switch(sk->sk_state) { - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - o2net_sc_queue_work(sc, &sc->sc_connect_work); - break; - default: - printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT - " shutdown, state %d\n", - SC_NODEF_ARGS(sc), sk->sk_state); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); - break; + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + o2net_sc_queue_work(sc, &sc->sc_connect_work); + break; + default: + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT + " shutdown, state %d\n", + SC_NODEF_ARGS(sc), sk->sk_state); + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); + break; } out: read_unlock(&sk->sk_callback_lock); -- cgit v1.2.3 From 4a184b4ff424e544359f081087723fc36efe603e Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 3 Jul 2013 15:01:10 -0700 Subject: ocfs2: fix NULL pointer dereference when traversing o2hb_all_regions There may exist NULL pointer dereference in config_item_name() when one volume (say Volume A) unmounts while another (say Volume B) mounting. Volume A Volume B already Mounted. Unmounting, call o2hb_heartbeat_group_drop_item() -> config_item_put(item) set reg(A)->item.ci_name to NULL in function config_item_cleanup(). begin mounting, call o2hb_region_pin() and tranverse all regions. When reading reg(A)->item.ci_name, it causes NULL pointer dereference. call o2hb_region_release() and del reg(A) from list. So we should skip accessing regions that is going to release when tranverse o2hb_all_regions. Signed-off-by: Yiwen Jiang Signed-off-by: joyce Acked-by: Joel Becker Cc: Mark Fasheh Cc: Sunil Mushran Cc: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 956d79d1fdde..5c1c864e81cc 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2389,6 +2389,9 @@ static int o2hb_region_pin(const char *region_uuid) assert_spin_locked(&o2hb_live_lock); list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + uuid = config_item_name(®->hr_item); /* local heartbeat */ @@ -2439,6 +2442,9 @@ static void o2hb_region_unpin(const char *region_uuid) assert_spin_locked(&o2hb_live_lock); list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + uuid = config_item_name(®->hr_item); if (region_uuid) { if (strcmp(region_uuid, uuid)) @@ -2654,6 +2660,9 @@ int o2hb_get_all_regions(char *region_uuids, u8 max_regions) p = region_uuids; list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) { + if (reg->hr_item_dropped) + continue; + mlog(0, "Region: %s\n", config_item_name(®->hr_item)); if (numregs < max_regions) { memcpy(p, config_item_name(®->hr_item), -- cgit v1.2.3 From 040fa02077de01c7e08fa75be6125e4ca5636011 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:16 -0700 Subject: clear_refs: sanitize accepted commands declaration This is the implementation of the soft-dirty bit concept that should help keep track of changes in user memory, which in turn is very-very required by the checkpoint-restore project (http://criu.org). To create a dump of an application(s) we save all the information about it to files, and the biggest part of such dump is the contents of tasks' memory. However, there are usage scenarios where it's not required to get _all_ the task memory while creating a dump. For example, when doing periodical dumps, it's only required to take full memory dump only at the first step and then take incremental changes of memory. Another example is live migration. We copy all the memory to the destination node without stopping all tasks, then stop them, check for what pages has changed, dump it and the rest of the state, then copy it to the destination node. This decreases freeze time significantly. That said, some help from kernel to watch how processes modify the contents of their memory is required. The proposal is to track changes with the help of new soft-dirty bit this way: 1. First do "echo 4 > /proc/$pid/clear_refs". At that point kernel clears the soft dirty _and_ the writable bits from all ptes of process $pid. From now on every write to any page will result in #pf and the subsequent call to pte_mkdirty/pmd_mkdirty, which in turn will set the soft dirty flag. 2. Then read the /proc/$pid/pagemap2 and check the soft-dirty bit reported there (the 55'th one). If set, the respective pte was written to since last call to clear refs. The soft-dirty bit is the _PAGE_BIT_HIDDEN one. Although it's used by kmemcheck, the latter one marks kernel pages with it, while the former bit is put on user pages so they do not conflict to each other. This patch: A new clear-refs type will be added in the next patch, so prepare code for that. [akpm@linux-foundation.org: don't assume that sizeof(enum clear_refs_types) == sizeof(int)] Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3e636d864d56..dad0809db551 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -688,6 +688,13 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +enum clear_refs_types { + CLEAR_REFS_ALL = 1, + CLEAR_REFS_ANON, + CLEAR_REFS_MAPPED, + CLEAR_REFS_LAST, +}; + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -719,10 +726,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } -#define CLEAR_REFS_ALL 1 -#define CLEAR_REFS_ANON 2 -#define CLEAR_REFS_MAPPED 3 - static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -730,7 +733,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, char buffer[PROC_NUMBUF]; struct mm_struct *mm; struct vm_area_struct *vma; - int type; + enum clear_refs_types type; + int itype; int rv; memset(buffer, 0, sizeof(buffer)); @@ -738,10 +742,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; - rv = kstrtoint(strstrip(buffer), 10, &type); + rv = kstrtoint(strstrip(buffer), 10, &itype); if (rv < 0) return rv; - if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED) + type = (enum clear_refs_types)itype; + if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; task = get_proc_task(file_inode(file)); if (!task) -- cgit v1.2.3 From af9de7eb180fa9b74c2cdc256349304a58c63c02 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:18 -0700 Subject: clear_refs: introduce private struct for mm_walk In the next patch the clear-refs-type will be required in clear_refs_pte_range funciton, so prepare the walk->private to carry this info. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dad0809db551..ef6f6c62dfee 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -695,10 +695,15 @@ enum clear_refs_types { CLEAR_REFS_LAST, }; +struct clear_refs_private { + struct vm_area_struct *vma; +}; + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->private; + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = cp->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -753,13 +758,16 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct clear_refs_private cp = { + }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, .mm = mm, + .private = &cp, }; down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { - clear_refs_walk.private = vma; + cp.vma = vma; if (is_vm_hugetlb_page(vma)) continue; /* -- cgit v1.2.3 From 2b0a9f017548f05e42fbf7e67c4a626c1ebd5e12 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:19 -0700 Subject: pagemap: introduce pagemap_entry_t without pmshift bits These bits are always constant (== PAGE_SHIFT) and just occupy space in the entry. Moreover, in next patch we will need to report one more bit in the pagemap, but all bits are already busy on it. That said, describe the pagemap entry that has 6 more free zero bits. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef6f6c62dfee..39d641292579 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -807,6 +807,7 @@ typedef struct { struct pagemapread { int pos, len; pagemap_entry_t *buffer; + bool v2; }; #define PAGEMAP_WALK_SIZE (PMD_SIZE) @@ -820,14 +821,16 @@ struct pagemapread { #define PM_PSHIFT_BITS 6 #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) -#define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) +#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +/* in "new" pagemap pshift bits are occupied with more status bits */ +#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) -#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) +#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) #define PM_END_OF_BUFFER 1 static inline pagemap_entry_t make_pme(u64 val) @@ -850,7 +853,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, struct pagemapread *pm = walk->private; unsigned long addr; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); for (addr = start; addr < end; addr += PAGE_SIZE) { err = add_to_pagemap(addr, &pme, pm); @@ -860,7 +863,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -static void pte_to_pagemap_entry(pagemap_entry_t *pme, +static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { u64 frame, flags; @@ -879,18 +882,18 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; - *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pmd_t pmd, int offset) { /* @@ -900,12 +903,12 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else -static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, +static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pmd_t pmd, int offset) { } @@ -918,7 +921,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct pagemapread *pm = walk->private; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -928,7 +931,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -945,7 +948,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT); + pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* check that 'vma' actually covers this address, @@ -953,7 +956,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && (vma->vm_start <= addr) && !is_vm_hugetlb_page(vma)) { pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, vma, addr, *pte); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); /* unmap before userspace copy */ pte_unmap(pte); } @@ -968,14 +971,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, } #ifdef CONFIG_HUGETLB_PAGE -static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, +static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, pte_t pte, int offset) { if (pte_present(pte)) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); + | PM_STATUS2(pm->v2, 0) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT); + *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } /* This function walks within one hugetlb entry in the single call */ @@ -989,7 +992,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); err = add_to_pagemap(addr, &pme, pm); if (err) return err; @@ -1051,6 +1054,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; + pm.v2 = false; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; -- cgit v1.2.3 From 0f8975ec4db2c8b5bd111b211292ca9be0feb6b8 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:20 -0700 Subject: mm: soft-dirty bits for user memory changes tracking The soft-dirty is a bit on a PTE which helps to track which pages a task writes to. In order to do this tracking one should 1. Clear soft-dirty bits from PTEs ("echo 4 > /proc/PID/clear_refs) 2. Wait some time. 3. Read soft-dirty bits (55'th in /proc/PID/pagemap2 entries) To do this tracking, the writable bit is cleared from PTEs when the soft-dirty bit is. Thus, after this, when the task tries to modify a page at some virtual address the #PF occurs and the kernel sets the soft-dirty bit on the respective PTE. Note, that although all the task's address space is marked as r/o after the soft-dirty bits clear, the #PF-s that occur after that are processed fast. This is so, since the pages are still mapped to physical memory, and thus all the kernel does is finds this fact out and puts back writable, dirty and soft-dirty bits on the PTE. Another thing to note, is that when mremap moves PTEs they are marked with soft-dirty as well, since from the user perspective mremap modifies the virtual memory at mremap's new address. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 39d641292579..a18e065c1c3e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -692,13 +693,32 @@ enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, + CLEAR_REFS_SOFT_DIRTY, CLEAR_REFS_LAST, }; struct clear_refs_private { struct vm_area_struct *vma; + enum clear_refs_types type; }; +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * The soft-dirty tracker uses #PF-s to catch writes + * to pages, so write-protect the pte as well. See the + * Documentation/vm/soft-dirty.txt for full description + * of how soft-dirty works. + */ + pte_t ptent = *pte; + ptent = pte_wrprotect(ptent); + ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); + set_pte_at(vma->vm_mm, addr, pte, ptent); +#endif +} + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -718,6 +738,11 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(ptent)) continue; + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty(vma, addr, pte); + continue; + } + page = vm_normal_page(vma, addr, ptent); if (!page) continue; @@ -759,6 +784,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, mm = get_task_mm(task); if (mm) { struct clear_refs_private cp = { + .type = type, }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, @@ -766,6 +792,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .private = &cp, }; down_read(&mm->mmap_sem); + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_start(mm, 0, -1); for (vma = mm->mmap; vma; vma = vma->vm_next) { cp.vma = vma; if (is_vm_hugetlb_page(vma)) @@ -786,6 +814,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } + if (type == CLEAR_REFS_SOFT_DIRTY) + mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); mmput(mm); @@ -827,6 +857,7 @@ struct pagemapread { /* in "new" pagemap pshift bits are occupied with more status bits */ #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) +#define __PM_SOFT_DIRTY (1LL) #define PM_PRESENT PM_STATUS(4LL) #define PM_SWAP PM_STATUS(2LL) #define PM_FILE PM_STATUS(1LL) @@ -868,6 +899,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, { u64 frame, flags; struct page *page = NULL; + int flags2 = 0; if (pte_present(pte)) { frame = pte_pfn(pte); @@ -888,13 +920,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; - *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, 0) | flags); + *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset) + pmd_t pmd, int offset, int pmd_flags2) { /* * Currently pmd for thp is always present because thp can not be @@ -903,13 +937,13 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p */ if (pmd_present(pmd)) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) - | PM_STATUS2(pm->v2, 0) | PM_PRESENT); + | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else *pme = make_pme(PM_NOT_PRESENT(pm->v2)); } #else static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pmd_t pmd, int offset) + pmd_t pmd, int offset, int pmd_flags2) { } #endif @@ -926,12 +960,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + int pmd_flags2; + + pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; - thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset); + thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); err = add_to_pagemap(addr, &pme, pm); if (err) break; -- cgit v1.2.3 From 541c237c0923f567c9c4cabb8a81635baadc713f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 3 Jul 2013 15:01:22 -0700 Subject: pagemap: prepare to reuse constant bits with page-shift In order to reuse bits from pagemap entries gracefully, we leave the entries as is but on pagemap open emit a warning in dmesg, that bits 55-60 are about to change in a couple of releases. Next, if a user issues soft-dirty clear command via the clear_refs file (it was disabled before v3.9) we assume that he's aware of the new pagemap format, note that fact and report the bits in pagemap in the new manner. The "migration strategy" looks like this then: 1. existing users are not affected -- they don't touch soft-dirty feature, thus see old bits in pagemap, but are warned and have time to fix themselves 2. those who use soft-dirty know about new pagemap format 3. some time soon we get rid of any signs of page-shift in pagemap as well as this trick with clear-soft-dirty affecting pagemap format. Signed-off-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Glauber Costa Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a18e065c1c3e..dbf61f6174f0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -689,6 +689,23 @@ const struct file_operations proc_tid_smaps_operations = { .release = seq_release_private, }; +/* + * We do not want to have constant page-shift bits sitting in + * pagemap entries and are about to reuse them some time soon. + * + * Here's the "migration strategy": + * 1. when the system boots these bits remain what they are, + * but a warning about future change is printed in log; + * 2. once anyone clears soft-dirty bits via clear_refs file, + * these flag is set to denote, that user is aware of the + * new API and those page-shift bits change their meaning. + * The respective warning is printed in dmesg; + * 3. In a couple of releases we will remove all the mentions + * of page-shift in pagemap entries. + */ + +static bool soft_dirty_cleared __read_mostly; + enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -778,6 +795,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, type = (enum clear_refs_types)itype; if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) return -EINVAL; + + if (type == CLEAR_REFS_SOFT_DIRTY) { + soft_dirty_cleared = true; + pr_warn_once("The pagemap bits 55-60 has changed their meaning! " + "See the linux/Documentation/vm/pagemap.txt for details.\n"); + } + task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; @@ -1091,7 +1115,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!count) goto out_task; - pm.v2 = false; + pm.v2 = soft_dirty_cleared; pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; @@ -1164,9 +1188,18 @@ out: return ret; } +static int pagemap_open(struct inode *inode, struct file *file) +{ + pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " + "to stop being page-shift some time soon. See the " + "linux/Documentation/vm/pagemap.txt for details.\n"); + return 0; +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, + .open = pagemap_open, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ -- cgit v1.2.3 From ef9f515a4c38c759c56c7d1e760cc243b6d516f4 Mon Sep 17 00:00:00 2001 From: Libin Date: Wed, 3 Jul 2013 15:01:26 -0700 Subject: ncpfs: use vma_pages() to replace (vm_end - vm_start) >> PAGE_SHIFT (*->vm_end - *->vm_start) >> PAGE_SHIFT operation is implemented as a inline funcion vma_pages() in linux/mm.h, so using it. Signed-off-by: Libin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ncpfs/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index ee24df5af1f9..3c5dd55d284c 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -117,7 +117,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; /* we do not support files bigger than 4GB... We eventually supports just 4GB... */ - if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff + if (vma_pages(vma) + vma->vm_pgoff > (1U << (32 - PAGE_SHIFT))) return -EFBIG; -- cgit v1.2.3 From b45972265f823ed01eae0867a176320071665787 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 3 Jul 2013 15:02:05 -0700 Subject: mm: vmscan: take page buffers dirty and locked state into account Page reclaim keeps track of dirty and under writeback pages and uses it to determine if wait_iff_congested() should stall or if kswapd should begin writing back pages. This fails to account for buffer pages that can be under writeback but not PageWriteback which is the case for filesystems like ext3 ordered mode. Furthermore, PageDirty buffer pages can have all the buffers clean and writepage does no IO so it should not be accounted as congested. This patch adds an address_space operation that filesystems may optionally use to check if a page is really dirty or really under writeback. An implementation is provided for for buffer_heads is added and used for block operations and ext3 in ordered mode. By default the page flags are obeyed. Credit goes to Jan Kara for identifying that the page flags alone are not sufficient for ext3 and sanity checking a number of ideas on how the problem could be addressed. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 1 + fs/buffer.c | 34 ++++++++++++++++++++++++++++++++++ fs/ext3/inode.c | 1 + 3 files changed, 36 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 431b6a04ebfd..bb43ce081d6e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1562,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = { .writepages = generic_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .is_dirty_writeback = buffer_check_dirty_writeback, }; const struct file_operations def_blk_fops = { diff --git a/fs/buffer.c b/fs/buffer.c index f93392e2df12..4d7433534f5c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -82,6 +82,40 @@ void unlock_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(unlock_buffer); +/* + * Returns if the page has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the PageDirty information is stale. If + * any of the pages are locked, it is assumed they are locked for IO. + */ +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct buffer_head *head, *bh; + *dirty = false; + *writeback = false; + + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + return; + + if (PageWriteback(page)) + *writeback = true; + + head = page_buffers(page); + bh = head; + do { + if (buffer_locked(bh)) + *writeback = true; + + if (buffer_dirty(bh)) + *dirty = true; + + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(buffer_check_dirty_writeback); + /* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f67668f724ba..2bd85486b879 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1985,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, .error_remove_page = generic_error_remove_page, }; -- cgit v1.2.3 From f919b19614f06711cba300c1bb1e3d94c9ca21b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 3 Jul 2013 15:02:06 -0700 Subject: fs: nfs: inform the VM about pages being committed or unstable VM page reclaim uses dirty and writeback page states to determine if flushers are cleaning pages too slowly and that page reclaim should stall waiting on flushers to catch up. Page state in NFS is a bit more complex and a clean page can be unreclaimable due to being unstable which is effectively "dirty" from the perspective of the VM from reclaim context. Similarly, if the inode is currently being committed then it's similar to being under writeback. This patch adds a is_dirty_writeback() handled for NFS that checks if a pages backing inode is being committed and should be accounted as writeback and if a page has private state indicating that it is effectively dirty. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Jiri Slaby Cc: Valdis Kletnieks Cc: Zlatko Calusic Cc: dormando Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfs/file.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b4a79f4ad1d..94e94bd11aae 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -495,6 +495,35 @@ static int nfs_release_page(struct page *page, gfp_t gfp) return nfs_fscache_release_page(page, gfp); } +static void nfs_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct nfs_inode *nfsi; + struct address_space *mapping = page_file_mapping(page); + + if (!mapping || PageSwapCache(page)) + return; + + /* + * Check if an unstable page is currently being committed and + * if so, have the VM treat it as if the page is under writeback + * so it will not block due to pages that will shortly be freeable. + */ + nfsi = NFS_I(mapping->host); + if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + *writeback = true; + return; + } + + /* + * If PagePrivate() is set, then the page is not freeable and as the + * inode is not being committed, it's not going to be cleaned in the + * near future so treat it as dirty + */ + if (PagePrivate(page)) + *dirty = true; +} + /* * Attempt to clear the private state associated with a page when an error * occurs that requires the cached contents of an inode to be written back or @@ -542,6 +571,7 @@ const struct address_space_operations nfs_file_aops = { .direct_IO = nfs_direct_IO, .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, + .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, #ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, -- cgit v1.2.3 From b27eb186608c89ef0979ae47c649859ceaa1b2e7 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:13 -0700 Subject: vmcore: clean up read_vmcore() Rewrite part of read_vmcore() that reads objects in vmcore_list in the same way as part reading ELF headers, by which some duplicated and redundant codes are removed. Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 68 +++++++++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 17f7e080d7ff..ab0c92e64411 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -118,27 +118,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } -/* Maps vmcore file offset to respective physical address in memroy. */ -static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, - struct vmcore **m_ptr) -{ - struct vmcore *m; - u64 paddr; - - list_for_each_entry(m, vc_list, list) { - u64 start, end; - start = m->offset; - end = m->offset + m->size - 1; - if (offset >= start && offset <= end) { - paddr = m->paddr + offset - start; - *m_ptr = m; - return paddr; - } - } - *m_ptr = NULL; - return 0; -} - /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -147,8 +126,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, { ssize_t acc = 0, tmp; size_t tsz; - u64 start, nr_bytes; - struct vmcore *curr_m = NULL; + u64 start; + struct vmcore *m = NULL; if (buflen == 0 || *fpos >= vmcore_size) return 0; @@ -174,33 +153,26 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } - start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); - if (!curr_m) - return -EINVAL; - - while (buflen) { - tsz = min_t(size_t, buflen, PAGE_SIZE - (start & ~PAGE_MASK)); - - /* Calculate left bytes in current memory segment. */ - nr_bytes = (curr_m->size - (start - curr_m->paddr)); - if (tsz > nr_bytes) - tsz = nr_bytes; - - tmp = read_from_oldmem(buffer, tsz, &start, 1); - if (tmp < 0) - return tmp; - buflen -= tsz; - *fpos += tsz; - buffer += tsz; - acc += tsz; - if (start >= (curr_m->paddr + curr_m->size)) { - if (curr_m->list.next == &vmcore_list) - return acc; /*EOF*/ - curr_m = list_entry(curr_m->list.next, - struct vmcore, list); - start = curr_m->paddr; + list_for_each_entry(m, &vmcore_list, list) { + if (*fpos < m->offset + m->size) { + tsz = m->offset + m->size - *fpos; + if (buflen < tsz) + tsz = buflen; + start = m->paddr + *fpos - m->offset; + tmp = read_from_oldmem(buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; } } + return acc; } -- cgit v1.2.3 From f2bdacdd597d8d05c3d5f5d36273084f7ef7e6f5 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:14 -0700 Subject: vmcore: allocate buffer for ELF headers on page-size alignment Allocate ELF headers on page-size boundary using __get_free_pages() instead of kmalloc(). Later patch will merge PT_NOTE entries into a single unique one and decrease the buffer size actually used. Keep original buffer size in variable elfcorebuf_sz_orig to kfree the buffer later and actually used buffer size with rounded up to page-size boundary in variable elfcorebuf_sz separately. The size of part of the ELF buffer exported from /proc/vmcore is elfcorebuf_sz. The merged, removed PT_NOTE entries, i.e. the range [elfcorebuf_sz, elfcorebuf_sz_orig], is filled with 0. Use size of the ELF headers as an initial offset value in set_vmcore_list_offsets_elf{64,32} and process_ptload_program_headers_elf{64,32} in order to indicate that the offset includes the holes towards the page boundary. As a result, both set_vmcore_list_offsets_elf{64,32} have the same definition. Merge them as set_vmcore_list_offsets. [akpm@linux-foundation.org: add free_elfcorebuf(), cleanups] Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 119 ++++++++++++++++++++++++------------------------------- 1 file changed, 51 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index ab0c92e64411..0b1c04e5e2c5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -32,6 +32,7 @@ static LIST_HEAD(vmcore_list); /* Stores the pointer to the buffer containing kernel elf core headers. */ static char *elfcorebuf; static size_t elfcorebuf_sz; +static size_t elfcorebuf_sz_orig; /* Total size of vmcore file. */ static u64 vmcore_size; @@ -186,7 +187,7 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size_elf64(char *elfptr) +static u64 __init get_vmcore_size_elf64(char *elfptr, size_t elfsz) { int i; u64 size; @@ -195,7 +196,7 @@ static u64 __init get_vmcore_size_elf64(char *elfptr) ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); - size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); + size = elfsz; for (i = 0; i < ehdr_ptr->e_phnum; i++) { size += phdr_ptr->p_memsz; phdr_ptr++; @@ -203,7 +204,7 @@ static u64 __init get_vmcore_size_elf64(char *elfptr) return size; } -static u64 __init get_vmcore_size_elf32(char *elfptr) +static u64 __init get_vmcore_size_elf32(char *elfptr, size_t elfsz) { int i; u64 size; @@ -212,7 +213,7 @@ static u64 __init get_vmcore_size_elf32(char *elfptr) ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); - size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); + size = elfsz; for (i = 0; i < ehdr_ptr->e_phnum; i++) { size += phdr_ptr->p_memsz; phdr_ptr++; @@ -294,6 +295,8 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -375,6 +378,8 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); *elfsz = *elfsz - i; memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + memset(elfptr + *elfsz, 0, i); + *elfsz = roundup(*elfsz, PAGE_SIZE); /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; @@ -398,8 +403,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + + vmcore_off = elfsz + phdr_ptr->p_memsz; /* Note sections */ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -435,8 +439,7 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ /* First program header is PT_NOTE header. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + + vmcore_off = elfsz + phdr_ptr->p_memsz; /* Note sections */ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { @@ -459,18 +462,14 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf64(char *elfptr, - struct list_head *vc_list) +static void __init set_vmcore_list_offsets(size_t elfsz, + struct list_head *vc_list) { loff_t vmcore_off; - Elf64_Ehdr *ehdr_ptr; struct vmcore *m; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf64_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + vmcore_off = elfsz; list_for_each_entry(m, vc_list, list) { m->offset = vmcore_off; @@ -478,24 +477,10 @@ static void __init set_vmcore_list_offsets_elf64(char *elfptr, } } -/* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets_elf32(char *elfptr, - struct list_head *vc_list) +static void free_elfcorebuf(void) { - loff_t vmcore_off; - Elf32_Ehdr *ehdr_ptr; - struct vmcore *m; - - ehdr_ptr = (Elf32_Ehdr *)elfptr; - - /* Skip Elf header and program headers. */ - vmcore_off = sizeof(Elf32_Ehdr) + - (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); - - list_for_each_entry(m, vc_list, list) { - m->offset = vmcore_off; - vmcore_off += m->size; - } + free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); + elfcorebuf = NULL; } static int __init parse_crash_elf64_headers(void) @@ -526,31 +511,31 @@ static int __init parse_crash_elf64_headers(void) } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf64_Ehdr) + + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + if (rc) + goto fail; rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf32_headers(void) @@ -581,31 +566,30 @@ static int __init parse_crash_elf32_headers(void) } /* Read in all elf headers. */ - elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); - elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + elfcorebuf_sz_orig = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf_sz = elfcorebuf_sz_orig; + elfcorebuf = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(elfcorebuf_sz_orig)); if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); - if (rc < 0) { - kfree(elfcorebuf); - return rc; - } + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + if (rc < 0) + goto fail; /* Merge all PT_NOTE headers into one. */ rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } + if (rc) + goto fail; rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, &vmcore_list); - if (rc) { - kfree(elfcorebuf); - return rc; - } - set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + if (rc) + goto fail; + set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list); return 0; +fail: + free_elfcorebuf(); + return rc; } static int __init parse_crash_elf_headers(void) @@ -629,14 +613,14 @@ static int __init parse_crash_elf_headers(void) return rc; /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf64(elfcorebuf); + vmcore_size = get_vmcore_size_elf64(elfcorebuf, elfcorebuf_sz); } else if (e_ident[EI_CLASS] == ELFCLASS32) { rc = parse_crash_elf32_headers(); if (rc) return rc; /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf32(elfcorebuf); + vmcore_size = get_vmcore_size_elf32(elfcorebuf, elfcorebuf_sz); } else { pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; @@ -683,7 +667,6 @@ void vmcore_cleanup(void) list_del(&m->list); kfree(m); } - kfree(elfcorebuf); - elfcorebuf = NULL; + free_elfcorebuf(); } EXPORT_SYMBOL_GPL(vmcore_cleanup); -- cgit v1.2.3 From 7f614cd1e052ebbddee7ea49c725dc75fee74a5a Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:15 -0700 Subject: vmcore: treat memory chunks referenced by PT_LOAD program header entries in page-size boundary in vmcore_list Treat memory chunks referenced by PT_LOAD program header entries in page-size boundary in vmcore_list. Formally, for each range [start, end], we set up the corresponding vmcore object in vmcore_list to [rounddown(start, PAGE_SIZE), roundup(end, PAGE_SIZE)]. This change affects layout of /proc/vmcore. The gaps generated by the rearrangement are newly made visible to applications as holes. Concretely, they are two ranges [rounddown(start, PAGE_SIZE), start] and [end, roundup(end, PAGE_SIZE)]. Suppose variable m points at a vmcore object in vmcore_list, and variable phdr points at the program header of PT_LOAD type the variable m corresponds to. Then, pictorially: m->offset +---------------+ | hole | phdr->p_offset = +---------------+ m->offset + (paddr - start) | |\ | kernel memory | phdr->p_memsz | |/ +---------------+ | hole | m->offset + m->size +---------------+ where m->offset and m->offset + m->size are always page-size aligned. Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 0b1c04e5e2c5..78c87a1ac01a 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -407,20 +407,27 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, phdr_ptr->p_memsz; /* Note sections */ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset. */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } @@ -443,20 +450,27 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, phdr_ptr->p_memsz; /* Note sections */ for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 paddr, start, end, size; + if (phdr_ptr->p_type != PT_LOAD) continue; + paddr = phdr_ptr->p_offset; + start = rounddown(paddr, PAGE_SIZE); + end = roundup(paddr + phdr_ptr->p_memsz, PAGE_SIZE); + size = end - start; + /* Add this contiguous chunk of memory to vmcore list.*/ new = get_new_element(); if (!new) return -ENOMEM; - new->paddr = phdr_ptr->p_offset; - new->size = phdr_ptr->p_memsz; + new->paddr = start; + new->size = size; list_add_tail(&new->list, vc_list); /* Update the program header offset */ - phdr_ptr->p_offset = vmcore_off; - vmcore_off = vmcore_off + phdr_ptr->p_memsz; + phdr_ptr->p_offset = vmcore_off + (paddr - start); + vmcore_off = vmcore_off + size; } return 0; } -- cgit v1.2.3 From 087350c9dcf1b38c597b31d7761f7366e2866e6b Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:19 -0700 Subject: vmcore: allocate ELF note segment in the 2nd kernel vmalloc memory The reasons why we don't allocate ELF note segment in the 1st kernel (old memory) on page boundary is to keep backward compatibility for old kernels, and that if doing so, we waste not a little memory due to round-up operation to fit the memory to page boundary since most of the buffers are in per-cpu area. ELF notes are per-cpu, so total size of ELF note segments depends on number of CPUs. The current maximum number of CPUs on x86_64 is 5192, and there's already system with 4192 CPUs in SGI, where total size amounts to 1MB. This can be larger in the near future or possibly even now on another architecture that has larger size of note per a single cpu. Thus, to avoid the case where memory allocation for large block fails, we allocate vmcore objects on vmalloc memory. This patch adds elfnotes_buf and elfnotes_sz variables to keep pointer to the ELF note segment buffer and its size. There's no longer the vmcore object that corresponds to the ELF note segment in vmcore_list. Accordingly, read_vmcore() has new case for ELF note segment and set_vmcore_list_offsets_elf{64,32}() and other helper functions starts calculating offset from sum of size of ELF headers and size of ELF note segment. [akpm@linux-foundation.org: use min(), fix error-path vzalloc() leaks] Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 359 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 286 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 78c87a1ac01a..9b9270eb0599 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -34,6 +34,9 @@ static char *elfcorebuf; static size_t elfcorebuf_sz; static size_t elfcorebuf_sz_orig; +static char *elfnotes_buf; +static size_t elfnotes_sz; + /* Total size of vmcore file. */ static u64 vmcore_size; @@ -139,9 +142,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = elfcorebuf_sz - *fpos; - if (buflen < tsz) - tsz = buflen; + tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) return -EFAULT; buflen -= tsz; @@ -154,11 +155,27 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } + /* Read Elf note segment */ + if (*fpos < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + if (copy_to_user(buffer, kaddr, tsz)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + list_for_each_entry(m, &vmcore_list, list) { if (*fpos < m->offset + m->size) { - tsz = m->offset + m->size - *fpos; - if (buflen < tsz) - tsz = buflen; + tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); start = m->paddr + *fpos - m->offset; tmp = read_from_oldmem(buffer, tsz, &start, 1); if (tmp < 0) @@ -221,27 +238,27 @@ static u64 __init get_vmcore_size_elf32(char *elfptr, size_t elfsz) return size; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf64 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf64_Phdr *phdr_ptr; Elf64_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); @@ -253,7 +270,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { + while (real_sz < max_sz) { if (nhdr_ptr->n_namesz == 0) break; sz = sizeof(Elf64_Nhdr) + @@ -262,26 +279,122 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, real_sz += sz; nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; } + return 0; +} + +/** + * get_note_number_and_size_elf64 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf64(const Elf64_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf64_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf64_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; + } + + return 0; +} + +/** + * copy_notes_elf64 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf64 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf64_Phdr *phdr_ptr; + + phdr_ptr = (Elf64_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; + } + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + rc = update_note_header_size_elf64(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf64(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = vzalloc(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf64_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -304,27 +417,27 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return 0; } -/* Merges all the PT_NOTE headers into one. */ -static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, - struct list_head *vc_list) +/** + * update_note_header_size_elf32 - update p_memsz member of each PT_NOTE entry + * + * @ehdr_ptr: ELF header + * + * This function updates p_memsz member of each PT_NOTE entry in the + * program header table pointed to by @ehdr_ptr to real size of ELF + * note segment. + */ +static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) { - int i, nr_ptnote=0, rc=0; - char *tmp; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr phdr, *phdr_ptr; + int i, rc=0; + Elf32_Phdr *phdr_ptr; Elf32_Nhdr *nhdr_ptr; - u64 phdr_sz = 0, note_off; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { - int j; void *notes_section; - struct vmcore *new; u64 offset, max_sz, sz, real_sz = 0; if (phdr_ptr->p_type != PT_NOTE) continue; - nr_ptnote++; max_sz = phdr_ptr->p_memsz; offset = phdr_ptr->p_offset; notes_section = kmalloc(max_sz, GFP_KERNEL); @@ -336,7 +449,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; } nhdr_ptr = notes_section; - for (j = 0; j < max_sz; j += sz) { + while (real_sz < max_sz) { if (nhdr_ptr->n_namesz == 0) break; sz = sizeof(Elf32_Nhdr) + @@ -345,26 +458,122 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, real_sz += sz; nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); } - - /* Add this contiguous chunk of notes section to vmcore list.*/ - new = get_new_element(); - if (!new) { - kfree(notes_section); - return -ENOMEM; - } - new->paddr = phdr_ptr->p_offset; - new->size = real_sz; - list_add_tail(&new->list, vc_list); - phdr_sz += real_sz; kfree(notes_section); + phdr_ptr->p_memsz = real_sz; + } + + return 0; +} + +/** + * get_note_number_and_size_elf32 - get the number of PT_NOTE program + * headers and sum of real size of their ELF note segment headers and + * data. + * + * @ehdr_ptr: ELF header + * @nr_ptnote: buffer for the number of PT_NOTE program headers + * @sz_ptnote: buffer for size of unique PT_NOTE program header + * + * This function is used to merge multiple PT_NOTE program headers + * into a unique single one. The resulting unique entry will have + * @sz_ptnote in its phdr->p_mem. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init get_note_number_and_size_elf32(const Elf32_Ehdr *ehdr_ptr, + int *nr_ptnote, u64 *sz_ptnote) +{ + int i; + Elf32_Phdr *phdr_ptr; + + *nr_ptnote = *sz_ptnote = 0; + + phdr_ptr = (Elf32_Phdr *)(ehdr_ptr + 1); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_NOTE) + continue; + *nr_ptnote += 1; + *sz_ptnote += phdr_ptr->p_memsz; } + return 0; +} + +/** + * copy_notes_elf32 - copy ELF note segments in a given buffer + * + * @ehdr_ptr: ELF header + * @notes_buf: buffer into which ELF note segments are copied + * + * This function is used to copy ELF note segment in the 1st kernel + * into the buffer @notes_buf in the 2nd kernel. It is assumed that + * size of the buffer @notes_buf is equal to or larger than sum of the + * real ELF note segment headers and data. + * + * It is assumed that program headers with PT_NOTE type pointed to by + * @ehdr_ptr has already been updated by update_note_header_size_elf32 + * and each of PT_NOTE program headers has actual ELF note segment + * size in its p_memsz member. + */ +static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) +{ + int i, rc=0; + Elf32_Phdr *phdr_ptr; + + phdr_ptr = (Elf32_Phdr*)(ehdr_ptr + 1); + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + u64 offset; + if (phdr_ptr->p_type != PT_NOTE) + continue; + offset = phdr_ptr->p_offset; + rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + if (rc < 0) + return rc; + notes_buf += phdr_ptr->p_memsz; + } + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + char **notes_buf, size_t *notes_sz) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + rc = update_note_header_size_elf32(ehdr_ptr); + if (rc < 0) + return rc; + + rc = get_note_number_and_size_elf32(ehdr_ptr, &nr_ptnote, &phdr_sz); + if (rc < 0) + return rc; + + *notes_sz = roundup(phdr_sz, PAGE_SIZE); + *notes_buf = vzalloc(*notes_sz); + if (!*notes_buf) + return -ENOMEM; + + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); + if (rc < 0) + return rc; + /* Prepare merged PT_NOTE program header. */ phdr.p_type = PT_NOTE; phdr.p_flags = 0; note_off = sizeof(Elf32_Ehdr) + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); - phdr.p_offset = note_off; + phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; phdr.p_align = 0; @@ -391,6 +600,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, * the new offset fields of exported program headers. */ static int __init process_ptload_program_headers_elf64(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -402,9 +612,8 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, ehdr_ptr = (Elf64_Ehdr *)elfptr; phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = elfsz + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { u64 paddr, start, end, size; @@ -434,6 +643,7 @@ static int __init process_ptload_program_headers_elf64(char *elfptr, static int __init process_ptload_program_headers_elf32(char *elfptr, size_t elfsz, + size_t elfnotes_sz, struct list_head *vc_list) { int i; @@ -445,9 +655,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, ehdr_ptr = (Elf32_Ehdr *)elfptr; phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ - /* First program header is PT_NOTE header. */ - vmcore_off = elfsz + - phdr_ptr->p_memsz; /* Note sections */ + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { u64 paddr, start, end, size; @@ -476,14 +685,14 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets(size_t elfsz, +static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, struct list_head *vc_list) { loff_t vmcore_off; struct vmcore *m; - /* Skip Elf header and program headers. */ - vmcore_off = elfsz; + /* Skip Elf header, program headers and Elf note segment. */ + vmcore_off = elfsz + elfnotes_sz; list_for_each_entry(m, vc_list, list) { m->offset = vmcore_off; @@ -495,6 +704,8 @@ static void free_elfcorebuf(void) { free_pages((unsigned long)elfcorebuf, get_order(elfcorebuf_sz_orig)); elfcorebuf = NULL; + vfree(elfnotes_buf); + elfnotes_buf = NULL; } static int __init parse_crash_elf64_headers(void) @@ -538,14 +749,15 @@ static int __init parse_crash_elf64_headers(void) goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); if (rc) goto fail; rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, - &vmcore_list); + elfnotes_sz, &vmcore_list); if (rc) goto fail; - set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list); + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; fail: free_elfcorebuf(); @@ -592,14 +804,15 @@ static int __init parse_crash_elf32_headers(void) goto fail; /* Merge all PT_NOTE headers into one. */ - rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, + &elfnotes_buf, &elfnotes_sz); if (rc) goto fail; rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, - &vmcore_list); + elfnotes_sz, &vmcore_list); if (rc) goto fail; - set_vmcore_list_offsets(elfcorebuf_sz, &vmcore_list); + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); return 0; fail: free_elfcorebuf(); -- cgit v1.2.3 From ef9e78fd2753213ea01d77f7a76a9cb6ad0f50a7 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:21 -0700 Subject: vmcore: allow user process to remap ELF note segment buffer Now ELF note segment has been copied in the buffer on vmalloc memory. To allow user process to remap the ELF note segment buffer with remap_vmalloc_page, the corresponding VM area object has to have VM_USERMAP flag set. [akpm@linux-foundation.org: use the conventional comment layout] Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9b9270eb0599..1082492e02fc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -369,6 +369,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, Elf64_Ehdr *ehdr_ptr; Elf64_Phdr phdr; u64 phdr_sz = 0, note_off; + struct vm_struct *vm; ehdr_ptr = (Elf64_Ehdr *)elfptr; @@ -385,6 +386,14 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, if (!*notes_buf) return -ENOMEM; + /* + * Allow users to remap ELF note segment buffer on vmalloc memory using + * remap_vmalloc_range.() + */ + vm = find_vm_area(*notes_buf); + BUG_ON(!vm); + vm->flags |= VM_USERMAP; + rc = copy_notes_elf64(ehdr_ptr, *notes_buf); if (rc < 0) return rc; @@ -548,6 +557,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, Elf32_Ehdr *ehdr_ptr; Elf32_Phdr phdr; u64 phdr_sz = 0, note_off; + struct vm_struct *vm; ehdr_ptr = (Elf32_Ehdr *)elfptr; @@ -564,6 +574,14 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, if (!*notes_buf) return -ENOMEM; + /* + * Allow users to remap ELF note segment buffer on vmalloc memory using + * remap_vmalloc_range() + */ + vm = find_vm_area(*notes_buf); + BUG_ON(!vm); + vm->flags |= VM_USERMAP; + rc = copy_notes_elf32(ehdr_ptr, *notes_buf); if (rc < 0) return rc; -- cgit v1.2.3 From 591ff71664e764a3806e341370f3c758cb2e7e3c Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:22 -0700 Subject: vmcore: calculate vmcore file size from buffer size and total size of vmcore objects The previous patches newly added holes before each chunk of memory and the holes need to be count in vmcore file size. There are two ways to count file size in such a way: 1) suppose m is a poitner to the last vmcore object in vmcore_list. Then file size is (m->offset + m->size), or 2) calculate sum of size of buffers for ELF header, program headers, ELF note segments and objects in vmcore_list. Although 1) is more direct and simpler than 2), 2) seems better in that it reflects internal object structure of /proc/vmcore. Thus, this patch changes get_vmcore_size_elf{64, 32} so that it calculates size in the way of 2). As a result, both get_vmcore_size_elf{64, 32} have the same definition. Merge them as get_vmcore_size. Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 44 +++++++++++--------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 1082492e02fc..8ec648368985 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -204,36 +204,15 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size_elf64(char *elfptr, size_t elfsz) +static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { - int i; u64 size; - Elf64_Ehdr *ehdr_ptr; - Elf64_Phdr *phdr_ptr; - - ehdr_ptr = (Elf64_Ehdr *)elfptr; - phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); - size = elfsz; - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; - } - return size; -} - -static u64 __init get_vmcore_size_elf32(char *elfptr, size_t elfsz) -{ - int i; - u64 size; - Elf32_Ehdr *ehdr_ptr; - Elf32_Phdr *phdr_ptr; + struct vmcore *m; - ehdr_ptr = (Elf32_Ehdr *)elfptr; - phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); - size = elfsz; - for (i = 0; i < ehdr_ptr->e_phnum; i++) { - size += phdr_ptr->p_memsz; - phdr_ptr++; + size = elfsz + elfnotesegsz; + list_for_each_entry(m, vc_list, list) { + size += m->size; } return size; } @@ -856,20 +835,19 @@ static int __init parse_crash_elf_headers(void) rc = parse_crash_elf64_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf64(elfcorebuf, elfcorebuf_sz); } else if (e_ident[EI_CLASS] == ELFCLASS32) { rc = parse_crash_elf32_headers(); if (rc) return rc; - - /* Determine vmcore size. */ - vmcore_size = get_vmcore_size_elf32(elfcorebuf, elfcorebuf_sz); } else { pr_warn("Warning: Core image elf header is not sane\n"); return -EINVAL; } + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + return 0; } -- cgit v1.2.3 From 83086978c63afd7c73e1c173c84aeab184c1e916 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 3 Jul 2013 15:02:23 -0700 Subject: vmcore: support mmap() on /proc/vmcore This patch introduces mmap_vmcore(). Don't permit writable nor executable mapping even with mprotect() because this mmap() is aimed at reading crash dump memory. Non-writable mapping is also requirement of remap_pfn_range() when mapping linear pages on non-consecutive physical pages; see is_cow_mapping(). Set VM_MIXEDMAP flag to remap memory by remap_pfn_range and by remap_vmalloc_range_pertial at the same time for a single vma. do_munmap() can correctly clean partially remapped vma with two functions in abnormal case. See zap_pte_range(), vm_normal_page() and their comments for details. On x86-32 PAE kernels, mmap() supports at most 16TB memory only. This limitation comes from the fact that the third argument of remap_pfn_range(), pfn, is of 32-bit length on x86-32: unsigned long. [akpm@linux-foundation.org: use min(), switch to conventional error-unwinding approach] Signed-off-by: HATAYAMA Daisuke Acked-by: Vivek Goyal Cc: KOSAKI Motohiro Cc: Atsushi Kumagai Cc: Lisa Mitchell Cc: Zhang Yanfei Tested-by: Maxim Uvarov Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 116 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 8ec648368985..28503172f2e4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -194,9 +195,122 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } +/** + * alloc_elfnotes_buf - allocate buffer for ELF note segment in + * vmalloc memory + * + * @notes_sz: size of buffer + * + * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap + * the buffer to user-space by means of remap_vmalloc_range(). + * + * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is + * disabled and there's no need to allow users to mmap the buffer. + */ +static inline char *alloc_elfnotes_buf(size_t notes_sz) +{ +#ifdef CONFIG_MMU + return vmalloc_user(notes_sz); +#else + return vzalloc(notes_sz); +#endif +} + +/* + * Disable mmap_vmcore() if CONFIG_MMU is not defined. MMU is + * essential for mmap_vmcore() in order to map physically + * non-contiguous objects (ELF header, ELF note segment and memory + * regions in the 1st kernel pointed to by PT_LOAD entries) into + * virtually contiguous user-space in ELF layout. + */ +#ifdef CONFIG_MMU +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + u64 start, end, len, tsz; + struct vmcore *m; + + start = (u64)vma->vm_pgoff << PAGE_SHIFT; + end = start + size; + + if (size > vmcore_size || end > vmcore_size) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + vma->vm_flags |= VM_MIXEDMAP; + + len = 0; + + if (start < elfcorebuf_sz) { + u64 pfn; + + tsz = min(elfcorebuf_sz - (size_t)start, size); + pfn = __pa(elfcorebuf + start) >> PAGE_SHIFT; + if (remap_pfn_range(vma, vma->vm_start, pfn, tsz, + vma->vm_page_prot)) + return -EAGAIN; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + if (start < elfcorebuf_sz + elfnotes_sz) { + void *kaddr; + + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); + kaddr = elfnotes_buf + start - elfcorebuf_sz; + if (remap_vmalloc_range_partial(vma, vma->vm_start + len, + kaddr, tsz)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + + list_for_each_entry(m, &vmcore_list, list) { + if (start < m->offset + m->size) { + u64 paddr = 0; + + tsz = min_t(size_t, m->offset + m->size - start, size); + paddr = m->paddr + start - m->offset; + if (remap_pfn_range(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) + goto fail; + size -= tsz; + start += tsz; + len += tsz; + + if (size == 0) + return 0; + } + } + + return 0; +fail: + do_munmap(vma->vm_mm, vma->vm_start, len); + return -EAGAIN; +} +#else +static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +#endif + static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, .llseek = default_llseek, + .mmap = mmap_vmcore, }; static struct vmcore* __init get_new_element(void) @@ -348,7 +462,6 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, Elf64_Ehdr *ehdr_ptr; Elf64_Phdr phdr; u64 phdr_sz = 0, note_off; - struct vm_struct *vm; ehdr_ptr = (Elf64_Ehdr *)elfptr; @@ -361,18 +474,10 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = vzalloc(*notes_sz); + *notes_buf = alloc_elfnotes_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; - /* - * Allow users to remap ELF note segment buffer on vmalloc memory using - * remap_vmalloc_range.() - */ - vm = find_vm_area(*notes_buf); - BUG_ON(!vm); - vm->flags |= VM_USERMAP; - rc = copy_notes_elf64(ehdr_ptr, *notes_buf); if (rc < 0) return rc; @@ -536,7 +641,6 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, Elf32_Ehdr *ehdr_ptr; Elf32_Phdr phdr; u64 phdr_sz = 0, note_off; - struct vm_struct *vm; ehdr_ptr = (Elf32_Ehdr *)elfptr; @@ -549,18 +653,10 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = vzalloc(*notes_sz); + *notes_buf = alloc_elfnotes_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; - /* - * Allow users to remap ELF note segment buffer on vmalloc memory using - * remap_vmalloc_range() - */ - vm = find_vm_area(*notes_buf); - BUG_ON(!vm); - vm->flags |= VM_USERMAP; - rc = copy_notes_elf32(ehdr_ptr, *notes_buf); if (rc < 0) return rc; -- cgit v1.2.3 From a0b8cab3b9b2efadabdcff264c450ca515e2619c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 3 Jul 2013 15:02:32 -0700 Subject: mm: remove lru parameter from __pagevec_lru_add and remove parts of pagevec API Now that the LRU to add a page to is decided at LRU-add time, remove the misleading lru parameter from __pagevec_lru_add. A consequence of this is that the pagevec_lru_add_file, pagevec_lru_add_anon and similar helpers are misleading as the caller no longer has direct control over what LRU the page is added to. Unused helpers are removed by this patch and existing users of pagevec_lru_add_file() are converted to use lru_cache_add_file() directly and use the per-cpu pagevecs instead of creating their own pagevec. Signed-off-by: Mel Gorman Reviewed-by: Jan Kara Reviewed-by: Rik van Riel Acked-by: Johannes Weiner Cc: Alexey Lyahkov Cc: Andrew Perepechko Cc: Robin Dong Cc: Theodore Tso Cc: Hugh Dickins Cc: Rik van Riel Cc: Bernd Schubert Cc: David Howells Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cachefiles/rdwr.c | 30 +++++++----------------------- fs/nfs/dir.c | 7 ++----- 2 files changed, 9 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 317f9ee9c991..ebaff368120d 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internal.h" /* @@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) */ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, struct fscache_retrieval *op, - struct page *netpage, - struct pagevec *pagevec) + struct page *netpage) { struct cachefiles_one_read *monitor; struct address_space *bmapping; @@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, _enter(""); - pagevec_reinit(pagevec); - _debug("read back %p{%lu,%d}", netpage, netpage->index, page_count(netpage)); @@ -283,9 +281,7 @@ installed_new_backing_page: backpage = newpage; newpage = NULL; - page_cache_get(backpage); - pagevec_add(pagevec, backpage); - __pagevec_lru_add_file(pagevec); + lru_cache_add_file(backpage); read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (block) { /* submit the apparently valid page to the backing fs to be * read from disk */ - ret = cachefiles_read_backing_file_one(object, op, page, - &pagevec); + ret = cachefiles_read_backing_file_one(object, op, page); } else if (cachefiles_has_space(cache, 0, 1) == 0) { /* there's space in the cache we can use */ fscache_mark_page_cached(op, page); @@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, { struct cachefiles_one_read *monitor = NULL; struct address_space *bmapping = object->backer->d_inode->i_mapping; - struct pagevec lru_pvec; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; _enter(""); - pagevec_init(&lru_pvec, 0); - list_for_each_entry_safe(netpage, _n, list, lru) { list_del(&netpage->lru); @@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backpage = newpage; newpage = NULL; - page_cache_get(backpage); - if (!pagevec_add(&lru_pvec, backpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(backpage); reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* install a monitor */ page_cache_get(netpage); @@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); @@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, out: /* tidy up */ - pagevec_lru_add_file(&lru_pvec); - if (newpage) page_cache_release(newpage); if (netpage) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5d051419527b..d7ed697133f0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1758,7 +1759,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink); */ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct pagevec lru_pvec; struct page *page; char *kaddr; struct iattr attr; @@ -1798,11 +1798,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - pagevec_init(&lru_pvec, 0); - if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { - pagevec_add(&lru_pvec, page); - pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else -- cgit v1.2.3 From 0ed5fd138539940a493dc69359cb2f49de70ad89 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 3 Jul 2013 15:03:43 -0700 Subject: mm: use totalram_pages instead of num_physpages at runtime The global variable num_physpages is scheduled to be removed, so use totalram_pages instead of num_physpages at runtime. Signed-off-by: Jiang Liu Cc: Miklos Szeredi Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9a0cdde14a08..0b578598c6ac 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -785,7 +785,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((num_physpages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) -- cgit v1.2.3 From 096a8aac6bf4a5a0b2ef812ad76d056bbf3fb2af Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jul 2013 15:04:55 -0700 Subject: clean up scary strncpy(dst, src, strlen(src)) uses Fix various weird constructions of strncpy(dst, src, strlen(src)). Length limits should be about the space available in the destination, not repurposed as a method to either always include or always exclude a trailing NULL byte. Either the NULL should always be copied (using strlcpy), or it should not be copied (using something like memcpy). Readable code should not depend on the weird behavior of strncpy when it hits the length limit. Better to avoid the anti-pattern entirely. [akpm@linux-foundation.org: revert getdelays.c part due to missing bsd/string.h] Signed-off-by: Kees Cook Acked-by: Greg Kroah-Hartman [staging] Acked-by: Rafael J. Wysocki [acpi] Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ursula Braun Cc: Frank Blaschka Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hppfs/hppfs.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index fc90ab11c340..4338ff32959d 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -69,7 +69,7 @@ static char *dentry_name(struct dentry *dentry, int extra) struct dentry *parent; char *root, *name; const char *seg_name; - int len, seg_len; + int len, seg_len, root_len; len = 0; parent = dentry; @@ -81,7 +81,8 @@ static char *dentry_name(struct dentry *dentry, int extra) } root = "proc"; - len += strlen(root); + root_len = strlen(root); + len += root_len; name = kmalloc(len + extra + 1, GFP_KERNEL); if (name == NULL) return NULL; @@ -91,7 +92,7 @@ static char *dentry_name(struct dentry *dentry, int extra) while (parent->d_parent != parent) { if (is_pid(parent)) { seg_name = "pid"; - seg_len = strlen("pid"); + seg_len = strlen(seg_name); } else { seg_name = parent->d_name.name; @@ -100,10 +101,10 @@ static char *dentry_name(struct dentry *dentry, int extra) len -= seg_len + 1; name[len] = '/'; - strncpy(&name[len + 1], seg_name, seg_len); + memcpy(&name[len + 1], seg_name, seg_len); parent = parent->d_parent; } - strncpy(name, root, strlen(root)); + memcpy(name, root, root_len); return name; } -- cgit v1.2.3 From f170168b9a0b61ea1e647b082b38f605f1d3de3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Jul 2013 15:04:58 -0700 Subject: drivers: avoid parsing names as kthread_run() format strings Calling kthread_run with a single name parameter causes it to be handled as a format string. Many callers are passing potentially dynamic string content, so use "%s" in those cases to avoid any potential accidents. Signed-off-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/lockd/svc.c | 2 +- fs/nfs/callback.c | 5 ++--- fs/nfs/nfs4state.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index a2aa97d45670..10d6c41aecad 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -305,7 +305,7 @@ static int lockd_start_svc(struct svc_serv *serv) svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); + nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); printk(KERN_WARNING diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index cff089a412c7..da6a43d19aa3 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -211,7 +211,6 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_rqst *rqstp; int (*callback_svc)(void *vrqstp); struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; - char svc_name[12]; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); @@ -235,10 +234,10 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, svc_sock_update_bufs(serv); - sprintf(svc_name, "nfsv4.%u-svc", minorversion); cb_info->serv = serv; cb_info->rqst = rqstp; - cb_info->task = kthread_run(callback_svc, cb_info->rqst, svc_name); + cb_info->task = kthread_run(callback_svc, cb_info->rqst, + "nfsv4.%u-svc", minorversion); if (IS_ERR(cb_info->task)) { ret = PTR_ERR(cb_info->task); svc_exit_thread(cb_info->rqst); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ff10b4aa534c..55418811a55a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1194,7 +1194,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) snprintf(buf, sizeof(buf), "%s-manager", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); rcu_read_unlock(); - task = kthread_run(nfs4_run_state_manager, clp, buf); + task = kthread_run(nfs4_run_state_manager, clp, "%s", buf); if (IS_ERR(task)) { printk(KERN_ERR "%s: kthread_run: %ld\n", __func__, PTR_ERR(task)); -- cgit v1.2.3 From c7ef972c440fc9f1eda28b450cd30ad15c4d60cf Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Wed, 3 Jul 2013 15:08:05 -0700 Subject: nilfs2: implement calculation of free inodes count Currently, NILFS2 returns 0 as free inodes count (f_ffree) and current used inodes count as total file nodes in file system (f_files): df -i Filesystem Inodes IUsed IFree IUse% Mounted on /dev/loop0 2 2 0 100% /mnt/nilfs2 This patch implements real calculation of free inodes count. First of all, it is calculated total file nodes in file system as (desc_blocks_count * groups_per_desc_block * entries_per_group). Then, it is calculated free inodes count as difference the total file nodes and used inodes count. As a result, we have such output for NILFS2: df -i Filesystem Inodes IUsed IFree IUse% Mounted on /dev/loop0 4194304 2114701 2079603 51% /mnt/nilfs2 Reported-by: Clemens Eisserer Signed-off-by: Vyacheslav Dubeyko Signed-off-by: Ryusuke Konishi Tested-by: Vyacheslav Dubeyko Cc: Joern Engel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/alloc.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/alloc.h | 2 ++ fs/nilfs2/ifile.c | 22 +++++++++++++++++++ fs/nilfs2/ifile.h | 2 ++ fs/nilfs2/super.c | 25 ++++++++++++++++++++-- 5 files changed, 112 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index eed4d7b26249..741fd02e0444 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -397,6 +397,69 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, max - curr + 1); } +/** + * nilfs_palloc_count_desc_blocks - count descriptor blocks number + * @inode: inode of metadata file using this allocator + * @desc_blocks: descriptor blocks number [out] + */ +static int nilfs_palloc_count_desc_blocks(struct inode *inode, + unsigned long *desc_blocks) +{ + unsigned long blknum; + int ret; + + ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); + if (likely(!ret)) + *desc_blocks = DIV_ROUND_UP( + blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block); + return ret; +} + +/** + * nilfs_palloc_mdt_file_can_grow - check potential opportunity for + * MDT file growing + * @inode: inode of metadata file using this allocator + * @desc_blocks: known current descriptor blocks count + */ +static inline bool nilfs_palloc_mdt_file_can_grow(struct inode *inode, + unsigned long desc_blocks) +{ + return (nilfs_palloc_groups_per_desc_block(inode) * desc_blocks) < + nilfs_palloc_groups_count(inode); +} + +/** + * nilfs_palloc_count_max_entries - count max number of entries that can be + * described by descriptor blocks count + * @inode: inode of metadata file using this allocator + * @nused: current number of used entries + * @nmaxp: max number of entries [out] + */ +int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp) +{ + unsigned long desc_blocks = 0; + u64 entries_per_desc_block, nmax; + int err; + + err = nilfs_palloc_count_desc_blocks(inode, &desc_blocks); + if (unlikely(err)) + return err; + + entries_per_desc_block = (u64)nilfs_palloc_entries_per_group(inode) * + nilfs_palloc_groups_per_desc_block(inode); + nmax = entries_per_desc_block * desc_blocks; + + if (nused == nmax && + nilfs_palloc_mdt_file_can_grow(inode, desc_blocks)) + nmax += entries_per_desc_block; + + if (nused > nmax) + return -ERANGE; + + *nmaxp = nmax; + return 0; +} + /** * nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object * @inode: inode of metadata file using this allocator diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index fb7238100548..4bd6451b5703 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -48,6 +48,8 @@ int nilfs_palloc_get_entry_block(struct inode *, __u64, int, void *nilfs_palloc_block_get_entry(const struct inode *, __u64, const struct buffer_head *, void *); +int nilfs_palloc_count_max_entries(struct inode *, u64, u64 *); + /** * nilfs_palloc_req - persistent allocator request and reply * @pr_entry_nr: entry number (vblocknr or inode number) diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index d8e65bde083c..d788a5928351 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -159,6 +159,28 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, return err; } +/** + * nilfs_ifile_count_free_inodes - calculate free inodes count + * @ifile: ifile inode + * @nmaxinodes: current maximum of available inodes count [out] + * @nfreeinodes: free inodes count [out] + */ +int nilfs_ifile_count_free_inodes(struct inode *ifile, + u64 *nmaxinodes, u64 *nfreeinodes) +{ + u64 nused; + int err; + + *nmaxinodes = 0; + *nfreeinodes = 0; + + nused = atomic_read(&NILFS_I(ifile)->i_root->inodes_count); + err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes); + if (likely(!err)) + *nfreeinodes = *nmaxinodes - nused; + return err; +} + /** * nilfs_ifile_read - read or get ifile inode * @sb: super block instance diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 59b6f2b51df6..679674d13372 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -49,6 +49,8 @@ int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); int nilfs_ifile_delete_inode(struct inode *, ino_t); int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); +int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *); + int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, size_t inode_size, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c7d1f9f18b09..7d257e78fbad 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -609,6 +609,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) unsigned long overhead; unsigned long nrsvblocks; sector_t nfreeblocks; + u64 nmaxinodes, nfreeinodes; int err; /* @@ -633,14 +634,34 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) if (unlikely(err)) return err; + err = nilfs_ifile_count_free_inodes(root->ifile, + &nmaxinodes, &nfreeinodes); + if (unlikely(err)) { + printk(KERN_WARNING + "NILFS warning: fail to count free inodes: err %d.\n", + err); + if (err == -ERANGE) { + /* + * If nilfs_palloc_count_max_entries() returns + * -ERANGE error code then we simply treat + * curent inodes count as maximum possible and + * zero as free inodes value. + */ + nmaxinodes = atomic_read(&root->inodes_count); + nfreeinodes = 0; + err = 0; + } else + return err; + } + buf->f_type = NILFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = blocks - overhead; buf->f_bfree = nfreeblocks; buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? (buf->f_bfree - nrsvblocks) : 0; - buf->f_files = atomic_read(&root->inodes_count); - buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_files = nmaxinodes; + buf->f_ffree = nfreeinodes; buf->f_namelen = NILFS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); -- cgit v1.2.3 From e5f7f84843154db8b6ef5b2ac5e286f72212f54e Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Wed, 3 Jul 2013 15:08:06 -0700 Subject: ] nilfs2: use atomic64_t type for inodes_count and blocks_count fields in nilfs_root struct The cp_inodes_count and cp_blocks_count are represented as __le64 type in on-disk structure (struct nilfs_checkpoint). But analogous fields in in-core structure (struct nilfs_root) are represented by atomic_t type. This patch replaces atomic_t on atomic64_t type in representation of inodes_count and blocks_count fields in struct nilfs_root. Signed-off-by: Vyacheslav Dubeyko Acked-by: Ryusuke Konishi Acked-by: Joern Engel Cc: Clemens Eisserer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/ifile.c | 2 +- fs/nilfs2/inode.c | 8 ++++---- fs/nilfs2/segment.c | 4 ++-- fs/nilfs2/super.c | 8 +++++--- fs/nilfs2/the_nilfs.c | 4 ++-- fs/nilfs2/the_nilfs.h | 4 ++-- 6 files changed, 16 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index d788a5928351..6548c7851b48 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -174,7 +174,7 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile, *nmaxinodes = 0; *nfreeinodes = 0; - nused = atomic_read(&NILFS_I(ifile)->i_root->inodes_count); + nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count); err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes); if (likely(!err)) *nfreeinodes = *nmaxinodes - nused; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index bccfec8343c5..b1a5277cfd18 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -54,7 +54,7 @@ void nilfs_inode_add_blocks(struct inode *inode, int n) inode_add_bytes(inode, (1 << inode->i_blkbits) * n); if (root) - atomic_add(n, &root->blocks_count); + atomic64_add(n, &root->blocks_count); } void nilfs_inode_sub_blocks(struct inode *inode, int n) @@ -63,7 +63,7 @@ void nilfs_inode_sub_blocks(struct inode *inode, int n) inode_sub_bytes(inode, (1 << inode->i_blkbits) * n); if (root) - atomic_sub(n, &root->blocks_count); + atomic64_sub(n, &root->blocks_count); } /** @@ -369,7 +369,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) goto failed_ifile_create_inode; /* reference count of i_bh inherits from nilfs_mdt_read_block() */ - atomic_inc(&root->inodes_count); + atomic64_inc(&root->inodes_count); inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -801,7 +801,7 @@ void nilfs_evict_inode(struct inode *inode) ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); if (!ret) - atomic_dec(&ii->i_root->inodes_count); + atomic64_dec(&ii->i_root->inodes_count); nilfs_clear_inode(inode); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a5752a589932..bd88a7461063 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -835,9 +835,9 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) raw_cp->cp_snapshot_list.ssl_next = 0; raw_cp->cp_snapshot_list.ssl_prev = 0; raw_cp->cp_inodes_count = - cpu_to_le64(atomic_read(&sci->sc_root->inodes_count)); + cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count)); raw_cp->cp_blocks_count = - cpu_to_le64(atomic_read(&sci->sc_root->blocks_count)); + cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count)); raw_cp->cp_nblk_inc = cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7d257e78fbad..1427de5ebf4d 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -554,8 +554,10 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, if (err) goto failed_bh; - atomic_set(&root->inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); - atomic_set(&root->blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + atomic64_set(&root->inodes_count, + le64_to_cpu(raw_cp->cp_inodes_count)); + atomic64_set(&root->blocks_count, + le64_to_cpu(raw_cp->cp_blocks_count)); nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); @@ -647,7 +649,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) * curent inodes count as maximum possible and * zero as free inodes value. */ - nmaxinodes = atomic_read(&root->inodes_count); + nmaxinodes = atomic64_read(&root->inodes_count); nfreeinodes = 0; err = 0; } else diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 41e6a04a561f..94c451ce6d24 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -764,8 +764,8 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->ifile = NULL; new->nilfs = nilfs; atomic_set(&new->count, 1); - atomic_set(&new->inodes_count, 0); - atomic_set(&new->blocks_count, 0); + atomic64_set(&new->inodes_count, 0); + atomic64_set(&new->blocks_count, 0); rb_link_node(&new->rb_node, parent, p); rb_insert_color(&new->rb_node, &nilfs->ns_cptree); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index be1267a34cea..de8cc53b4a5c 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -241,8 +241,8 @@ struct nilfs_root { struct the_nilfs *nilfs; struct inode *ifile; - atomic_t inodes_count; - atomic_t blocks_count; + atomic64_t inodes_count; + atomic64_t blocks_count; }; /* Special checkpoint number */ -- cgit v1.2.3 From e68e96d2a7b2391cc1f70e207ec9cbe9d092adc9 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 3 Jul 2013 15:08:08 -0700 Subject: fs/fat: use fat_msg() to replace printk() in __fat_fs_error() Signed-off-by: Gu Zheng Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/misc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 359d307b5507..628e22a5a543 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -30,7 +30,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); + fat_msg(sb, KERN_ERR, "error, %pV", &vaf); va_end(args); } @@ -38,8 +38,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { sb->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT-fs (%s): Filesystem has been " - "set read-only\n", sb->s_id); + fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } EXPORT_SYMBOL_GPL(__fat_fs_error); -- cgit v1.2.3 From 77d55918029ef4626bfd6ba716728dcc7d919de3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:14 -0700 Subject: signals: eventpoll: do not use sigprocmask() sigprocmask() should die. None of the current callers actually need this strange interface. Change fs/eventpoll.c to use set_current_blocked(). This also means we should not worry about SIGKILL/SIGSTOP. Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Denys Vlasenko Cc: Eric Wong Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index deecc7294a67..0f0f73624a88 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1975,8 +1975,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, return -EINVAL; if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } error = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -1993,7 +1993,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return error; @@ -2020,8 +2020,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) return -EFAULT; sigset_from_compat(&ksigmask, &csigmask); - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + sigsaved = current->blocked; + set_current_blocked(&ksigmask); } err = sys_epoll_wait(epfd, events, maxevents, timeout); @@ -2038,7 +2038,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, sizeof(sigsaved)); set_restore_sigmask(); } else - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + set_current_blocked(&sigsaved); } return err; -- cgit v1.2.3 From e7fd1549aeb83e34ee0955cdf5dee5d4088508f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:16 -0700 Subject: coredump: format_corename() can leak cn->corename do_coredump() assumes that format_corename() can only fail if expand_corename() fails and frees cn->corename. This is not true, for example cn_print_exe_file() can fail and in this case nobody frees cn->corename. Change do_coredump() to always do kfree(cn->corename) after it calls format_corename() (NULL is fine), change expand_corename() to do nothing if kmalloc() fails. Signed-off-by: Oleg Nesterov Cc: Andi Kleen Cc: Colin Walters Cc: Denys Vlasenko Cc: Jiri Slaby Cc: Lennart Poettering Cc: Lucas De Marchi Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index dafafbafa731..11bc368e0017 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -58,16 +58,14 @@ static atomic_t call_count = ATOMIC_INIT(1); static int expand_corename(struct core_name *cn) { - char *old_corename = cn->corename; + int size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); + char *corename = krealloc(cn->corename, size, GFP_KERNEL); - cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); - cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL); - - if (!cn->corename) { - kfree(old_corename); + if (!corename) return -ENOMEM; - } + cn->size = size; + cn->corename = corename; return 0; } @@ -157,10 +155,9 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) int pid_in_pattern = 0; int err = 0; + cn->used = 0; cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); cn->corename = kmalloc(cn->size, GFP_KERNEL); - cn->used = 0; - if (!cn->corename) return -ENOMEM; @@ -549,7 +546,7 @@ void do_coredump(siginfo_t *siginfo) if (ispipe < 0) { printk(KERN_WARNING "format_corename failed\n"); printk(KERN_WARNING "Aborting core\n"); - goto fail_corename; + goto fail_unlock; } if (cprm.limit == 1) { @@ -669,7 +666,6 @@ fail_dropcount: atomic_dec(&core_dump_count); fail_unlock: kfree(cn.corename); -fail_corename: coredump_finish(mm, core_dumped); revert_creds(old_cred); fail_creds: -- cgit v1.2.3 From bc03c691aa86948af4e272ebdcdd4203018210f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:17 -0700 Subject: coredump: introduce cn_vprintf() Turn cn_printf(...) into cn_vprintf(va_list args), reintroduce cn_printf() as a trivial wrapper. This simplifies the next change and cn_vprintf() will have more callers. Signed-off-by: Oleg Nesterov Cc: Andi Kleen Cc: Colin Walters Cc: Denys Vlasenko Cc: Jiri Slaby Cc: Lennart Poettering Cc: Lucas De Marchi Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 11bc368e0017..c10a43aae220 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -69,17 +69,13 @@ static int expand_corename(struct core_name *cn) return 0; } -static int cn_printf(struct core_name *cn, const char *fmt, ...) +static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { char *cur; int need; int ret; - va_list arg; - va_start(arg, fmt); need = vsnprintf(NULL, 0, fmt, arg); - va_end(arg); - if (likely(need < cn->size - cn->used - 1)) goto out_printf; @@ -89,9 +85,7 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...) out_printf: cur = cn->corename + cn->used; - va_start(arg, fmt); vsnprintf(cur, need + 1, fmt, arg); - va_end(arg); cn->used += need; return 0; @@ -99,6 +93,18 @@ expand_fail: return ret; } +static int cn_printf(struct core_name *cn, const char *fmt, ...) +{ + va_list arg; + int ret; + + va_start(arg, fmt); + ret = cn_vprintf(cn, fmt, arg); + va_end(arg); + + return ret; +} + static void cn_escape(char *str) { for (; *str; str++) -- cgit v1.2.3 From 5fe9d8ca21cc1517258fe448639392d5d542eec6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:19 -0700 Subject: coredump: cn_vprintf() has no reason to call vsnprintf() twice cn_vprintf() looks really overcomplicated and sub-optimal. We do not need vsnprintf(NULL) to calculate the size we need, we can simply try to print into the current buffer and expand/retry only if necessary. Signed-off-by: Oleg Nesterov Cc: Andi Kleen Cc: Colin Walters Cc: Denys Vlasenko Cc: Jiri Slaby Cc: Lennart Poettering Cc: Lucas De Marchi Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index c10a43aae220..2b1d1f54e630 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -71,26 +71,20 @@ static int expand_corename(struct core_name *cn) static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) { - char *cur; - int need; - int ret; - - need = vsnprintf(NULL, 0, fmt, arg); - if (likely(need < cn->size - cn->used - 1)) - goto out_printf; - - ret = expand_corename(cn); - if (ret) - goto expand_fail; + int free, need; + +again: + free = cn->size - cn->used; + need = vsnprintf(cn->corename + cn->used, free, fmt, arg); + if (need < free) { + cn->used += need; + return 0; + } -out_printf: - cur = cn->corename + cn->used; - vsnprintf(cur, need + 1, fmt, arg); - cn->used += need; - return 0; + if (!expand_corename(cn)) + goto again; -expand_fail: - return ret; + return -ENOMEM; } static int cn_printf(struct core_name *cn, const char *fmt, ...) -- cgit v1.2.3 From 923bed030ff6e20b5176e10da151fade83097891 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:20 -0700 Subject: coredump: kill cn_escape(), introduce cn_esc_printf() The usage of cn_escape() looks really annoying, imho this sequence needs a wrapper. And it is buggy. If cn_printf() does expand_corename() cn_escape() writes to the freed memory. Introduce cn_esc_printf() which hopefully does this all right. It records the index before cn_vprintf(), not "char *" which is no longer valid (in general) after krealloc(). Signed-off-by: Oleg Nesterov Cc: Andi Kleen Cc: Colin Walters Cc: Denys Vlasenko Cc: Jiri Slaby Cc: Lennart Poettering Cc: Lucas De Marchi Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 2b1d1f54e630..90d7cee54347 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -99,11 +99,21 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...) return ret; } -static void cn_escape(char *str) +static int cn_esc_printf(struct core_name *cn, const char *fmt, ...) { - for (; *str; str++) - if (*str == '/') - *str = '!'; + int cur = cn->used; + va_list arg; + int ret; + + va_start(arg, fmt); + ret = cn_vprintf(cn, fmt, arg); + va_end(arg); + + for (; cur < cn->used; ++cur) { + if (cn->corename[cur] == '/') + cn->corename[cur] = '!'; + } + return ret; } static int cn_print_exe_file(struct core_name *cn) @@ -113,12 +123,8 @@ static int cn_print_exe_file(struct core_name *cn) int ret; exe_file = get_mm_exe_file(current->mm); - if (!exe_file) { - char *commstart = cn->corename + cn->used; - ret = cn_printf(cn, "%s (path unknown)", current->comm); - cn_escape(commstart); - return ret; - } + if (!exe_file) + return cn_esc_printf(cn, "%s (path unknown)", current->comm); pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { @@ -132,9 +138,7 @@ static int cn_print_exe_file(struct core_name *cn) goto free_buf; } - cn_escape(path); - - ret = cn_printf(cn, "%s", path); + ret = cn_esc_printf(cn, "%s", path); free_buf: kfree(pathbuf); @@ -207,22 +211,16 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; } /* hostname */ - case 'h': { - char *namestart = cn->corename + cn->used; + case 'h': down_read(&uts_sem); - err = cn_printf(cn, "%s", + err = cn_esc_printf(cn, "%s", utsname()->nodename); up_read(&uts_sem); - cn_escape(namestart); break; - } /* executable */ - case 'e': { - char *commstart = cn->corename + cn->used; - err = cn_printf(cn, "%s", current->comm); - cn_escape(commstart); + case 'e': + err = cn_esc_printf(cn, "%s", current->comm); break; - } case 'E': err = cn_print_exe_file(cn); break; -- cgit v1.2.3 From 3ceadcf6d489650ade673b7197c11c521aecb038 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:22 -0700 Subject: coredump: kill call_count, add core_name_size Imho, "atomic_t call_count" is ugly and should die. It buys nothing and in fact it can grow more than necessary, expand doesn't check if it was already incremented by another task. Kill it, and introduce "static int core_name_size" updated by expand_corename(). This is obviously racy too but harmless, and core_name_size never grows for no reason. We do not bother to to calculate the "right" new size, we simply do kmalloc(size_we_need) and use ksize() to rely on kmalloc_index's decision. Finally change format_corename() to use expand_corename(), krealloc(NULL) is fine. Signed-off-by: Oleg Nesterov Cc: Andi Kleen Cc: Colin Walters Cc: Denys Vlasenko Cc: Jiri Slaby Cc: Lennart Poettering Cc: Lucas De Marchi Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 90d7cee54347..56a9ab963a40 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -45,26 +45,28 @@ #include int core_uses_pid; -char core_pattern[CORENAME_MAX_SIZE] = "core"; unsigned int core_pipe_limit; +char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_name_size = CORENAME_MAX_SIZE; struct core_name { char *corename; int used, size; }; -static atomic_t call_count = ATOMIC_INIT(1); /* The maximal length of core_pattern is also specified in sysctl.c */ -static int expand_corename(struct core_name *cn) +static int expand_corename(struct core_name *cn, int size) { - int size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count); char *corename = krealloc(cn->corename, size, GFP_KERNEL); if (!corename) return -ENOMEM; - cn->size = size; + if (size > core_name_size) /* racy but harmless */ + core_name_size = size; + + cn->size = ksize(corename); cn->corename = corename; return 0; } @@ -81,7 +83,7 @@ again: return 0; } - if (!expand_corename(cn)) + if (!expand_corename(cn, cn->size + need - free + 1)) goto again; return -ENOMEM; @@ -160,9 +162,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) int err = 0; cn->used = 0; - cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count); - cn->corename = kmalloc(cn->size, GFP_KERNEL); - if (!cn->corename) + cn->corename = NULL; + if (expand_corename(cn, core_name_size)) return -ENOMEM; /* Repeat as long as we have more pattern to process and more output -- cgit v1.2.3 From 888ffc5923e4343a78575918ab781e85fa22d244 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:23 -0700 Subject: coredump: '% at the end' shouldn't bypass core_uses_pid logic "goto end" should not bypass the "Backward compatibility with core_uses_pid" code, move this label up. While at it, - It is ugly to copy '|' into cn->corename and then inc the pointer for argv_split(). Change format_corename() to increment pat_ptr instead. - Remove the dead "if (*pat_ptr == 0)" in format_corename(), we already checked it is not zero. Signed-off-by: Oleg Nesterov Cc: Andi Kleen Cc: Colin Walters Cc: Denys Vlasenko Cc: Jiri Slaby Cc: Lennart Poettering Cc: Lucas De Marchi Acked-by: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/coredump.c b/fs/coredump.c index 56a9ab963a40..72f816d6cad9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -165,13 +165,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) cn->corename = NULL; if (expand_corename(cn, core_name_size)) return -ENOMEM; + cn->corename[0] = '\0'; + + if (ispipe) + ++pat_ptr; /* Repeat as long as we have more pattern to process and more output space */ while (*pat_ptr) { if (*pat_ptr != '%') { - if (*pat_ptr == 0) - goto out; err = cn_printf(cn, "%c", *pat_ptr++); } else { switch (*++pat_ptr) { @@ -240,6 +242,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) return err; } +out: /* Backward compatibility with core_uses_pid: * * If core_pattern does not include a %p (as is the default) @@ -250,7 +253,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) if (err) return err; } -out: return ispipe; } @@ -580,7 +582,7 @@ void do_coredump(siginfo_t *siginfo) goto fail_dropcount; } - helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL); + helper_argv = argv_split(GFP_KERNEL, cn.corename, NULL); if (!helper_argv) { printk(KERN_WARNING "%s failed to allocate memory\n", __func__); @@ -597,7 +599,7 @@ void do_coredump(siginfo_t *siginfo) argv_free(helper_argv); if (retval) { - printk(KERN_INFO "Core dump to %s pipe failed\n", + printk(KERN_INFO "Core dump to |%s pipe failed\n", cn.corename); goto close_fail; } -- cgit v1.2.3 From 3f4185483832ccf3d2977923db576fa689c2abce Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:25 -0700 Subject: fs/exec.c:de_thread(): use change_pid() rather than detach_pid/attach_pid de_thread() can use change_pid() instead of detach + attach. This looks better and this ensures that, say, next_thread() can never see a task with ->pid == NULL. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Cc: Michal Hocko Cc: Pavel Emelyanov Cc: Sergey Dyasly Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 03b907cfd765..7619dddd5622 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -947,9 +947,8 @@ static int de_thread(struct task_struct *tsk) * Note: The old leader also uses this pid until release_task * is called. Odd but simple and correct. */ - detach_pid(tsk, PIDTYPE_PID); tsk->pid = leader->pid; - attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); + change_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); -- cgit v1.2.3 From 1d98a5fa11e9a66a7cf7b03f73cab60184781065 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:27 -0700 Subject: fs/proc/uptime.c:uptime_proc_show(): use get_monotonic_boottime() Change uptime_proc_show() to use get_monotonic_boottime() instead of do_posix_clock_monotonic_gettime() + monotonic_to_bootbased(). Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: John Stultz Cc: Tomas Janousek Cc: Tomas Smetana Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/uptime.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 9610ac772d7e..061894625903 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -20,8 +20,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) for_each_possible_cpu(i) idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; - do_posix_clock_monotonic_gettime(&uptime); - monotonic_to_bootbased(&uptime); + get_monotonic_boottime(&uptime); nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; -- cgit v1.2.3 From 30bc30df102b2d0c003d93477e04b97e6c528573 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Wed, 3 Jul 2013 15:08:28 -0700 Subject: fs/proc/kcore.c: using strlcpy() instead of strncpy() For NUL terminated string, set '\0' at the end. Signed-off-by: Zhao Hongjiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 0a22194e5d58..06ea155e1a59 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -408,7 +408,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) prpsinfo.pr_zomb = 0; strcpy(prpsinfo.pr_fname, "vmlinux"); - strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); + strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); nhdr->p_filesz += notesize(¬es[1]); bufp = storenote(¬es[1], bufp); -- cgit v1.2.3 From bd9d43f47d6944019918a801398c587ee86c4b52 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:34 -0700 Subject: fs/exec.c: do_execve_common(): use current_user() Trivial cleanup. do_execve_common() can use current_user() and avoid the unnecessary "struct cred *cred" var. Signed-off-by: Oleg Nesterov Cc: Vasiliy Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 7619dddd5622..396aec2dae90 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1464,7 +1464,6 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; bool clear_in_exec; int retval; - const struct cred *cred = current_cred(); /* * We move the actual failure in case of RLIMIT_NPROC excess from @@ -1473,7 +1472,7 @@ static int do_execve_common(const char *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) { + atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { retval = -EAGAIN; goto out_ret; } -- cgit v1.2.3 From 266b7a021f7dcc4d4531961a47f4ef74c3c4ab6b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 3 Jul 2013 15:08:35 -0700 Subject: fs/exec.c:de_thread: mt-exec should update ->real_start_time 924b42d5 ("Use boot based time for process start time and boot time in /proc") updated copy_process/do_task_stat but forgot about de_thread(). This breaks "ps axOT" if a sub-thread execs. Note: I think that task->start_time should die. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Acked-by: John Stultz Cc: Tomas Janousek Cc: Tomas Smetana Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 396aec2dae90..9c73def87642 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -932,6 +932,7 @@ static int de_thread(struct task_struct *tsk) * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; + tsk->real_start_time = leader->real_start_time; BUG_ON(!same_thread_group(leader, tsk)); BUG_ON(has_group_leader_pid(tsk)); -- cgit v1.2.3 From 4b30f07e74d2df673925019ad58bc59a719df3bb Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Wed, 3 Jul 2013 15:09:16 -0700 Subject: aio: fix wrong comment in aio_complete() ctx->ctx_lock should be ctx->completion_lock. Signed-off-by: Tang Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index a8ecc8313fb0..9b5ca1137419 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -625,7 +625,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* * Add a completion event to the ring buffer. Must be done holding - * ctx->ctx_lock to prevent other code from messing with the tail + * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. */ spin_lock_irqsave(&ctx->completion_lock, flags); -- cgit v1.2.3