diff options
author | Dima Zavin <dima@android.com> | 2010-10-21 15:00:17 -0700 |
---|---|---|
committer | Dima Zavin <dima@android.com> | 2010-10-21 15:00:17 -0700 |
commit | 5ff1b9cdb1791cab2759a2a073f0ba6b24579bc3 (patch) | |
tree | b375bea8dbd645df03a14a74481918b4f6657867 /fs | |
parent | d3c5752794d700393fc3ba5ad4eb4b09c269b23c (diff) | |
parent | f6f94e2ab1b33f0082ac22d71f66385a60d8157f (diff) |
Merge commit 'v2.6.36' into android-2.6.36
Diffstat (limited to 'fs')
-rw-r--r-- | fs/binfmt_aout.c | 4 | ||||
-rw-r--r-- | fs/ceph/caps.c | 31 | ||||
-rw-r--r-- | fs/ceph/export.c | 21 | ||||
-rw-r--r-- | fs/ceph/file.c | 2 | ||||
-rw-r--r-- | fs/ceph/osd_client.c | 2 | ||||
-rw-r--r-- | fs/cifs/cifssmb.c | 49 | ||||
-rw-r--r-- | fs/cifs/inode.c | 2 | ||||
-rw-r--r-- | fs/exec.c | 40 | ||||
-rw-r--r-- | fs/exofs/inode.c | 8 | ||||
-rw-r--r-- | fs/fs-writeback.c | 19 | ||||
-rw-r--r-- | fs/fuse/dev.c | 2 | ||||
-rw-r--r-- | fs/nfsd/nfsfh.h | 2 | ||||
-rw-r--r-- | fs/notify/Kconfig | 2 | ||||
-rw-r--r-- | fs/ocfs2/symlink.c | 2 | ||||
-rw-r--r-- | fs/proc/base.c | 4 | ||||
-rw-r--r-- | fs/reiserfs/ioctl.c | 7 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 19 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 12 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 37 |
19 files changed, 172 insertions, 93 deletions
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index f96eff04e11a..a6395bdb26ae 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm) if (!dump_write(file, dump_start, dump_size)) goto end_coredump; } -/* Finally dump the task struct. Not be used by gdb, but could be useful */ - set_fs(KERNEL_DS); - if (!dump_write(file, current, sizeof(*current))) - goto end_coredump; end_coredump: set_fs(fs); return has_dumped; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 73c153092f72..5e9da996a151 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2283,7 +2283,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - int seq = le32_to_cpu(grant->seq); + unsigned seq = le32_to_cpu(grant->seq); + unsigned issue_seq = le32_to_cpu(grant->issue_seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2295,8 +2296,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", - inode, cap, mds, seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", + inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2392,6 +2393,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; + cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; @@ -2774,15 +2776,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (op == CEPH_CAP_OP_IMPORT) __queue_cap_release(session, vino.ino, cap_id, mseq, seq); - - /* - * send any full release message to try to move things - * along for the mds (who clearly thinks we still have this - * cap). - */ - ceph_add_cap_releases(mdsc, session); - ceph_send_cap_releases(mdsc, session); - goto done; + goto flush_cap_releases; } /* these will work even if we don't have a cap yet */ @@ -2810,7 +2804,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); spin_unlock(&inode->i_lock); - goto done; + goto flush_cap_releases; } /* note that each of these drops i_lock for us */ @@ -2834,6 +2828,17 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_cap_op_name(op)); } + goto done; + +flush_cap_releases: + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); + done: mutex_unlock(&session->s_mutex); done_unlocked: diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 4480cb1c63e7..e38423e82f2e 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -42,32 +42,37 @@ struct ceph_nfs_confh { static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, int connectable) { + int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; struct dentry *parent = dentry->d_parent; struct inode *inode = dentry->d_inode; - int type; + int connected_handle_length = sizeof(*cfh)/4; + int handle_length = sizeof(*fh)/4; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - if (*max_len >= sizeof(*cfh)) { + if (*max_len >= connected_handle_length) { dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); cfh->parent_name_hash = parent->d_name.hash; - *max_len = sizeof(*cfh); + *max_len = connected_handle_length; type = 2; - } else if (*max_len > sizeof(*fh)) { - if (connectable) - return -ENOSPC; + } else if (*max_len >= handle_length) { + if (connectable) { + *max_len = connected_handle_length; + return 255; + } dout("encode_fh %p\n", dentry); fh->ino = ceph_ino(dentry->d_inode); - *max_len = sizeof(*fh); + *max_len = handle_length; type = 1; } else { - return -ENOSPC; + *max_len = handle_length; + return 255; } return type; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c044a4f0457..66e4da6dba22 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -697,7 +697,7 @@ more: * start_request so that a tid has been assigned. */ spin_lock(&ci->i_unsafe_lock); - list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); + list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); } diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index dfced1dacbcd..3b5571b8ce22 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -549,7 +549,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, */ static void __cancel_request(struct ceph_osd_request *req) { - if (req->r_sent) { + if (req->r_sent && req->r_osd) { ceph_con_revoke(&req->r_osd->o_con, req->r_request); req->r_sent = 0; } diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c65c3419dd37..7e83b356cc9e 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -232,7 +232,7 @@ static int small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, void **request_buf) { - int rc = 0; + int rc; rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) @@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, if (tcon != NULL) cifs_stats_inc(&tcon->num_smbs_sent); - return rc; + return 0; } int @@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct, /* If the return code is zero, this function must fill in request_buf pointer */ static int -smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf /* returned */ , - void **response_buf /* returned */ ) +__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) { - int rc = 0; - - rc = cifs_reconnect_tcon(tcon, smb_command); - if (rc) - return rc; - *request_buf = cifs_buf_get(); if (*request_buf == NULL) { /* BB should we add a retry in here if not a writepage? */ @@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, if (tcon != NULL) cifs_stats_inc(&tcon->num_smbs_sent); - return rc; + return 0; +} + +/* If the return code is zero, this function must fill in request_buf pointer */ +static int +smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) +{ + int rc; + + rc = cifs_reconnect_tcon(tcon, smb_command); + if (rc) + return rc; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); +} + +static int +smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf, void **response_buf) +{ + if (tcon->ses->need_reconnect || tcon->need_reconnect) + return -EHOSTDOWN; + + return __smb_init(smb_command, wct, tcon, request_buf, response_buf); } static int validate_t2(struct smb_t2_rsp *pSMB) @@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon) cFYI(1, "In QFSUnixInfo"); QFSUnixRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; @@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap) cFYI(1, "In SETFSUnixInfo"); SETFSUnixRetry: /* BB switch to small buf init to save memory */ - rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, - (void **) &pSMBr); + rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon, + (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 93f77d438d3c..53cce8cc2224 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -801,6 +801,8 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; + if (S_ISREG(inode->i_mode)) + inode->i_data.backing_dev_info = sb->s_bdi; #ifdef CONFIG_CIFS_FSCACHE /* initialize per-inode cache cookie pointer */ CIFS_I(inode)->fscache = NULL; diff --git a/fs/exec.c b/fs/exec.c index 828dd2461d6b..6d2b6f936858 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2014,3 +2014,43 @@ fail_creds: fail: return; } + +/* + * Core dumping helper functions. These are the only things you should + * do on a core-file: use only these functions to write out all the + * necessary info. + */ +int dump_write(struct file *file, const void *addr, int nr) +{ + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; +} +EXPORT_SYMBOL(dump_write); + +int dump_seek(struct file *file, loff_t off) +{ + int ret = 1; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + if (file->f_op->llseek(file, off, SEEK_CUR) < 0) + return 0; + } else { + char *buf = (char *)get_zeroed_page(GFP_KERNEL); + + if (!buf) + return 0; + while (off > 0) { + unsigned long n = off; + + if (n > PAGE_SIZE) + n = PAGE_SIZE; + if (!dump_write(file, buf, n)) { + ret = 0; + break; + } + off -= n; + } + free_page((unsigned long)buf); + } + return ret; +} +EXPORT_SYMBOL(dump_seek); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index eb7368ebd8cd..3eadd97324b1 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -54,6 +54,9 @@ struct page_collect { unsigned nr_pages; unsigned long length; loff_t pg_first; /* keep 64bit also in 32-arches */ + bool read_4_write; /* This means two things: that the read is sync + * And the pages should not be unlocked. + */ }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->nr_pages = 0; pcol->length = 0; pcol->pg_first = -1; + pcol->read_4_write = false; } static void _pcol_reset(struct page_collect *pcol) @@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page) if (PageError(page)) ClearPageError(page); - unlock_page(page); + if (!pcol->read_4_write) + unlock_page(page); EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," " splitting\n", inode->i_ino, page->index); @@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync) /* readpage_strip might call read_exec(,is_sync==false) at several * places but not if we have a single page. */ + pcol.read_4_write = is_sync; ret = readpage_strip(&pcol, page); if (ret) { EXOFS_ERR("_readpage => %d\n", ret); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 72f4d0babc64..07d235864542 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -72,22 +72,11 @@ int writeback_in_progress(struct backing_dev_info *bdi) static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; - /* - * For inodes on standard filesystems, we use superblock's bdi. For - * inodes on virtual filesystems, we want to use inode mapping's bdi - * because they can possibly point to something useful (think about - * block_dev filesystem). - */ - if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) { - /* Some device inodes could play dirty tricks. Catch them... */ - WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi), - "Dirtiable inode bdi %s != sb bdi %s\n", - bdi->name, sb->s_bdi->name); - return sb->s_bdi; - } - return bdi; + if (strcmp(sb->s_type->name, "bdev") == 0) + return inode->i_mapping->backing_dev_info; + + return sb->s_bdi; } static void bdi_queue_work(struct backing_dev_info *bdi, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d367af1514ef..cde755cca564 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, loff_t file_size; unsigned int num; unsigned int offset; - size_t total_len; + size_t total_len = 0; req = fuse_get_req(fc); if (IS_ERR(req)) diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index cdfb8c6a4206..c16f8d8331b5 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp) static inline void fh_unlock(struct svc_fh *fhp) { - BUG_ON(!fhp->fh_dentry); - if (fhp->fh_locked) { fill_post_wcc(fhp); mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 22c629eedd82..b388443c3a09 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -3,4 +3,4 @@ config FSNOTIFY source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" -source "fs/notify/fanotify/Kconfig" +#source "fs/notify/fanotify/Kconfig" diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 32499d213fc4..9975457c981f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, } /* Fast symlinks can't be large */ - len = strlen(target); + len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); link = kzalloc(len + 1, GFP_NOFS); if (!link) { status = -ENOMEM; diff --git a/fs/proc/base.c b/fs/proc/base.c index a495be632cf7..7ed7ea18b20a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2709,7 +2709,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif @@ -3045,7 +3045,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), - INF("limits", S_IRUSR, proc_pid_limits), + INF("limits", S_IRUGO, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index f53505de0712..5cbb81e134ac 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; + int depth; int index; struct page *page; struct address_space *mapping; @@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) /* we need to make sure nobody is changing the file size beneath ** us */ - mutex_lock(&inode->i_mutex); - reiserfs_write_lock(inode->i_sb); + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ @@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out: mutex_unlock(&inode->i_mutex); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); return retval; } diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index d59c4a65d492..81976ffed7d6 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -668,14 +668,11 @@ xfs_inode_set_reclaim_tag( xfs_perag_put(pag); } -void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, +STATIC void +__xfs_inode_clear_reclaim( xfs_perag_t *pag, xfs_inode_t *ip) { - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; if (!pag->pag_ici_reclaimable) { /* clear the reclaim tag from the perag radix tree */ @@ -689,6 +686,17 @@ __xfs_inode_clear_reclaim_tag( } } +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + /* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table @@ -838,6 +846,7 @@ reclaim: if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); write_unlock(&pag->pag_ici_lock); /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ed575fb4b495..7e206fc1fa36 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -405,9 +405,15 @@ xlog_cil_push( new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); - /* lock out transaction commit, but don't block on background push */ + /* + * Lock out transaction commit, but don't block for background pushes + * unless we are well over the CIL space limit. See the definition of + * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic + * used here. + */ if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_seq) + if (!push_seq && + cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log)) goto out_free_ticket; down_write(&cil->xc_ctx_lock); } @@ -422,7 +428,7 @@ xlog_cil_push( goto out_skip; /* check for a previously pushed seqeunce */ - if (push_seq < cil->xc_ctx->sequence) + if (push_seq && push_seq < cil->xc_ctx->sequence) goto out_skip; /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ced52b98b322..edcdfe01617f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -426,13 +426,13 @@ struct xfs_cil { }; /* - * The amount of log space we should the CIL to aggregate is difficult to size. - * Whatever we chose we have to make we can get a reservation for the log space - * effectively, that it is large enough to capture sufficient relogging to - * reduce log buffer IO significantly, but it is not too large for the log or - * induces too much latency when writing out through the iclogs. We track both - * space consumed and the number of vectors in the checkpoint context, so we - * need to decide which to use for limiting. + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. * * Every log buffer we write out during a push needs a header reserved, which * is at least one sector and more for v2 logs. Hence we need a reservation of @@ -459,16 +459,21 @@ struct xfs_cil { * checkpoint transaction ticket is specific to the checkpoint context, rather * than the CIL itself. * - * With dynamic reservations, we can basically make up arbitrary limits for the - * checkpoint size so long as they don't violate any other size rules. Hence - * the initial maximum size for the checkpoint transaction will be set to a - * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit - * right now based on the latency of writing out a large amount of data through - * the circular iclog buffers. + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. */ - -#define XLOG_CIL_SPACE_LIMIT(log) \ - (min((log->l_logsize >> 2), (8 * 1024 * 1024))) +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4)) /* * The reservation head lsn is not made up of a cycle number and block number. |