From 6c0f3af72cb1622a66962a1180c36ef8c41be8e2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Nov 2010 11:14:34 -0800 Subject: ceph: add dir_layout to inode Add a ceph_dir_layout to the inode, and calculate dentry hash values based on the parent directory's specified dir_hash function. This is needed because the old default Linux dcache hash function is extremely week and leads to a poor distribution of files among dir fragments. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 20 ++++++++++++++++++++ fs/ceph/export.c | 2 +- fs/ceph/inode.c | 2 ++ fs/ceph/super.h | 2 ++ 4 files changed, 25 insertions(+), 1 deletion(-) (limited to 'fs/ceph') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d902948a90d8..562f9884a4d9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1216,6 +1216,26 @@ void ceph_dentry_lru_del(struct dentry *dn) } } +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct dentry *dn) +{ + struct inode *dir = dn->d_parent->d_inode; + struct ceph_inode_info *dci = ceph_inode(dir); + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); + } +} + const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, .readdir = ceph_readdir, diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 2297d9426992..e41056174bf8 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = parent->d_name.hash; + cfh->parent_name_hash = ceph_dentry_hash(parent); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bf1286588f26..045283ce4413 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_release_count = 0; ci->i_symlink = NULL; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7f01728a4657..6e0826695112 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -239,6 +239,7 @@ struct ceph_inode_info { unsigned i_ceph_flags; unsigned long i_release_count; + struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; char *i_symlink; @@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct dentry *dn); /* * our d_ops vary depending on whether the inode is live, -- cgit v1.2.3 From 14303d20f3ae3e6ab626c77a4aac202b3bafd377 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Dec 2010 17:37:52 -0800 Subject: ceph: implement DIRLAYOUTHASH feature to get dir layout from MDS This implements the DIRLAYOUTHASH protocol feature, which passes the dir layout over the wire from the MDS. This gives the client knowledge of the correct hash function to use for mapping dentries among dir fragments. Note that if this feature is _not_ present on the client but is on the MDS, the client may misdirect requests. This will result in a forward and degrade performance. It may also result in inaccurate NFS filehandle generation, which will prevent fh resolution when the inode is not present in the client cache and the parent directories have been fragmented. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 2 ++ fs/ceph/mds_client.c | 42 +++++++++++++++++++++++++++--------------- fs/ceph/mds_client.h | 1 + fs/ceph/super.c | 3 ++- 4 files changed, 32 insertions(+), 16 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 045283ce4413..e791fa34b23d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -682,6 +682,8 @@ static int fill_inode(struct inode *inode, inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; + ci->i_dir_layout = iinfo->dir_layout; + ci->i_files = le64_to_cpu(info->files); ci->i_subdirs = le64_to_cpu(info->subdirs); ci->i_rbytes = le64_to_cpu(info->rbytes); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 38800eaa81d0..9be29b06a2d9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops; * parse individual inode info */ static int parse_reply_info_in(void **p, void *end, - struct ceph_mds_reply_info_in *info) + struct ceph_mds_reply_info_in *info, + int features) { int err = -EIO; @@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end, info->symlink = *p; *p += info->symlink_len; + if (features & CEPH_FEATURE_DIRLAYOUTHASH) + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); + else + memset(&info->dir_layout, 0, sizeof(info->dir_layout)); + ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; @@ -88,12 +95,13 @@ bad: * target inode. */ static int parse_reply_info_trace(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { int err; if (info->head->is_dentry) { - err = parse_reply_info_in(p, end, &info->diri); + err = parse_reply_info_in(p, end, &info->diri, features); if (err < 0) goto out_bad; @@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end, } if (info->head->is_target) { - err = parse_reply_info_in(p, end, &info->targeti); + err = parse_reply_info_in(p, end, &info->targeti, features); if (err < 0) goto out_bad; } @@ -134,7 +142,8 @@ out_bad: * parse readdir results */ static int parse_reply_info_dir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { u32 num, i = 0; int err; @@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end, *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i]); + err = parse_reply_info_in(p, end, &info->dir_in[i], features); if (err < 0) goto out_bad; i++; @@ -205,7 +214,8 @@ out_bad: * parse fcntl F_GETLK results */ static int parse_reply_info_filelock(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -225,19 +235,21 @@ bad: * parse extra results */ static int parse_reply_info_extra(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) - return parse_reply_info_filelock(p, end, info); + return parse_reply_info_filelock(p, end, info, features); else - return parse_reply_info_dir(p, end, info); + return parse_reply_info_dir(p, end, info, features); } /* * parse entire mds reply */ static int parse_reply_info(struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { void *p, *end; u32 len; @@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_trace(&p, p+len, info); + err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; } @@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_extra(&p, p+len, info); + err = parse_reply_info_extra(&p, p+len, info, features); if (err < 0) goto out_bad; } @@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else { /* dir + name */ inode = dir; - hash = req->r_dentry->d_name.hash; + hash = ceph_dentry_hash(req->r_dentry); is_hash = true; } } @@ -2101,7 +2113,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo); + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index aabe563b54db..f8f27f6eaa90 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -35,6 +35,7 @@ struct ceph_cap; */ struct ceph_mds_reply_info_in { struct ceph_mds_reply_inode *in; + struct ceph_dir_layout dir_layout; u32 symlink_len; char *symlink; u32 xattr_len; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 08b460ae0539..1417f3f3e246 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->supported_features |= CEPH_FEATURE_FLOCK; + fsc->client->supported_features |= CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; fsc->client->monc.want_mdsmap = 1; fsc->mount_options = fsopt; -- cgit v1.2.3 From 4af25fdda6943f311a63034f80933e4d6d6e3a19 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Nov 2010 13:41:47 -0700 Subject: ceph: drop redundant r_mds field The r_mds field is redundant, since we can find the same information at r_session->s_mds, and when r_session is NULL then r_mds is meaningless. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 9 ++++++--- fs/ceph/mds_client.c | 8 +++++--- fs/ceph/mds_client.h | 1 - 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7ae1b3d55b58..08f65faac112 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p) for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { req = rb_entry(rp, struct ceph_mds_request, r_node); - if (req->r_request) - seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); - else + if (req->r_request && req->r_session) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, + req->r_session->s_mds); + else if (!req->r_request) seq_printf(s, "%lld\t(no request)\t", req->r_tid); + else + seq_printf(s, "%lld\t(no session)\t", req->r_tid); seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9be29b06a2d9..e22e8b41d572 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1705,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int flags = 0; - req->r_mds = mds; req->r_attempts++; if (req->r_inode) { struct ceph_cap *cap = @@ -2068,8 +2067,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) goto out; } else { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = - ceph_get_cap_for_mds(ci, req->r_mds);; + struct ceph_cap *cap = NULL; + + if (req->r_session) + cap = ceph_get_cap_for_mds(ci, + req->r_session->s_mds); dout("already using auth"); if ((!cap || cap != ci->i_auth_cap) || diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f8f27f6eaa90..4e3a9cc0bba6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -166,7 +166,6 @@ struct ceph_mds_request { struct ceph_mds_client *r_mdsc; int r_op; /* mds op code */ - int r_mds; /* operation on what? */ struct inode *r_inode; /* arg1 */ -- cgit v1.2.3 From dc69e2e9fcd7c613eb744ea3b9c4ee9ca554e822 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Nov 2010 13:49:00 -0700 Subject: ceph: associate requests with opening sessions Associate request with sessions that aren't yep open. This makes the debugfs mdsc request list more informative. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e22e8b41d572..509339ceef72 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1791,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } + put_request_session(req); + mds = __choose_mds(mdsc, req); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { @@ -1808,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } } + req->r_session = get_session(session); + dout("do_request mds%d session %p state %s\n", mds, session, session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && @@ -1820,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc, } /* send request */ - req->r_session = get_session(session); req->r_resend_mds = -1; /* forget any previous mds hint */ if (req->r_request_started == 0) /* note request start time */ @@ -1874,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); - put_request_session(req); __do_request(mdsc, req); } } -- cgit v1.2.3 From 582c86e69045f37da8be445c265f72a7a73b18c6 Mon Sep 17 00:00:00 2001 From: Tracey Dent Date: Tue, 14 Dec 2010 19:32:37 -0500 Subject: ceph: Makefile: Remove unnessary code Remove the if and else conditional because the code is in mainline and there is no need in it being there. Also, Changed Makefile to use -y instead of -objs because -objs is deprecated and not mentioned in Documentation/kbuild/makefiles.txt. Signed-off-by: Tracey Dent Signed-off-by: Sage Weil --- fs/ceph/Makefile | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 9e6c4f2e8ff1..bd352125e829 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -2,31 +2,10 @@ # Makefile for CEPH filesystem. # -ifneq ($(KERNELRELEASE),) - obj-$(CONFIG_CEPH_FS) += ceph.o -ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install - -clean: - $(MAKE) -C $(KERNELDIR) M=$(PWD) clean - -endif -- cgit v1.2.3 From 01e6acc4ea4c284c44bfb3d46c76f4ae580c6435 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jan 2011 14:49:45 +0100 Subject: ceph: fsc->*_wq's aren't used in memory reclaim path fsc->*_wq's aren't depended upon during memory reclaim. Convert to alloc_workqueue() w/o WQ_MEM_RECLAIM. Signed-off-by: Tejun Heo Cc: Sage Weil Cc: ceph-devel@vger.kernel.org Signed-off-by: Sage Weil --- fs/ceph/super.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs/ceph') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1417f3f3e246..bf6f0f34082a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -444,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail_client; err = -ENOMEM; - fsc->wb_wq = create_workqueue("ceph-writeback"); + /* + * The number of concurrent works can be high but they don't need + * to be processed in parallel, limit concurrency. + */ + fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); if (fsc->wb_wq == NULL) goto fail_bdi; - fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); + fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; - fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); + fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); if (fsc->trunc_wq == NULL) goto fail_pg_inv_wq; -- cgit v1.2.3