From 6e6f09231a134e7523514ed504380f5caafc9334 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 27 Feb 2015 08:54:08 +0800 Subject: ceph: drop cap releases in requests composed before cap reconnect These cap releases are stale because MDS will re-establish client caps according to the cap reconnect messages. Note: MDS can detect stale cap messages, so these stale cap releases are harmless even we don't drop them. Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 71c073f38e54..0a1a9e452148 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1853,7 +1853,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, */ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_msg *msg; struct ceph_mds_request_head *head; @@ -1937,6 +1937,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, releases += ceph_encode_inode_release(&p, req->r_old_dentry->d_inode, mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + head->num_releases = cpu_to_le16(releases); /* time stamp */ @@ -1989,7 +1995,7 @@ static void complete_request(struct ceph_mds_client *mdsc, */ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_mds_request_head *rhead; struct ceph_msg *msg; @@ -2048,7 +2054,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds); + msg = create_request_message(mdsc, req, mds, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); complete_request(mdsc, req); @@ -2132,7 +2138,7 @@ static int __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __prepare_send_request(mdsc, req, mds); + err = __prepare_send_request(mdsc, req, mds, false); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2658,7 +2664,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2679,7 +2685,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, continue; /* only old requests */ if (req->r_session && req->r_session->s_mds == session->s_mds) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, + session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); -- cgit v1.2.3 From e2c3de046c5a1f3525772b4cacc7731cb626ab61 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Mar 2015 16:05:04 +0800 Subject: ceph: fix dcache/nocache mount option Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 2 ++ fs/ceph/super.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83e9976f7189..92a6b6018511 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -281,6 +281,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); if ((ctx->pos == 2 || fi->dentry) && + ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -629,6 +630,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 04c8124ed30e..f8ea7ebe9d1f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,7 +36,8 @@ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ -#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ + CEPH_MOUNT_OPT_DCACHE) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; -- cgit v1.2.3 From e1eba3ea02dbd2a4b6be1a5708af31dc9b9aaaa6 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 3 Mar 2015 18:57:49 +0100 Subject: ceph: remove redundant declaration ceph_aops was already defined extern in addr.c section Signed-off-by: Fabian Frederick Signed-off-by: Yan, Zheng --- fs/ceph/super.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f8ea7ebe9d1f..fa20e1318939 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -882,7 +882,6 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; -extern const struct address_space_operations ceph_aops; extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, -- cgit v1.2.3 From 3563dbdd99603d4339467ddffafe3851aae2e398 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Fri, 6 Feb 2015 06:52:17 -0500 Subject: ceph: use msecs_to_jiffies for time conversion This is only an API consolidation and should make things more readable it replaces var * HZ / 1000 by msecs_to_jiffies(var). Signed-off-by: Nicholas Mc Guire Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0a1a9e452148..0122491d947c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3140,7 +3140,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = - le32_to_cpu(h->duration_ms) * HZ / 1000; + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; dentry->d_time = di->lease_renew_from + duration; -- cgit v1.2.3 From 57e95460f0b360c4d29c0320922b46b55dd2b79f Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Tue, 10 Mar 2015 11:18:15 -0400 Subject: ceph: match wait_for_completion_timeout return type return type of wait_for_completion_timeout is unsigned long not int. An appropriately named unsigned long is added and the assignment fixed up. Signed-off-by: Nicholas Mc Guire Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 92a6b6018511..d486f2a5a88d 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1242,11 +1242,12 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { - ret = wait_for_completion_timeout( - &req->r_safe_completion, req->r_timeout); - if (ret > 0) + unsigned long time_left = wait_for_completion_timeout( + &req->r_safe_completion, + req->r_timeout); + if (time_left > 0) ret = 0; - else if (ret == 0) + else ret = -EIO; /* timed out */ } else { wait_for_completion(&req->r_safe_completion); -- cgit v1.2.3 From 1fe480235ad7236e8ea6c167af5a5d1ac24f8a88 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 5 Mar 2015 10:47:22 +0300 Subject: rbd: be more informative on -ENOENT failures pr_info what exactly was the culprit: missing pool, image or snap. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b40af3203089..83f5733f1a7a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5301,8 +5301,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) if (mapping) { ret = rbd_dev_header_watch_sync(rbd_dev); - if (ret) + if (ret) { + if (ret == -ENOENT) + pr_info("image %s/%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->image_name); goto out_header_name; + } } ret = rbd_dev_header_info(rbd_dev); @@ -5319,8 +5324,14 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) ret = rbd_spec_fill_snap_id(rbd_dev); else ret = rbd_spec_fill_names(rbd_dev); - if (ret) + if (ret) { + if (ret == -ENOENT) + pr_info("snap %s/%s@%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->image_name, + rbd_dev->spec->snap_name); goto err_out_probe; + } if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { ret = rbd_dev_v2_parent_info(rbd_dev); @@ -5390,8 +5401,11 @@ static ssize_t do_rbd_add(struct bus_type *bus, /* pick the pool */ rc = rbd_add_get_pool_id(rbdc, spec->pool_name); - if (rc < 0) + if (rc < 0) { + if (rc == -ENOENT) + pr_info("pool %s does not exist\n", spec->pool_name); goto err_out_client; + } spec->pool_id = (u64)rc; /* The ceph file layout needs to fit pool id in 32 bits */ -- cgit v1.2.3 From c1d00b2d9c4fc821e33c5cdfbdbc32677cb0e2e0 Mon Sep 17 00:00:00 2001 From: Taesoo Kim Date: Fri, 20 Mar 2015 17:36:56 -0400 Subject: ceph: properly release page upon error When ceph_update_writeable_page fails (including -EAGAIN), it unlocks (w/ unlock_page) the page but does not 'release' (w/ page_cache_release) properly. Upon error, properly set *pagep to NULL, indicating an error. Signed-off-by: Taesoo Kim Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index fd5599d32362..e723482c5f37 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1146,6 +1146,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); + if (r < 0) + page_cache_release(page); + else + *pagep = page; } while (r == -EAGAIN); return r; -- cgit v1.2.3 From a149bb9a281c5c2904cf6fcdf9ed386340032ce3 Mon Sep 17 00:00:00 2001 From: Sanidhya Kashyap Date: Sat, 21 Mar 2015 12:54:58 -0400 Subject: ceph: kstrdup() memory handling Currently, there is no check for the kstrdup() for r_path2, r_path1 and snapdir_name as various locations as there is a possibility of failure during memory pressure. Therefore, returning ENOMEM where the checks have been missed. Signed-off-by: Sanidhya Kashyap Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 24 ++++++++++++++++++------ fs/ceph/super.c | 10 ++++++++++ fs/ceph/xattr.c | 23 ++++++++++++++++------- 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d486f2a5a88d..98c71e895e81 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -337,16 +337,23 @@ more: ceph_mdsc_put_request(req); return err; } - req->r_inode = inode; - ihold(inode); - req->r_dentry = dget(file->f_path.dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (fi->last_name) { + req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + } req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -757,10 +764,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } - req->r_dentry = dget(dentry); - req->r_num_caps = 2; req->r_path2 = kstrdup(dest, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + ceph_mdsc_put_request(req); + goto out; + } req->r_locked_dir = dir; + req->r_dentry = dget(dentry); + req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a63997b8bcff..9f035ccb6191 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -345,6 +345,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->rsize = CEPH_RSIZE_DEFAULT; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) { + err = -ENOMEM; + goto out; + } + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; @@ -730,6 +735,11 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (IS_ERR(req)) return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5a492caf34cb..5c4c9c256931 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = inode; - ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - req->r_num_caps = 1; + req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } req->r_pagelist = pagelist; pagelist = NULL; + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); @@ -1019,12 +1026,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) + return -ENOMEM; + req->r_inode = inode; ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; - req->r_path2 = kstrdup(name, GFP_NOFS); - + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; -- cgit v1.2.3 From 3ef650d3989677bd460497f9c3b0d58caaba0116 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 23 Mar 2015 13:35:03 -0700 Subject: libceph: osdmap.h: Add missing format newlines To avoid possible interleaving, add missing '\n' to formats. Convert pr_warning to pr_warn while there. Signed-off-by: Joe Perches Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 561ea896c657..e55c08bc3a96 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -175,13 +175,12 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) __u8 version; if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { - pr_warning("incomplete pg encoding"); - + pr_warn("incomplete pg encoding\n"); return -EINVAL; } version = ceph_decode_8(p); if (version > 1) { - pr_warning("do not understand pg encoding %d > 1", + pr_warn("do not understand pg encoding %d > 1\n", (int)version); return -EINVAL; } -- cgit v1.2.3 From db40cc1702d6a7049740d269cf6c1a42f979c7a7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 23 Mar 2015 20:12:20 +0800 Subject: ceph: keep i_snap_realm while there are writers when reconnecting to MDS is denied, we remove session caps forcibly. But it's possible there are ongoing write, the write code needs to reference i_snap_realm. So if there are ongoing write, we keep i_snap_realm. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8172775428a0..4ddadc13d162 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -896,6 +896,18 @@ int ceph_is_any_caps(struct inode *inode) return ret; } +static void drop_inode_snap_realm(struct ceph_inode_info *ci) +{ + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, + realm); +} + /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -946,15 +958,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); - } + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); } @@ -2309,6 +2319,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) wake = 1; } } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); -- cgit v1.2.3 From 571ade336ac89f5db602f7df992152e4f4b945bc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 24 Mar 2015 11:36:08 +0800 Subject: ceph: don't mark dirty caps when there is no auth cap No i_auth_cap means reconnecting to MDS was denied. So don't add new dirty caps. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 8 +++++++- fs/ceph/mds_client.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 4ddadc13d162..37a42353b983 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1404,6 +1404,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) int was = ci->i_dirty_caps; int dirty = 0; + if (!ci->i_auth_cap) { + pr_warn("__mark_dirty_caps %p %llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_ino(inode), ceph_cap_string(mask)); + return 0; + } + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); @@ -1414,7 +1421,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ci->i_snap_realm->cached_context); dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); - WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0122491d947c..0cfc2d163549 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1098,7 +1098,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); - if (!__ceph_is_any_real_caps(ci)) { + if (!ci->i_auth_cap) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; -- cgit v1.2.3 From a9f6eb61850e1599f9aa5141f25ccc1d8248e174 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 24 Mar 2015 15:49:36 +0800 Subject: ceph: don't zero i_wrbuffer_ref when reconnecting is denied remove_session_caps_cb() does not truncate dirty data in page cache, but zeros i_wrbuffer_ref/i_wrbuffer_ref_head. This will result negtive i_wrbuffer_ref/i_wrbuffer_ref_head Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0cfc2d163549..c3bac611a097 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1120,13 +1120,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, mdsc->num_cap_flushing--; drop = 1; } - if (drop && ci->i_wrbuffer_ref) { - pr_info(" dropping dirty data for %p %lld\n", - inode, ceph_ino(inode)); - ci->i_wrbuffer_ref = 0; - ci->i_wrbuffer_ref_head = 0; - drop++; - } spin_unlock(&mdsc->cap_dirty_lock); } spin_unlock(&ci->i_ceph_lock); -- cgit v1.2.3 From 1c841a96b5f369cbb0b169d13825c7ff7d0fba8d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 24 Mar 2015 20:15:36 +0800 Subject: ceph: cleanup unsafe requests when reconnecting is denied Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c3bac611a097..fd5585b8d382 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1021,6 +1021,33 @@ static void cleanup_cap_releases(struct ceph_mds_session *session) spin_unlock(&session->s_cap_lock); } +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("cleanup_session_requests mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + list_del_init(&req->r_unsafe_item); + pr_info(" dropping unsafe request %llu\n", req->r_tid); + __unregister_request(mdsc, req); + } + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); +} + /* * Helper to safely iterate over all caps associated with a session, with * special care taken to handle a racing __ceph_remove_cap(). @@ -2589,6 +2616,7 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect denied\n", session->s_mds); + cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); -- cgit v1.2.3 From 67c64eb742a49d3d3f5dcef75d0c32a3394e5519 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 23 Mar 2015 14:52:40 +0300 Subject: libceph: don't overwrite specific con error msgs - specific con->error_msg messages (e.g. "protocol version mismatch") end up getting overwritten by a catch-all "socket error on read / write", introduced in commit 3a140a0d5c4b ("libceph: report socket read/write error message") - "bad message sequence # for incoming message" loses to "bad crc" due to the fact that -EBADMSG is used for both Fix it, and tidy up con->error_msg assignments and pr_errs while at it. Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a9f4ae45b7fb..967080a9f043 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -505,8 +505,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); - con->error_msg = "connect error"; - return ret; } @@ -2145,12 +2143,10 @@ static int process_connect(struct ceph_connection *con) * to WAIT. This shouldn't happen if we are the * client. */ - pr_err("process_connect got WAIT as client\n"); con->error_msg = "protocol error, got WAIT as client"; return -1; default: - pr_err("connect protocol error, will retry\n"); con->error_msg = "protocol error, garbage tag during connect"; return -1; } @@ -2282,8 +2278,7 @@ static int read_partial_message(struct ceph_connection *con) crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", + pr_err("read_partial_message bad hdr crc %u != expected %u\n", crc, con->in_hdr.crc); return -EBADMSG; } @@ -2313,7 +2308,7 @@ static int read_partial_message(struct ceph_connection *con) pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); con->error_msg = "bad message sequence # for incoming message"; - return -EBADMSG; + return -EBADE; } /* allocate message? */ @@ -2660,6 +2655,8 @@ more: switch (ret) { case -EBADMSG: con->error_msg = "bad crc"; + /* fall through */ + case -EBADE: ret = -EIO; break; case -EIO: @@ -2838,7 +2835,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on read"; + if (!con->error_msg) + con->error_msg = "socket error on read"; fault = true; break; } @@ -2847,7 +2845,8 @@ static void con_work(struct work_struct *work) if (ret < 0) { if (ret == -EAGAIN) continue; - con->error_msg = "socket error on write"; + if (!con->error_msg) + con->error_msg = "socket error on write"; fault = true; } @@ -2869,11 +2868,13 @@ static void con_work(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + con->error_msg = NULL; + WARN_ON(con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN); @@ -3295,8 +3296,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) */ if (*skip) return 0; - con->error_msg = "error allocating memory for incoming message"; + con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); -- cgit v1.2.3 From d8a2c89c8636405ad0b234f111d22c00c37e452b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Mar 2015 16:15:17 +0300 Subject: rbd: mark block queue as non-rotational Set QUEUE_FLAG_NONROT. Following commit b277da0a8a59 ("block: disable entropy contributions for nonrot devices") we should also clear QUEUE_FLAG_ADD_RANDOM, but it's off by default for blk-mq drivers, so just note it in the comment. Also remove physical block size assignment - no sense in repeating defaults that are not going to change. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 83f5733f1a7a..2002d28c5d5b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3762,8 +3762,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) goto out_tag_set; } - /* We use the default size, but let's be explicit about it. */ - blk_queue_physical_block_size(q, SECTOR_SIZE); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ /* set io sizes to object size */ segment_size = rbd_obj_bytes(&rbd_dev->header); -- cgit v1.2.3 From ff40f9ae95917b72b6acb6057471c99054b6ee24 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 25 Mar 2015 21:02:16 +0300 Subject: libceph, ceph: split ceph_show_options() Split ceph_show_options() into two pieces and move the piece responsible for printing client (libceph) options into net/ceph. This way people adding a libceph option wouldn't have to remember to update code in fs/ceph. Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 40 +++++++++++++++------------------------- include/linux/ceph/libceph.h | 1 + net/ceph/ceph_common.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 25 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9f035ccb6191..34a779edd421 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -411,31 +411,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) { struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; - struct ceph_options *opt = fsc->client->options; - - if (opt->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &opt->fsid); - if (opt->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (opt->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (opt->flags & CEPH_OPT_NOMSGAUTH) - seq_puts(m, ",nocephx_require_signatures"); - if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) - seq_puts(m, ",notcp_nodelay"); - - if (opt->name) - seq_printf(m, ",name=%s", opt->name); - if (opt->key) - seq_puts(m, ",secret="); - - if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); - if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - opt->osd_keepalive_timeout); + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) seq_puts(m, ",dirstat"); @@ -482,6 +471,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 16fff9608848..303be6ef7f94 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -191,6 +191,7 @@ extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, int (*parse_extra_token)(char *c, void *private), void *private); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index ec565508e904..79e8f71aef5b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -490,6 +490,43 @@ out: } EXPORT_SYMBOL(ceph_parse_options); +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +{ + struct ceph_options *opt = client->options; + size_t pos = m->count; + + if (opt->name) + seq_printf(m, "name=%s,", opt->name); + if (opt->key) + seq_puts(m, "secret=,"); + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, "fsid=%pU,", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, "noshare,"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, "nocrc,"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, "nocephx_require_signatures,"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, "notcp_nodelay,"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, "osdkeepalivetimeout=%d,", + opt->osd_keepalive_timeout); + + /* drop redundant comma */ + if (m->count != pos) + m->count--; + + return 0; +} +EXPORT_SYMBOL(ceph_print_client_options); + u64 ceph_client_id(struct ceph_client *client) { return client->monc.auth->global_id; -- cgit v1.2.3 From 5cf7bd30120ead3db43ef9074be38018d9acf22f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 25 Mar 2015 21:07:41 +0300 Subject: libceph: expose client options through debugfs Add a client_options attribute for showing libceph options. Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + net/ceph/debugfs.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 303be6ef7f94..30f92cefaa72 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -135,6 +135,7 @@ struct ceph_client { struct dentry *debugfs_dir; struct dentry *debugfs_monmap; struct dentry *debugfs_osdmap; + struct dentry *debugfs_options; #endif }; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 14d9995097cc..593dc2eabcc8 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -22,6 +22,7 @@ * .../monmap - current monmap * .../osdc - active osd requests * .../monc - mon client state + * .../client_options - libceph-only (i.e. not rbd or cephfs) options * .../dentry_lru - dump contents of dentry lru * .../caps - expose cap (reservation) stats * .../bdi - symlink to ../../bdi/something @@ -177,10 +178,24 @@ static int osdc_show(struct seq_file *s, void *pp) return 0; } +static int client_options_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + int ret; + + ret = ceph_print_client_options(s, client); + if (ret) + return ret; + + seq_putc(s, '\n'); + return 0; +} + CEPH_DEFINE_SHOW_FUNC(monmap_show) CEPH_DEFINE_SHOW_FUNC(osdmap_show) CEPH_DEFINE_SHOW_FUNC(monc_show) CEPH_DEFINE_SHOW_FUNC(osdc_show) +CEPH_DEFINE_SHOW_FUNC(client_options_show) int ceph_debugfs_init(void) { @@ -242,6 +257,14 @@ int ceph_debugfs_client_init(struct ceph_client *client) if (!client->debugfs_osdmap) goto out; + client->debugfs_options = debugfs_create_file("client_options", + 0600, + client->debugfs_dir, + client, + &client_options_show_fops); + if (!client->debugfs_options) + goto out; + return 0; out: @@ -252,6 +275,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { dout("ceph_debugfs_client_cleanup %p\n", client); + debugfs_remove(client->debugfs_options); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); -- cgit v1.2.3 From ff7eeb82cc16f25203b69f817cbbb85845c817fe Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 25 Mar 2015 21:10:09 +0300 Subject: ceph: show non-default options only Don't pollute /proc/mounts with default options (presently these are dcache, nofsc and acl). Leave the acl/noacl however - it's a bit of a special case due to CONFIG_CEPH_FS_POSIX_ACL. Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 34a779edd421..e463ebd69a9c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -432,14 +432,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); - if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) - seq_puts(m, ",dcache"); - else + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); - else - seq_puts(m, ",nofsc"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) -- cgit v1.2.3 From 9571eb4f9617e89b3f979a3856b1296eba277bb1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 25 Mar 2015 21:15:17 +0300 Subject: libceph: simplify our debugfs attr macro No need to do single_open()'s job ourselves. Signed-off-by: Ilya Dryomov --- include/linux/ceph/debugfs.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h index 1df086d7882d..29cf897cc5cd 100644 --- a/include/linux/ceph/debugfs.h +++ b/include/linux/ceph/debugfs.h @@ -7,13 +7,7 @@ #define CEPH_DEFINE_SHOW_FUNC(name) \ static int name##_open(struct inode *inode, struct file *file) \ { \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ + return single_open(file, name, inode->i_private); \ } \ \ static const struct file_operations name##_fops = { \ -- cgit v1.2.3 From 32ec4397756d072873ee778cbf41b9f6a335b953 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 26 Mar 2015 19:06:00 +0800 Subject: ceph: hold on to exclusive caps on complete directories If a directory is complete, we want to keep the exclusive cap. So that MDS does not end up revoking the shared cap on every create/unlink operation. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 37a42353b983..11631c4c7d14 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1561,7 +1561,19 @@ retry_locked: if (!mdsc->stopping && inode->i_nlink > 0) { if (want) { retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + retain |= want; } else { + retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, -- cgit v1.2.3 From c0bd50e2eeddf139d8f61e709d7003210301e93a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 7 Apr 2015 15:51:08 +0800 Subject: ceph: fix null pointer dereference in send_mds_reconnect() sb->s_root can be null when umounting Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index fd5585b8d382..0a2eb32ffe43 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2892,7 +2892,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_cap_lock); /* trim unused caps to reduce MDS's cache rejoin time */ - shrink_dcache_parent(mdsc->fsc->sb->s_root); + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, -- cgit v1.2.3 From 0ea611a3bc5fb8f6a0bb1a76fe2dbf8ebe4bdf77 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 7 Apr 2015 15:36:32 +0800 Subject: ceph: rename snapshot support Signed-off-by: Yan, Zheng --- fs/ceph/dir.c | 13 +++++++++---- fs/ceph/strings.c | 1 + include/linux/ceph/ceph_fs.h | 1 + 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 98c71e895e81..e729b79812b4 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -947,16 +947,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; int err; if (ceph_snap(old_dir) != ceph_snap(new_dir)) return -EXDEV; - if (ceph_snap(old_dir) != CEPH_NOSNAP || - ceph_snap(new_dir) != CEPH_NOSNAP) - return -EROFS; + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); ihold(old_dir); @@ -1387,6 +1391,7 @@ const struct inode_operations ceph_snapdir_iops = { .getattr = ceph_getattr, .mkdir = ceph_mkdir, .rmdir = ceph_unlink, + .rename = ceph_rename, }; const struct dentry_operations ceph_dentry_ops = { diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 51cc23e48111..89e6bc321df3 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 31eb03d0c766..d7d072a25c27 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -323,6 +323,7 @@ enum { CEPH_MDS_OP_MKSNAP = 0x01400, CEPH_MDS_OP_RMSNAP = 0x01401, CEPH_MDS_OP_LSSNAP = 0x00402, + CEPH_MDS_OP_RENAMESNAP = 0x01403, }; extern const char *ceph_mds_op_name(int op); -- cgit v1.2.3 From ec137c10e720e5cf085504332ee1bf380241ed69 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 13 Apr 2015 11:25:07 +0800 Subject: ceph: fix uninline data function For CEPH_OSD_CMPXATTR_MODE_U64, OSD expects the u64 to be encoded as string in object's xattr. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e723482c5f37..cab1cf5a330b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1539,19 +1539,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); - err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, - "inline_version", &inline_version, - sizeof(inline_version), - CEPH_OSD_CMPXATTR_OP_GT, - CEPH_OSD_CMPXATTR_MODE_U64); - if (err) - goto out_put; - - err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, - "inline_version", &inline_version, - sizeof(inline_version), 0, 0); - if (err) - goto out_put; + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + } + + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put; + } ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); -- cgit v1.2.3 From 9be6df215a1baa57ccad0c388daeb2ad8f8d99ee Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 14 Apr 2015 20:55:13 +0300 Subject: crush: drop unnecessary include from mapper.c Signed-off-by: Ilya Dryomov --- net/ceph/crush/mapper.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a1ef53c04415..5549fb609358 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -20,7 +20,6 @@ #include #include -#include /* * Implement the core CRUSH mapping algorithm. -- cgit v1.2.3 From 45002267e8d2699bf9b022315bee3dd13b044843 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 14 Apr 2015 16:04:23 +0300 Subject: crush: ensuring at most num-rep osds are selected Crush temporary buffers are allocated as per replica size configured by the user. When there are more final osds (to be selected as per rule) than the replicas, buffer overlaps and it causes crash. Now, it ensures that at most num-rep osds are selected even if more number of osds are allowed by the rule. Reflects ceph.git commits 6b4d1aa99718e3b367496326c1e64551330fabc0, 234b066ba04976783d15ff2abc3e81b6cc06fb10. Signed-off-by: Ilya Dryomov --- net/ceph/crush/mapper.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 5549fb609358..91c41fe83113 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -289,6 +289,7 @@ static int is_out(const struct crush_map *map, * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector + * @out_size: size of the out vector * @tries: number of attempts to make * @recurse_tries: number of attempts to have recursive chooseleaf make * @local_retries: localized retries @@ -303,6 +304,7 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, + int out_size, unsigned int tries, unsigned int recurse_tries, unsigned int local_retries, @@ -321,6 +323,7 @@ static int crush_choose_firstn(const struct crush_map *map, int item = 0; int itemtype; int collide, reject; + int count = out_size; dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", recurse_to_leaf ? "_LEAF" : "", @@ -328,7 +331,7 @@ static int crush_choose_firstn(const struct crush_map *map, tries, recurse_tries, local_retries, local_fallback_retries, parent_r); - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < numrep && count > 0 ; rep++) { /* keep trying until we get a non-out, non-colliding item */ ftotal = 0; skip_rep = 0; @@ -402,7 +405,7 @@ static int crush_choose_firstn(const struct crush_map *map, map->buckets[-1-item], weight, weight_max, x, outpos+1, 0, - out2, outpos, + out2, outpos, count, recurse_tries, 0, local_retries, local_fallback_retries, @@ -462,6 +465,7 @@ reject: dprintk("CHOOSE got %d\n", item); out[outpos] = item; outpos++; + count--; } dprintk("CHOOSE returns %d\n", outpos); @@ -653,6 +657,7 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; + int out_size; /* * the original choose_total_tries value was off by one (it * counted "retries" and not "tries"). add one. @@ -760,6 +765,7 @@ int crush_do_rule(const struct crush_map *map, x, numrep, curstep->arg2, o+osize, j, + result_max-osize, choose_tries, recurse_tries, choose_local_retries, @@ -769,11 +775,13 @@ int crush_do_rule(const struct crush_map *map, c+osize, 0); } else { + out_size = ((numrep < (result_max-osize)) ? + numrep : (result_max-osize)); crush_choose_indep( map, map->buckets[-1-w[i]], weight, weight_max, - x, numrep, numrep, + x, out_size, numrep, curstep->arg2, o+osize, j, choose_tries, @@ -782,7 +790,7 @@ int crush_do_rule(const struct crush_map *map, recurse_to_leaf, c+osize, 0); - osize += numrep; + osize += out_size; } } -- cgit v1.2.3 From 958a27658d94cf212caeb0ffb04ee0b0bc89cc40 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 14 Apr 2015 16:54:52 +0300 Subject: crush: straw2 bucket type with an efficient 64-bit crush_ln() This is an improved straw bucket that correctly avoids any data movement between items A and B when neither A nor B's weights are changed. Said differently, if we adjust the weight of item C (including adding it anew or removing it completely), we will only see inputs move to or from C, never between other items in the bucket. Notably, there is not intermediate scaling factor that needs to be calculated. The mapping function is a simple function of the item weights. The below commits were squashed together into this one (mostly to avoid adding and then yanking a ~6000 lines worth of crush_ln_table): - crush: add a straw2 bucket type - crush: add crush_ln to calculate nature log efficently - crush: improve straw2 adjustment slightly - crush: change crush_ln to provide 32 more digits - crush: fix crush_get_bucket_item_weight and bucket destroy for straw2 - crush/mapper: fix divide-by-0 in straw2 (with div64_s64() for draw = ln / w and INT64_MIN -> S64_MIN - need to create a proper compat.h in ceph.git) Reflects ceph.git commits 242293c908e923d474910f2b8203fa3b41eb5a53, 32a1ead92efcd351822d22a5fc37d159c65c1338, 6289912418c4a3597a11778bcf29ed5415117ad9, 35fcb04e2945717cf5cfe150b9fa89cb3d2303a1, 6445d9ee7290938de1e4ee9563912a6ab6d8ee5f, b5921d55d16796e12d66ad2c4add7305f9ce2353. Signed-off-by: Ilya Dryomov --- include/linux/crush/crush.h | 12 ++- net/ceph/crush/crush.c | 14 ++++ net/ceph/crush/crush_ln_table.h | 166 ++++++++++++++++++++++++++++++++++++++++ net/ceph/crush/mapper.c | 101 ++++++++++++++++++++++++ net/ceph/osdmap.c | 25 ++++++ 5 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 net/ceph/crush/crush_ln_table.h diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 4fad5f8ee01d..48a1a7d100f1 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -96,13 +96,15 @@ struct crush_rule { * uniform O(1) poor poor * list O(n) optimal poor * tree O(log n) good good - * straw O(n) optimal optimal + * straw O(n) better better + * straw2 O(n) optimal optimal */ enum { CRUSH_BUCKET_UNIFORM = 1, CRUSH_BUCKET_LIST = 2, CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 + CRUSH_BUCKET_STRAW = 4, + CRUSH_BUCKET_STRAW2 = 5, }; extern const char *crush_bucket_alg_name(int alg); @@ -149,6 +151,11 @@ struct crush_bucket_straw { __u32 *straws; /* 16-bit fixed point */ }; +struct crush_bucket_straw2 { + struct crush_bucket h; + __u32 *item_weights; /* 16-bit fixed point */ +}; + /* @@ -189,6 +196,7 @@ extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); extern void crush_destroy_bucket_list(struct crush_bucket_list *b); extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); +extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b); extern void crush_destroy_bucket(struct crush_bucket *b); extern void crush_destroy_rule(struct crush_rule *r); extern void crush_destroy(struct crush_map *map); diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 16bc199d9a62..9d84ce4ea0df 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -17,6 +17,7 @@ const char *crush_bucket_alg_name(int alg) case CRUSH_BUCKET_LIST: return "list"; case CRUSH_BUCKET_TREE: return "tree"; case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_STRAW2: return "straw2"; default: return "unknown"; } } @@ -40,6 +41,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_STRAW2: + return ((struct crush_bucket_straw2 *)b)->item_weights[p]; } return 0; } @@ -77,6 +80,14 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) kfree(b); } +void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) +{ + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + void crush_destroy_bucket(struct crush_bucket *b) { switch (b->alg) { @@ -92,6 +103,9 @@ void crush_destroy_bucket(struct crush_bucket *b) case CRUSH_BUCKET_STRAW: crush_destroy_bucket_straw((struct crush_bucket_straw *)b); break; + case CRUSH_BUCKET_STRAW2: + crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); + break; } } diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h new file mode 100644 index 000000000000..6192c7fc958c --- /dev/null +++ b/net/ceph/crush/crush_ln_table.h @@ -0,0 +1,166 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +#endif + +#ifndef CEPH_CRUSH_LN_H +#define CEPH_CRUSH_LN_H + + +// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) +// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + +static int64_t __RH_LH_tbl[128*2+2] = { + 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, + 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, + 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, + 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll, + 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll, + 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll, + 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll, + 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell, + 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll, + 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll, + 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll, + 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll, + 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll, + 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll, + 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all, + 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll, + 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all, + 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell, + 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll, + 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll, + 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll, + 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll, + 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll, + 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll, + 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll, + 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll, + 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell, + 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll, + 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll, + 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll, + 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll, + 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll, + 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll, + 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll, + 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll, + 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll, + 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll, + 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll, + 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll, + 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll, + 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll, + 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll, + 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll, + 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll, + 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll, + 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll, + 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll, + 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll, + 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll, + 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll, + 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll, + 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll, + 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll, + 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell, + 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell, + 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll, + 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell, + 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll, + 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll, + 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll, + 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll, + 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll, + 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, + 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, + 0x0000800000000000ll, 0x0000ffff00000000ll, + }; + + + // LL_tbl[k] = 2^48*log2(1.0+k/2^15); +static int64_t __LL_tbl[256] = { + 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, + 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, + 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, + 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull, + 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull, + 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull, + 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull, + 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull, + 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull, + 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull, + 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull, + 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull, + 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull, + 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull, + 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull, + 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull, + 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull, + 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull, + 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull, + 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull, + 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull, + 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull, + 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull, + 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull, + 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull, + 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull, + 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull, + 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull, + 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull, + 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull, + 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull, + 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull, + 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull, + 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull, + 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull, + 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull, + 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull, + 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull, + 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull, + 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull, + 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull, + 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull, + 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull, + 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull, + 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull, + 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull, + 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull, + 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull, + 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull, + 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull, + 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull, + 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull, + 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull, + 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull, + 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull, + 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull, + 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull, + 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull, + 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull, + 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull, + 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull, + 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull, + 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull, + 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, +}; + + + + +#endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 91c41fe83113..5b47736d27d9 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -20,6 +20,7 @@ #include #include +#include "crush_ln_table.h" /* * Implement the core CRUSH mapping algorithm. @@ -237,6 +238,102 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, return bucket->h.items[high]; } +// compute 2^44*log2(input+1) +uint64_t crush_ln(unsigned xin) +{ + unsigned x=xin, x1; + int iexpon, index1, index2; + uint64_t RH, LH, LL, xl64, result; + + x++; + + // normalize input + iexpon = 15; + while(!(x&0x18000)) { x<<=1; iexpon--; } + + index1 = (x>>8)<<1; + // RH ~ 2^56/index1 + RH = __RH_LH_tbl[index1 - 256]; + // LH ~ 2^48 * log2(index1/256) + LH = __RH_LH_tbl[index1 + 1 - 256]; + + // RH*x ~ 2^48 * (2^15 + xf), xf<2^8 + xl64 = (int64_t)x * RH; + xl64 >>= 48; + x1 = xl64; + + result = iexpon; + result <<= (12 + 32); + + index2 = x1 & 0xff; + // LL ~ 2^48*log2(1.0+index2/2^15) + LL = __LL_tbl[index2]; + + LH = LH + LL; + + LH >>= (48-12 - 32); + result += LH; + + return result; +} + + +/* + * straw2 + * + * for reference, see: + * + * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * + */ + +static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, + int x, int r) +{ + unsigned i, high = 0; + unsigned u; + unsigned w; + __s64 ln, draw, high_draw = 0; + + for (i = 0; i < bucket->h.size; i++) { + w = bucket->item_weights[i]; + if (w) { + u = crush_hash32_3(bucket->h.hash, x, + bucket->h.items[i], r); + u &= 0xffff; + + /* + * for some reason slightly less than 0x10000 produces + * a slightly more accurate distribution... probably a + * rounding effect. + * + * the natural log lookup table maps [0,0xffff] + * (corresponding to real numbers [1/0x10000, 1] to + * [0, 0xffffffffffff] (corresponding to real numbers + * [-11.090355,0]). + */ + ln = crush_ln(u) - 0x1000000000000ll; + + /* + * divide by 16.16 fixed-point weight. note + * that the ln value is negative, so a larger + * weight means a larger (less negative) value + * for draw. + */ + draw = div64_s64(ln, w); + } else { + draw = S64_MIN; + } + + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + + static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); @@ -254,12 +351,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) case CRUSH_BUCKET_STRAW: return bucket_straw_choose((struct crush_bucket_straw *)in, x, r); + case CRUSH_BUCKET_STRAW2: + return bucket_straw2_choose((struct crush_bucket_straw2 *)in, + x, r); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } + /* * true if device is marked "out" (failed, fully offloaded) * of the cluster diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index b8c3fde5b04f..15796696d64e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -122,6 +122,22 @@ bad: return -EINVAL; } +static int crush_decode_straw2_bucket(void **p, void *end, + struct crush_bucket_straw2 *b) +{ + int j; + dout("crush_decode_straw2_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) + b->item_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + static int skip_name_map(void **p, void *end) { int len; @@ -204,6 +220,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) case CRUSH_BUCKET_STRAW: size = sizeof(struct crush_bucket_straw); break; + case CRUSH_BUCKET_STRAW2: + size = sizeof(struct crush_bucket_straw2); + break; default: err = -EINVAL; goto bad; @@ -261,6 +280,12 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (err < 0) goto bad; break; + case CRUSH_BUCKET_STRAW2: + err = crush_decode_straw2_bucket(p, end, + (struct crush_bucket_straw2 *)b); + if (err < 0) + goto bad; + break; } } -- cgit v1.2.3 From 7c1c4747f28f59f2939687168203863f7e095d10 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 14 Apr 2015 23:13:31 +0300 Subject: libceph: announce support for straw2 buckets Sync up feature bits and enable CEPH_FEATURE_CRUSH_V4. Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_features.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 71e05bbf8ceb..4763ad64e832 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -50,6 +50,19 @@ #define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) #define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) #define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ +#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42) +#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43) +#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44) +#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) +#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) +#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */ +#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */ +#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */ +#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) +#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */ +#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) +// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY +#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ /* * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature @@ -93,7 +106,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_EXPORT_PEER | \ CEPH_FEATURE_OSDMAP_ENC | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ - CEPH_FEATURE_OSD_PRIMARY_AFFINITY) + CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ + CEPH_FEATURE_CRUSH_V4) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From f77303bddabf73ebccb60f613b77da391f933cf6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 22 Apr 2015 18:28:13 +0300 Subject: rbd: rbd_wq comment is obsolete After the switch to blk-mq rbd_wq processes requests, not devices. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2002d28c5d5b..812523330a78 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5687,7 +5687,7 @@ static int __init rbd_init(void) /* * The number of active work items is limited by the number of - * rbd devices, so leave @max_active at default. + * rbd devices * queue depth, so leave @max_active at default. */ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); if (!rbd_wq) { -- cgit v1.2.3