diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-28 11:02:23 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-01-28 11:02:23 -0800 |
commit | d891ea23d5203e5c47439b2a174f86a00b356a6c (patch) | |
tree | 3876cefcced9df5519f437cd8eb275cb979b93f6 /drivers/block | |
parent | 08d21b5f93eb92a781daea71b6fcb3a340909141 (diff) | |
parent | 125d725c923527a85876c031028c7f55c28b74b3 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull ceph updates from Sage Weil:
"This is a big batch. From Ilya we have:
- rbd support for more than ~250 mapped devices (now uses same scheme
that SCSI does for device major/minor numbering)
- crush updates for new mapping behaviors (will be needed for coming
erasure coding support, among other things)
- preliminary support for tiered storage pools
There is also a big series fixing a pile cephfs bugs with clustered
MDSs from Yan Zheng, ACL support for cephfs from Guangliang Zhao, ceph
fscache improvements from Li Wang, improved behavior when we get
ENOSPC from Josh Durgin, some readv/writev improvements from
Majianpeng, and the usual mix of small cleanups"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (76 commits)
ceph: cast PAGE_SIZE to size_t in ceph_sync_write()
ceph: fix dout() compile warnings in ceph_filemap_fault()
libceph: support CEPH_FEATURE_OSD_CACHEPOOL feature
libceph: follow redirect replies from osds
libceph: rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid}
libceph: follow {read,write}_tier fields on osd request submission
libceph: add ceph_pg_pool_by_id()
libceph: CEPH_OSD_FLAG_* enum update
libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg()
libceph: introduce and start using oid abstraction
libceph: rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN
libceph: move ceph_file_layout helpers to ceph_fs.h
libceph: start using oloc abstraction
libceph: dout() is missing a newline
libceph: add ceph_kv{malloc,free}() and switch to them
libceph: support CEPH_FEATURE_EXPORT_PEER
ceph: add imported caps when handling cap export message
ceph: add open export target session helper
ceph: remove exported caps when handling cap import message
ceph: handle session flush message
...
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/rbd.c | 303 |
1 files changed, 204 insertions, 99 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cb1db2979d3d..16cab6635163 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -41,6 +41,7 @@ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/idr.h> #include "rbd_types.h" @@ -89,9 +90,9 @@ static int atomic_dec_return_safe(atomic_t *v) } #define RBD_DRV_NAME "rbd" -#define RBD_DRV_NAME_LONG "rbd (rados block device)" -#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ +#define RBD_MINORS_PER_MAJOR 256 +#define RBD_SINGLE_MAJOR_PART_SHIFT 4 #define RBD_SNAP_DEV_NAME_PREFIX "snap_" #define RBD_MAX_SNAP_NAME_LEN \ @@ -323,6 +324,7 @@ struct rbd_device { int dev_id; /* blkdev unique id */ int major; /* blkdev assigned major */ + int minor; struct gendisk *disk; /* blkdev's gendisk and rq */ u32 image_format; /* Either 1 or 2 */ @@ -386,6 +388,17 @@ static struct kmem_cache *rbd_img_request_cache; static struct kmem_cache *rbd_obj_request_cache; static struct kmem_cache *rbd_segment_name_cache; +static int rbd_major; +static DEFINE_IDA(rbd_dev_id_ida); + +/* + * Default to false for now, as single-major requires >= 0.75 version of + * userspace rbd utility. + */ +static bool single_major = false; +module_param(single_major, bool, S_IRUGO); +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); + static int rbd_img_request_submit(struct rbd_img_request *img_request); static void rbd_dev_device_release(struct device *dev); @@ -394,18 +407,52 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, size_t count); +static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, + size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); static void rbd_spec_put(struct rbd_spec *spec); +static int rbd_dev_id_to_minor(int dev_id) +{ + return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; +} + +static int minor_to_rbd_dev_id(int minor) +{ + return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; +} + static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); +static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); +static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); static struct attribute *rbd_bus_attrs[] = { &bus_attr_add.attr, &bus_attr_remove.attr, + &bus_attr_add_single_major.attr, + &bus_attr_remove_single_major.attr, NULL, }; -ATTRIBUTE_GROUPS(rbd_bus); + +static umode_t rbd_bus_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + if (!single_major && + (attr == &bus_attr_add_single_major.attr || + attr == &bus_attr_remove_single_major.attr)) + return 0; + + return attr->mode; +} + +static const struct attribute_group rbd_bus_group = { + .attrs = rbd_bus_attrs, + .is_visible = rbd_bus_is_visible, +}; +__ATTRIBUTE_GROUPS(rbd_bus); static struct bus_type rbd_bus_type = { .name = "rbd", @@ -1041,9 +1088,9 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) name_format = "%s.%012llx"; if (rbd_dev->image_format == 2) name_format = "%s.%016llx"; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); @@ -1761,11 +1808,8 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@ -1802,11 +1846,8 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@ -2866,7 +2907,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) * Request sync osd watch/unwatch. The value of "start" determines * whether a watch request is being initiated or torn down. */ -static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) +static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; @@ -2941,6 +2982,22 @@ out_cancel: return ret; } +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) +{ + return __rbd_dev_header_watch_sync(rbd_dev, true); +} + +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + int ret; + + ret = __rbd_dev_header_watch_sync(rbd_dev, false); + if (ret) { + rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", + ret); + } +} + /* * Synchronous osd object method call. Returns the number of bytes * returned in the outbound buffer, or a negative error code. @@ -3388,14 +3445,18 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) u64 segment_size; /* create gendisk info */ - disk = alloc_disk(RBD_MINORS_PER_MAJOR); + disk = alloc_disk(single_major ? + (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : + RBD_MINORS_PER_MAJOR); if (!disk) return -ENOMEM; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", rbd_dev->dev_id); disk->major = rbd_dev->major; - disk->first_minor = 0; + disk->first_minor = rbd_dev->minor; + if (single_major) + disk->flags |= GENHD_FL_EXT_DEVT; disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; @@ -3467,7 +3528,14 @@ static ssize_t rbd_major_show(struct device *dev, return sprintf(buf, "%d\n", rbd_dev->major); return sprintf(buf, "(none)\n"); +} +static ssize_t rbd_minor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%d\n", rbd_dev->minor); } static ssize_t rbd_client_id_show(struct device *dev, @@ -3589,6 +3657,7 @@ static ssize_t rbd_image_refresh(struct device *dev, static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); +static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); @@ -3602,6 +3671,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_size.attr, &dev_attr_features.attr, &dev_attr_major.attr, + &dev_attr_minor.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, &dev_attr_pool_id.attr, @@ -4372,21 +4442,29 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); - /* * Get a unique rbd identifier for the given new rbd_dev, and add - * the rbd_dev to the global list. The minimum rbd id is 1. + * the rbd_dev to the global list. */ -static void rbd_dev_id_get(struct rbd_device *rbd_dev) +static int rbd_dev_id_get(struct rbd_device *rbd_dev) { - rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max); + int new_dev_id; + + new_dev_id = ida_simple_get(&rbd_dev_id_ida, + 0, minor_to_rbd_dev_id(1 << MINORBITS), + GFP_KERNEL); + if (new_dev_id < 0) + return new_dev_id; + + rbd_dev->dev_id = new_dev_id; spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); spin_unlock(&rbd_dev_list_lock); - dout("rbd_dev %p given dev id %llu\n", rbd_dev, - (unsigned long long) rbd_dev->dev_id); + + dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); + + return 0; } /* @@ -4395,49 +4473,13 @@ static void rbd_dev_id_get(struct rbd_device *rbd_dev) */ static void rbd_dev_id_put(struct rbd_device *rbd_dev) { - struct list_head *tmp; - int rbd_id = rbd_dev->dev_id; - int max_id; - - rbd_assert(rbd_id > 0); - - dout("rbd_dev %p released dev id %llu\n", rbd_dev, - (unsigned long long) rbd_dev->dev_id); spin_lock(&rbd_dev_list_lock); list_del_init(&rbd_dev->node); - - /* - * If the id being "put" is not the current maximum, there - * is nothing special we need to do. - */ - if (rbd_id != atomic64_read(&rbd_dev_id_max)) { - spin_unlock(&rbd_dev_list_lock); - return; - } - - /* - * We need to update the current maximum id. Search the - * list to find out what it is. We're more likely to find - * the maximum at the end, so search the list backward. - */ - max_id = 0; - list_for_each_prev(tmp, &rbd_dev_list) { - struct rbd_device *rbd_dev; - - rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->dev_id > max_id) - max_id = rbd_dev->dev_id; - } spin_unlock(&rbd_dev_list_lock); - /* - * The max id could have been updated by rbd_dev_id_get(), in - * which case it now accurately reflects the new maximum. - * Be careful not to overwrite the maximum value in that - * case. - */ - atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id); - dout(" max dev id has been reset\n"); + ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + + dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); } /* @@ -4860,20 +4902,29 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) { int ret; - /* generate unique id: find highest unique id, add one */ - rbd_dev_id_get(rbd_dev); + /* Get an id and fill in device name. */ + + ret = rbd_dev_id_get(rbd_dev); + if (ret) + return ret; - /* Fill in the device name, now that we have its id. */ BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); - /* Get our block major device number. */ + /* Record our major and minor device numbers. */ - ret = register_blkdev(0, rbd_dev->name); - if (ret < 0) - goto err_out_id; - rbd_dev->major = ret; + if (!single_major) { + ret = register_blkdev(0, rbd_dev->name); + if (ret < 0) + goto err_out_id; + + rbd_dev->major = ret; + rbd_dev->minor = 0; + } else { + rbd_dev->major = rbd_major; + rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); + } /* Set up the blkdev mapping. */ @@ -4905,7 +4956,8 @@ err_out_mapping: err_out_disk: rbd_free_disk(rbd_dev); err_out_blkdev: - unregister_blkdev(rbd_dev->major, rbd_dev->name); + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); @@ -4961,7 +5013,6 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) { int ret; - int tmp; /* * Get the id from the image id object. Unless there's an @@ -4980,7 +5031,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) goto err_out_format; if (mapping) { - ret = rbd_dev_header_watch_sync(rbd_dev, true); + ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) goto out_header_name; } @@ -5007,12 +5058,8 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - if (mapping) { - tmp = rbd_dev_header_watch_sync(rbd_dev, false); - if (tmp) - rbd_warn(rbd_dev, "unable to tear down " - "watch request (%d)\n", tmp); - } + if (mapping) + rbd_dev_header_unwatch_sync(rbd_dev); out_header_name: kfree(rbd_dev->header_name); rbd_dev->header_name = NULL; @@ -5026,9 +5073,9 @@ err_out_format: return ret; } -static ssize_t rbd_add(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_add(struct bus_type *bus, + const char *buf, + size_t count) { struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts = NULL; @@ -5090,6 +5137,12 @@ static ssize_t rbd_add(struct bus_type *bus, rc = rbd_dev_device_setup(rbd_dev); if (rc) { + /* + * rbd_dev_header_unwatch_sync() can't be moved into + * rbd_dev_image_release() without refactoring, see + * commit 1f3ef78861ac. + */ + rbd_dev_header_unwatch_sync(rbd_dev); rbd_dev_image_release(rbd_dev); goto err_out_module; } @@ -5110,6 +5163,23 @@ err_out_module: return (ssize_t)rc; } +static ssize_t rbd_add(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_add(bus, buf, count); +} + +static ssize_t rbd_add_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_add(bus, buf, count); +} + static void rbd_dev_device_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); @@ -5117,8 +5187,8 @@ static void rbd_dev_device_release(struct device *dev) rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_dev_mapping_clear(rbd_dev); - unregister_blkdev(rbd_dev->major, rbd_dev->name); - rbd_dev->major = 0; + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); rbd_dev_id_put(rbd_dev); rbd_dev_mapping_clear(rbd_dev); } @@ -5149,9 +5219,9 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) } } -static ssize_t rbd_remove(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) { struct rbd_device *rbd_dev = NULL; struct list_head *tmp; @@ -5191,16 +5261,14 @@ static ssize_t rbd_remove(struct bus_type *bus, if (ret < 0 || already) return ret; - ret = rbd_dev_header_watch_sync(rbd_dev, false); - if (ret) - rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret); - + rbd_dev_header_unwatch_sync(rbd_dev); /* * flush remaining watch callbacks - these must be complete * before the osd_client is shutdown */ dout("%s: flushing notifies", __func__); ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); + /* * Don't free anything from rbd_dev->disk until after all * notifies are completely processed. Otherwise @@ -5214,6 +5282,23 @@ static ssize_t rbd_remove(struct bus_type *bus, return count; } +static ssize_t rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_remove(bus, buf, count); +} + +static ssize_t rbd_remove_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_remove(bus, buf, count); +} + /* * create control files in sysfs * /sys/bus/rbd/... @@ -5259,7 +5344,7 @@ static int rbd_slab_init(void) rbd_assert(!rbd_segment_name_cache); rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); if (rbd_segment_name_cache) return 0; out_err: @@ -5295,24 +5380,45 @@ static int __init rbd_init(void) if (!libceph_compatible(NULL)) { rbd_warn(NULL, "libceph incompatibility (quitting)"); - return -EINVAL; } + rc = rbd_slab_init(); if (rc) return rc; + + if (single_major) { + rbd_major = register_blkdev(0, RBD_DRV_NAME); + if (rbd_major < 0) { + rc = rbd_major; + goto err_out_slab; + } + } + rc = rbd_sysfs_init(); if (rc) - rbd_slab_exit(); + goto err_out_blkdev; + + if (single_major) + pr_info("loaded (major %d)\n", rbd_major); else - pr_info("loaded " RBD_DRV_NAME_LONG "\n"); + pr_info("loaded\n"); + + return 0; +err_out_blkdev: + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); +err_out_slab: + rbd_slab_exit(); return rc; } static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); rbd_slab_exit(); } @@ -5322,9 +5428,8 @@ module_exit(rbd_exit); MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); -MODULE_DESCRIPTION("rados block device"); - /* following authorship retained from original osdblk.c */ MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); +MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); MODULE_LICENSE("GPL"); |