summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/autofs/Kconfig1
-rw-r--r--fs/binfmt_aout.c4
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/ceph/Kconfig14
-rw-r--r--fs/ceph/Makefile11
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c65
-rw-r--r--fs/ceph/armor.c103
-rw-r--r--fs/ceph/auth.c259
-rw-r--r--fs/ceph/auth.h92
-rw-r--r--fs/ceph/auth_none.c131
-rw-r--r--fs/ceph/auth_none.h30
-rw-r--r--fs/ceph/auth_x.c687
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c65
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c81
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c72
-rw-r--r--fs/ceph/ceph_fs.h728
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c609
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c412
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c406
-rw-r--r--fs/ceph/decode.h196
-rw-r--r--fs/ceph/dir.c97
-rw-r--r--fs/ceph/export.c26
-rw-r--r--fs/ceph/file.c209
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h4
-rw-r--r--fs/ceph/locks.c23
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h20
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/mdsmap.h62
-rw-r--r--fs/ceph/messenger.c2277
-rw-r--r--fs/ceph/messenger.h253
-rw-r--r--fs/ceph/mon_client.c1018
-rw-r--r--fs/ceph/mon_client.h121
-rw-r--r--fs/ceph/msgpool.c64
-rw-r--r--fs/ceph/msgpool.h25
-rw-r--r--fs/ceph/msgr.h175
-rw-r--r--fs/ceph/osd_client.c1539
-rw-r--r--fs/ceph/osd_client.h167
-rw-r--r--fs/ceph/osdmap.c1110
-rw-r--r--fs/ceph/osdmap.h128
-rw-r--r--fs/ceph/pagelist.c63
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h405
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1154
-rw-r--r--fs/ceph/super.h400
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c18
-rw-r--r--fs/cifs/cifssmb.c49
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/compat_ioctl.c70
-rw-r--r--fs/exec.c40
-rw-r--r--fs/exofs/inode.c8
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/super.c8
-rw-r--r--fs/fs-writeback.c19
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/aops.c24
-rw-r--r--fs/gfs2/bmap.c255
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c31
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c9
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c23
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/inode.h15
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/ops_fstype.c79
-rw-r--r--fs/gfs2/ops_inode.c326
-rw-r--r--fs/gfs2/quota.c16
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c50
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/sys.c22
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/bfind.c4
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfsplus/bfind.c17
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/brec.c29
-rw-r--r--fs/hfsplus/btree.c67
-rw-r--r--fs/hfsplus/catalog.c50
-rw-r--r--fs/hfsplus/dir.c201
-rw-r--r--fs/hfsplus/extents.c223
-rw-r--r--fs/hfsplus/hfsplus_fs.h85
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c185
-rw-r--r--fs/hfsplus/ioctl.c153
-rw-r--r--fs/hfsplus/options.c10
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c310
-rw-r--r--fs/hfsplus/unicode.c16
-rw-r--r--fs/hfsplus/wrapper.c40
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/jbd2/journal.c4
-rw-r--r--fs/libfs.c29
-rw-r--r--fs/nfs/Kconfig1
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/ocfs2/aops.c9
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c532
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h6
-rw-r--r--fs/ocfs2/cluster/tcp.c5
-rw-r--r--fs/ocfs2/dcache.c33
-rw-r--r--fs/ocfs2/dcache.h1
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h29
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c400
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/file.c73
-rw-r--r--fs/ocfs2/inode.c1
-rw-r--r--fs/ocfs2/inode.h12
-rw-r--r--fs/ocfs2/ioctl.c356
-rw-r--r--fs/ocfs2/journal.c9
-rw-r--r--fs/ocfs2/journal.h3
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/ocfs2.h63
-rw-r--r--fs/ocfs2/ocfs2_fs.h46
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h95
-rw-r--r--fs/ocfs2/refcounttree.c43
-rw-r--r--fs/ocfs2/refcounttree.h7
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_o2cb.c2
-rw-r--r--fs/ocfs2/suballoc.c16
-rw-r--r--fs/ocfs2/super.c163
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/sysfile.c60
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/reiserfs/ioctl.c7
-rw-r--r--fs/smbfs/Kconfig1
-rw-r--r--fs/sysfs/group.c59
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c19
-rw-r--r--fs/xfs/xfs_log_cil.c12
-rw-r--r--fs/xfs/xfs_log_priv.h37
175 files changed, 4574 insertions, 15044 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 3d185308ec88..65781de44fc0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,6 +50,7 @@ endif # BLOCK
config FILE_LOCKING
bool "Enable POSIX file locking API" if EMBEDDED
default y
+ select BKL # while lockd still uses it.
help
This option enables standard file locking support, required
for filesystems like NFS and for the flock() system
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..1dd5f34b3cf2 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,7 @@
config ADFS_FS
tristate "ADFS file system support (EXPERIMENTAL)"
depends on BLOCK && EXPERIMENTAL
+ depends on BKL # need to fix
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/super.c b/fs/affs/super.c
index a167f96d79f7..fa4fbe1e238a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -104,8 +104,8 @@ static void init_once(void *foo)
{
struct affs_inode_info *ei = (struct affs_inode_info *) foo;
- init_MUTEX(&ei->i_link_lock);
- init_MUTEX(&ei->i_ext_lock);
+ sema_init(&ei->i_link_lock, 1);
+ sema_init(&ei->i_ext_lock, 1);
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
index 5f3bea90911e..480e210c83ab 100644
--- a/fs/autofs/Kconfig
+++ b/fs/autofs/Kconfig
@@ -1,5 +1,6 @@
config AUTOFS_FS
tristate "Kernel automounter support"
+ depends on BKL # unfixable, just use autofs4
help
The automounter is a tool to automatically mount remote file systems
on demand. This implementation is partially kernel-based to reduce
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f96eff04e11a..a6395bdb26ae 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -134,10 +134,6 @@ static int aout_core_dump(struct coredump_params *cprm)
if (!dump_write(file, dump_start, dump_size))
goto end_coredump;
}
-/* Finally dump the task struct. Not be used by gdb, but could be useful */
- set_fs(KERNEL_DS);
- if (!dump_write(file, current, sizeof(*current)))
- goto end_coredump;
end_coredump:
set_fs(fs);
return has_dumped;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 535e763ab1a6..6884e198e0c7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -800,7 +800,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* default mmap base, as well as whatever program they
* might try to exec. This is because the brk will
* follow the loader, and is not movable. */
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) || defined(CONFIG_ARM)
load_bias = 0;
#else
load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
config CEPH_FS
tristate "Ceph distributed file system (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
+ select CEPH_LIB
select LIBCRC32C
select CRYPTO_AES
select CRYPTO
+ default n
help
Choose Y or M here to include support for mounting the
experimental Ceph distributed file system. Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
If unsure, say N.
-config CEPH_FS_PRETTYDEBUG
- bool "Include file:line in ceph debug output"
- depends on CEPH_FS
- default n
- help
- If you say Y here, debug output will include a filename and
- line to aid debugging. This icnreases kernel size and slows
- execution slightly when debug call sites are enabled (e.g.,
- via CONFIG_DYNAMIC_DEBUG).
-
- If unsure, say N.
-
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o \
- messenger.o msgpool.o buffer.o pagelist.o \
- mds_client.o mdsmap.o \
- mon_client.o \
- osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
- debugfs.o \
- auth.o auth_none.o \
- crypto.o armor.o \
- auth_x.o \
- ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+ mds_client.o mdsmap.o strings.o ceph_frag.o \
+ debugfs.o
else
#Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# The following files are shared by (and manually synchronized
-# between) the Ceph userland and kernel client.
-#
-# userland kernel
-src/include/ceph_fs.h fs/ceph/ceph_fs.h
-src/include/ceph_fs.cc fs/ceph/ceph_fs.c
-src/include/msgr.h fs/ceph/msgr.h
-src/include/rados.h fs/ceph/rados.h
-src/include/ceph_strings.cc fs/ceph/ceph_strings.c
-src/include/ceph_frag.h fs/ceph/ceph_frag.h
-src/include/ceph_frag.cc fs/ceph/ceph_frag.c
-src/include/ceph_hash.h fs/ceph/ceph_hash.h
-src/include/ceph_hash.cc fs/ceph/ceph_hash.c
-src/crush/crush.c fs/ceph/crush/crush.c
-src/crush/crush.h fs/ceph/crush/crush.h
-src/crush/mapper.c fs/ceph/crush/mapper.c
-src/crush/mapper.h fs/ceph/crush/mapper.h
-src/crush/hash.h fs/ceph/crush/hash.h
-src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..51bcc5ce3230 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
@@ -10,7 +10,8 @@
#include <linux/task_io_accounting_ops.h>
#include "super.h"
-#include "osd_client.h"
+#include "mds_client.h"
+#include <linux/ceph/osd_client.h>
/*
* Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
{
struct inode *inode = filp->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
int err = 0;
u64 len = PAGE_CACHE_SIZE;
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_inode_to_client(inode)->client->osdc;
int rc = 0;
struct page **pages;
loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
struct inode *inode;
struct ceph_inode_info *ci;
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
struct ceph_osd_client *osdc;
loff_t page_off = page->index << PAGE_CACHE_SHIFT;
int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
inode = page->mapping->host;
ci = ceph_inode(inode);
- client = ceph_inode_to_client(inode);
- osdc = &client->osdc;
+ fsc = ceph_inode_to_client(inode);
+ osdc = &fsc->client->osdc;
/* verify this is a writeable snap context */
snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
inode, page, page->index, page_off, len, snapc);
- writeback_stat = atomic_long_inc_return(&client->writeback_count);
+ writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
- CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
- set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
+ set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
struct address_space *mapping = inode->i_mapping;
__s32 rc = -EIO;
u64 bytes = 0;
- struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat;
unsigned issued = ceph_caps_issued(ci);
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
WARN_ON(!PageUptodate(page));
writeback_stat =
- atomic_long_dec_return(&client->writeback_count);
+ atomic_long_dec_return(&fsc->writeback_count);
if (writeback_stat <
- CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
- clear_bdi_congested(&client->backing_dev_info,
+ CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+ clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
* mempool. we avoid the mempool if we can because req->r_num_pages
* may be less than the maximum write size.
*/
-static void alloc_page_vec(struct ceph_client *client,
+static void alloc_page_vec(struct ceph_fs_client *fsc,
struct ceph_osd_request *req)
{
req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
GFP_NOFS);
if (!req->r_pages) {
- req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+ req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
req->r_pages_from_pool = 1;
WARN_ON(!req->r_pages);
}
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = mapping->backing_dev_info;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
pgoff_t index, start, end;
int range_whole = 0;
int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- client = ceph_inode_to_client(inode);
- if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ fsc = ceph_inode_to_client(inode);
+ if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
pr_warning("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
}
- if (client->mount_args->wsize && client->mount_args->wsize < wsize)
- wsize = client->mount_args->wsize;
+ if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+ wsize = fsc->mount_options->wsize;
if (wsize < PAGE_CACHE_SIZE)
wsize = PAGE_CACHE_SIZE;
max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
offset = (unsigned long long)page->index
<< PAGE_CACHE_SHIFT;
len = wsize;
- req = ceph_osdc_new_request(&client->osdc,
+ req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout,
ceph_vino(inode),
offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
&inode->i_mtime, true, 1);
max_pages = req->r_num_pages;
- alloc_page_vec(client, req);
+ alloc_page_vec(fsc, req);
req->r_callback = writepages_finish;
req->r_inode = inode;
}
@@ -794,10 +797,10 @@ get_more_pages:
inode, page, page->index);
writeback_stat =
- atomic_long_inc_return(&client->writeback_count);
+ atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat > CONGESTION_ON_THRESH(
- client->mount_args->congestion_kb)) {
- set_bdi_congested(&client->backing_dev_info,
+ fsc->mount_options->congestion_kb)) {
+ set_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC);
}
@@ -846,7 +849,7 @@ get_more_pages:
op->payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);
- ceph_osdc_start_request(&client->osdc, req, true);
+ ceph_osdc_start_request(&fsc->client->osdc, req, true);
req = NULL;
/* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t page_off = pos & PAGE_CACHE_MASK;
int pos_in_page = pos & ~PAGE_CACHE_MASK;
int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file->f_dentry->d_inode;
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_dentry->d_inode;
struct page *page = vmf->page;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
loff_t off = page->index << PAGE_CACHE_SHIFT;
loff_t size, len;
int ret;
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
deleted file mode 100644
index eb2a666b0be7..000000000000
--- a/fs/ceph/armor.c
+++ /dev/null
@@ -1,103 +0,0 @@
-
-#include <linux/errno.h>
-
-int ceph_armor(char *dst, const char *src, const char *end);
-int ceph_unarmor(char *dst, const char *src, const char *end);
-
-/*
- * base64 encode/decode.
- */
-
-static const char *pem_key =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-static int encode_bits(int c)
-{
- return pem_key[c];
-}
-
-static int decode_bits(char c)
-{
- if (c >= 'A' && c <= 'Z')
- return c - 'A';
- if (c >= 'a' && c <= 'z')
- return c - 'a' + 26;
- if (c >= '0' && c <= '9')
- return c - '0' + 52;
- if (c == '+')
- return 62;
- if (c == '/')
- return 63;
- if (c == '=')
- return 0; /* just non-negative, please */
- return -EINVAL;
-}
-
-int ceph_armor(char *dst, const char *src, const char *end)
-{
- int olen = 0;
- int line = 0;
-
- while (src < end) {
- unsigned char a, b, c;
-
- a = *src++;
- *dst++ = encode_bits(a >> 2);
- if (src < end) {
- b = *src++;
- *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
- if (src < end) {
- c = *src++;
- *dst++ = encode_bits(((b & 15) << 2) |
- (c >> 6));
- *dst++ = encode_bits(c & 63);
- } else {
- *dst++ = encode_bits((b & 15) << 2);
- *dst++ = '=';
- }
- } else {
- *dst++ = encode_bits(((a & 3) << 4));
- *dst++ = '=';
- *dst++ = '=';
- }
- olen += 4;
- line += 4;
- if (line == 64) {
- line = 0;
- *(dst++) = '\n';
- olen++;
- }
- }
- return olen;
-}
-
-int ceph_unarmor(char *dst, const char *src, const char *end)
-{
- int olen = 0;
-
- while (src < end) {
- int a, b, c, d;
-
- if (src < end && src[0] == '\n')
- src++;
- if (src + 4 > end)
- return -EINVAL;
- a = decode_bits(src[0]);
- b = decode_bits(src[1]);
- c = decode_bits(src[2]);
- d = decode_bits(src[3]);
- if (a < 0 || b < 0 || c < 0 || d < 0)
- return -EINVAL;
-
- *dst++ = (a << 2) | (b >> 4);
- if (src[2] == '=')
- return olen + 1;
- *dst++ = ((b & 15) << 4) | (c >> 2);
- if (src[3] == '=')
- return olen + 2;
- *dst++ = ((c & 3) << 6) | d;
- olen += 3;
- src += 4;
- }
- return olen;
-}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
deleted file mode 100644
index 6d2e30600627..000000000000
--- a/fs/ceph/auth.c
+++ /dev/null
@@ -1,259 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/module.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-
-#include "types.h"
-#include "auth_none.h"
-#include "auth_x.h"
-#include "decode.h"
-#include "super.h"
-
-#include "messenger.h"
-
-/*
- * get protocol handler
- */
-static u32 supported_protocols[] = {
- CEPH_AUTH_NONE,
- CEPH_AUTH_CEPHX
-};
-
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
-{
- switch (protocol) {
- case CEPH_AUTH_NONE:
- return ceph_auth_none_init(ac);
- case CEPH_AUTH_CEPHX:
- return ceph_x_init(ac);
- default:
- return -ENOENT;
- }
-}
-
-/*
- * setup, teardown.
- */
-struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
-{
- struct ceph_auth_client *ac;
- int ret;
-
- dout("auth_init name '%s' secret '%s'\n", name, secret);
-
- ret = -ENOMEM;
- ac = kzalloc(sizeof(*ac), GFP_NOFS);
- if (!ac)
- goto out;
-
- ac->negotiating = true;
- if (name)
- ac->name = name;
- else
- ac->name = CEPH_AUTH_NAME_DEFAULT;
- dout("auth_init name %s secret %s\n", ac->name, secret);
- ac->secret = secret;
- return ac;
-
-out:
- return ERR_PTR(ret);
-}
-
-void ceph_auth_destroy(struct ceph_auth_client *ac)
-{
- dout("auth_destroy %p\n", ac);
- if (ac->ops)
- ac->ops->destroy(ac);
- kfree(ac);
-}
-
-/*
- * Reset occurs when reconnecting to the monitor.
- */
-void ceph_auth_reset(struct ceph_auth_client *ac)
-{
- dout("auth_reset %p\n", ac);
- if (ac->ops && !ac->negotiating)
- ac->ops->reset(ac);
- ac->negotiating = true;
-}
-
-int ceph_entity_name_encode(const char *name, void **p, void *end)
-{
- int len = strlen(name);
-
- if (*p + 2*sizeof(u32) + len > end)
- return -ERANGE;
- ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
- ceph_encode_32(p, len);
- ceph_encode_copy(p, name, len);
- return 0;
-}
-
-/*
- * Initiate protocol negotiation with monitor. Include entity name
- * and list supported protocols.
- */
-int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
-{
- struct ceph_mon_request_header *monhdr = buf;
- void *p = monhdr + 1, *end = buf + len, *lenp;
- int i, num;
- int ret;
-
- dout("auth_build_hello\n");
- monhdr->have_version = 0;
- monhdr->session_mon = cpu_to_le16(-1);
- monhdr->session_mon_tid = 0;
-
- ceph_encode_32(&p, 0); /* no protocol, yet */
-
- lenp = p;
- p += sizeof(u32);
-
- ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
- ceph_encode_8(&p, 1);
- num = ARRAY_SIZE(supported_protocols);
- ceph_encode_32(&p, num);
- ceph_decode_need(&p, end, num * sizeof(u32), bad);
- for (i = 0; i < num; i++)
- ceph_encode_32(&p, supported_protocols[i]);
-
- ret = ceph_entity_name_encode(ac->name, &p, end);
- if (ret < 0)
- return ret;
- ceph_decode_need(&p, end, sizeof(u64), bad);
- ceph_encode_64(&p, ac->global_id);
-
- ceph_encode_32(&lenp, p - lenp - sizeof(u32));
- return p - buf;
-
-bad:
- return -ERANGE;
-}
-
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
- void *msg_buf, size_t msg_len)
-{
- struct ceph_mon_request_header *monhdr = msg_buf;
- void *p = monhdr + 1;
- void *end = msg_buf + msg_len;
- int ret;
-
- monhdr->have_version = 0;
- monhdr->session_mon = cpu_to_le16(-1);
- monhdr->session_mon_tid = 0;
-
- ceph_encode_32(&p, ac->protocol);
-
- ret = ac->ops->build_request(ac, p + sizeof(u32), end);
- if (ret < 0) {
- pr_err("error %d building auth method %s request\n", ret,
- ac->ops->name);
- return ret;
- }
- dout(" built request %d bytes\n", ret);
- ceph_encode_32(&p, ret);
- return p + ret - msg_buf;
-}
-
-/*
- * Handle auth message from monitor.
- */
-int ceph_handle_auth_reply(struct ceph_auth_client *ac,
- void *buf, size_t len,
- void *reply_buf, size_t reply_len)
-{
- void *p = buf;
- void *end = buf + len;
- int protocol;
- s32 result;
- u64 global_id;
- void *payload, *payload_end;
- int payload_len;
- char *result_msg;
- int result_msg_len;
- int ret = -EINVAL;
-
- dout("handle_auth_reply %p %p\n", p, end);
- ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
- protocol = ceph_decode_32(&p);
- result = ceph_decode_32(&p);
- global_id = ceph_decode_64(&p);
- payload_len = ceph_decode_32(&p);
- payload = p;
- p += payload_len;
- ceph_decode_need(&p, end, sizeof(u32), bad);
- result_msg_len = ceph_decode_32(&p);
- result_msg = p;
- p += result_msg_len;
- if (p != end)
- goto bad;
-
- dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
- result_msg, global_id, payload_len);
-
- payload_end = payload + payload_len;
-
- if (global_id && ac->global_id != global_id) {
- dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
- ac->global_id = global_id;
- }
-
- if (ac->negotiating) {
- /* server does not support our protocols? */
- if (!protocol && result < 0) {
- ret = result;
- goto out;
- }
- /* set up (new) protocol handler? */
- if (ac->protocol && ac->protocol != protocol) {
- ac->ops->destroy(ac);
- ac->protocol = 0;
- ac->ops = NULL;
- }
- if (ac->protocol != protocol) {
- ret = ceph_auth_init_protocol(ac, protocol);
- if (ret) {
- pr_err("error %d on auth protocol %d init\n",
- ret, protocol);
- goto out;
- }
- }
-
- ac->negotiating = false;
- }
-
- ret = ac->ops->handle_reply(ac, result, payload, payload_end);
- if (ret == -EAGAIN) {
- return ceph_build_auth_request(ac, reply_buf, reply_len);
- } else if (ret) {
- pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
- return ret;
- }
- return 0;
-
-bad:
- pr_err("failed to decode auth msg\n");
-out:
- return ret;
-}
-
-int ceph_build_auth(struct ceph_auth_client *ac,
- void *msg_buf, size_t msg_len)
-{
- if (!ac->protocol)
- return ceph_auth_build_hello(ac, msg_buf, msg_len);
- BUG_ON(!ac->ops);
- if (ac->ops->should_authenticate(ac))
- return ceph_build_auth_request(ac, msg_buf, msg_len);
- return 0;
-}
-
-int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
-{
- if (!ac->ops)
- return 0;
- return ac->ops->is_authenticated(ac);
-}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
deleted file mode 100644
index d38a2fb4a137..000000000000
--- a/fs/ceph/auth.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifndef _FS_CEPH_AUTH_H
-#define _FS_CEPH_AUTH_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * Abstract interface for communicating with the authenticate module.
- * There is some handshake that takes place between us and the monitor
- * to acquire the necessary keys. These are used to generate an
- * 'authorizer' that we use when connecting to a service (mds, osd).
- */
-
-struct ceph_auth_client;
-struct ceph_authorizer;
-
-struct ceph_auth_client_ops {
- const char *name;
-
- /*
- * true if we are authenticated and can connect to
- * services.
- */
- int (*is_authenticated)(struct ceph_auth_client *ac);
-
- /*
- * true if we should (re)authenticate, e.g., when our tickets
- * are getting old and crusty.
- */
- int (*should_authenticate)(struct ceph_auth_client *ac);
-
- /*
- * build requests and process replies during monitor
- * handshake. if handle_reply returns -EAGAIN, we build
- * another request.
- */
- int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
- int (*handle_reply)(struct ceph_auth_client *ac, int result,
- void *buf, void *end);
-
- /*
- * Create authorizer for connecting to a service, and verify
- * the response to authenticate the service.
- */
- int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
- struct ceph_authorizer **a,
- void **buf, size_t *len,
- void **reply_buf, size_t *reply_len);
- int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len);
- void (*destroy_authorizer)(struct ceph_auth_client *ac,
- struct ceph_authorizer *a);
- void (*invalidate_authorizer)(struct ceph_auth_client *ac,
- int peer_type);
-
- /* reset when we (re)connect to a monitor */
- void (*reset)(struct ceph_auth_client *ac);
-
- void (*destroy)(struct ceph_auth_client *ac);
-};
-
-struct ceph_auth_client {
- u32 protocol; /* CEPH_AUTH_* */
- void *private; /* for use by protocol implementation */
- const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
-
- bool negotiating; /* true if negotiating protocol */
- const char *name; /* entity name */
- u64 global_id; /* our unique id in system */
- const char *secret; /* our secret key */
- unsigned want_keys; /* which services we want */
-};
-
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
- const char *secret);
-extern void ceph_auth_destroy(struct ceph_auth_client *ac);
-
-extern void ceph_auth_reset(struct ceph_auth_client *ac);
-
-extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
- void *buf, size_t len);
-extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
- void *buf, size_t len,
- void *reply_buf, size_t reply_len);
-extern int ceph_entity_name_encode(const char *name, void **p, void *end);
-
-extern int ceph_build_auth(struct ceph_auth_client *ac,
- void *msg_buf, size_t msg_len);
-
-extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
-
-#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
deleted file mode 100644
index ad1dc21286c7..000000000000
--- a/fs/ceph/auth_none.c
+++ /dev/null
@@ -1,131 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_none.h"
-#include "auth.h"
-#include "decode.h"
-
-static void reset(struct ceph_auth_client *ac)
-{
- struct ceph_auth_none_info *xi = ac->private;
-
- xi->starting = true;
- xi->built_authorizer = false;
-}
-
-static void destroy(struct ceph_auth_client *ac)
-{
- kfree(ac->private);
- ac->private = NULL;
-}
-
-static int is_authenticated(struct ceph_auth_client *ac)
-{
- struct ceph_auth_none_info *xi = ac->private;
-
- return !xi->starting;
-}
-
-static int should_authenticate(struct ceph_auth_client *ac)
-{
- struct ceph_auth_none_info *xi = ac->private;
-
- return xi->starting;
-}
-
-/*
- * the generic auth code decode the global_id, and we carry no actual
- * authenticate state, so nothing happens here.
- */
-static int handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
-{
- struct ceph_auth_none_info *xi = ac->private;
-
- xi->starting = false;
- return result;
-}
-
-/*
- * build an 'authorizer' with our entity_name and global_id. we can
- * reuse a single static copy since it is identical for all services
- * we connect to.
- */
-static int ceph_auth_none_create_authorizer(
- struct ceph_auth_client *ac, int peer_type,
- struct ceph_authorizer **a,
- void **buf, size_t *len,
- void **reply_buf, size_t *reply_len)
-{
- struct ceph_auth_none_info *ai = ac->private;
- struct ceph_none_authorizer *au = &ai->au;
- void *p, *end;
- int ret;
-
- if (!ai->built_authorizer) {
- p = au->buf;
- end = p + sizeof(au->buf);
- ceph_encode_8(&p, 1);
- ret = ceph_entity_name_encode(ac->name, &p, end - 8);
- if (ret < 0)
- goto bad;
- ceph_decode_need(&p, end, sizeof(u64), bad2);
- ceph_encode_64(&p, ac->global_id);
- au->buf_len = p - (void *)au->buf;
- ai->built_authorizer = true;
- dout("built authorizer len %d\n", au->buf_len);
- }
-
- *a = (struct ceph_authorizer *)au;
- *buf = au->buf;
- *len = au->buf_len;
- *reply_buf = au->reply_buf;
- *reply_len = sizeof(au->reply_buf);
- return 0;
-
-bad2:
- ret = -ERANGE;
-bad:
- return ret;
-}
-
-static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
-{
- /* nothing to do */
-}
-
-static const struct ceph_auth_client_ops ceph_auth_none_ops = {
- .name = "none",
- .reset = reset,
- .destroy = destroy,
- .is_authenticated = is_authenticated,
- .should_authenticate = should_authenticate,
- .handle_reply = handle_reply,
- .create_authorizer = ceph_auth_none_create_authorizer,
- .destroy_authorizer = ceph_auth_none_destroy_authorizer,
-};
-
-int ceph_auth_none_init(struct ceph_auth_client *ac)
-{
- struct ceph_auth_none_info *xi;
-
- dout("ceph_auth_none_init %p\n", ac);
- xi = kzalloc(sizeof(*xi), GFP_NOFS);
- if (!xi)
- return -ENOMEM;
-
- xi->starting = true;
- xi->built_authorizer = false;
-
- ac->protocol = CEPH_AUTH_NONE;
- ac->private = xi;
- ac->ops = &ceph_auth_none_ops;
- return 0;
-}
-
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
deleted file mode 100644
index 8164df1a08be..000000000000
--- a/fs/ceph/auth_none.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FS_CEPH_AUTH_NONE_H
-#define _FS_CEPH_AUTH_NONE_H
-
-#include <linux/slab.h>
-
-#include "auth.h"
-
-/*
- * null security mode.
- *
- * we use a single static authorizer that simply encodes our entity name
- * and global id.
- */
-
-struct ceph_none_authorizer {
- char buf[128];
- int buf_len;
- char reply_buf[0];
-};
-
-struct ceph_auth_none_info {
- bool starting;
- bool built_authorizer;
- struct ceph_none_authorizer au; /* we only need one; it's static */
-};
-
-extern int ceph_auth_none_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
deleted file mode 100644
index a2d002cbdec2..000000000000
--- a/fs/ceph/auth_x.c
+++ /dev/null
@@ -1,687 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-
-#include "auth_x.h"
-#include "auth_x_protocol.h"
-#include "crypto.h"
-#include "auth.h"
-#include "decode.h"
-
-#define TEMP_TICKET_BUF_LEN 256
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
-
-static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
-{
- struct ceph_x_info *xi = ac->private;
- int need;
-
- ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return (ac->want_keys & xi->have_keys) == ac->want_keys;
-}
-
-static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
-{
- struct ceph_x_info *xi = ac->private;
- int need;
-
- ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return need != 0;
-}
-
-static int ceph_x_encrypt_buflen(int ilen)
-{
- return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
- sizeof(u32);
-}
-
-static int ceph_x_encrypt(struct ceph_crypto_key *secret,
- void *ibuf, int ilen, void *obuf, size_t olen)
-{
- struct ceph_x_encrypt_header head = {
- .struct_v = 1,
- .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
- };
- size_t len = olen - sizeof(u32);
- int ret;
-
- ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
- &head, sizeof(head), ibuf, ilen);
- if (ret)
- return ret;
- ceph_encode_32(&obuf, len);
- return len + sizeof(u32);
-}
-
-static int ceph_x_decrypt(struct ceph_crypto_key *secret,
- void **p, void *end, void *obuf, size_t olen)
-{
- struct ceph_x_encrypt_header head;
- size_t head_len = sizeof(head);
- int len, ret;
-
- len = ceph_decode_32(p);
- if (*p + len > end)
- return -EINVAL;
-
- dout("ceph_x_decrypt len %d\n", len);
- ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
- *p, len);
- if (ret)
- return ret;
- if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
- return -EPERM;
- *p += len;
- return olen;
-}
-
-/*
- * get existing (or insert new) ticket handler
- */
-static struct ceph_x_ticket_handler *
-get_ticket_handler(struct ceph_auth_client *ac, int service)
-{
- struct ceph_x_ticket_handler *th;
- struct ceph_x_info *xi = ac->private;
- struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
-
- while (*p) {
- parent = *p;
- th = rb_entry(parent, struct ceph_x_ticket_handler, node);
- if (service < th->service)
- p = &(*p)->rb_left;
- else if (service > th->service)
- p = &(*p)->rb_right;
- else
- return th;
- }
-
- /* add it */
- th = kzalloc(sizeof(*th), GFP_NOFS);
- if (!th)
- return ERR_PTR(-ENOMEM);
- th->service = service;
- rb_link_node(&th->node, parent, p);
- rb_insert_color(&th->node, &xi->ticket_handlers);
- return th;
-}
-
-static void remove_ticket_handler(struct ceph_auth_client *ac,
- struct ceph_x_ticket_handler *th)
-{
- struct ceph_x_info *xi = ac->private;
-
- dout("remove_ticket_handler %p %d\n", th, th->service);
- rb_erase(&th->node, &xi->ticket_handlers);
- ceph_crypto_key_destroy(&th->session_key);
- if (th->ticket_blob)
- ceph_buffer_put(th->ticket_blob);
- kfree(th);
-}
-
-static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
- struct ceph_crypto_key *secret,
- void *buf, void *end)
-{
- struct ceph_x_info *xi = ac->private;
- int num;
- void *p = buf;
- int ret;
- char *dbuf;
- char *ticket_buf;
- u8 reply_struct_v;
-
- dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
- if (!dbuf)
- return -ENOMEM;
-
- ret = -ENOMEM;
- ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
- if (!ticket_buf)
- goto out_dbuf;
-
- ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
- reply_struct_v = ceph_decode_8(&p);
- if (reply_struct_v != 1)
- goto bad;
- num = ceph_decode_32(&p);
- dout("%d tickets\n", num);
- while (num--) {
- int type;
- u8 tkt_struct_v, blob_struct_v;
- struct ceph_x_ticket_handler *th;
- void *dp, *dend;
- int dlen;
- char is_enc;
- struct timespec validity;
- struct ceph_crypto_key old_key;
- void *tp, *tpend;
- struct ceph_timespec new_validity;
- struct ceph_crypto_key new_session_key;
- struct ceph_buffer *new_ticket_blob;
- unsigned long new_expires, new_renew_after;
- u64 new_secret_id;
-
- ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
-
- type = ceph_decode_32(&p);
- dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-
- tkt_struct_v = ceph_decode_8(&p);
- if (tkt_struct_v != 1)
- goto bad;
-
- th = get_ticket_handler(ac, type);
- if (IS_ERR(th)) {
- ret = PTR_ERR(th);
- goto out;
- }
-
- /* blob for me */
- dlen = ceph_x_decrypt(secret, &p, end, dbuf,
- TEMP_TICKET_BUF_LEN);
- if (dlen <= 0) {
- ret = dlen;
- goto out;
- }
- dout(" decrypted %d bytes\n", dlen);
- dend = dbuf + dlen;
- dp = dbuf;
-
- tkt_struct_v = ceph_decode_8(&dp);
- if (tkt_struct_v != 1)
- goto bad;
-
- memcpy(&old_key, &th->session_key, sizeof(old_key));
- ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
- if (ret)
- goto out;
-
- ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
- ceph_decode_timespec(&validity, &new_validity);
- new_expires = get_seconds() + validity.tv_sec;
- new_renew_after = new_expires - (validity.tv_sec / 4);
- dout(" expires=%lu renew_after=%lu\n", new_expires,
- new_renew_after);
-
- /* ticket blob for service */
- ceph_decode_8_safe(&p, end, is_enc, bad);
- tp = ticket_buf;
- if (is_enc) {
- /* encrypted */
- dout(" encrypted ticket\n");
- dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
- TEMP_TICKET_BUF_LEN);
- if (dlen < 0) {
- ret = dlen;
- goto out;
- }
- dlen = ceph_decode_32(&tp);
- } else {
- /* unencrypted */
- ceph_decode_32_safe(&p, end, dlen, bad);
- ceph_decode_need(&p, end, dlen, bad);
- ceph_decode_copy(&p, ticket_buf, dlen);
- }
- tpend = tp + dlen;
- dout(" ticket blob is %d bytes\n", dlen);
- ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
- blob_struct_v = ceph_decode_8(&tp);
- new_secret_id = ceph_decode_64(&tp);
- ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
- if (ret)
- goto out;
-
- /* all is well, update our ticket */
- ceph_crypto_key_destroy(&th->session_key);
- if (th->ticket_blob)
- ceph_buffer_put(th->ticket_blob);
- th->session_key = new_session_key;
- th->ticket_blob = new_ticket_blob;
- th->validity = new_validity;
- th->secret_id = new_secret_id;
- th->expires = new_expires;
- th->renew_after = new_renew_after;
- dout(" got ticket service %d (%s) secret_id %lld len %d\n",
- type, ceph_entity_type_name(type), th->secret_id,
- (int)th->ticket_blob->vec.iov_len);
- xi->have_keys |= th->service;
- }
-
- ret = 0;
-out:
- kfree(ticket_buf);
-out_dbuf:
- kfree(dbuf);
- return ret;
-
-bad:
- ret = -EINVAL;
- goto out;
-}
-
-static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
- struct ceph_x_ticket_handler *th,
- struct ceph_x_authorizer *au)
-{
- int maxlen;
- struct ceph_x_authorize_a *msg_a;
- struct ceph_x_authorize_b msg_b;
- void *p, *end;
- int ret;
- int ticket_blob_len =
- (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
-
- dout("build_authorizer for %s %p\n",
- ceph_entity_type_name(th->service), au);
-
- maxlen = sizeof(*msg_a) + sizeof(msg_b) +
- ceph_x_encrypt_buflen(ticket_blob_len);
- dout(" need len %d\n", maxlen);
- if (au->buf && au->buf->alloc_len < maxlen) {
- ceph_buffer_put(au->buf);
- au->buf = NULL;
- }
- if (!au->buf) {
- au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
- if (!au->buf)
- return -ENOMEM;
- }
- au->service = th->service;
-
- msg_a = au->buf->vec.iov_base;
- msg_a->struct_v = 1;
- msg_a->global_id = cpu_to_le64(ac->global_id);
- msg_a->service_id = cpu_to_le32(th->service);
- msg_a->ticket_blob.struct_v = 1;
- msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
- msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
- if (ticket_blob_len) {
- memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
- th->ticket_blob->vec.iov_len);
- }
- dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
- le64_to_cpu(msg_a->ticket_blob.secret_id));
-
- p = msg_a + 1;
- p += ticket_blob_len;
- end = au->buf->vec.iov_base + au->buf->vec.iov_len;
-
- get_random_bytes(&au->nonce, sizeof(au->nonce));
- msg_b.struct_v = 1;
- msg_b.nonce = cpu_to_le64(au->nonce);
- ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
- p, end - p);
- if (ret < 0)
- goto out_buf;
- p += ret;
- au->buf->vec.iov_len = p - au->buf->vec.iov_base;
- dout(" built authorizer nonce %llx len %d\n", au->nonce,
- (int)au->buf->vec.iov_len);
- BUG_ON(au->buf->vec.iov_len > maxlen);
- return 0;
-
-out_buf:
- ceph_buffer_put(au->buf);
- au->buf = NULL;
- return ret;
-}
-
-static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
- void **p, void *end)
-{
- ceph_decode_need(p, end, 1 + sizeof(u64), bad);
- ceph_encode_8(p, 1);
- ceph_encode_64(p, th->secret_id);
- if (th->ticket_blob) {
- const char *buf = th->ticket_blob->vec.iov_base;
- u32 len = th->ticket_blob->vec.iov_len;
-
- ceph_encode_32_safe(p, end, len, bad);
- ceph_encode_copy_safe(p, end, buf, len, bad);
- } else {
- ceph_encode_32_safe(p, end, 0, bad);
- }
-
- return 0;
-bad:
- return -ERANGE;
-}
-
-static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
-{
- int want = ac->want_keys;
- struct ceph_x_info *xi = ac->private;
- int service;
-
- *pneed = ac->want_keys & ~(xi->have_keys);
-
- for (service = 1; service <= want; service <<= 1) {
- struct ceph_x_ticket_handler *th;
-
- if (!(ac->want_keys & service))
- continue;
-
- if (*pneed & service)
- continue;
-
- th = get_ticket_handler(ac, service);
-
- if (IS_ERR(th)) {
- *pneed |= service;
- continue;
- }
-
- if (get_seconds() >= th->renew_after)
- *pneed |= service;
- if (get_seconds() >= th->expires)
- xi->have_keys &= ~service;
- }
-}
-
-
-static int ceph_x_build_request(struct ceph_auth_client *ac,
- void *buf, void *end)
-{
- struct ceph_x_info *xi = ac->private;
- int need;
- struct ceph_x_request_header *head = buf;
- int ret;
- struct ceph_x_ticket_handler *th =
- get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-
- if (IS_ERR(th))
- return PTR_ERR(th);
-
- ceph_x_validate_tickets(ac, &need);
-
- dout("build_request want %x have %x need %x\n",
- ac->want_keys, xi->have_keys, need);
-
- if (need & CEPH_ENTITY_TYPE_AUTH) {
- struct ceph_x_authenticate *auth = (void *)(head + 1);
- void *p = auth + 1;
- struct ceph_x_challenge_blob tmp;
- char tmp_enc[40];
- u64 *u;
-
- if (p > end)
- return -ERANGE;
-
- dout(" get_auth_session_key\n");
- head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
-
- /* encrypt and hash */
- get_random_bytes(&auth->client_challenge, sizeof(u64));
- tmp.client_challenge = auth->client_challenge;
- tmp.server_challenge = cpu_to_le64(xi->server_challenge);
- ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
- tmp_enc, sizeof(tmp_enc));
- if (ret < 0)
- return ret;
-
- auth->struct_v = 1;
- auth->key = 0;
- for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
- auth->key ^= *(__le64 *)u;
- dout(" server_challenge %llx client_challenge %llx key %llx\n",
- xi->server_challenge, le64_to_cpu(auth->client_challenge),
- le64_to_cpu(auth->key));
-
- /* now encode the old ticket if exists */
- ret = ceph_x_encode_ticket(th, &p, end);
- if (ret < 0)
- return ret;
-
- return p - buf;
- }
-
- if (need) {
- void *p = head + 1;
- struct ceph_x_service_ticket_request *req;
-
- if (p > end)
- return -ERANGE;
- head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
- ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
- if (ret)
- return ret;
- ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
- xi->auth_authorizer.buf->vec.iov_len);
-
- req = p;
- req->keys = cpu_to_le32(need);
- p += sizeof(*req);
- return p - buf;
- }
-
- return 0;
-}
-
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
-{
- struct ceph_x_info *xi = ac->private;
- struct ceph_x_reply_header *head = buf;
- struct ceph_x_ticket_handler *th;
- int len = end - buf;
- int op;
- int ret;
-
- if (result)
- return result; /* XXX hmm? */
-
- if (xi->starting) {
- /* it's a hello */
- struct ceph_x_server_challenge *sc = buf;
-
- if (len != sizeof(*sc))
- return -EINVAL;
- xi->server_challenge = le64_to_cpu(sc->server_challenge);
- dout("handle_reply got server challenge %llx\n",
- xi->server_challenge);
- xi->starting = false;
- xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
- return -EAGAIN;
- }
-
- op = le16_to_cpu(head->op);
- result = le32_to_cpu(head->result);
- dout("handle_reply op %d result %d\n", op, result);
- switch (op) {
- case CEPHX_GET_AUTH_SESSION_KEY:
- /* verify auth key */
- ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
- buf + sizeof(*head), end);
- break;
-
- case CEPHX_GET_PRINCIPAL_SESSION_KEY:
- th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
- if (IS_ERR(th))
- return PTR_ERR(th);
- ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
- buf + sizeof(*head), end);
- break;
-
- default:
- return -EINVAL;
- }
- if (ret)
- return ret;
- if (ac->want_keys == xi->have_keys)
- return 0;
- return -EAGAIN;
-}
-
-static int ceph_x_create_authorizer(
- struct ceph_auth_client *ac, int peer_type,
- struct ceph_authorizer **a,
- void **buf, size_t *len,
- void **reply_buf, size_t *reply_len)
-{
- struct ceph_x_authorizer *au;
- struct ceph_x_ticket_handler *th;
- int ret;
-
- th = get_ticket_handler(ac, peer_type);
- if (IS_ERR(th))
- return PTR_ERR(th);
-
- au = kzalloc(sizeof(*au), GFP_NOFS);
- if (!au)
- return -ENOMEM;
-
- ret = ceph_x_build_authorizer(ac, th, au);
- if (ret) {
- kfree(au);
- return ret;
- }
-
- *a = (struct ceph_authorizer *)au;
- *buf = au->buf->vec.iov_base;
- *len = au->buf->vec.iov_len;
- *reply_buf = au->reply_buf;
- *reply_len = sizeof(au->reply_buf);
- return 0;
-}
-
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a, size_t len)
-{
- struct ceph_x_authorizer *au = (void *)a;
- struct ceph_x_ticket_handler *th;
- int ret = 0;
- struct ceph_x_authorize_reply reply;
- void *p = au->reply_buf;
- void *end = p + sizeof(au->reply_buf);
-
- th = get_ticket_handler(ac, au->service);
- if (IS_ERR(th))
- return PTR_ERR(th);
- ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
- if (ret < 0)
- return ret;
- if (ret != sizeof(reply))
- return -EPERM;
-
- if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
- ret = -EPERM;
- else
- ret = 0;
- dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
- au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
- return ret;
-}
-
-static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
-{
- struct ceph_x_authorizer *au = (void *)a;
-
- ceph_buffer_put(au->buf);
- kfree(au);
-}
-
-
-static void ceph_x_reset(struct ceph_auth_client *ac)
-{
- struct ceph_x_info *xi = ac->private;
-
- dout("reset\n");
- xi->starting = true;
- xi->server_challenge = 0;
-}
-
-static void ceph_x_destroy(struct ceph_auth_client *ac)
-{
- struct ceph_x_info *xi = ac->private;
- struct rb_node *p;
-
- dout("ceph_x_destroy %p\n", ac);
- ceph_crypto_key_destroy(&xi->secret);
-
- while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
- struct ceph_x_ticket_handler *th =
- rb_entry(p, struct ceph_x_ticket_handler, node);
- remove_ticket_handler(ac, th);
- }
-
- if (xi->auth_authorizer.buf)
- ceph_buffer_put(xi->auth_authorizer.buf);
-
- kfree(ac->private);
- ac->private = NULL;
-}
-
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
- int peer_type)
-{
- struct ceph_x_ticket_handler *th;
-
- th = get_ticket_handler(ac, peer_type);
- if (!IS_ERR(th))
- remove_ticket_handler(ac, th);
-}
-
-
-static const struct ceph_auth_client_ops ceph_x_ops = {
- .name = "x",
- .is_authenticated = ceph_x_is_authenticated,
- .should_authenticate = ceph_x_should_authenticate,
- .build_request = ceph_x_build_request,
- .handle_reply = ceph_x_handle_reply,
- .create_authorizer = ceph_x_create_authorizer,
- .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
- .destroy_authorizer = ceph_x_destroy_authorizer,
- .invalidate_authorizer = ceph_x_invalidate_authorizer,
- .reset = ceph_x_reset,
- .destroy = ceph_x_destroy,
-};
-
-
-int ceph_x_init(struct ceph_auth_client *ac)
-{
- struct ceph_x_info *xi;
- int ret;
-
- dout("ceph_x_init %p\n", ac);
- ret = -ENOMEM;
- xi = kzalloc(sizeof(*xi), GFP_NOFS);
- if (!xi)
- goto out;
-
- ret = -EINVAL;
- if (!ac->secret) {
- pr_err("no secret set (for auth_x protocol)\n");
- goto out_nomem;
- }
-
- ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
- if (ret)
- goto out_nomem;
-
- xi->starting = true;
- xi->ticket_handlers = RB_ROOT;
-
- ac->protocol = CEPH_AUTH_CEPHX;
- ac->private = xi;
- ac->ops = &ceph_x_ops;
- return 0;
-
-out_nomem:
- kfree(xi);
-out:
- return ret;
-}
-
-
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
deleted file mode 100644
index ff6f8180e681..000000000000
--- a/fs/ceph/auth_x.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _FS_CEPH_AUTH_X_H
-#define _FS_CEPH_AUTH_X_H
-
-#include <linux/rbtree.h>
-
-#include "crypto.h"
-#include "auth.h"
-#include "auth_x_protocol.h"
-
-/*
- * Handle ticket for a single service.
- */
-struct ceph_x_ticket_handler {
- struct rb_node node;
- unsigned service;
-
- struct ceph_crypto_key session_key;
- struct ceph_timespec validity;
-
- u64 secret_id;
- struct ceph_buffer *ticket_blob;
-
- unsigned long renew_after, expires;
-};
-
-
-struct ceph_x_authorizer {
- struct ceph_buffer *buf;
- unsigned service;
- u64 nonce;
- char reply_buf[128]; /* big enough for encrypted blob */
-};
-
-struct ceph_x_info {
- struct ceph_crypto_key secret;
-
- bool starting;
- u64 server_challenge;
-
- unsigned have_keys;
- struct rb_root ticket_handlers;
-
- struct ceph_x_authorizer auth_authorizer;
-};
-
-extern int ceph_x_init(struct ceph_auth_client *ac);
-
-#endif
-
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
deleted file mode 100644
index 671d30576c4f..000000000000
--- a/fs/ceph/auth_x_protocol.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __FS_CEPH_AUTH_X_PROTOCOL
-#define __FS_CEPH_AUTH_X_PROTOCOL
-
-#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
-#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
-#define CEPHX_GET_ROTATING_KEY 0x0400
-
-/* common bits */
-struct ceph_x_ticket_blob {
- __u8 struct_v;
- __le64 secret_id;
- __le32 blob_len;
- char blob[];
-} __attribute__ ((packed));
-
-
-/* common request/reply headers */
-struct ceph_x_request_header {
- __le16 op;
-} __attribute__ ((packed));
-
-struct ceph_x_reply_header {
- __le16 op;
- __le32 result;
-} __attribute__ ((packed));
-
-
-/* authenticate handshake */
-
-/* initial hello (no reply header) */
-struct ceph_x_server_challenge {
- __u8 struct_v;
- __le64 server_challenge;
-} __attribute__ ((packed));
-
-struct ceph_x_authenticate {
- __u8 struct_v;
- __le64 client_challenge;
- __le64 key;
- /* ticket blob */
-} __attribute__ ((packed));
-
-struct ceph_x_service_ticket_request {
- __u8 struct_v;
- __le32 keys;
-} __attribute__ ((packed));
-
-struct ceph_x_challenge_blob {
- __le64 server_challenge;
- __le64 client_challenge;
-} __attribute__ ((packed));
-
-
-
-/* authorize handshake */
-
-/*
- * The authorizer consists of two pieces:
- * a - service id, ticket blob
- * b - encrypted with session key
- */
-struct ceph_x_authorize_a {
- __u8 struct_v;
- __le64 global_id;
- __le32 service_id;
- struct ceph_x_ticket_blob ticket_blob;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_b {
- __u8 struct_v;
- __le64 nonce;
-} __attribute__ ((packed));
-
-struct ceph_x_authorize_reply {
- __u8 struct_v;
- __le64 nonce_plus_one;
-} __attribute__ ((packed));
-
-
-/*
- * encyption bundle
- */
-#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
-
-struct ceph_x_encrypt_header {
- __u8 struct_v;
- __le64 magic;
-} __attribute__ ((packed));
-
-#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
deleted file mode 100644
index cd39f17021de..000000000000
--- a/fs/ceph/buffer.c
+++ /dev/null
@@ -1,65 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-
-#include "buffer.h"
-#include "decode.h"
-
-struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
-{
- struct ceph_buffer *b;
-
- b = kmalloc(sizeof(*b), gfp);
- if (!b)
- return NULL;
-
- b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
- if (b->vec.iov_base) {
- b->is_vmalloc = false;
- } else {
- b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
- if (!b->vec.iov_base) {
- kfree(b);
- return NULL;
- }
- b->is_vmalloc = true;
- }
-
- kref_init(&b->kref);
- b->alloc_len = len;
- b->vec.iov_len = len;
- dout("buffer_new %p\n", b);
- return b;
-}
-
-void ceph_buffer_release(struct kref *kref)
-{
- struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
-
- dout("buffer_release %p\n", b);
- if (b->vec.iov_base) {
- if (b->is_vmalloc)
- vfree(b->vec.iov_base);
- else
- kfree(b->vec.iov_base);
- }
- kfree(b);
-}
-
-int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
-{
- size_t len;
-
- ceph_decode_need(p, end, sizeof(u32), bad);
- len = ceph_decode_32(p);
- dout("decode_buffer len %d\n", (int)len);
- ceph_decode_need(p, end, len, bad);
- *b = ceph_buffer_new(len, GFP_NOFS);
- if (!*b)
- return -ENOMEM;
- ceph_decode_copy(p, (*b)->vec.iov_base, len);
- return 0;
-bad:
- return -EINVAL;
-}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
deleted file mode 100644
index 58d19014068f..000000000000
--- a/fs/ceph/buffer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __FS_CEPH_BUFFER_H
-#define __FS_CEPH_BUFFER_H
-
-#include <linux/kref.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/types.h>
-#include <linux/uio.h>
-
-/*
- * a simple reference counted buffer.
- *
- * use kmalloc for small sizes (<= one page), vmalloc for larger
- * sizes.
- */
-struct ceph_buffer {
- struct kref kref;
- struct kvec vec;
- size_t alloc_len;
- bool is_vmalloc;
-};
-
-extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
-extern void ceph_buffer_release(struct kref *kref);
-
-static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
-{
- kref_get(&b->kref);
- return b;
-}
-
-static inline void ceph_buffer_put(struct ceph_buffer *b)
-{
- kref_put(&b->kref, ceph_buffer_release);
-}
-
-extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
-
-#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 73c153092f72..98ab13e2b71d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/fs.h>
#include <linux/kernel.h>
@@ -9,8 +9,9 @@
#include <linux/writeback.h>
#include "super.h"
-#include "decode.h"
-#include "messenger.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h>
/*
* Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
spin_unlock(&mdsc->caps_list_lock);
}
-void ceph_reservation_status(struct ceph_client *client,
+void ceph_reservation_status(struct ceph_fs_client *fsc,
int *total, int *avail, int *used, int *reserved,
int *min)
{
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
if (total)
*total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- struct ceph_mount_args *ma = mdsc->client->mount_args;
+ struct ceph_mount_options *ma = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap_reservation *caps_reservation)
{
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *new_cap = NULL;
struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci;
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
int removed = 0;
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
int mds;
struct ceph_cap_snap *capsnap;
u32 mseq;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
session->s_mutex */
u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
{
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct inode *inode = &ci->vfs_inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
static int __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session)
{
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int flushing;
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
/*
* try to invalidate mapping pages without blocking.
*/
-static int mapping_is_empty(struct address_space *mapping)
-{
- struct page *page = find_get_page(mapping, 0);
-
- if (!page)
- return 1;
-
- put_page(page);
- return 0;
-}
-
static int try_nonblocking_invalidate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
invalidate_mapping_pages(&inode->i_data, 0, -1);
spin_lock(&inode->i_lock);
- if (mapping_is_empty(&inode->i_data) &&
+ if (inode->i_data.nrpages == 0 &&
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
*/
if ((!is_delayed || mdsc->stopping) &&
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
- ci->i_rdcache_gen && /* may have cached pages */
+ inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */
(revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
unsigned *flush_tid)
{
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int unlock_session = session ? 0 : 1;
int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
caps_are_flushed(inode, flush_tid));
} else {
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_client(inode->i_sb)->mdsc;
spin_lock(&inode->i_lock);
if (__ceph_caps_dirty(ci))
@@ -2283,7 +2273,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
- int seq = le32_to_cpu(grant->seq);
+ unsigned seq = le32_to_cpu(grant->seq);
+ unsigned issue_seq = le32_to_cpu(grant->issue_seq);
int newcaps = le32_to_cpu(grant->caps);
int issued, implemented, used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
@@ -2295,8 +2286,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
int revoked_rdcache = 0;
int queue_invalidate = 0;
- dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
- inode, cap, mds, seq, ceph_cap_string(newcaps));
+ dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
+ inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
inode->i_size);
@@ -2392,6 +2383,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
cap->seq = seq;
+ cap->issue_seq = issue_seq;
/* file layout may have changed */
ci->i_layout = grant->layout;
@@ -2463,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(inode->i_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
@@ -2711,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg)
{
struct ceph_mds_client *mdsc = session->s_mdsc;
- struct super_block *sb = mdsc->client->sb;
+ struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct ceph_cap *cap;
struct ceph_mds_caps *h;
@@ -2774,15 +2766,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
if (op == CEPH_CAP_OP_IMPORT)
__queue_cap_release(session, vino.ino, cap_id,
mseq, seq);
-
- /*
- * send any full release message to try to move things
- * along for the mds (who clearly thinks we still have this
- * cap).
- */
- ceph_add_cap_releases(mdsc, session);
- ceph_send_cap_releases(mdsc, session);
- goto done;
+ goto flush_cap_releases;
}
/* these will work even if we don't have a cap yet */
@@ -2810,7 +2794,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" no cap on %p ino %llx.%llx from mds%d\n",
inode, ceph_ino(inode), ceph_snap(inode), mds);
spin_unlock(&inode->i_lock);
- goto done;
+ goto flush_cap_releases;
}
/* note that each of these drops i_lock for us */
@@ -2834,6 +2818,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_cap_op_name(op));
}
+ goto done;
+
+flush_cap_releases:
+ /*
+ * send any full release message to try to move things
+ * along for the mds (who clearly thinks we still have this
+ * cap).
+ */
+ ceph_add_cap_releases(mdsc, session);
+ ceph_send_cap_releases(mdsc, session);
+
done:
mutex_unlock(&session->s_mutex);
done_unlocked:
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
deleted file mode 100644
index 1818c2305610..000000000000
--- a/fs/ceph/ceph_debug.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef _FS_CEPH_DEBUG_H
-#define _FS_CEPH_DEBUG_H
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
-
-/*
- * wrap pr_debug to include a filename:lineno prefix on each line.
- * this incurs some overhead (kernel size and execution time) due to
- * the extra function call at each call site.
- */
-
-# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
-# define dout(fmt, ...) \
- pr_debug(" %12.12s:%-4d : " fmt, \
- ceph_file_part(__FILE__, sizeof(__FILE__)), \
- __LINE__, ##__VA_ARGS__)
-# else
-/* faux printk call just to see any compiler warnings. */
-# define dout(fmt, ...) do { \
- if (0) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
- } while (0)
-# endif
-
-#else
-
-/*
- * or, just wrap pr_debug
- */
-# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
-
-#endif
-
-#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
/*
* Ceph 'frag' type
*/
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
int ceph_frag_compare(__u32 a, __u32 b)
{
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
deleted file mode 100644
index 5babb8e95352..000000000000
--- a/fs/ceph/ceph_frag.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef FS_CEPH_FRAG_H
-#define FS_CEPH_FRAG_H
-
-/*
- * "Frags" are a way to describe a subset of a 32-bit number space,
- * using a mask and a value to match against that mask. Any given frag
- * (subset of the number space) can be partitioned into 2^n sub-frags.
- *
- * Frags are encoded into a 32-bit word:
- * 8 upper bits = "bits"
- * 24 lower bits = "value"
- * (We could go to 5+27 bits, but who cares.)
- *
- * We use the _most_ significant bits of the 24 bit value. This makes
- * values logically sort.
- *
- * Unfortunately, because the "bits" field is still in the high bits, we
- * can't sort encoded frags numerically. However, it does allow you
- * to feed encoded frags as values into frag_contains_value.
- */
-static inline __u32 ceph_frag_make(__u32 b, __u32 v)
-{
- return (b << 24) |
- (v & (0xffffffu << (24-b)) & 0xffffffu);
-}
-static inline __u32 ceph_frag_bits(__u32 f)
-{
- return f >> 24;
-}
-static inline __u32 ceph_frag_value(__u32 f)
-{
- return f & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask(__u32 f)
-{
- return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
-}
-static inline __u32 ceph_frag_mask_shift(__u32 f)
-{
- return 24 - ceph_frag_bits(f);
-}
-
-static inline int ceph_frag_contains_value(__u32 f, __u32 v)
-{
- return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
-{
- /* is sub as specific as us, and contained by us? */
- return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
- (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
-}
-
-static inline __u32 ceph_frag_parent(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f) - 1,
- ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
-}
-static inline int ceph_frag_is_left_child(__u32 f)
-{
- return ceph_frag_bits(f) > 0 &&
- (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
-}
-static inline int ceph_frag_is_right_child(__u32 f)
-{
- return ceph_frag_bits(f) > 0 &&
- (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
-}
-static inline __u32 ceph_frag_sibling(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f),
- ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
-}
-static inline __u32 ceph_frag_left_child(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
-}
-static inline __u32 ceph_frag_right_child(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f)+1,
- ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
-}
-static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
-{
- int newbits = ceph_frag_bits(f) + by;
- return ceph_frag_make(newbits,
- ceph_frag_value(f) | (i << (24 - newbits)));
-}
-static inline int ceph_frag_is_leftmost(__u32 f)
-{
- return ceph_frag_value(f) == 0;
-}
-static inline int ceph_frag_is_rightmost(__u32 f)
-{
- return ceph_frag_value(f) == ceph_frag_mask(f);
-}
-static inline __u32 ceph_frag_next(__u32 f)
-{
- return ceph_frag_make(ceph_frag_bits(f),
- ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
-}
-
-/*
- * comparator to sort frags logically, as when traversing the
- * number space in ascending order...
- */
-int ceph_frag_compare(__u32 a, __u32 b);
-
-#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
deleted file mode 100644
index 3ac6cc7c1156..000000000000
--- a/fs/ceph/ceph_fs.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Some non-inline ceph helpers
- */
-#include "types.h"
-
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
- __u32 su = le32_to_cpu(layout->fl_stripe_unit);
- __u32 sc = le32_to_cpu(layout->fl_stripe_count);
- __u32 os = le32_to_cpu(layout->fl_object_size);
-
- /* stripe unit, object size must be non-zero, 64k increment */
- if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
- return 0;
- if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
- return 0;
- /* object size must be a multiple of stripe unit */
- if (os < su || os % su)
- return 0;
- /* stripe count must be non-zero */
- if (!sc)
- return 0;
- return 1;
-}
-
-
-int ceph_flags_to_mode(int flags)
-{
- int mode;
-
-#ifdef O_DIRECTORY /* fixme */
- if ((flags & O_DIRECTORY) == O_DIRECTORY)
- return CEPH_FILE_MODE_PIN;
-#endif
- if ((flags & O_APPEND) == O_APPEND)
- flags |= O_WRONLY;
-
- if ((flags & O_ACCMODE) == O_RDWR)
- mode = CEPH_FILE_MODE_RDWR;
- else if ((flags & O_ACCMODE) == O_WRONLY)
- mode = CEPH_FILE_MODE_WR;
- else
- mode = CEPH_FILE_MODE_RD;
-
-#ifdef O_LAZY
- if (flags & O_LAZY)
- mode |= CEPH_FILE_MODE_LAZY;
-#endif
-
- return mode;
-}
-
-int ceph_caps_for_mode(int mode)
-{
- int caps = CEPH_CAP_PIN;
-
- if (mode & CEPH_FILE_MODE_RD)
- caps |= CEPH_CAP_FILE_SHARED |
- CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
- if (mode & CEPH_FILE_MODE_WR)
- caps |= CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
- CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
- CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
- if (mode & CEPH_FILE_MODE_LAZY)
- caps |= CEPH_CAP_FILE_LAZYIO;
-
- return caps;
-}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
deleted file mode 100644
index d5619ac86711..000000000000
--- a/fs/ceph/ceph_fs.h
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * ceph_fs.h - Ceph constants and data types to share between kernel and
- * user space.
- *
- * Most types in this file are defined as little-endian, and are
- * primarily intended to describe data structures that pass over the
- * wire or that are stored on disk.
- *
- * LGPL2
- */
-
-#ifndef CEPH_FS_H
-#define CEPH_FS_H
-
-#include "msgr.h"
-#include "rados.h"
-
-/*
- * subprotocol versions. when specific messages types or high-level
- * protocols change, bump the affected components. we keep rev
- * internal cluster protocols separately from the public,
- * client-facing protocol.
- */
-#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
-#define CEPH_MON_PROTOCOL 5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL 24 /* server/client */
-#define CEPH_MDSC_PROTOCOL 32 /* server/client */
-#define CEPH_MONC_PROTOCOL 15 /* server/client */
-
-
-#define CEPH_INO_ROOT 1
-#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
-
-/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
-#define CEPH_MAX_MON 31
-
-
-/*
- * feature bits
- */
-#define CEPH_FEATURE_UID (1<<0)
-#define CEPH_FEATURE_NOSRCADDR (1<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
-#define CEPH_FEATURE_FLOCK (1<<3)
-
-
-/*
- * ceph_file_layout - describe data layout for a file/inode
- */
-struct ceph_file_layout {
- /* file -> object mapping */
- __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
- of page size. */
- __le32 fl_stripe_count; /* over this many objects */
- __le32 fl_object_size; /* until objects are this big, then move to
- new objects */
- __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
-
- /* pg -> disk layout */
- __le32 fl_object_stripe_unit; /* for per-object parity, if any */
-
- /* object -> pg layout */
- __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
- __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
-} __attribute__ ((packed));
-
-#define CEPH_MIN_STRIPE_UNIT 65536
-
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
-
-/* crypto algorithms */
-#define CEPH_CRYPTO_NONE 0x0
-#define CEPH_CRYPTO_AES 0x1
-
-#define CEPH_AES_IV "cephsageyudagreg"
-
-/* security/authentication protocols */
-#define CEPH_AUTH_UNKNOWN 0x0
-#define CEPH_AUTH_NONE 0x1
-#define CEPH_AUTH_CEPHX 0x2
-
-#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
-
-
-/*********************************************
- * message layer
- */
-
-/*
- * message types
- */
-
-/* misc */
-#define CEPH_MSG_SHUTDOWN 1
-#define CEPH_MSG_PING 2
-
-/* client <-> monitor */
-#define CEPH_MSG_MON_MAP 4
-#define CEPH_MSG_MON_GET_MAP 5
-#define CEPH_MSG_STATFS 13
-#define CEPH_MSG_STATFS_REPLY 14
-#define CEPH_MSG_MON_SUBSCRIBE 15
-#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
-#define CEPH_MSG_AUTH 17
-#define CEPH_MSG_AUTH_REPLY 18
-
-/* client <-> mds */
-#define CEPH_MSG_MDS_MAP 21
-
-#define CEPH_MSG_CLIENT_SESSION 22
-#define CEPH_MSG_CLIENT_RECONNECT 23
-
-#define CEPH_MSG_CLIENT_REQUEST 24
-#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
-#define CEPH_MSG_CLIENT_REPLY 26
-#define CEPH_MSG_CLIENT_CAPS 0x310
-#define CEPH_MSG_CLIENT_LEASE 0x311
-#define CEPH_MSG_CLIENT_SNAP 0x312
-#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
-
-/* pool ops */
-#define CEPH_MSG_POOLOP_REPLY 48
-#define CEPH_MSG_POOLOP 49
-
-
-/* osd */
-#define CEPH_MSG_OSD_MAP 41
-#define CEPH_MSG_OSD_OP 42
-#define CEPH_MSG_OSD_OPREPLY 43
-
-/* pool operations */
-enum {
- POOL_OP_CREATE = 0x01,
- POOL_OP_DELETE = 0x02,
- POOL_OP_AUID_CHANGE = 0x03,
- POOL_OP_CREATE_SNAP = 0x11,
- POOL_OP_DELETE_SNAP = 0x12,
- POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
- POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
-};
-
-struct ceph_mon_request_header {
- __le64 have_version;
- __le16 session_mon;
- __le64 session_mon_tid;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_statfs {
- __le64 kb, kb_used, kb_avail;
- __le64 num_objects;
-} __attribute__ ((packed));
-
-struct ceph_mon_statfs_reply {
- struct ceph_fsid fsid;
- __le64 version;
- struct ceph_statfs st;
-} __attribute__ ((packed));
-
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
- __le32 pool;
- __le32 op;
- __le64 auid;
- __le64 snapid;
- __le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
- __le32 reply_code;
- __le32 epoch;
- char has_data;
- char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
- __le64 snapid;
-} __attribute__ ((packed));
-
-struct ceph_osd_getmap {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
- __le32 start;
-} __attribute__ ((packed));
-
-struct ceph_mds_getmap {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-struct ceph_client_mount {
- struct ceph_mon_request_header monhdr;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_item {
- __le64 have_version; __le64 have;
- __u8 onetime;
-} __attribute__ ((packed));
-
-struct ceph_mon_subscribe_ack {
- __le32 duration; /* seconds */
- struct ceph_fsid fsid;
-} __attribute__ ((packed));
-
-/*
- * mds states
- * > 0 -> in
- * <= 0 -> out
- */
-#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
-#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
- empty log. */
-#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
-#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
-#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
-#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
-#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
-
-#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
-#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
- operations (import, rename, etc.) */
-#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
-#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
-#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
-#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
-#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
-
-extern const char *ceph_mds_state_name(int s);
-
-
-/*
- * metadata lock types.
- * - these are bitmasks.. we can compose them
- * - they also define the lock ordering by the MDS
- * - a few of these are internal to the mds
- */
-#define CEPH_LOCK_DVERSION 1
-#define CEPH_LOCK_DN 2
-#define CEPH_LOCK_ISNAP 16
-#define CEPH_LOCK_IVERSION 32 /* mds internal */
-#define CEPH_LOCK_IFILE 64
-#define CEPH_LOCK_IAUTH 128
-#define CEPH_LOCK_ILINK 256
-#define CEPH_LOCK_IDFT 512 /* dir frag tree */
-#define CEPH_LOCK_INEST 1024 /* mds internal */
-#define CEPH_LOCK_IXATTR 2048
-#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
-#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
-
-/* client_session ops */
-enum {
- CEPH_SESSION_REQUEST_OPEN,
- CEPH_SESSION_OPEN,
- CEPH_SESSION_REQUEST_CLOSE,
- CEPH_SESSION_CLOSE,
- CEPH_SESSION_REQUEST_RENEWCAPS,
- CEPH_SESSION_RENEWCAPS,
- CEPH_SESSION_STALE,
- CEPH_SESSION_RECALL_STATE,
-};
-
-extern const char *ceph_session_op_name(int op);
-
-struct ceph_mds_session_head {
- __le32 op;
- __le64 seq;
- struct ceph_timespec stamp;
- __le32 max_caps, max_leases;
-} __attribute__ ((packed));
-
-/* client_request */
-/*
- * metadata ops.
- * & 0x001000 -> write op
- * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
- & & 0x100000 -> use weird ino/path trace
- */
-#define CEPH_MDS_OP_WRITE 0x001000
-enum {
- CEPH_MDS_OP_LOOKUP = 0x00100,
- CEPH_MDS_OP_GETATTR = 0x00101,
- CEPH_MDS_OP_LOOKUPHASH = 0x00102,
- CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
-
- CEPH_MDS_OP_SETXATTR = 0x01105,
- CEPH_MDS_OP_RMXATTR = 0x01106,
- CEPH_MDS_OP_SETLAYOUT = 0x01107,
- CEPH_MDS_OP_SETATTR = 0x01108,
- CEPH_MDS_OP_SETFILELOCK= 0x01109,
- CEPH_MDS_OP_GETFILELOCK= 0x00110,
-
- CEPH_MDS_OP_MKNOD = 0x01201,
- CEPH_MDS_OP_LINK = 0x01202,
- CEPH_MDS_OP_UNLINK = 0x01203,
- CEPH_MDS_OP_RENAME = 0x01204,
- CEPH_MDS_OP_MKDIR = 0x01220,
- CEPH_MDS_OP_RMDIR = 0x01221,
- CEPH_MDS_OP_SYMLINK = 0x01222,
-
- CEPH_MDS_OP_CREATE = 0x01301,
- CEPH_MDS_OP_OPEN = 0x00302,
- CEPH_MDS_OP_READDIR = 0x00305,
-
- CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
- CEPH_MDS_OP_MKSNAP = 0x01400,
- CEPH_MDS_OP_RMSNAP = 0x01401,
- CEPH_MDS_OP_LSSNAP = 0x00402,
-};
-
-extern const char *ceph_mds_op_name(int op);
-
-
-#define CEPH_SETATTR_MODE 1
-#define CEPH_SETATTR_UID 2
-#define CEPH_SETATTR_GID 4
-#define CEPH_SETATTR_MTIME 8
-#define CEPH_SETATTR_ATIME 16
-#define CEPH_SETATTR_SIZE 32
-#define CEPH_SETATTR_CTIME 64
-
-union ceph_mds_request_args {
- struct {
- __le32 mask; /* CEPH_CAP_* */
- } __attribute__ ((packed)) getattr;
- struct {
- __le32 mode;
- __le32 uid;
- __le32 gid;
- struct ceph_timespec mtime;
- struct ceph_timespec atime;
- __le64 size, old_size; /* old_size needed by truncate */
- __le32 mask; /* CEPH_SETATTR_* */
- } __attribute__ ((packed)) setattr;
- struct {
- __le32 frag; /* which dir fragment */
- __le32 max_entries; /* how many dentries to grab */
- __le32 max_bytes;
- } __attribute__ ((packed)) readdir;
- struct {
- __le32 mode;
- __le32 rdev;
- } __attribute__ ((packed)) mknod;
- struct {
- __le32 mode;
- } __attribute__ ((packed)) mkdir;
- struct {
- __le32 flags;
- __le32 mode;
- __le32 stripe_unit; /* layout for newly created file */
- __le32 stripe_count; /* ... */
- __le32 object_size;
- __le32 file_replication;
- __le32 preferred;
- } __attribute__ ((packed)) open;
- struct {
- __le32 flags;
- } __attribute__ ((packed)) setxattr;
- struct {
- struct ceph_file_layout layout;
- } __attribute__ ((packed)) setlayout;
- struct {
- __u8 rule; /* currently fcntl or flock */
- __u8 type; /* shared, exclusive, remove*/
- __le64 pid; /* process id requesting the lock */
- __le64 pid_namespace;
- __le64 start; /* initial location to lock */
- __le64 length; /* num bytes to lock from start */
- __u8 wait; /* will caller wait for lock to become available? */
- } __attribute__ ((packed)) filelock_change;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
-#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
-
-struct ceph_mds_request_head {
- __le64 oldest_client_tid;
- __le32 mdsmap_epoch; /* on client */
- __le32 flags; /* CEPH_MDS_FLAG_* */
- __u8 num_retry, num_fwd; /* count retry, fwd attempts */
- __le16 num_releases; /* # include cap/lease release records */
- __le32 op; /* mds op code */
- __le32 caller_uid, caller_gid;
- __le64 ino; /* use this ino for openc, mkdir, mknod,
- etc. (if replaying) */
- union ceph_mds_request_args args;
-} __attribute__ ((packed));
-
-/* cap/lease release record */
-struct ceph_mds_request_release {
- __le64 ino, cap_id; /* ino and unique cap id */
- __le32 caps, wanted; /* new issued, wanted */
- __le32 seq, issue_seq, mseq;
- __le32 dname_seq; /* if releasing a dentry lease, a */
- __le32 dname_len; /* string follows. */
-} __attribute__ ((packed));
-
-/* client reply */
-struct ceph_mds_reply_head {
- __le32 op;
- __le32 result;
- __le32 mdsmap_epoch;
- __u8 safe; /* true if committed to disk */
- __u8 is_dentry, is_target; /* true if dentry, target inode records
- are included with reply */
-} __attribute__ ((packed));
-
-/* one for each node split */
-struct ceph_frag_tree_split {
- __le32 frag; /* this frag splits... */
- __le32 by; /* ...by this many bits */
-} __attribute__ ((packed));
-
-struct ceph_frag_tree_head {
- __le32 nsplits; /* num ceph_frag_tree_split records */
- struct ceph_frag_tree_split splits[];
-} __attribute__ ((packed));
-
-/* capability issue, for bundling with mds reply */
-struct ceph_mds_reply_cap {
- __le32 caps, wanted; /* caps issued, wanted */
- __le64 cap_id;
- __le32 seq, mseq;
- __le64 realm; /* snap realm */
- __u8 flags; /* CEPH_CAP_FLAG_* */
-} __attribute__ ((packed));
-
-#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
-
-/* inode record, for bundling with mds reply */
-struct ceph_mds_reply_inode {
- __le64 ino;
- __le64 snapid;
- __le32 rdev;
- __le64 version; /* inode version */
- __le64 xattr_version; /* version for xattr blob */
- struct ceph_mds_reply_cap cap; /* caps issued for this inode */
- struct ceph_file_layout layout;
- struct ceph_timespec ctime, mtime, atime;
- __le32 time_warp_seq;
- __le64 size, max_size, truncate_size;
- __le32 truncate_seq;
- __le32 mode, uid, gid;
- __le32 nlink;
- __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
- struct ceph_timespec rctime;
- struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
-} __attribute__ ((packed));
-/* followed by frag array, then symlink string, then xattr blob */
-
-/* reply_lease follows dname, and reply_inode */
-struct ceph_mds_reply_lease {
- __le16 mask; /* lease type(s) */
- __le32 duration_ms; /* lease duration */
- __le32 seq;
-} __attribute__ ((packed));
-
-struct ceph_mds_reply_dirfrag {
- __le32 frag; /* fragment */
- __le32 auth; /* auth mds, if this is a delegation point */
- __le32 ndist; /* number of mds' this is replicated on */
- __le32 dist[];
-} __attribute__ ((packed));
-
-#define CEPH_LOCK_FCNTL 1
-#define CEPH_LOCK_FLOCK 2
-
-#define CEPH_LOCK_SHARED 1
-#define CEPH_LOCK_EXCL 2
-#define CEPH_LOCK_UNLOCK 4
-
-struct ceph_filelock {
- __le64 start;/* file offset to start lock at */
- __le64 length; /* num bytes to lock; 0 for all following start */
- __le64 client; /* which client holds the lock */
- __le64 pid; /* process id holding the lock on the client */
- __le64 pid_namespace;
- __u8 type; /* shared lock, exclusive lock, or unlock */
-} __attribute__ ((packed));
-
-
-/* file access modes */
-#define CEPH_FILE_MODE_PIN 0
-#define CEPH_FILE_MODE_RD 1
-#define CEPH_FILE_MODE_WR 2
-#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
-#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
-#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
-
-int ceph_flags_to_mode(int flags);
-
-
-/* capability bits */
-#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
-
-/* generic cap bits */
-#define CEPH_CAP_GSHARED 1 /* client can reads */
-#define CEPH_CAP_GEXCL 2 /* client can read and update */
-#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
-#define CEPH_CAP_GRD 8 /* (file) client can read */
-#define CEPH_CAP_GWR 16 /* (file) client can write */
-#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
-#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
-#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
-
-/* per-lock shift */
-#define CEPH_CAP_SAUTH 2
-#define CEPH_CAP_SLINK 4
-#define CEPH_CAP_SXATTR 6
-#define CEPH_CAP_SFILE 8
-#define CEPH_CAP_SFLOCK 20
-
-#define CEPH_CAP_BITS 22
-
-/* composed values */
-#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
-#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
-#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
-#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
-#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
-#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
-#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
-#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
-#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
-#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
-
-
-/* cap masks (for getattr) */
-#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
-#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
-#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
-#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
-#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
-#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
-#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
-#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
-#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
- CEPH_CAP_AUTH_SHARED | \
- CEPH_CAP_LINK_SHARED | \
- CEPH_CAP_FILE_SHARED | \
- CEPH_CAP_XATTR_SHARED)
-
-#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
- CEPH_CAP_LINK_SHARED | \
- CEPH_CAP_XATTR_SHARED | \
- CEPH_CAP_FILE_SHARED)
-#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
- CEPH_CAP_FILE_CACHE)
-
-#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
- CEPH_CAP_LINK_EXCL | \
- CEPH_CAP_XATTR_EXCL | \
- CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
- CEPH_CAP_FILE_EXCL)
-#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
-#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
- CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
- CEPH_CAP_PIN)
-
-#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
- CEPH_LOCK_IXATTR)
-
-int ceph_caps_for_mode(int mode);
-
-enum {
- CEPH_CAP_OP_GRANT, /* mds->client grant */
- CEPH_CAP_OP_REVOKE, /* mds->client revoke */
- CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
- CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
- CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
- CEPH_CAP_OP_UPDATE, /* client->mds update */
- CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
- CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
- CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
- CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
- CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
- CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
- CEPH_CAP_OP_RENEW, /* client->mds renewal request */
-};
-
-extern const char *ceph_cap_op_name(int op);
-
-/*
- * caps message, used for capability callbacks, acks, requests, etc.
- */
-struct ceph_mds_caps {
- __le32 op; /* CEPH_CAP_OP_* */
- __le64 ino, realm;
- __le64 cap_id;
- __le32 seq, issue_seq;
- __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
- __le32 migrate_seq;
- __le64 snap_follows;
- __le32 snap_trace_len;
-
- /* authlock */
- __le32 uid, gid, mode;
-
- /* linklock */
- __le32 nlink;
-
- /* xattrlock */
- __le32 xattr_len;
- __le64 xattr_version;
-
- /* filelock */
- __le64 size, max_size, truncate_size;
- __le32 truncate_seq;
- struct ceph_timespec mtime, atime, ctime;
- struct ceph_file_layout layout;
- __le32 time_warp_seq;
-} __attribute__ ((packed));
-
-/* cap release msg head */
-struct ceph_mds_cap_release {
- __le32 num; /* number of cap_items that follow */
-} __attribute__ ((packed));
-
-struct ceph_mds_cap_item {
- __le64 ino;
- __le64 cap_id;
- __le32 migrate_seq, seq;
-} __attribute__ ((packed));
-
-#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
-#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
-#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
-#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
-
-extern const char *ceph_lease_op_name(int o);
-
-/* lease msg header */
-struct ceph_mds_lease {
- __u8 action; /* CEPH_MDS_LEASE_* */
- __le16 mask; /* which lease */
- __le64 ino;
- __le64 first, last; /* snap range */
- __le32 seq;
- __le32 duration_ms; /* duration of renewal */
-} __attribute__ ((packed));
-/* followed by a __le32+string for dname */
-
-/* client reconnect */
-struct ceph_mds_cap_reconnect {
- __le64 cap_id;
- __le32 wanted;
- __le32 issued;
- __le64 snaprealm;
- __le64 pathbase; /* base ino for our path to this ino */
- __le32 flock_len; /* size of flock state blob, if any */
-} __attribute__ ((packed));
-/* followed by flock blob */
-
-struct ceph_mds_cap_reconnect_v1 {
- __le64 cap_id;
- __le32 wanted;
- __le32 issued;
- __le64 size;
- struct ceph_timespec mtime, atime;
- __le64 snaprealm;
- __le64 pathbase; /* base ino for our path to this ino */
-} __attribute__ ((packed));
-
-struct ceph_mds_snaprealm_reconnect {
- __le64 ino; /* snap realm base */
- __le64 seq; /* snap seq for this snap realm */
- __le64 parent; /* parent realm */
-} __attribute__ ((packed));
-
-/*
- * snaps
- */
-enum {
- CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
- CEPH_SNAP_OP_CREATE,
- CEPH_SNAP_OP_DESTROY,
- CEPH_SNAP_OP_SPLIT,
-};
-
-extern const char *ceph_snap_op_name(int o);
-
-/* snap msg header */
-struct ceph_mds_snap_head {
- __le32 op; /* CEPH_SNAP_OP_* */
- __le64 split; /* ino to split off, if any */
- __le32 num_split_inos; /* # inos belonging to new child realm */
- __le32 num_split_realms; /* # child realms udner new child realm */
- __le32 trace_len; /* size of snap trace blob */
-} __attribute__ ((packed));
-/* followed by split ino list, then split realms, then the trace blob */
-
-/*
- * encode info about a snaprealm, as viewed by a client
- */
-struct ceph_mds_snap_realm {
- __le64 ino; /* ino */
- __le64 created; /* snap: when created */
- __le64 parent; /* ino: parent realm */
- __le64 parent_since; /* snap: same parent since */
- __le64 seq; /* snap: version */
- __le32 num_snaps;
- __le32 num_prior_parent_snaps;
-} __attribute__ ((packed));
-/* followed by my snap list, then prior parent snap list */
-
-#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
deleted file mode 100644
index bd570015d147..000000000000
--- a/fs/ceph/ceph_hash.c
+++ /dev/null
@@ -1,118 +0,0 @@
-
-#include "types.h"
-
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c) \
- do { \
- a = a - b; a = a - c; a = a ^ (c >> 13); \
- b = b - c; b = b - a; b = b ^ (a << 8); \
- c = c - a; c = c - b; c = c ^ (b >> 13); \
- a = a - b; a = a - c; a = a ^ (c >> 12); \
- b = b - c; b = b - a; b = b ^ (a << 16); \
- c = c - a; c = c - b; c = c ^ (b >> 5); \
- a = a - b; a = a - c; a = a ^ (c >> 3); \
- b = b - c; b = b - a; b = b ^ (a << 10); \
- c = c - a; c = c - b; c = c ^ (b >> 15); \
- } while (0)
-
-unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
-{
- const unsigned char *k = (const unsigned char *)str;
- __u32 a, b, c; /* the internal state */
- __u32 len; /* how many key bytes still need mixing */
-
- /* Set up the internal state */
- len = length;
- a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
- b = a;
- c = 0; /* variable initialization of internal state */
-
- /* handle most of the key */
- while (len >= 12) {
- a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
- ((__u32)k[3] << 24));
- b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
- ((__u32)k[7] << 24));
- c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
- ((__u32)k[11] << 24));
- mix(a, b, c);
- k = k + 12;
- len = len - 12;
- }
-
- /* handle the last 11 bytes */
- c = c + length;
- switch (len) { /* all the case statements fall through */
- case 11:
- c = c + ((__u32)k[10] << 24);
- case 10:
- c = c + ((__u32)k[9] << 16);
- case 9:
- c = c + ((__u32)k[8] << 8);
- /* the first byte of c is reserved for the length */
- case 8:
- b = b + ((__u32)k[7] << 24);
- case 7:
- b = b + ((__u32)k[6] << 16);
- case 6:
- b = b + ((__u32)k[5] << 8);
- case 5:
- b = b + k[4];
- case 4:
- a = a + ((__u32)k[3] << 24);
- case 3:
- a = a + ((__u32)k[2] << 16);
- case 2:
- a = a + ((__u32)k[1] << 8);
- case 1:
- a = a + k[0];
- /* case 0: nothing left to add */
- }
- mix(a, b, c);
-
- return c;
-}
-
-/*
- * linux dcache hash
- */
-unsigned ceph_str_hash_linux(const char *str, unsigned length)
-{
- unsigned long hash = 0;
- unsigned char c;
-
- while (length--) {
- c = *str++;
- hash = (hash + (c << 4) + (c >> 4)) * 11;
- }
- return hash;
-}
-
-
-unsigned ceph_str_hash(int type, const char *s, unsigned len)
-{
- switch (type) {
- case CEPH_STR_HASH_LINUX:
- return ceph_str_hash_linux(s, len);
- case CEPH_STR_HASH_RJENKINS:
- return ceph_str_hash_rjenkins(s, len);
- default:
- return -1;
- }
-}
-
-const char *ceph_str_hash_name(int type)
-{
- switch (type) {
- case CEPH_STR_HASH_LINUX:
- return "linux";
- case CEPH_STR_HASH_RJENKINS:
- return "rjenkins";
- default:
- return "unknown";
- }
-}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
deleted file mode 100644
index d099c3f90236..000000000000
--- a/fs/ceph/ceph_hash.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef FS_CEPH_HASH_H
-#define FS_CEPH_HASH_H
-
-#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
-#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
-
-extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
-extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
-
-extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
-extern const char *ceph_str_hash_name(int type);
-
-#endif
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
deleted file mode 100644
index fabd302e5779..000000000000
--- a/fs/ceph/crush/crush.c
+++ /dev/null
@@ -1,151 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/slab.h>
-#else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
-#endif
-
-#include "crush.h"
-
-const char *crush_bucket_alg_name(int alg)
-{
- switch (alg) {
- case CRUSH_BUCKET_UNIFORM: return "uniform";
- case CRUSH_BUCKET_LIST: return "list";
- case CRUSH_BUCKET_TREE: return "tree";
- case CRUSH_BUCKET_STRAW: return "straw";
- default: return "unknown";
- }
-}
-
-/**
- * crush_get_bucket_item_weight - Get weight of an item in given bucket
- * @b: bucket pointer
- * @p: item index in bucket
- */
-int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
-{
- if (p >= b->size)
- return 0;
-
- switch (b->alg) {
- case CRUSH_BUCKET_UNIFORM:
- return ((struct crush_bucket_uniform *)b)->item_weight;
- case CRUSH_BUCKET_LIST:
- return ((struct crush_bucket_list *)b)->item_weights[p];
- case CRUSH_BUCKET_TREE:
- if (p & 1)
- return ((struct crush_bucket_tree *)b)->node_weights[p];
- return 0;
- case CRUSH_BUCKET_STRAW:
- return ((struct crush_bucket_straw *)b)->item_weights[p];
- }
- return 0;
-}
-
-/**
- * crush_calc_parents - Calculate parent vectors for the given crush map.
- * @map: crush_map pointer
- */
-void crush_calc_parents(struct crush_map *map)
-{
- int i, b, c;
-
- for (b = 0; b < map->max_buckets; b++) {
- if (map->buckets[b] == NULL)
- continue;
- for (i = 0; i < map->buckets[b]->size; i++) {
- c = map->buckets[b]->items[i];
- BUG_ON(c >= map->max_devices ||
- c < -map->max_buckets);
- if (c >= 0)
- map->device_parents[c] = map->buckets[b]->id;
- else
- map->bucket_parents[-1-c] = map->buckets[b]->id;
- }
- }
-}
-
-void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
-{
- kfree(b->h.perm);
- kfree(b->h.items);
- kfree(b);
-}
-
-void crush_destroy_bucket_list(struct crush_bucket_list *b)
-{
- kfree(b->item_weights);
- kfree(b->sum_weights);
- kfree(b->h.perm);
- kfree(b->h.items);
- kfree(b);
-}
-
-void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
-{
- kfree(b->node_weights);
- kfree(b);
-}
-
-void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
-{
- kfree(b->straws);
- kfree(b->item_weights);
- kfree(b->h.perm);
- kfree(b->h.items);
- kfree(b);
-}
-
-void crush_destroy_bucket(struct crush_bucket *b)
-{
- switch (b->alg) {
- case CRUSH_BUCKET_UNIFORM:
- crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
- break;
- case CRUSH_BUCKET_LIST:
- crush_destroy_bucket_list((struct crush_bucket_list *)b);
- break;
- case CRUSH_BUCKET_TREE:
- crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
- break;
- case CRUSH_BUCKET_STRAW:
- crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
- break;
- }
-}
-
-/**
- * crush_destroy - Destroy a crush_map
- * @map: crush_map pointer
- */
-void crush_destroy(struct crush_map *map)
-{
- int b;
-
- /* buckets */
- if (map->buckets) {
- for (b = 0; b < map->max_buckets; b++) {
- if (map->buckets[b] == NULL)
- continue;
- crush_destroy_bucket(map->buckets[b]);
- }
- kfree(map->buckets);
- }
-
- /* rules */
- if (map->rules) {
- for (b = 0; b < map->max_rules; b++)
- kfree(map->rules[b]);
- kfree(map->rules);
- }
-
- kfree(map->bucket_parents);
- kfree(map->device_parents);
- kfree(map);
-}
-
-
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
deleted file mode 100644
index 97e435b191f4..000000000000
--- a/fs/ceph/crush/crush.h
+++ /dev/null
@@ -1,180 +0,0 @@
-#ifndef CEPH_CRUSH_CRUSH_H
-#define CEPH_CRUSH_CRUSH_H
-
-#include <linux/types.h>
-
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-
-
-#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
-
-
-#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
-#define CRUSH_MAX_SET 10 /* max size of a mapping result */
-
-
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices. A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
- __u32 op;
- __s32 arg1;
- __s32 arg2;
-};
-
-/* step op codes */
-enum {
- CRUSH_RULE_NOOP = 0,
- CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
- CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
- /* arg2 = type */
- CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
- CRUSH_RULE_EMIT = 4, /* no args */
- CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
- CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
-};
-
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N 0
-#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
-
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
- __u8 ruleset;
- __u8 type;
- __u8 min_size;
- __u8 max_size;
-};
-
-struct crush_rule {
- __u32 len;
- struct crush_rule_mask mask;
- struct crush_rule_step steps[0];
-};
-
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
- (len)*sizeof(struct crush_rule_step))
-
-
-
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets). Items within a bucket are chosen using one of a
- * few different algorithms. The table summarizes how the speed of
- * each option measures up against mapping stability when items are
- * added or removed.
- *
- * Bucket Alg Speed Additions Removals
- * ------------------------------------------------
- * uniform O(1) poor poor
- * list O(n) optimal poor
- * tree O(log n) good good
- * straw O(n) optimal optimal
- */
-enum {
- CRUSH_BUCKET_UNIFORM = 1,
- CRUSH_BUCKET_LIST = 2,
- CRUSH_BUCKET_TREE = 3,
- CRUSH_BUCKET_STRAW = 4
-};
-extern const char *crush_bucket_alg_name(int alg);
-
-struct crush_bucket {
- __s32 id; /* this'll be negative */
- __u16 type; /* non-zero; type=0 is reserved for devices */
- __u8 alg; /* one of CRUSH_BUCKET_* */
- __u8 hash; /* which hash function to use, CRUSH_HASH_* */
- __u32 weight; /* 16-bit fixed point */
- __u32 size; /* num items */
- __s32 *items;
-
- /*
- * cached random permutation: used for uniform bucket and for
- * the linear search fallback for the other bucket types.
- */
- __u32 perm_x; /* @x for which *perm is defined */
- __u32 perm_n; /* num elements of *perm that are permuted/defined */
- __u32 *perm;
-};
-
-struct crush_bucket_uniform {
- struct crush_bucket h;
- __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
-};
-
-struct crush_bucket_list {
- struct crush_bucket h;
- __u32 *item_weights; /* 16-bit fixed point */
- __u32 *sum_weights; /* 16-bit fixed point. element i is sum
- of weights 0..i, inclusive */
-};
-
-struct crush_bucket_tree {
- struct crush_bucket h; /* note: h.size is _tree_ size, not number of
- actual items */
- __u8 num_nodes;
- __u32 *node_weights;
-};
-
-struct crush_bucket_straw {
- struct crush_bucket h;
- __u32 *item_weights; /* 16-bit fixed point */
- __u32 *straws; /* 16-bit fixed point */
-};
-
-
-
-/*
- * CRUSH map includes all buckets, rules, etc.
- */
-struct crush_map {
- struct crush_bucket **buckets;
- struct crush_rule **rules;
-
- /*
- * Parent pointers to identify the parent bucket a device or
- * bucket in the hierarchy. If an item appears more than
- * once, this is the _last_ time it appeared (where buckets
- * are processed in bucket id order, from -1 on down to
- * -max_buckets.
- */
- __u32 *bucket_parents;
- __u32 *device_parents;
-
- __s32 max_buckets;
- __u32 max_rules;
- __s32 max_devices;
-};
-
-
-/* crush.c */
-extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
-extern void crush_calc_parents(struct crush_map *map);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket(struct crush_bucket *b);
-extern void crush_destroy(struct crush_map *map);
-
-#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
deleted file mode 100644
index 5873aed694bf..000000000000
--- a/fs/ceph/crush/hash.c
+++ /dev/null
@@ -1,149 +0,0 @@
-
-#include <linux/types.h>
-#include "hash.h"
-
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do { \
- a = a-b; a = a-c; a = a^(c>>13); \
- b = b-c; b = b-a; b = b^(a<<8); \
- c = c-a; c = c-b; c = c^(b>>13); \
- a = a-b; a = a-c; a = a^(c>>12); \
- b = b-c; b = b-a; b = b^(a<<16); \
- c = c-a; c = c-b; c = c^(b>>5); \
- a = a-b; a = a-c; a = a^(c>>3); \
- b = b-c; b = b-a; b = b^(a<<10); \
- c = c-a; c = c-b; c = c^(b>>15); \
- } while (0)
-
-#define crush_hash_seed 1315423911
-
-static __u32 crush_hash32_rjenkins1(__u32 a)
-{
- __u32 hash = crush_hash_seed ^ a;
- __u32 b = a;
- __u32 x = 231232;
- __u32 y = 1232;
- crush_hashmix(b, x, hash);
- crush_hashmix(y, a, hash);
- return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
-{
- __u32 hash = crush_hash_seed ^ a ^ b;
- __u32 x = 231232;
- __u32 y = 1232;
- crush_hashmix(a, b, hash);
- crush_hashmix(x, a, hash);
- crush_hashmix(b, y, hash);
- return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
-{
- __u32 hash = crush_hash_seed ^ a ^ b ^ c;
- __u32 x = 231232;
- __u32 y = 1232;
- crush_hashmix(a, b, hash);
- crush_hashmix(c, x, hash);
- crush_hashmix(y, a, hash);
- crush_hashmix(b, x, hash);
- crush_hashmix(y, c, hash);
- return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
-{
- __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
- __u32 x = 231232;
- __u32 y = 1232;
- crush_hashmix(a, b, hash);
- crush_hashmix(c, d, hash);
- crush_hashmix(a, x, hash);
- crush_hashmix(y, b, hash);
- crush_hashmix(c, x, hash);
- crush_hashmix(y, d, hash);
- return hash;
-}
-
-static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
- __u32 e)
-{
- __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
- __u32 x = 231232;
- __u32 y = 1232;
- crush_hashmix(a, b, hash);
- crush_hashmix(c, d, hash);
- crush_hashmix(e, x, hash);
- crush_hashmix(y, a, hash);
- crush_hashmix(b, x, hash);
- crush_hashmix(y, c, hash);
- crush_hashmix(d, x, hash);
- crush_hashmix(y, e, hash);
- return hash;
-}
-
-
-__u32 crush_hash32(int type, __u32 a)
-{
- switch (type) {
- case CRUSH_HASH_RJENKINS1:
- return crush_hash32_rjenkins1(a);
- default:
- return 0;
- }
-}
-
-__u32 crush_hash32_2(int type, __u32 a, __u32 b)
-{
- switch (type) {
- case CRUSH_HASH_RJENKINS1:
- return crush_hash32_rjenkins1_2(a, b);
- default:
- return 0;
- }
-}
-
-__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
-{
- switch (type) {
- case CRUSH_HASH_RJENKINS1:
- return crush_hash32_rjenkins1_3(a, b, c);
- default:
- return 0;
- }
-}
-
-__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
-{
- switch (type) {
- case CRUSH_HASH_RJENKINS1:
- return crush_hash32_rjenkins1_4(a, b, c, d);
- default:
- return 0;
- }
-}
-
-__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
-{
- switch (type) {
- case CRUSH_HASH_RJENKINS1:
- return crush_hash32_rjenkins1_5(a, b, c, d, e);
- default:
- return 0;
- }
-}
-
-const char *crush_hash_name(int type)
-{
- switch (type) {
- case CRUSH_HASH_RJENKINS1:
- return "rjenkins1";
- default:
- return "unknown";
- }
-}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
deleted file mode 100644
index 91e884230d5d..000000000000
--- a/fs/ceph/crush/hash.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef CEPH_CRUSH_HASH_H
-#define CEPH_CRUSH_HASH_H
-
-#define CRUSH_HASH_RJENKINS1 0
-
-#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
-
-extern const char *crush_hash_name(int type);
-
-extern __u32 crush_hash32(int type, __u32 a);
-extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
-extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
-extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
- __u32 e);
-
-#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
deleted file mode 100644
index a4eec133258e..000000000000
--- a/fs/ceph/crush/mapper.c
+++ /dev/null
@@ -1,609 +0,0 @@
-
-#ifdef __KERNEL__
-# include <linux/string.h>
-# include <linux/slab.h>
-# include <linux/bug.h>
-# include <linux/kernel.h>
-# ifndef dprintk
-# define dprintk(args...)
-# endif
-#else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
-#endif
-
-#include "crush.h"
-#include "hash.h"
-
-/*
- * Implement the core CRUSH mapping algorithm.
- */
-
-/**
- * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
- * @map: the crush_map
- * @ruleset: the storage ruleset id (user defined)
- * @type: storage ruleset type (user defined)
- * @size: output set size
- */
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
-{
- int i;
-
- for (i = 0; i < map->max_rules; i++) {
- if (map->rules[i] &&
- map->rules[i]->mask.ruleset == ruleset &&
- map->rules[i]->mask.type == type &&
- map->rules[i]->mask.min_size <= size &&
- map->rules[i]->mask.max_size >= size)
- return i;
- }
- return -1;
-}
-
-
-/*
- * bucket choose methods
- *
- * For each bucket algorithm, we have a "choose" method that, given a
- * crush input @x and replica position (usually, position in output set) @r,
- * will produce an item in the bucket.
- */
-
-/*
- * Choose based on a random permutation of the bucket.
- *
- * We used to use some prime number arithmetic to do this, but it
- * wasn't very random, and had some other bad behaviors. Instead, we
- * calculate an actual random permutation of the bucket members.
- * Since this is expensive, we optimize for the r=0 case, which
- * captures the vast majority of calls.
- */
-static int bucket_perm_choose(struct crush_bucket *bucket,
- int x, int r)
-{
- unsigned pr = r % bucket->size;
- unsigned i, s;
-
- /* start a new permutation if @x has changed */
- if (bucket->perm_x != x || bucket->perm_n == 0) {
- dprintk("bucket %d new x=%d\n", bucket->id, x);
- bucket->perm_x = x;
-
- /* optimize common r=0 case */
- if (pr == 0) {
- s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
- bucket->size;
- bucket->perm[0] = s;
- bucket->perm_n = 0xffff; /* magic value, see below */
- goto out;
- }
-
- for (i = 0; i < bucket->size; i++)
- bucket->perm[i] = i;
- bucket->perm_n = 0;
- } else if (bucket->perm_n == 0xffff) {
- /* clean up after the r=0 case above */
- for (i = 1; i < bucket->size; i++)
- bucket->perm[i] = i;
- bucket->perm[bucket->perm[0]] = 0;
- bucket->perm_n = 1;
- }
-
- /* calculate permutation up to pr */
- for (i = 0; i < bucket->perm_n; i++)
- dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
- while (bucket->perm_n <= pr) {
- unsigned p = bucket->perm_n;
- /* no point in swapping the final entry */
- if (p < bucket->size - 1) {
- i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
- (bucket->size - p);
- if (i) {
- unsigned t = bucket->perm[p + i];
- bucket->perm[p + i] = bucket->perm[p];
- bucket->perm[p] = t;
- }
- dprintk(" perm_choose swap %d with %d\n", p, p+i);
- }
- bucket->perm_n++;
- }
- for (i = 0; i < bucket->size; i++)
- dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
-
- s = bucket->perm[pr];
-out:
- dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
- bucket->size, x, r, pr, s);
- return bucket->items[s];
-}
-
-/* uniform */
-static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
- int x, int r)
-{
- return bucket_perm_choose(&bucket->h, x, r);
-}
-
-/* list */
-static int bucket_list_choose(struct crush_bucket_list *bucket,
- int x, int r)
-{
- int i;
-
- for (i = bucket->h.size-1; i >= 0; i--) {
- __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
- r, bucket->h.id);
- w &= 0xffff;
- dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
- "sw %x rand %llx",
- i, x, r, bucket->h.items[i], bucket->item_weights[i],
- bucket->sum_weights[i], w);
- w *= bucket->sum_weights[i];
- w = w >> 16;
- /*dprintk(" scaled %llx\n", w);*/
- if (w < bucket->item_weights[i])
- return bucket->h.items[i];
- }
-
- BUG_ON(1);
- return 0;
-}
-
-
-/* (binary) tree */
-static int height(int n)
-{
- int h = 0;
- while ((n & 1) == 0) {
- h++;
- n = n >> 1;
- }
- return h;
-}
-
-static int left(int x)
-{
- int h = height(x);
- return x - (1 << (h-1));
-}
-
-static int right(int x)
-{
- int h = height(x);
- return x + (1 << (h-1));
-}
-
-static int terminal(int x)
-{
- return x & 1;
-}
-
-static int bucket_tree_choose(struct crush_bucket_tree *bucket,
- int x, int r)
-{
- int n, l;
- __u32 w;
- __u64 t;
-
- /* start at root */
- n = bucket->num_nodes >> 1;
-
- while (!terminal(n)) {
- /* pick point in [0, w) */
- w = bucket->node_weights[n];
- t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
- bucket->h.id) * (__u64)w;
- t = t >> 32;
-
- /* descend to the left or right? */
- l = left(n);
- if (t < bucket->node_weights[l])
- n = l;
- else
- n = right(n);
- }
-
- return bucket->h.items[n >> 1];
-}
-
-
-/* straw */
-
-static int bucket_straw_choose(struct crush_bucket_straw *bucket,
- int x, int r)
-{
- int i;
- int high = 0;
- __u64 high_draw = 0;
- __u64 draw;
-
- for (i = 0; i < bucket->h.size; i++) {
- draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
- draw &= 0xffff;
- draw *= bucket->straws[i];
- if (i == 0 || draw > high_draw) {
- high = i;
- high_draw = draw;
- }
- }
- return bucket->h.items[high];
-}
-
-static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
-{
- dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
- switch (in->alg) {
- case CRUSH_BUCKET_UNIFORM:
- return bucket_uniform_choose((struct crush_bucket_uniform *)in,
- x, r);
- case CRUSH_BUCKET_LIST:
- return bucket_list_choose((struct crush_bucket_list *)in,
- x, r);
- case CRUSH_BUCKET_TREE:
- return bucket_tree_choose((struct crush_bucket_tree *)in,
- x, r);
- case CRUSH_BUCKET_STRAW:
- return bucket_straw_choose((struct crush_bucket_straw *)in,
- x, r);
- default:
- BUG_ON(1);
- return in->items[0];
- }
-}
-
-/*
- * true if device is marked "out" (failed, fully offloaded)
- * of the cluster
- */
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
-{
- if (weight[item] >= 0x10000)
- return 0;
- if (weight[item] == 0)
- return 1;
- if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
- < weight[item])
- return 0;
- return 1;
-}
-
-/**
- * crush_choose - choose numrep distinct items of given type
- * @map: the crush_map
- * @bucket: the bucket we are choose an item from
- * @x: crush input value
- * @numrep: the number of items to choose
- * @type: the type of item to choose
- * @out: pointer to output vector
- * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @out2: second output vector for leaf items (if @recurse_to_leaf)
- */
-static int crush_choose(struct crush_map *map,
- struct crush_bucket *bucket,
- __u32 *weight,
- int x, int numrep, int type,
- int *out, int outpos,
- int firstn, int recurse_to_leaf,
- int *out2)
-{
- int rep;
- int ftotal, flocal;
- int retry_descent, retry_bucket, skip_rep;
- struct crush_bucket *in = bucket;
- int r;
- int i;
- int item = 0;
- int itemtype;
- int collide, reject;
- const int orig_tries = 5; /* attempts before we fall back to search */
-
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
- bucket->id, x, outpos, numrep);
-
- for (rep = outpos; rep < numrep; rep++) {
- /* keep trying until we get a non-out, non-colliding item */
- ftotal = 0;
- skip_rep = 0;
- do {
- retry_descent = 0;
- in = bucket; /* initial bucket */
-
- /* choose through intervening buckets */
- flocal = 0;
- do {
- collide = 0;
- retry_bucket = 0;
- r = rep;
- if (in->alg == CRUSH_BUCKET_UNIFORM) {
- /* be careful */
- if (firstn || numrep >= in->size)
- /* r' = r + f_total */
- r += ftotal;
- else if (in->size % numrep == 0)
- /* r'=r+(n+1)*f_local */
- r += (numrep+1) *
- (flocal+ftotal);
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- } else {
- if (firstn)
- /* r' = r + f_total */
- r += ftotal;
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- }
-
- /* bucket choose */
- if (in->size == 0) {
- reject = 1;
- goto reject;
- }
- if (flocal >= (in->size>>1) &&
- flocal > orig_tries)
- item = bucket_perm_choose(in, x, r);
- else
- item = crush_bucket_choose(in, x, r);
- BUG_ON(item >= map->max_devices);
-
- /* desired type? */
- if (item < 0)
- itemtype = map->buckets[-1-item]->type;
- else
- itemtype = 0;
- dprintk(" item %d type %d\n", item, itemtype);
-
- /* keep going? */
- if (itemtype != type) {
- BUG_ON(item >= 0 ||
- (-1-item) >= map->max_buckets);
- in = map->buckets[-1-item];
- retry_bucket = 1;
- continue;
- }
-
- /* collision? */
- for (i = 0; i < outpos; i++) {
- if (out[i] == item) {
- collide = 1;
- break;
- }
- }
-
- reject = 0;
- if (recurse_to_leaf) {
- if (item < 0) {
- if (crush_choose(map,
- map->buckets[-1-item],
- weight,
- x, outpos+1, 0,
- out2, outpos,
- firstn, 0,
- NULL) <= outpos)
- /* didn't get leaf */
- reject = 1;
- } else {
- /* we already have a leaf! */
- out2[outpos] = item;
- }
- }
-
- if (!reject) {
- /* out? */
- if (itemtype == 0)
- reject = is_out(map, weight,
- item, x);
- else
- reject = 0;
- }
-
-reject:
- if (reject || collide) {
- ftotal++;
- flocal++;
-
- if (collide && flocal < 3)
- /* retry locally a few times */
- retry_bucket = 1;
- else if (flocal < in->size + orig_tries)
- /* exhaustive bucket search */
- retry_bucket = 1;
- else if (ftotal < 20)
- /* then retry descent */
- retry_descent = 1;
- else
- /* else give up */
- skip_rep = 1;
- dprintk(" reject %d collide %d "
- "ftotal %d flocal %d\n",
- reject, collide, ftotal,
- flocal);
- }
- } while (retry_bucket);
- } while (retry_descent);
-
- if (skip_rep) {
- dprintk("skip rep\n");
- continue;
- }
-
- dprintk("CHOOSE got %d\n", item);
- out[outpos] = item;
- outpos++;
- }
-
- dprintk("CHOOSE returns %d\n", outpos);
- return outpos;
-}
-
-
-/**
- * crush_do_rule - calculate a mapping with the given input and rule
- * @map: the crush_map
- * @ruleno: the rule id
- * @x: hash input
- * @result: pointer to result vector
- * @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
- */
-int crush_do_rule(struct crush_map *map,
- int ruleno, int x, int *result, int result_max,
- int force, __u32 *weight)
-{
- int result_len;
- int force_context[CRUSH_MAX_DEPTH];
- int force_pos = -1;
- int a[CRUSH_MAX_SET];
- int b[CRUSH_MAX_SET];
- int c[CRUSH_MAX_SET];
- int recurse_to_leaf;
- int *w;
- int wsize = 0;
- int *o;
- int osize;
- int *tmp;
- struct crush_rule *rule;
- int step;
- int i, j;
- int numrep;
- int firstn;
- int rc = -1;
-
- BUG_ON(ruleno >= map->max_rules);
-
- rule = map->rules[ruleno];
- result_len = 0;
- w = a;
- o = b;
-
- /*
- * determine hierarchical context of force, if any. note
- * that this may or may not correspond to the specific types
- * referenced by the crush rule.
- */
- if (force >= 0) {
- if (force >= map->max_devices ||
- map->device_parents[force] == 0) {
- /*dprintk("CRUSH: forcefed device dne\n");*/
- rc = -1; /* force fed device dne */
- goto out;
- }
- if (!is_out(map, weight, force, x)) {
- while (1) {
- force_context[++force_pos] = force;
- if (force >= 0)
- force = map->device_parents[force];
- else
- force = map->bucket_parents[-1-force];
- if (force == 0)
- break;
- }
- }
- }
-
- for (step = 0; step < rule->len; step++) {
- firstn = 0;
- switch (rule->steps[step].op) {
- case CRUSH_RULE_TAKE:
- w[0] = rule->steps[step].arg1;
- if (force_pos >= 0) {
- BUG_ON(force_context[force_pos] != w[0]);
- force_pos--;
- }
- wsize = 1;
- break;
-
- case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
- case CRUSH_RULE_CHOOSE_FIRSTN:
- firstn = 1;
- case CRUSH_RULE_CHOOSE_LEAF_INDEP:
- case CRUSH_RULE_CHOOSE_INDEP:
- BUG_ON(wsize == 0);
-
- recurse_to_leaf =
- rule->steps[step].op ==
- CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
- rule->steps[step].op ==
- CRUSH_RULE_CHOOSE_LEAF_INDEP;
-
- /* reset output */
- osize = 0;
-
- for (i = 0; i < wsize; i++) {
- /*
- * see CRUSH_N, CRUSH_N_MINUS macros.
- * basically, numrep <= 0 means relative to
- * the provided result_max
- */
- numrep = rule->steps[step].arg1;
- if (numrep <= 0) {
- numrep += result_max;
- if (numrep <= 0)
- continue;
- }
- j = 0;
- if (osize == 0 && force_pos >= 0) {
- /* skip any intermediate types */
- while (force_pos &&
- force_context[force_pos] < 0 &&
- rule->steps[step].arg2 !=
- map->buckets[-1 -
- force_context[force_pos]]->type)
- force_pos--;
- o[osize] = force_context[force_pos];
- if (recurse_to_leaf)
- c[osize] = force_context[0];
- j++;
- force_pos--;
- }
- osize += crush_choose(map,
- map->buckets[-1-w[i]],
- weight,
- x, numrep,
- rule->steps[step].arg2,
- o+osize, j,
- firstn,
- recurse_to_leaf, c+osize);
- }
-
- if (recurse_to_leaf)
- /* copy final _leaf_ values to output set */
- memcpy(o, c, osize*sizeof(*o));
-
- /* swap t and w arrays */
- tmp = o;
- o = w;
- w = tmp;
- wsize = osize;
- break;
-
-
- case CRUSH_RULE_EMIT:
- for (i = 0; i < wsize && result_len < result_max; i++) {
- result[result_len] = w[i];
- result_len++;
- }
- wsize = 0;
- break;
-
- default:
- BUG_ON(1);
- }
- }
- rc = result_len;
-
-out:
- return rc;
-}
-
-
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
deleted file mode 100644
index c46b99c18bb0..000000000000
--- a/fs/ceph/crush/mapper.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef CEPH_CRUSH_MAPPER_H
-#define CEPH_CRUSH_MAPPER_H
-
-/*
- * CRUSH functions for find rules and then mapping an input to an
- * output set.
- *
- * LGPL2
- */
-
-#include "crush.h"
-
-extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
-extern int crush_do_rule(struct crush_map *map,
- int ruleno,
- int x, int *result, int result_max,
- int forcefeed, /* -1 for none */
- __u32 *weights);
-
-#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
deleted file mode 100644
index a3e627f63293..000000000000
--- a/fs/ceph/crypto.c
+++ /dev/null
@@ -1,412 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/scatterlist.h>
-#include <linux/slab.h>
-#include <crypto/hash.h>
-
-#include "crypto.h"
-#include "decode.h"
-
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
- if (*p + sizeof(u16) + sizeof(key->created) +
- sizeof(u16) + key->len > end)
- return -ERANGE;
- ceph_encode_16(p, key->type);
- ceph_encode_copy(p, &key->created, sizeof(key->created));
- ceph_encode_16(p, key->len);
- ceph_encode_copy(p, key->key, key->len);
- return 0;
-}
-
-int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
-{
- ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
- key->type = ceph_decode_16(p);
- ceph_decode_copy(p, &key->created, sizeof(key->created));
- key->len = ceph_decode_16(p);
- ceph_decode_need(p, end, key->len, bad);
- key->key = kmalloc(key->len, GFP_NOFS);
- if (!key->key)
- return -ENOMEM;
- ceph_decode_copy(p, key->key, key->len);
- return 0;
-
-bad:
- dout("failed to decode crypto key\n");
- return -EINVAL;
-}
-
-int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
-{
- int inlen = strlen(inkey);
- int blen = inlen * 3 / 4;
- void *buf, *p;
- int ret;
-
- dout("crypto_key_unarmor %s\n", inkey);
- buf = kmalloc(blen, GFP_NOFS);
- if (!buf)
- return -ENOMEM;
- blen = ceph_unarmor(buf, inkey, inkey+inlen);
- if (blen < 0) {
- kfree(buf);
- return blen;
- }
-
- p = buf;
- ret = ceph_crypto_key_decode(key, &p, p + blen);
- kfree(buf);
- if (ret)
- return ret;
- dout("crypto_key_unarmor key %p type %d len %d\n", key,
- key->type, key->len);
- return 0;
-}
-
-
-
-#define AES_KEY_SIZE 16
-
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
-{
- return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
-}
-
-static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-
-static int ceph_aes_encrypt(const void *key, int key_len,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- struct scatterlist sg_in[2], sg_out[1];
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
- int ret;
- void *iv;
- int ivsize;
- size_t zero_padding = (0x10 - (src_len & 0x0f));
- char pad[16];
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- memset(pad, zero_padding, zero_padding);
-
- *dst_len = src_len + zero_padding;
-
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- sg_init_table(sg_in, 2);
- sg_set_buf(&sg_in[0], src, src_len);
- sg_set_buf(&sg_in[1], pad, zero_padding);
- sg_init_table(sg_out, 1);
- sg_set_buf(sg_out, dst, *dst_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
-
- memcpy(iv, aes_iv, ivsize);
- /*
- print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
- pad, zero_padding, 1);
- */
- ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
- src_len + zero_padding);
- crypto_free_blkcipher(tfm);
- if (ret < 0)
- pr_err("ceph_aes_crypt failed %d\n", ret);
- /*
- print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
- return 0;
-}
-
-static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
- size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len)
-{
- struct scatterlist sg_in[3], sg_out[1];
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
- int ret;
- void *iv;
- int ivsize;
- size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
- char pad[16];
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- memset(pad, zero_padding, zero_padding);
-
- *dst_len = src1_len + src2_len + zero_padding;
-
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- sg_init_table(sg_in, 3);
- sg_set_buf(&sg_in[0], src1, src1_len);
- sg_set_buf(&sg_in[1], src2, src2_len);
- sg_set_buf(&sg_in[2], pad, zero_padding);
- sg_init_table(sg_out, 1);
- sg_set_buf(sg_out, dst, *dst_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
-
- memcpy(iv, aes_iv, ivsize);
- /*
- print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
- src1, src1_len, 1);
- print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
- src2, src2_len, 1);
- print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
- pad, zero_padding, 1);
- */
- ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
- src1_len + src2_len + zero_padding);
- crypto_free_blkcipher(tfm);
- if (ret < 0)
- pr_err("ceph_aes_crypt2 failed %d\n", ret);
- /*
- print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
- return 0;
-}
-
-static int ceph_aes_decrypt(const void *key, int key_len,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- struct scatterlist sg_in[1], sg_out[2];
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm };
- char pad[16];
- void *iv;
- int ivsize;
- int ret;
- int last_byte;
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- sg_init_table(sg_in, 1);
- sg_init_table(sg_out, 2);
- sg_set_buf(sg_in, src, src_len);
- sg_set_buf(&sg_out[0], dst, *dst_len);
- sg_set_buf(&sg_out[1], pad, sizeof(pad));
-
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
-
- memcpy(iv, aes_iv, ivsize);
-
- /*
- print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- */
-
- ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
- crypto_free_blkcipher(tfm);
- if (ret < 0) {
- pr_err("ceph_aes_decrypt failed %d\n", ret);
- return ret;
- }
-
- if (src_len <= *dst_len)
- last_byte = ((char *)dst)[src_len - 1];
- else
- last_byte = pad[src_len - *dst_len - 1];
- if (last_byte <= 16 && src_len >= last_byte) {
- *dst_len = src_len - last_byte;
- } else {
- pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
- last_byte, (int)src_len);
- return -EPERM; /* bad padding */
- }
- /*
- print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
- dst, *dst_len, 1);
- */
- return 0;
-}
-
-static int ceph_aes_decrypt2(const void *key, int key_len,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len)
-{
- struct scatterlist sg_in[1], sg_out[3];
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm };
- char pad[16];
- void *iv;
- int ivsize;
- int ret;
- int last_byte;
-
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- sg_init_table(sg_in, 1);
- sg_set_buf(sg_in, src, src_len);
- sg_init_table(sg_out, 3);
- sg_set_buf(&sg_out[0], dst1, *dst1_len);
- sg_set_buf(&sg_out[1], dst2, *dst2_len);
- sg_set_buf(&sg_out[2], pad, sizeof(pad));
-
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
-
- memcpy(iv, aes_iv, ivsize);
-
- /*
- print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
- key, key_len, 1);
- print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
- src, src_len, 1);
- */
-
- ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
- crypto_free_blkcipher(tfm);
- if (ret < 0) {
- pr_err("ceph_aes_decrypt failed %d\n", ret);
- return ret;
- }
-
- if (src_len <= *dst1_len)
- last_byte = ((char *)dst1)[src_len - 1];
- else if (src_len <= *dst1_len + *dst2_len)
- last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
- else
- last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
- if (last_byte <= 16 && src_len >= last_byte) {
- src_len -= last_byte;
- } else {
- pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
- last_byte, (int)src_len);
- return -EPERM; /* bad padding */
- }
-
- if (src_len < *dst1_len) {
- *dst1_len = src_len;
- *dst2_len = 0;
- } else {
- *dst2_len = src_len - *dst1_len;
- }
- /*
- print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
- dst1, *dst1_len, 1);
- print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
- dst2, *dst2_len, 1);
- */
-
- return 0;
-}
-
-
-int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src_len)
- return -ERANGE;
- memcpy(dst, src, src_len);
- *dst_len = src_len;
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_decrypt(secret->key, secret->len, dst,
- dst_len, src, src_len);
-
- default:
- return -EINVAL;
- }
-}
-
-int ceph_decrypt2(struct ceph_crypto_key *secret,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len)
-{
- size_t t;
-
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst1_len + *dst2_len < src_len)
- return -ERANGE;
- t = min(*dst1_len, src_len);
- memcpy(dst1, src, t);
- *dst1_len = t;
- src += t;
- src_len -= t;
- if (src_len) {
- t = min(*dst2_len, src_len);
- memcpy(dst2, src, t);
- *dst2_len = t;
- }
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_decrypt2(secret->key, secret->len,
- dst1, dst1_len, dst2, dst2_len,
- src, src_len);
-
- default:
- return -EINVAL;
- }
-}
-
-int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src, size_t src_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src_len)
- return -ERANGE;
- memcpy(dst, src, src_len);
- *dst_len = src_len;
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_encrypt(secret->key, secret->len, dst,
- dst_len, src, src_len);
-
- default:
- return -EINVAL;
- }
-}
-
-int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len)
-{
- switch (secret->type) {
- case CEPH_CRYPTO_NONE:
- if (*dst_len < src1_len + src2_len)
- return -ERANGE;
- memcpy(dst, src1, src1_len);
- memcpy(dst + src1_len, src2, src2_len);
- *dst_len = src1_len + src2_len;
- return 0;
-
- case CEPH_CRYPTO_AES:
- return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
- src1, src1_len, src2, src2_len);
-
- default:
- return -EINVAL;
- }
-}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
deleted file mode 100644
index bdf38607323c..000000000000
--- a/fs/ceph/crypto.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef _FS_CEPH_CRYPTO_H
-#define _FS_CEPH_CRYPTO_H
-
-#include "types.h"
-#include "buffer.h"
-
-/*
- * cryptographic secret
- */
-struct ceph_crypto_key {
- int type;
- struct ceph_timespec created;
- int len;
- void *key;
-};
-
-static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
-{
- kfree(key->key);
-}
-
-extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
- void **p, void *end);
-extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
- void **p, void *end);
-extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
-
-/* crypto.c */
-extern int ceph_decrypt(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len);
-extern int ceph_encrypt(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src, size_t src_len);
-extern int ceph_decrypt2(struct ceph_crypto_key *secret,
- void *dst1, size_t *dst1_len,
- void *dst2, size_t *dst2_len,
- const void *src, size_t src_len);
-extern int ceph_encrypt2(struct ceph_crypto_key *secret,
- void *dst, size_t *dst_len,
- const void *src1, size_t src1_len,
- const void *src2, size_t src2_len);
-
-/* armor.c */
-extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(char *dst, const char *src, const char *end);
-
-#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..7ae1b3d55b58 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/device.h>
#include <linux/slab.h>
@@ -7,143 +7,49 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
+
#include "super.h"
-#include "mds_client.h"
-#include "mon_client.h"
-#include "auth.h"
#ifdef CONFIG_DEBUG_FS
-/*
- * Implement /sys/kernel/debug/ceph fun
- *
- * /sys/kernel/debug/ceph/client* - an instance of the ceph client
- * .../osdmap - current osdmap
- * .../mdsmap - current mdsmap
- * .../monmap - current monmap
- * .../osdc - active osd requests
- * .../mdsc - active mds requests
- * .../monc - mon client state
- * .../dentry_lru - dump contents of dentry lru
- * .../caps - expose cap (reservation) stats
- * .../bdi - symlink to ../../bdi/something
- */
-
-static struct dentry *ceph_debugfs_dir;
-
-static int monmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
-
- if (client->monc.monmap == NULL)
- return 0;
-
- seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
- for (i = 0; i < client->monc.monmap->num_mon; i++) {
- struct ceph_entity_inst *inst =
- &client->monc.monmap->mon_inst[i];
-
- seq_printf(s, "\t%s%lld\t%s\n",
- ENTITY_NAME(inst->name),
- pr_addr(&inst->addr.in_addr));
- }
- return 0;
-}
+#include "mds_client.h"
static int mdsmap_show(struct seq_file *s, void *p)
{
int i;
- struct ceph_client *client = s->private;
+ struct ceph_fs_client *fsc = s->private;
- if (client->mdsc.mdsmap == NULL)
+ if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0;
- seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
- seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+ seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
seq_printf(s, "session_timeout %d\n",
- client->mdsc.mdsmap->m_session_timeout);
+ fsc->mdsc->mdsmap->m_session_timeout);
seq_printf(s, "session_autoclose %d\n",
- client->mdsc.mdsmap->m_session_autoclose);
- for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+ fsc->mdsc->mdsmap->m_session_autoclose);
+ for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
struct ceph_entity_addr *addr =
- &client->mdsc.mdsmap->m_info[i].addr;
- int state = client->mdsc.mdsmap->m_info[i].state;
+ &fsc->mdsc->mdsmap->m_info[i].addr;
+ int state = fsc->mdsc->mdsmap->m_info[i].state;
- seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+ seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
+ ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state));
}
return 0;
}
-static int osdmap_show(struct seq_file *s, void *p)
-{
- int i;
- struct ceph_client *client = s->private;
- struct rb_node *n;
-
- if (client->osdc.osdmap == NULL)
- return 0;
- seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
- seq_printf(s, "flags%s%s\n",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
- " NEARFULL" : "",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
- " FULL" : "");
- for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
- struct ceph_pg_pool_info *pool =
- rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
- pool->id, pool->v.pg_num, pool->pg_num_mask,
- pool->v.lpg_num, pool->lpg_num_mask);
- }
- for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
- struct ceph_entity_addr *addr =
- &client->osdc.osdmap->osd_addr[i];
- int state = client->osdc.osdmap->osd_state[i];
- char sb[64];
-
- seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
- i, pr_addr(&addr->in_addr),
- ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
- ceph_osdmap_state_str(sb, sizeof(sb), state));
- }
- return 0;
-}
-
-static int monc_show(struct seq_file *s, void *p)
-{
- struct ceph_client *client = s->private;
- struct ceph_mon_generic_request *req;
- struct ceph_mon_client *monc = &client->monc;
- struct rb_node *rp;
-
- mutex_lock(&monc->mutex);
-
- if (monc->have_mdsmap)
- seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
- if (monc->have_osdmap)
- seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
- if (monc->want_next_osdmap)
- seq_printf(s, "want next osdmap\n");
-
- for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
- __u16 op;
- req = rb_entry(rp, struct ceph_mon_generic_request, node);
- op = le16_to_cpu(req->request->hdr.type);
- if (op == CEPH_MSG_STATFS)
- seq_printf(s, "%lld statfs\n", req->tid);
- else
- seq_printf(s, "%lld unknown\n", req->tid);
- }
-
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
+/*
+ * mdsc debugfs
+ */
static int mdsc_show(struct seq_file *s, void *p)
{
- struct ceph_client *client = s->private;
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
return 0;
}
-static int osdc_show(struct seq_file *s, void *pp)
-{
- struct ceph_client *client = s->private;
- struct ceph_osd_client *osdc = &client->osdc;
- struct rb_node *p;
-
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- struct ceph_osd_request *req;
- struct ceph_osd_request_head *head;
- struct ceph_osd_op *op;
- int num_ops;
- int opcode, olen;
- int i;
-
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1,
- le32_to_cpu(req->r_pgid.pool),
- le16_to_cpu(req->r_pgid.ps));
-
- head = req->r_request->front.iov_base;
- op = (void *)(head + 1);
-
- num_ops = le16_to_cpu(head->num_ops);
- olen = le32_to_cpu(head->object_len);
- seq_printf(s, "%.*s", olen,
- (const char *)(head->ops + num_ops));
-
- if (req->r_reassert_version.epoch)
- seq_printf(s, "\t%u'%llu",
- (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
- le64_to_cpu(req->r_reassert_version.version));
- else
- seq_printf(s, "\t");
-
- for (i = 0; i < num_ops; i++) {
- opcode = le16_to_cpu(op->op);
- seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
- op++;
- }
-
- seq_printf(s, "\n");
- }
- mutex_unlock(&osdc->request_mutex);
- return 0;
-}
-
static int caps_show(struct seq_file *s, void *p)
{
- struct ceph_client *client = s->private;
+ struct ceph_fs_client *fsc = s->private;
int total, avail, used, reserved, min;
- ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+ ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
"avail\t\t%d\n"
"used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
static int dentry_lru_show(struct seq_file *s, void *ptr)
{
- struct ceph_client *client = s->private;
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = s->private;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_dentry_info *di;
spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
return 0;
}
-#define DEFINE_SHOW_FUNC(name) \
-static int name##_open(struct inode *inode, struct file *file) \
-{ \
- struct seq_file *sf; \
- int ret; \
- \
- ret = single_open(file, name, NULL); \
- sf = file->private_data; \
- sf->private = inode->i_private; \
- return ret; \
-} \
- \
-static const struct file_operations name##_fops = { \
- .open = name##_open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
-};
-
-DEFINE_SHOW_FUNC(monmap_show)
-DEFINE_SHOW_FUNC(mdsmap_show)
-DEFINE_SHOW_FUNC(osdmap_show)
-DEFINE_SHOW_FUNC(monc_show)
-DEFINE_SHOW_FUNC(mdsc_show)
-DEFINE_SHOW_FUNC(osdc_show)
-DEFINE_SHOW_FUNC(dentry_lru_show)
-DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
+CEPH_DEFINE_SHOW_FUNC(mdsc_show)
+CEPH_DEFINE_SHOW_FUNC(caps_show)
+CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
+
+/*
+ * debugfs
+ */
static int congestion_kb_set(void *data, u64 val)
{
- struct ceph_client *client = (struct ceph_client *)data;
-
- if (client)
- client->mount_args->congestion_kb = (int)val;
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+ fsc->mount_options->congestion_kb = (int)val;
return 0;
}
static int congestion_kb_get(void *data, u64 *val)
{
- struct ceph_client *client = (struct ceph_client *)data;
-
- if (client)
- *val = (u64)client->mount_args->congestion_kb;
+ struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
+ *val = (u64)fsc->mount_options->congestion_kb;
return 0;
}
-
DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
congestion_kb_set, "%llu\n");
-int __init ceph_debugfs_init(void)
-{
- ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
- if (!ceph_debugfs_dir)
- return -ENOMEM;
- return 0;
-}
-void ceph_debugfs_cleanup(void)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
- debugfs_remove(ceph_debugfs_dir);
+ dout("ceph_fs_debugfs_cleanup\n");
+ debugfs_remove(fsc->debugfs_bdi);
+ debugfs_remove(fsc->debugfs_congestion_kb);
+ debugfs_remove(fsc->debugfs_mdsmap);
+ debugfs_remove(fsc->debugfs_caps);
+ debugfs_remove(fsc->debugfs_mdsc);
+ debugfs_remove(fsc->debugfs_dentry_lru);
}
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
- int ret = 0;
- char name[80];
-
- snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
- client->monc.auth->global_id);
+ char name[100];
+ int err = -ENOMEM;
- client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
- if (!client->debugfs_dir)
- goto out;
-
- client->monc.debugfs_file = debugfs_create_file("monc",
- 0600,
- client->debugfs_dir,
- client,
- &monc_show_fops);
- if (!client->monc.debugfs_file)
+ dout("ceph_fs_debugfs_init\n");
+ fsc->debugfs_congestion_kb =
+ debugfs_create_file("writeback_congestion_kb",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &congestion_kb_fops);
+ if (!fsc->debugfs_congestion_kb)
goto out;
- client->mdsc.debugfs_file = debugfs_create_file("mdsc",
- 0600,
- client->debugfs_dir,
- client,
- &mdsc_show_fops);
- if (!client->mdsc.debugfs_file)
- goto out;
+ dout("a\n");
- client->osdc.debugfs_file = debugfs_create_file("osdc",
- 0600,
- client->debugfs_dir,
- client,
- &osdc_show_fops);
- if (!client->osdc.debugfs_file)
+ snprintf(name, sizeof(name), "../../bdi/%s",
+ dev_name(fsc->backing_dev_info.dev));
+ fsc->debugfs_bdi =
+ debugfs_create_symlink("bdi",
+ fsc->client->debugfs_dir,
+ name);
+ if (!fsc->debugfs_bdi)
goto out;
- client->debugfs_monmap = debugfs_create_file("monmap",
+ dout("b\n");
+ fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
0600,
- client->debugfs_dir,
- client,
- &monmap_show_fops);
- if (!client->debugfs_monmap)
- goto out;
-
- client->debugfs_mdsmap = debugfs_create_file("mdsmap",
- 0600,
- client->debugfs_dir,
- client,
+ fsc->client->debugfs_dir,
+ fsc,
&mdsmap_show_fops);
- if (!client->debugfs_mdsmap)
- goto out;
-
- client->debugfs_osdmap = debugfs_create_file("osdmap",
- 0600,
- client->debugfs_dir,
- client,
- &osdmap_show_fops);
- if (!client->debugfs_osdmap)
+ if (!fsc->debugfs_mdsmap)
goto out;
- client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
- 0600,
- client->debugfs_dir,
- client,
- &dentry_lru_show_fops);
- if (!client->debugfs_dentry_lru)
+ dout("ca\n");
+ fsc->debugfs_mdsc = debugfs_create_file("mdsc",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &mdsc_show_fops);
+ if (!fsc->debugfs_mdsc)
goto out;
- client->debugfs_caps = debugfs_create_file("caps",
+ dout("da\n");
+ fsc->debugfs_caps = debugfs_create_file("caps",
0400,
- client->debugfs_dir,
- client,
+ fsc->client->debugfs_dir,
+ fsc,
&caps_show_fops);
- if (!client->debugfs_caps)
+ if (!fsc->debugfs_caps)
goto out;
- client->debugfs_congestion_kb =
- debugfs_create_file("writeback_congestion_kb",
- 0600,
- client->debugfs_dir,
- client,
- &congestion_kb_fops);
- if (!client->debugfs_congestion_kb)
+ dout("ea\n");
+ fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+ 0600,
+ fsc->client->debugfs_dir,
+ fsc,
+ &dentry_lru_show_fops);
+ if (!fsc->debugfs_dentry_lru)
goto out;
- sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
- client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
- name);
-
return 0;
out:
- ceph_debugfs_client_cleanup(client);
- return ret;
+ ceph_fs_debugfs_cleanup(fsc);
+ return err;
}
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
-{
- debugfs_remove(client->debugfs_bdi);
- debugfs_remove(client->debugfs_caps);
- debugfs_remove(client->debugfs_dentry_lru);
- debugfs_remove(client->debugfs_osdmap);
- debugfs_remove(client->debugfs_mdsmap);
- debugfs_remove(client->debugfs_monmap);
- debugfs_remove(client->osdc.debugfs_file);
- debugfs_remove(client->mdsc.debugfs_file);
- debugfs_remove(client->monc.debugfs_file);
- debugfs_remove(client->debugfs_congestion_kb);
- debugfs_remove(client->debugfs_dir);
-}
#else /* CONFIG_DEBUG_FS */
-int __init ceph_debugfs_init(void)
-{
- return 0;
-}
-
-void ceph_debugfs_cleanup(void)
-{
-}
-
-int ceph_debugfs_client_init(struct ceph_client *client)
+int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
return 0;
}
-void ceph_debugfs_client_cleanup(struct ceph_client *client)
+void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
{
}
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
deleted file mode 100644
index 3d25415afe63..000000000000
--- a/fs/ceph/decode.h
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef __CEPH_DECODE_H
-#define __CEPH_DECODE_H
-
-#include <asm/unaligned.h>
-#include <linux/time.h>
-
-#include "types.h"
-
-/*
- * in all cases,
- * void **p pointer to position pointer
- * void *end pointer to end of buffer (last byte + 1)
- */
-
-static inline u64 ceph_decode_64(void **p)
-{
- u64 v = get_unaligned_le64(*p);
- *p += sizeof(u64);
- return v;
-}
-static inline u32 ceph_decode_32(void **p)
-{
- u32 v = get_unaligned_le32(*p);
- *p += sizeof(u32);
- return v;
-}
-static inline u16 ceph_decode_16(void **p)
-{
- u16 v = get_unaligned_le16(*p);
- *p += sizeof(u16);
- return v;
-}
-static inline u8 ceph_decode_8(void **p)
-{
- u8 v = *(u8 *)*p;
- (*p)++;
- return v;
-}
-static inline void ceph_decode_copy(void **p, void *pv, size_t n)
-{
- memcpy(pv, *p, n);
- *p += n;
-}
-
-/*
- * bounds check input.
- */
-#define ceph_decode_need(p, end, n, bad) \
- do { \
- if (unlikely(*(p) + (n) > (end))) \
- goto bad; \
- } while (0)
-
-#define ceph_decode_64_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u64), bad); \
- v = ceph_decode_64(p); \
- } while (0)
-#define ceph_decode_32_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u32), bad); \
- v = ceph_decode_32(p); \
- } while (0)
-#define ceph_decode_16_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u16), bad); \
- v = ceph_decode_16(p); \
- } while (0)
-#define ceph_decode_8_safe(p, end, v, bad) \
- do { \
- ceph_decode_need(p, end, sizeof(u8), bad); \
- v = ceph_decode_8(p); \
- } while (0)
-
-#define ceph_decode_copy_safe(p, end, pv, n, bad) \
- do { \
- ceph_decode_need(p, end, n, bad); \
- ceph_decode_copy(p, pv, n); \
- } while (0)
-
-/*
- * struct ceph_timespec <-> struct timespec
- */
-static inline void ceph_decode_timespec(struct timespec *ts,
- const struct ceph_timespec *tv)
-{
- ts->tv_sec = le32_to_cpu(tv->tv_sec);
- ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
-}
-static inline void ceph_encode_timespec(struct ceph_timespec *tv,
- const struct timespec *ts)
-{
- tv->tv_sec = cpu_to_le32(ts->tv_sec);
- tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
-}
-
-/*
- * sockaddr_storage <-> ceph_sockaddr
- */
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
-{
- __be16 ss_family = htons(a->in_addr.ss_family);
- a->in_addr.ss_family = *(__u16 *)&ss_family;
-}
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
-{
- __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
- a->in_addr.ss_family = ntohs(ss_family);
- WARN_ON(a->in_addr.ss_family == 512);
-}
-
-/*
- * encoders
- */
-static inline void ceph_encode_64(void **p, u64 v)
-{
- put_unaligned_le64(v, (__le64 *)*p);
- *p += sizeof(u64);
-}
-static inline void ceph_encode_32(void **p, u32 v)
-{
- put_unaligned_le32(v, (__le32 *)*p);
- *p += sizeof(u32);
-}
-static inline void ceph_encode_16(void **p, u16 v)
-{
- put_unaligned_le16(v, (__le16 *)*p);
- *p += sizeof(u16);
-}
-static inline void ceph_encode_8(void **p, u8 v)
-{
- *(u8 *)*p = v;
- (*p)++;
-}
-static inline void ceph_encode_copy(void **p, const void *s, int len)
-{
- memcpy(*p, s, len);
- *p += len;
-}
-
-/*
- * filepath, string encoders
- */
-static inline void ceph_encode_filepath(void **p, void *end,
- u64 ino, const char *path)
-{
- u32 len = path ? strlen(path) : 0;
- BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
- ceph_encode_8(p, 1);
- ceph_encode_64(p, ino);
- ceph_encode_32(p, len);
- if (len)
- memcpy(*p, path, len);
- *p += len;
-}
-
-static inline void ceph_encode_string(void **p, void *end,
- const char *s, u32 len)
-{
- BUG_ON(*p + sizeof(len) + len > end);
- ceph_encode_32(p, len);
- if (len)
- memcpy(*p, s, len);
- *p += len;
-}
-
-#define ceph_encode_need(p, end, n, bad) \
- do { \
- if (unlikely(*(p) + (n) > (end))) \
- goto bad; \
- } while (0)
-
-#define ceph_encode_64_safe(p, end, v, bad) \
- do { \
- ceph_encode_need(p, end, sizeof(u64), bad); \
- ceph_encode_64(p, v); \
- } while (0)
-#define ceph_encode_32_safe(p, end, v, bad) \
- do { \
- ceph_encode_need(p, end, sizeof(u32), bad); \
- ceph_encode_32(p, v); \
- } while (0)
-#define ceph_encode_16_safe(p, end, v, bad) \
- do { \
- ceph_encode_need(p, end, sizeof(u16), bad); \
- ceph_encode_16(p, v); \
- } while (0)
-
-#define ceph_encode_copy_safe(p, end, pv, n, bad) \
- do { \
- ceph_encode_need(p, end, n, bad); \
- ceph_encode_copy(p, pv, n); \
- } while (0)
-
-
-#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..e0a2dc6fcafc 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/spinlock.h>
#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include "super.h"
+#include "mds_client.h"
/*
* Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
*/
static int __dcache_readdir(struct file *filp,
void *dirent, filldir_t filldir)
- __releases(inode->i_lock)
- __acquires(inode->i_lock)
{
- struct inode *inode = filp->f_dentry->d_inode;
struct ceph_file_info *fi = filp->private_data;
struct dentry *parent = filp->f_dentry;
struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
atomic_inc(&dentry->d_count);
spin_unlock(&dcache_lock);
- spin_unlock(&inode->i_lock);
dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
} else {
dput(last);
}
- last = NULL;
}
-
- spin_lock(&inode->i_lock);
- spin_lock(&dcache_lock);
-
last = dentry;
if (err < 0)
- goto out_unlock;
+ goto out;
- p = p->prev;
filp->f_pos++;
/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
- if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
- goto more;
- dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
- err = -EAGAIN;
+ if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
+ dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+ err = -EAGAIN;
+ goto out;
+ }
+
+ spin_lock(&dcache_lock);
+ p = p->prev; /* advance to next dentry */
+ goto more;
out_unlock:
spin_unlock(&dcache_lock);
-
- if (last) {
- spin_unlock(&inode->i_lock);
+out:
+ if (last)
dput(last);
- spin_lock(&inode->i_lock);
- }
-
return err;
}
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct ceph_file_info *fi = filp->private_data;
struct inode *inode = filp->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned frag = fpos_frag(filp->f_pos);
int off = fpos_off(filp->f_pos);
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = client->mount_args->max_readdir;
- const int max_bytes = client->mount_args->max_readdir_bytes;
+ const int max_entries = fsc->mount_options->max_readdir;
+ const int max_bytes = fsc->mount_options->max_readdir_bytes;
dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* can we use the dcache? */
spin_lock(&inode->i_lock);
if ((filp->f_pos == 2 || fi->dentry) &&
- !ceph_test_opt(client, NOASYNCREADDIR) &&
+ !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ spin_unlock(&inode->i_lock);
err = __dcache_readdir(filp, dirent, filldir);
- if (err != -EAGAIN) {
- spin_unlock(&inode->i_lock);
+ if (err != -EAGAIN)
return err;
- }
+ } else {
+ spin_unlock(&inode->i_lock);
}
- spin_unlock(&inode->i_lock);
if (fi->dentry) {
err = note_last_dentry(fi, fi->dentry->d_name.name,
fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
- struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = dentry->d_parent->d_inode;
/* .snap dir? */
if (err == -ENOENT &&
- ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
strcmp(dentry->d_name.name,
- client->mount_args->snapdir_name) == 0) {
+ fsc->mount_options->snapdir_name) == 0) {
struct inode *inode = ceph_get_snapdir(parent);
dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int op;
int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
spin_lock(&dir->i_lock);
dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
- client->mount_args->snapdir_name,
+ fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
static int ceph_mknod(struct inode *dir, struct dentry *dentry,
int mode, dev_t rdev)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
static int ceph_symlink(struct inode *dir, struct dentry *dentry,
const char *dest)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err = -EROFS;
int op;
@@ -758,8 +749,8 @@ out:
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
*/
static int ceph_unlink(struct inode *dir, struct dentry *dentry)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = dentry->d_inode;
struct ceph_mds_request *req;
int err = -EROFS;
@@ -854,8 +845,8 @@ out:
static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
- if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
+ if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
if (di) {
- mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_add_tail(&di->lru, &mdsc->dentry_lru);
mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
dn->d_name.len, dn->d_name.name, di->offset);
if (di) {
- mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_move_tail(&di->lru, &mdsc->dentry_lru);
spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
if (di) {
- mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
+ mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_del_init(&di->lru);
mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 4480cb1c63e7..2297d9426992 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/exportfs.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
#include "super.h"
+#include "mds_client.h"
/*
* NFS export support
@@ -42,32 +43,37 @@ struct ceph_nfs_confh {
static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
int connectable)
{
+ int type;
struct ceph_nfs_fh *fh = (void *)rawfh;
struct ceph_nfs_confh *cfh = (void *)rawfh;
struct dentry *parent = dentry->d_parent;
struct inode *inode = dentry->d_inode;
- int type;
+ int connected_handle_length = sizeof(*cfh)/4;
+ int handle_length = sizeof(*fh)/4;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- if (*max_len >= sizeof(*cfh)) {
+ if (*max_len >= connected_handle_length) {
dout("encode_fh %p connectable\n", dentry);
cfh->ino = ceph_ino(dentry->d_inode);
cfh->parent_ino = ceph_ino(parent->d_inode);
cfh->parent_name_hash = parent->d_name.hash;
- *max_len = sizeof(*cfh);
+ *max_len = connected_handle_length;
type = 2;
- } else if (*max_len > sizeof(*fh)) {
- if (connectable)
- return -ENOSPC;
+ } else if (*max_len >= handle_length) {
+ if (connectable) {
+ *max_len = connected_handle_length;
+ return 255;
+ }
dout("encode_fh %p\n", dentry);
fh->ino = ceph_ino(dentry->d_inode);
- *max_len = sizeof(*fh);
+ *max_len = handle_length;
type = 1;
} else {
- return -ENOSPC;
+ *max_len = handle_length;
+ return 255;
}
return type;
}
@@ -115,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
static struct dentry *__cfh_to_dentry(struct super_block *sb,
struct ceph_nfs_confh *cfh)
{
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
struct dentry *dentry;
struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c044a4f0457..e77c28cf3690 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+#include <linux/module.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/file.h>
@@ -38,8 +39,8 @@
static struct ceph_mds_request *
prepare_open_request(struct super_block *sb, int flags, int create_mode)
{
- struct ceph_client *client = ceph_sb_to_client(sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int want_auth = USE_ANY_MDS;
int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
int ceph_open(struct inode *inode, struct file *file)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *cf = file->private_data;
struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
struct nameidata *nd, int mode,
int locked_dir)
{
- struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct file *file = nd->intent.open.file;
struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
}
/*
- * build a vector of user pages
- */
-static struct page **get_direct_page_vector(const char __user *data,
- int num_pages,
- loff_t off, size_t len)
-{
- struct page **pages;
- int rc;
-
- pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- down_read(&current->mm->mmap_sem);
- rc = get_user_pages(current, current->mm, (unsigned long)data,
- num_pages, 0, 0, pages, NULL);
- up_read(&current->mm->mmap_sem);
- if (rc < 0)
- goto fail;
- return pages;
-
-fail:
- kfree(pages);
- return ERR_PTR(rc);
-}
-
-static void put_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- put_page(pages[i]);
- kfree(pages);
-}
-
-void ceph_release_page_vector(struct page **pages, int num_pages)
-{
- int i;
-
- for (i = 0; i < num_pages; i++)
- __free_pages(pages[i], 0);
- kfree(pages);
-}
-
-/*
- * allocate a vector new pages
- */
-static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
-{
- struct page **pages;
- int i;
-
- pages = kmalloc(sizeof(*pages) * num_pages, flags);
- if (!pages)
- return ERR_PTR(-ENOMEM);
- for (i = 0; i < num_pages; i++) {
- pages[i] = __page_cache_alloc(flags);
- if (pages[i] == NULL) {
- ceph_release_page_vector(pages, i);
- return ERR_PTR(-ENOMEM);
- }
- }
- return pages;
-}
-
-/*
- * copy user data into a page vector
- */
-static int copy_user_to_page_vector(struct page **pages,
- const char __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, PAGE_CACHE_SIZE-po, left);
- bad = copy_from_user(page_address(pages[i]) + po, data, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- po += l - bad;
- if (po == PAGE_CACHE_SIZE) {
- po = 0;
- i++;
- }
- }
- return len;
-}
-
-/*
- * copy user data from a page vector into a user pointer
- */
-static int copy_page_vector_to_user(struct page **pages, char __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, left, PAGE_CACHE_SIZE-po);
- bad = copy_to_user(data, page_address(pages[i]) + po, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- if (po) {
- po += l - bad;
- if (po == PAGE_CACHE_SIZE)
- po = 0;
- }
- i++;
- }
- return len;
-}
-
-/*
- * Zero an extent within a page vector. Offset is relative to the
- * start of the first page.
- */
-static void zero_page_vector_range(int off, int len, struct page **pages)
-{
- int i = off >> PAGE_CACHE_SHIFT;
-
- off &= ~PAGE_CACHE_MASK;
-
- dout("zero_page_vector_page %u~%u\n", off, len);
-
- /* leading partial page? */
- if (off) {
- int end = min((int)PAGE_CACHE_SIZE, off + len);
- dout("zeroing %d %p head from %d\n", i, pages[i],
- (int)off);
- zero_user_segment(pages[i], off, end);
- len -= (end - off);
- i++;
- }
- while (len >= PAGE_CACHE_SIZE) {
- dout("zeroing %d %p len=%d\n", i, pages[i], len);
- zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
- i++;
- }
- /* trailing partial page? */
- if (len) {
- dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
- zero_user_segment(pages[i], 0, len);
- }
-}
-
-
-/*
* Read a range of bytes striped over one or more objects. Iterate over
* objects we stripe over. (That's not atomic, but good enough for now.)
*
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
struct page **pages, int num_pages,
int *checkeof)
{
- struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 pos, this_len;
int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
more:
this_len = left;
- ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+ ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
ci->i_truncate_seq,
ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
if (read < pos - off) {
dout(" zero gap %llu to %llu\n", off + read, pos);
- zero_page_vector_range(page_off + read,
- pos - off - read, pages);
+ ceph_zero_page_vector_range(page_off + read,
+ pos - off - read, pages);
}
pos += ret;
read = pos - off;
@@ -495,8 +339,8 @@ more:
/* was original extent fully inside i_size? */
if (pos + left <= inode->i_size) {
dout("zero tail\n");
- zero_page_vector_range(page_off + read, len - read,
- pages);
+ ceph_zero_page_vector_range(page_off + read, len - read,
+ pages);
read = len;
goto out;
}
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
if (file->f_flags & O_DIRECT) {
- pages = get_direct_page_vector(data, num_pages, off, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, off, len);
/*
* flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
ret = striped_read(inode, off, len, pages, num_pages, checkeof);
if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = copy_page_vector_to_user(pages, data, off, ret);
+ ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
if (ret >= 0)
*poff = off + ret;
done:
if (file->f_flags & O_DIRECT)
- put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages);
else
ceph_release_page_vector(pages, num_pages);
dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
struct page **pages;
int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
*/
more:
len = left;
- req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), pos, &len,
CEPH_OSD_OP_WRITE, flags,
ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
num_pages = calc_pages_for(pos, len);
if (file->f_flags & O_DIRECT) {
- pages = get_direct_page_vector(data, num_pages, pos, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -673,7 +517,7 @@ more:
ret = PTR_ERR(pages);
goto out;
}
- ret = copy_user_to_page_vector(pages, data, pos, len);
+ ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
if (ret < 0) {
ceph_release_page_vector(pages, num_pages);
goto out;
@@ -689,7 +533,7 @@ more:
req->r_num_pages = num_pages;
req->r_inode = inode;
- ret = ceph_osdc_start_request(&client->osdc, req, false);
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
if (!ret) {
if (req->r_safe_callback) {
/*
@@ -697,15 +541,15 @@ more:
* start_request so that a tid has been assigned.
*/
spin_lock(&ci->i_unsafe_lock);
- list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+ list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
spin_unlock(&ci->i_unsafe_lock);
ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
}
- ret = ceph_osdc_wait_request(&client->osdc, req);
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
}
if (file->f_flags & O_DIRECT)
- put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages);
else if (file->f_flags & O_SYNC)
ceph_release_page_vector(pages, num_pages);
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_sb_to_client(inode->i_sb)->client->osdc;
loff_t endoff = pos + iov->iov_len;
int want, got = 0;
int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..1d6a45b5a04c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/module.h>
#include <linux/fs.h>
@@ -13,7 +13,8 @@
#include <linux/pagevec.h>
#include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+#include <linux/ceph/decode.h>
/*
* Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+ ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
}
/* it may be better to set st_size in getattr instead? */
- if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
+ if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
inode->i_size = ci->i_rbytes;
break;
default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
struct inode *in = NULL;
struct ceph_mds_reply_inode *ininfo;
struct ceph_vino vino;
- struct ceph_client *client = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int i = 0;
int err = 0;
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
*/
if (rinfo->head->is_dentry && !req->r_aborted &&
(rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
- client->mount_args->snapdir_name,
+ fsc->mount_options->snapdir_name,
req->r_dentry->d_name.len))) {
/*
* lookup link rename : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *parent_inode = dentry->d_parent->d_inode;
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
int issued;
int release = 0, dirtied = 0;
int mask = 0;
@@ -1728,8 +1729,8 @@ out:
*/
int ceph_do_getattr(struct inode *inode, int mask)
{
- struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int err;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
#include <linux/in.h>
-#include "ioctl.h"
#include "super.h"
-#include "ceph_debug.h"
+#include "mds_client.h"
+#include <linux/ceph/ceph_debug.h>
+
+#include "ioctl.h"
/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file->f_dentry->d_inode;
struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
}
/*
+ * Set a layout policy on a directory inode. All items in the tree
+ * rooted at this inode will inherit this layout on creation,
+ * (It doesn't apply retroactively )
+ * unless a subdirectory has its own layout policy.
+ */
+static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_mds_request *req;
+ struct ceph_ioctl_layout l;
+ int err, i;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+
+ /* copy and validate */
+ if (copy_from_user(&l, arg, sizeof(l)))
+ return -EFAULT;
+
+ if ((l.object_size & ~PAGE_MASK) ||
+ (l.stripe_unit & ~PAGE_MASK) ||
+ !l.stripe_unit ||
+ (l.object_size &&
+ (unsigned)l.object_size % (unsigned)l.stripe_unit))
+ return -EINVAL;
+
+ /* make sure it's a valid data pool */
+ if (l.data_pool > 0) {
+ mutex_lock(&mdsc->mutex);
+ err = -EINVAL;
+ for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+ if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+ err = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (err)
+ return err;
+ }
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
+ USE_AUTH_MDS);
+
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+
+ req->r_args.setlayout.layout.fl_stripe_unit =
+ cpu_to_le32(l.stripe_unit);
+ req->r_args.setlayout.layout.fl_stripe_count =
+ cpu_to_le32(l.stripe_count);
+ req->r_args.setlayout.layout.fl_object_size =
+ cpu_to_le32(l.object_size);
+ req->r_args.setlayout.layout.fl_pg_pool =
+ cpu_to_le32(l.data_pool);
+ req->r_args.setlayout.layout.fl_pg_preferred =
+ cpu_to_le32(l.preferred_osd);
+
+ err = ceph_mdsc_do_request(mdsc, inode, req);
+ ceph_mdsc_put_request(req);
+ return err;
+}
+
+/*
* Return object name, size/offset information, and location (OSD
* number, network address) for a given file offset.
*/
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_ioctl_dataloc dl;
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
+ struct ceph_osd_client *osdc =
+ &ceph_sb_to_client(inode->i_sb)->client->osdc;
u64 len = 1, olen;
u64 tmp;
struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case CEPH_IOC_SET_LAYOUT:
return ceph_ioctl_set_layout(file, (void __user *)arg);
+ case CEPH_IOC_SET_LAYOUT_POLICY:
+ return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
+
case CEPH_IOC_GET_DATALOC:
return ceph_ioctl_get_dataloc(file, (void __user *)arg);
case CEPH_IOC_LAZYIO:
return ceph_ioctl_lazyio(file);
}
+
return -ENOTTY;
}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
#include <linux/ioctl.h>
#include <linux/types.h>
-#define CEPH_IOCTL_MAGIC 0x97
+#define CEPH_IOCTL_MAGIC 0x98
/* just use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
struct ceph_ioctl_layout)
#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
+ struct ceph_ioctl_layout)
/*
* Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..40abde93c345 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/file.h>
#include <linux/namei.h>
#include "super.h"
#include "mds_client.h"
-#include "pagelist.h"
+#include <linux/ceph/pagelist.h>
/**
* Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
* Encode the flock and fcntl locks for the given inode into the pagelist.
* Format is: #fcntl locks, sequential fcntl locks, #flock locks,
* sequential flock locks.
- * Must be called with BLK already held, and the lock numbers should have
- * been gathered under the same lock holding window.
+ * Must be called with lock_flocks() already held.
+ * If we encounter more of a specific lock type than expected,
+ * we return the value 1.
*/
int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
struct file_lock *lock;
struct ceph_filelock cephlock;
int err = 0;
+ int seen_fcntl = 0;
+ int seen_flock = 0;
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_POSIX) {
+ ++seen_fcntl;
+ if (seen_fcntl > num_fcntl_locks) {
+ err = -ENOSPC;
+ goto fail;
+ }
err = lock_to_ceph_filelock(lock, &cephlock);
if (err)
goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
goto fail;
for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
if (lock->fl_flags & FL_FLOCK) {
+ ++seen_flock;
+ if (seen_flock > num_flock_locks) {
+ err = -ENOSPC;
+ goto fail;
+ }
err = lock_to_ceph_filelock(lock, &cephlock);
if (err)
goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..3142b15940c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/smp_lock.h>
-#include "mds_client.h"
-#include "mon_client.h"
#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-#include "pagelist.h"
+#include "mds_client.h"
+
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/pagelist.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
/*
* A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref)) {
if (s->s_authorizer)
- s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
- s->s_mdsc->client->monc.auth, s->s_authorizer);
+ s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
+ s->s_mdsc->fsc->client->monc.auth,
+ s->s_authorizer);
kfree(s);
}
}
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_seq = 0;
mutex_init(&s->s_mutex);
- ceph_con_init(mdsc->client->msgr, &s->s_con);
+ ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
s->s_con.private = s;
s->s_con.ops = &mds_con_ops;
s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
} else if (req->r_dentry) {
struct inode *dir = req->r_dentry->d_parent->d_inode;
- if (dir->i_sb != mdsc->client->sb) {
+ if (dir->i_sb != mdsc->fsc->sb) {
/* not this fs! */
inode = req->r_dentry->d_inode;
} else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
__ceph_remove_cap(cap);
if (!__ceph_is_any_real_caps(ci)) {
struct ceph_mds_client *mdsc =
- &ceph_sb_to_client(inode->i_sb)->mdsc;
+ ceph_sb_to_client(inode->i_sb)->mdsc;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg, *partial = NULL;
struct ceph_mds_cap_release *head;
int err = -ENOMEM;
- int extra = mdsc->client->mount_args->cap_release_safety;
+ int extra = mdsc->fsc->mount_options->cap_release_safety;
int num;
dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */
mutex_lock(&req->r_fill_mutex);
- err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+ err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
if (recon_state->flock) {
int num_fcntl_locks, num_flock_locks;
-
- lock_kernel();
- ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
- rec.v2.flock_len = (2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
-
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_encode_locks(inode, pagelist,
- num_fcntl_locks,
- num_flock_locks);
- unlock_kernel();
+ struct ceph_pagelist_cursor trunc_point;
+
+ ceph_pagelist_set_cursor(pagelist, &trunc_point);
+ do {
+ lock_flocks();
+ ceph_count_locks(inode, &num_fcntl_locks,
+ &num_flock_locks);
+ rec.v2.flock_len = (2*sizeof(u32) +
+ (num_fcntl_locks+num_flock_locks) *
+ sizeof(struct ceph_filelock));
+ unlock_flocks();
+
+ /* pre-alloc pagelist */
+ ceph_pagelist_truncate(pagelist, &trunc_point);
+ err = ceph_pagelist_append(pagelist, &rec, reclen);
+ if (!err)
+ err = ceph_pagelist_reserve(pagelist,
+ rec.v2.flock_len);
+
+ /* encode locks */
+ if (!err) {
+ lock_flocks();
+ err = ceph_encode_locks(inode,
+ pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ unlock_flocks();
+ }
+ } while (err == -ENOSPC);
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
- struct super_block *sb = mdsc->client->sb;
+ struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
struct ceph_inode_info *ci;
struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
schedule_delayed(mdsc);
}
+int ceph_mdsc_init(struct ceph_fs_client *fsc)
-int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
{
- mdsc->client = client;
+ struct ceph_mds_client *mdsc;
+
+ mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
+ if (!mdsc)
+ return -ENOMEM;
+ mdsc->fsc = fsc;
+ fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
INIT_LIST_HEAD(&mdsc->dentry_lru);
ceph_caps_init(mdsc);
- ceph_adjust_min_caps(mdsc, client->min_caps);
+ ceph_adjust_min_caps(mdsc, fsc->min_caps);
return 0;
}
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
static void wait_requests(struct ceph_mds_client *mdsc)
{
struct ceph_mds_request *req;
- struct ceph_client *client = mdsc->client;
+ struct ceph_fs_client *fsc = mdsc->fsc;
mutex_lock(&mdsc->mutex);
if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
dout("wait_requests waiting for requests\n");
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
- client->mount_args->mount_timeout * HZ);
+ fsc->client->options->mount_timeout * HZ);
/* tear down remaining requests */
mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush;
- if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
int i, n = 0;
- if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return true;
mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
{
struct ceph_mds_session *session;
int i;
- struct ceph_client *client = mdsc->client;
- unsigned long timeout = client->mount_args->mount_timeout * HZ;
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ unsigned long timeout = fsc->client->options->mount_timeout * HZ;
dout("close_sessions\n");
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
dout("stopped\n");
}
-void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
ceph_caps_finalize(mdsc);
}
+void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
+{
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+
+ ceph_mdsc_stop(mdsc);
+ fsc->mdsc = NULL;
+ kfree(mdsc);
+}
+
/*
* handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
ceph_decode_copy(&p, &fsid, sizeof(fsid));
- if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+ if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
return;
epoch = ceph_decode_32(&p);
maplen = ceph_decode_32(&p);
dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */
- ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+ ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
} else {
mdsc->mdsmap = newmap; /* first mds map */
}
- mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+ mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
__wake_requests(mdsc, &mdsc->waiting_for_map);
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
int ret = 0;
if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
}
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- struct ceph_auth_client *ac = mdsc->client->monc.auth;
+ struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
if (ac->ops->invalidate_authorizer)
ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
- return ceph_monc_validate_auth(&mdsc->client->monc);
+ return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
}
static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
.peer_reset = peer_reset,
};
-
-
-
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
#include <linux/rbtree.h>
#include <linux/spinlock.h>
-#include "types.h"
-#include "messenger.h"
-#include "mdsmap.h"
+#include <linux/ceph/types.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/mdsmap.h>
/*
* Some lock dependencies:
@@ -26,7 +26,7 @@
*
*/
-struct ceph_client;
+struct ceph_fs_client;
struct ceph_cap;
/*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
* mds client state
*/
struct ceph_mds_client {
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
struct mutex mutex; /* all nested structures */
struct ceph_mdsmap *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
int caps_avail_count; /* unused, unreserved */
int caps_min_count; /* keep at least this many
(unreserved) */
-
-#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_file;
-#endif
-
spinlock_t dentry_lru_lock;
struct list_head dentry_lru;
int num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
struct ceph_msg *msg, int mds);
-extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
- struct ceph_client *client);
+extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/bug.h>
#include <linux/err.h>
@@ -6,9 +6,9 @@
#include <linux/slab.h>
#include <linux/types.h>
-#include "mdsmap.h"
-#include "messenger.h"
-#include "decode.h"
+#include <linux/ceph/mdsmap.h>
+#include <linux/ceph/messenger.h>
+#include <linux/ceph/decode.h>
#include "super.h"
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
}
dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
- i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+ i+1, n, global_id, mds, inc,
+ ceph_pr_addr(&addr.in_addr),
ceph_mds_state_name(state));
if (mds >= 0 && mds < m->m_max_mds && state > 0) {
m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
deleted file mode 100644
index 4c5cb0880bba..000000000000
--- a/fs/ceph/mdsmap.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef _FS_CEPH_MDSMAP_H
-#define _FS_CEPH_MDSMAP_H
-
-#include "types.h"
-
-/*
- * mds map - describe servers in the mds cluster.
- *
- * we limit fields to those the client actually xcares about
- */
-struct ceph_mds_info {
- u64 global_id;
- struct ceph_entity_addr addr;
- s32 state;
- int num_export_targets;
- bool laggy;
- u32 *export_targets;
-};
-
-struct ceph_mdsmap {
- u32 m_epoch, m_client_epoch, m_last_failure;
- u32 m_root;
- u32 m_session_timeout; /* seconds */
- u32 m_session_autoclose; /* seconds */
- u64 m_max_file_size;
- u32 m_max_mds; /* size of m_addr, m_state arrays */
- struct ceph_mds_info *m_info;
-
- /* which object pools file data can be stored in */
- int m_num_data_pg_pools;
- u32 *m_data_pg_pools;
- u32 m_cas_pg_pool;
-};
-
-static inline struct ceph_entity_addr *
-ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
-{
- if (w >= m->m_max_mds)
- return NULL;
- return &m->m_info[w].addr;
-}
-
-static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
-{
- BUG_ON(w < 0);
- if (w >= m->m_max_mds)
- return CEPH_MDS_STATE_DNE;
- return m->m_info[w].state;
-}
-
-static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
-{
- if (w >= 0 && w < m->m_max_mds)
- return m->m_info[w].laggy;
- return false;
-}
-
-extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
-extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
-
-#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
deleted file mode 100644
index 2502d76fcec1..000000000000
--- a/fs/ceph/messenger.c
+++ /dev/null
@@ -1,2277 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/crc32c.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/inet.h>
-#include <linux/kthread.h>
-#include <linux/net.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/string.h>
-#include <net/tcp.h>
-
-#include "super.h"
-#include "messenger.h"
-#include "decode.h"
-#include "pagelist.h"
-
-/*
- * Ceph uses the messenger to exchange ceph_msg messages with other
- * hosts in the system. The messenger provides ordered and reliable
- * delivery. We tolerate TCP disconnects by reconnecting (with
- * exponential backoff) in the case of a fault (disconnection, bad
- * crc, protocol error). Acks allow sent messages to be discarded by
- * the sender.
- */
-
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-
-#ifdef CONFIG_LOCKDEP
-static struct lock_class_key socket_class;
-#endif
-
-
-static void queue_con(struct ceph_connection *con);
-static void con_work(struct work_struct *);
-static void ceph_fault(struct ceph_connection *con);
-
-/*
- * nicely render a sockaddr as a string.
- */
-#define MAX_ADDR_STR 20
-#define MAX_ADDR_STR_LEN 60
-static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
-static DEFINE_SPINLOCK(addr_str_lock);
-static int last_addr_str;
-
-const char *pr_addr(const struct sockaddr_storage *ss)
-{
- int i;
- char *s;
- struct sockaddr_in *in4 = (void *)ss;
- struct sockaddr_in6 *in6 = (void *)ss;
-
- spin_lock(&addr_str_lock);
- i = last_addr_str++;
- if (last_addr_str == MAX_ADDR_STR)
- last_addr_str = 0;
- spin_unlock(&addr_str_lock);
- s = addr_str[i];
-
- switch (ss->ss_family) {
- case AF_INET:
- snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
- (unsigned int)ntohs(in4->sin_port));
- break;
-
- case AF_INET6:
- snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
- (unsigned int)ntohs(in6->sin6_port));
- break;
-
- default:
- sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
- }
-
- return s;
-}
-
-static void encode_my_addr(struct ceph_messenger *msgr)
-{
- memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
- ceph_encode_addr(&msgr->my_enc_addr);
-}
-
-/*
- * work queue for all reading and writing to/from the socket.
- */
-struct workqueue_struct *ceph_msgr_wq;
-
-int __init ceph_msgr_init(void)
-{
- ceph_msgr_wq = create_workqueue("ceph-msgr");
- if (IS_ERR(ceph_msgr_wq)) {
- int ret = PTR_ERR(ceph_msgr_wq);
- pr_err("msgr_init failed to create workqueue: %d\n", ret);
- ceph_msgr_wq = NULL;
- return ret;
- }
- return 0;
-}
-
-void ceph_msgr_exit(void)
-{
- destroy_workqueue(ceph_msgr_wq);
-}
-
-void ceph_msgr_flush(void)
-{
- flush_workqueue(ceph_msgr_wq);
-}
-
-
-/*
- * socket callback functions
- */
-
-/* data available on socket, or listen socket received a connect */
-static void ceph_data_ready(struct sock *sk, int count_unused)
-{
- struct ceph_connection *con =
- (struct ceph_connection *)sk->sk_user_data;
- if (sk->sk_state != TCP_CLOSE_WAIT) {
- dout("ceph_data_ready on %p state = %lu, queueing work\n",
- con, con->state);
- queue_con(con);
- }
-}
-
-/* socket has buffer space for writing */
-static void ceph_write_space(struct sock *sk)
-{
- struct ceph_connection *con =
- (struct ceph_connection *)sk->sk_user_data;
-
- /* only queue to workqueue if there is data we want to write. */
- if (test_bit(WRITE_PENDING, &con->state)) {
- dout("ceph_write_space %p queueing write work\n", con);
- queue_con(con);
- } else {
- dout("ceph_write_space %p nothing to write\n", con);
- }
-
- /* since we have our own write_space, clear the SOCK_NOSPACE flag */
- clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
-
-/* socket's state has changed */
-static void ceph_state_change(struct sock *sk)
-{
- struct ceph_connection *con =
- (struct ceph_connection *)sk->sk_user_data;
-
- dout("ceph_state_change %p state = %lu sk_state = %u\n",
- con, con->state, sk->sk_state);
-
- if (test_bit(CLOSED, &con->state))
- return;
-
- switch (sk->sk_state) {
- case TCP_CLOSE:
- dout("ceph_state_change TCP_CLOSE\n");
- case TCP_CLOSE_WAIT:
- dout("ceph_state_change TCP_CLOSE_WAIT\n");
- if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
- if (test_bit(CONNECTING, &con->state))
- con->error_msg = "connection failed";
- else
- con->error_msg = "socket closed";
- queue_con(con);
- }
- break;
- case TCP_ESTABLISHED:
- dout("ceph_state_change TCP_ESTABLISHED\n");
- queue_con(con);
- break;
- }
-}
-
-/*
- * set up socket callbacks
- */
-static void set_sock_callbacks(struct socket *sock,
- struct ceph_connection *con)
-{
- struct sock *sk = sock->sk;
- sk->sk_user_data = (void *)con;
- sk->sk_data_ready = ceph_data_ready;
- sk->sk_write_space = ceph_write_space;
- sk->sk_state_change = ceph_state_change;
-}
-
-
-/*
- * socket helpers
- */
-
-/*
- * initiate connection to a remote socket.
- */
-static struct socket *ceph_tcp_connect(struct ceph_connection *con)
-{
- struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
- struct socket *sock;
- int ret;
-
- BUG_ON(con->sock);
- ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
- IPPROTO_TCP, &sock);
- if (ret)
- return ERR_PTR(ret);
- con->sock = sock;
- sock->sk->sk_allocation = GFP_NOFS;
-
-#ifdef CONFIG_LOCKDEP
- lockdep_set_class(&sock->sk->sk_lock, &socket_class);
-#endif
-
- set_sock_callbacks(sock, con);
-
- dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-
- ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
- O_NONBLOCK);
- if (ret == -EINPROGRESS) {
- dout("connect %s EINPROGRESS sk_state = %u\n",
- pr_addr(&con->peer_addr.in_addr),
- sock->sk->sk_state);
- ret = 0;
- }
- if (ret < 0) {
- pr_err("connect %s error %d\n",
- pr_addr(&con->peer_addr.in_addr), ret);
- sock_release(sock);
- con->sock = NULL;
- con->error_msg = "connect error";
- }
-
- if (ret < 0)
- return ERR_PTR(ret);
- return sock;
-}
-
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
- struct kvec iov = {buf, len};
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
- return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
-}
-
-/*
- * write something. @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
- size_t kvlen, size_t len, int more)
-{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-
- if (more)
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
-
- return kernel_sendmsg(sock, &msg, iov, kvlen, len);
-}
-
-
-/*
- * Shutdown/close the socket for the given connection.
- */
-static int con_close_socket(struct ceph_connection *con)
-{
- int rc;
-
- dout("con_close_socket on %p sock %p\n", con, con->sock);
- if (!con->sock)
- return 0;
- set_bit(SOCK_CLOSED, &con->state);
- rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
- sock_release(con->sock);
- con->sock = NULL;
- clear_bit(SOCK_CLOSED, &con->state);
- return rc;
-}
-
-/*
- * Reset a connection. Discard all incoming and outgoing messages
- * and clear *_seq state.
- */
-static void ceph_msg_remove(struct ceph_msg *msg)
-{
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
-}
-static void ceph_msg_remove_list(struct list_head *head)
-{
- while (!list_empty(head)) {
- struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
- list_head);
- ceph_msg_remove(msg);
- }
-}
-
-static void reset_connection(struct ceph_connection *con)
-{
- /* reset connection, out_queue, msg_ and connect_seq */
- /* discard existing out_queue and msg_seq */
- ceph_msg_remove_list(&con->out_queue);
- ceph_msg_remove_list(&con->out_sent);
-
- if (con->in_msg) {
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
- con->connect_seq = 0;
- con->out_seq = 0;
- if (con->out_msg) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
- con->out_keepalive_pending = false;
- con->in_seq = 0;
- con->in_seq_acked = 0;
-}
-
-/*
- * mark a peer down. drop any open connections.
- */
-void ceph_con_close(struct ceph_connection *con)
-{
- dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
- set_bit(CLOSED, &con->state); /* in case there's queued work */
- clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
- clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
- clear_bit(KEEPALIVE_PENDING, &con->state);
- clear_bit(WRITE_PENDING, &con->state);
- mutex_lock(&con->mutex);
- reset_connection(con);
- con->peer_global_seq = 0;
- cancel_delayed_work(&con->work);
- mutex_unlock(&con->mutex);
- queue_con(con);
-}
-
-/*
- * Reopen a closed connection, with a new peer address.
- */
-void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
-{
- dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
- set_bit(OPENING, &con->state);
- clear_bit(CLOSED, &con->state);
- memcpy(&con->peer_addr, addr, sizeof(*addr));
- con->delay = 0; /* reset backoff memory */
- queue_con(con);
-}
-
-/*
- * return true if this connection ever successfully opened
- */
-bool ceph_con_opened(struct ceph_connection *con)
-{
- return con->connect_seq > 0;
-}
-
-/*
- * generic get/put
- */
-struct ceph_connection *ceph_con_get(struct ceph_connection *con)
-{
- dout("con_get %p nref = %d -> %d\n", con,
- atomic_read(&con->nref), atomic_read(&con->nref) + 1);
- if (atomic_inc_not_zero(&con->nref))
- return con;
- return NULL;
-}
-
-void ceph_con_put(struct ceph_connection *con)
-{
- dout("con_put %p nref = %d -> %d\n", con,
- atomic_read(&con->nref), atomic_read(&con->nref) - 1);
- BUG_ON(atomic_read(&con->nref) == 0);
- if (atomic_dec_and_test(&con->nref)) {
- BUG_ON(con->sock);
- kfree(con);
- }
-}
-
-/*
- * initialize a new connection.
- */
-void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
-{
- dout("con_init %p\n", con);
- memset(con, 0, sizeof(*con));
- atomic_set(&con->nref, 1);
- con->msgr = msgr;
- mutex_init(&con->mutex);
- INIT_LIST_HEAD(&con->out_queue);
- INIT_LIST_HEAD(&con->out_sent);
- INIT_DELAYED_WORK(&con->work, con_work);
-}
-
-
-/*
- * We maintain a global counter to order connection attempts. Get
- * a unique seq greater than @gt.
- */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
-{
- u32 ret;
-
- spin_lock(&msgr->global_seq_lock);
- if (msgr->global_seq < gt)
- msgr->global_seq = gt;
- ret = ++msgr->global_seq;
- spin_unlock(&msgr->global_seq_lock);
- return ret;
-}
-
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off. Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con, int v)
-{
- struct ceph_msg *m = con->out_msg;
-
- dout("prepare_write_message_footer %p\n", con);
- con->out_kvec_is_msg = true;
- con->out_kvec[v].iov_base = &m->footer;
- con->out_kvec[v].iov_len = sizeof(m->footer);
- con->out_kvec_bytes += sizeof(m->footer);
- con->out_kvec_left++;
- con->out_more = m->more_to_follow;
- con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- int v = 0;
-
- con->out_kvec_bytes = 0;
- con->out_kvec_is_msg = true;
- con->out_msg_done = false;
-
- /* Sneak an ack in there first? If we can get it into the same
- * TCP packet that's a good thing. */
- if (con->in_seq > con->in_seq_acked) {
- con->in_seq_acked = con->in_seq;
- con->out_kvec[v].iov_base = &tag_ack;
- con->out_kvec[v++].iov_len = 1;
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con->out_kvec[v].iov_base = &con->out_temp_ack;
- con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
- con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
- }
-
- m = list_first_entry(&con->out_queue,
- struct ceph_msg, list_head);
- con->out_msg = m;
- if (test_bit(LOSSYTX, &con->state)) {
- list_del_init(&m->list_head);
- } else {
- /* put message on sent list */
- ceph_msg_get(m);
- list_move_tail(&m->list_head, &con->out_sent);
- }
-
- /*
- * only assign outgoing seq # if we haven't sent this message
- * yet. if it is requeued, resend with it's original seq.
- */
- if (m->needs_out_seq) {
- m->hdr.seq = cpu_to_le64(++con->out_seq);
- m->needs_out_seq = false;
- }
-
- dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
- m, con->out_seq, le16_to_cpu(m->hdr.type),
- le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
- le32_to_cpu(m->hdr.data_len),
- m->nr_pages);
- BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
-
- /* tag + hdr + front + middle */
- con->out_kvec[v].iov_base = &tag_msg;
- con->out_kvec[v++].iov_len = 1;
- con->out_kvec[v].iov_base = &m->hdr;
- con->out_kvec[v++].iov_len = sizeof(m->hdr);
- con->out_kvec[v++] = m->front;
- if (m->middle)
- con->out_kvec[v++] = m->middle->vec;
- con->out_kvec_left = v;
- con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
- (m->middle ? m->middle->vec.iov_len : 0);
- con->out_kvec_cur = con->out_kvec;
-
- /* fill in crc (except data pages), footer */
- con->out_msg->hdr.crc =
- cpu_to_le32(crc32c(0, (void *)&m->hdr,
- sizeof(m->hdr) - sizeof(m->hdr.crc)));
- con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
- con->out_msg->footer.front_crc =
- cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
- if (m->middle)
- con->out_msg->footer.middle_crc =
- cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
- m->middle->vec.iov_len));
- else
- con->out_msg->footer.middle_crc = 0;
- con->out_msg->footer.data_crc = 0;
- dout("prepare_write_message front_crc %u data_crc %u\n",
- le32_to_cpu(con->out_msg->footer.front_crc),
- le32_to_cpu(con->out_msg->footer.middle_crc));
-
- /* is there a data payload? */
- if (le32_to_cpu(m->hdr.data_len) > 0) {
- /* initialize page iterator */
- con->out_msg_pos.page = 0;
- con->out_msg_pos.page_pos =
- le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
- con->out_msg_pos.data_pos = 0;
- con->out_msg_pos.did_page_crc = 0;
- con->out_more = 1; /* data + footer will follow */
- } else {
- /* no, queue up footer too and be done */
- prepare_write_message_footer(con, v);
- }
-
- set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
- dout("prepare_write_ack %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con->out_kvec[0].iov_base = &tag_ack;
- con->out_kvec[0].iov_len = 1;
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con->out_kvec[1].iov_base = &con->out_temp_ack;
- con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
- con->out_kvec_left = 2;
- con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
- con->out_kvec_cur = con->out_kvec;
- con->out_more = 1; /* more will follow.. eventually.. */
- set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
- dout("prepare_write_keepalive %p\n", con);
- con->out_kvec[0].iov_base = &tag_keepalive;
- con->out_kvec[0].iov_len = 1;
- con->out_kvec_left = 1;
- con->out_kvec_bytes = 1;
- con->out_kvec_cur = con->out_kvec;
- set_bit(WRITE_PENDING, &con->state);
-}
-
-/*
- * Connection negotiation.
- */
-
-static void prepare_connect_authorizer(struct ceph_connection *con)
-{
- void *auth_buf;
- int auth_len = 0;
- int auth_protocol = 0;
-
- mutex_unlock(&con->mutex);
- if (con->ops->get_authorizer)
- con->ops->get_authorizer(con, &auth_buf, &auth_len,
- &auth_protocol, &con->auth_reply_buf,
- &con->auth_reply_buf_len,
- con->auth_retry);
- mutex_lock(&con->mutex);
-
- con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
- con->out_connect.authorizer_len = cpu_to_le32(auth_len);
-
- con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
- con->out_kvec[con->out_kvec_left].iov_len = auth_len;
- con->out_kvec_left++;
- con->out_kvec_bytes += auth_len;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_messenger *msgr,
- struct ceph_connection *con)
-{
- int len = strlen(CEPH_BANNER);
-
- con->out_kvec[0].iov_base = CEPH_BANNER;
- con->out_kvec[0].iov_len = len;
- con->out_kvec[1].iov_base = &msgr->my_enc_addr;
- con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
- con->out_kvec_left = 2;
- con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
- con->out_kvec_cur = con->out_kvec;
- con->out_more = 0;
- set_bit(WRITE_PENDING, &con->state);
-}
-
-static void prepare_write_connect(struct ceph_messenger *msgr,
- struct ceph_connection *con,
- int after_banner)
-{
- unsigned global_seq = get_global_seq(con->msgr, 0);
- int proto;
-
- switch (con->peer_name.type) {
- case CEPH_ENTITY_TYPE_MON:
- proto = CEPH_MONC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_OSD:
- proto = CEPH_OSDC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_MDS:
- proto = CEPH_MDSC_PROTOCOL;
- break;
- default:
- BUG();
- }
-
- dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
- con->connect_seq, global_seq, proto);
-
- con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
- con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
- con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
- con->out_connect.global_seq = cpu_to_le32(global_seq);
- con->out_connect.protocol_version = cpu_to_le32(proto);
- con->out_connect.flags = 0;
-
- if (!after_banner) {
- con->out_kvec_left = 0;
- con->out_kvec_bytes = 0;
- }
- con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
- con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
- con->out_kvec_left++;
- con->out_kvec_bytes += sizeof(con->out_connect);
- con->out_kvec_cur = con->out_kvec;
- con->out_more = 0;
- set_bit(WRITE_PENDING, &con->state);
-
- prepare_connect_authorizer(con);
-}
-
-
-/*
- * write as much of pending kvecs to the socket as we can.
- * 1 -> done
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
- int ret;
-
- dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
- while (con->out_kvec_bytes > 0) {
- ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
- con->out_kvec_left, con->out_kvec_bytes,
- con->out_more);
- if (ret <= 0)
- goto out;
- con->out_kvec_bytes -= ret;
- if (con->out_kvec_bytes == 0)
- break; /* done */
- while (ret > 0) {
- if (ret >= con->out_kvec_cur->iov_len) {
- ret -= con->out_kvec_cur->iov_len;
- con->out_kvec_cur++;
- con->out_kvec_left--;
- } else {
- con->out_kvec_cur->iov_len -= ret;
- con->out_kvec_cur->iov_base += ret;
- ret = 0;
- break;
- }
- }
- }
- con->out_kvec_left = 0;
- con->out_kvec_is_msg = false;
- ret = 1;
-out:
- dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
- con->out_kvec_bytes, con->out_kvec_left, ret);
- return ret; /* done! */
-}
-
-/*
- * Write as much message data payload as we can. If we finish, queue
- * up the footer.
- * 1 -> done, footer is now queued in out_kvec[].
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_msg_pages(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->out_msg;
- unsigned data_len = le32_to_cpu(msg->hdr.data_len);
- size_t len;
- int crc = con->msgr->nocrc;
- int ret;
-
- dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
- con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
- con->out_msg_pos.page_pos);
-
- while (con->out_msg_pos.page < con->out_msg->nr_pages) {
- struct page *page = NULL;
- void *kaddr = NULL;
-
- /*
- * if we are calculating the data crc (the default), we need
- * to map the page. if our pages[] has been revoked, use the
- * zero page.
- */
- if (msg->pages) {
- page = msg->pages[con->out_msg_pos.page];
- if (crc)
- kaddr = kmap(page);
- } else if (msg->pagelist) {
- page = list_first_entry(&msg->pagelist->head,
- struct page, lru);
- if (crc)
- kaddr = kmap(page);
- } else {
- page = con->msgr->zero_page;
- if (crc)
- kaddr = page_address(con->msgr->zero_page);
- }
- len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
- (int)(data_len - con->out_msg_pos.data_pos));
- if (crc && !con->out_msg_pos.did_page_crc) {
- void *base = kaddr + con->out_msg_pos.page_pos;
- u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
-
- BUG_ON(kaddr == NULL);
- con->out_msg->footer.data_crc =
- cpu_to_le32(crc32c(tmpcrc, base, len));
- con->out_msg_pos.did_page_crc = 1;
- }
-
- ret = kernel_sendpage(con->sock, page,
- con->out_msg_pos.page_pos, len,
- MSG_DONTWAIT | MSG_NOSIGNAL |
- MSG_MORE);
-
- if (crc && (msg->pages || msg->pagelist))
- kunmap(page);
-
- if (ret <= 0)
- goto out;
-
- con->out_msg_pos.data_pos += ret;
- con->out_msg_pos.page_pos += ret;
- if (ret == len) {
- con->out_msg_pos.page_pos = 0;
- con->out_msg_pos.page++;
- con->out_msg_pos.did_page_crc = 0;
- if (msg->pagelist)
- list_move_tail(&page->lru,
- &msg->pagelist->head);
- }
- }
-
- dout("write_partial_msg_pages %p msg %p done\n", con, msg);
-
- /* prepare and queue up footer, too */
- if (!crc)
- con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
- con->out_kvec_bytes = 0;
- con->out_kvec_left = 0;
- con->out_kvec_cur = con->out_kvec;
- prepare_write_message_footer(con, 0);
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
- int ret;
-
- while (con->out_skip > 0) {
- struct kvec iov = {
- .iov_base = page_address(con->msgr->zero_page),
- .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
- };
-
- ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
- if (ret <= 0)
- goto out;
- con->out_skip -= ret;
- }
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
- dout("prepare_read_banner %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
- dout("prepare_read_connect %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
- dout("prepare_read_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
- dout("prepare_read_tag %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
- dout("prepare_read_message %p\n", con);
- BUG_ON(con->in_msg != NULL);
- con->in_base_pos = 0;
- con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
- return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
- int *to, int size, void *object)
-{
- *to += size;
- while (con->in_base_pos < *to) {
- int left = *to - con->in_base_pos;
- int have = size - left;
- int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
- int ret, to = 0;
-
- dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
- /* peer's banner */
- ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
- if (ret <= 0)
- goto out;
- ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
- &con->actual_peer_addr);
- if (ret <= 0)
- goto out;
- ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
- &con->peer_addr_for_me);
- if (ret <= 0)
- goto out;
-out:
- return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
- int ret, to = 0;
-
- dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
- ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
- if (ret <= 0)
- goto out;
- ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
- con->auth_reply_buf);
- if (ret <= 0)
- goto out;
-
- dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
- con, (int)con->in_reply.tag,
- le32_to_cpu(con->in_reply.connect_seq),
- le32_to_cpu(con->in_reply.global_seq));
-out:
- return ret;
-
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
- if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
- pr_err("connect to %s got bad banner\n",
- pr_addr(&con->peer_addr.in_addr));
- con->error_msg = "protocol error, bad banner";
- return -1;
- }
- return 0;
-}
-
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
- switch (ss->ss_family) {
- case AF_INET:
- return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
- case AF_INET6:
- return
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
- ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
- }
- return false;
-}
-
-static int addr_port(struct sockaddr_storage *ss)
-{
- switch (ss->ss_family) {
- case AF_INET:
- return ntohs(((struct sockaddr_in *)ss)->sin_port);
- case AF_INET6:
- return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
- }
- return 0;
-}
-
-static void addr_set_port(struct sockaddr_storage *ss, int p)
-{
- switch (ss->ss_family) {
- case AF_INET:
- ((struct sockaddr_in *)ss)->sin_port = htons(p);
- case AF_INET6:
- ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
- }
-}
-
-/*
- * Parse an ip[:port] list into an addr array. Use the default
- * monitor port if a port isn't specified.
- */
-int ceph_parse_ips(const char *c, const char *end,
- struct ceph_entity_addr *addr,
- int max_count, int *count)
-{
- int i;
- const char *p = c;
-
- dout("parse_ips on '%.*s'\n", (int)(end-c), c);
- for (i = 0; i < max_count; i++) {
- const char *ipend;
- struct sockaddr_storage *ss = &addr[i].in_addr;
- struct sockaddr_in *in4 = (void *)ss;
- struct sockaddr_in6 *in6 = (void *)ss;
- int port;
- char delim = ',';
-
- if (*p == '[') {
- delim = ']';
- p++;
- }
-
- memset(ss, 0, sizeof(*ss));
- if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
- delim, &ipend))
- ss->ss_family = AF_INET;
- else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
- delim, &ipend))
- ss->ss_family = AF_INET6;
- else
- goto bad;
- p = ipend;
-
- if (delim == ']') {
- if (*p != ']') {
- dout("missing matching ']'\n");
- goto bad;
- }
- p++;
- }
-
- /* port? */
- if (p < end && *p == ':') {
- port = 0;
- p++;
- while (p < end && *p >= '0' && *p <= '9') {
- port = (port * 10) + (*p - '0');
- p++;
- }
- if (port > 65535 || port == 0)
- goto bad;
- } else {
- port = CEPH_MON_PORT;
- }
-
- addr_set_port(ss, port);
-
- dout("parse_ips got %s\n", pr_addr(ss));
-
- if (p == end)
- break;
- if (*p != ',')
- goto bad;
- p++;
- }
-
- if (p != end)
- goto bad;
-
- if (count)
- *count = i + 1;
- return 0;
-
-bad:
- pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
- return -EINVAL;
-}
-
-static int process_banner(struct ceph_connection *con)
-{
- dout("process_banner on %p\n", con);
-
- if (verify_hello(con) < 0)
- return -1;
-
- ceph_decode_addr(&con->actual_peer_addr);
- ceph_decode_addr(&con->peer_addr_for_me);
-
- /*
- * Make sure the other end is who we wanted. note that the other
- * end may not yet know their ip address, so if it's 0.0.0.0, give
- * them the benefit of the doubt.
- */
- if (memcmp(&con->peer_addr, &con->actual_peer_addr,
- sizeof(con->peer_addr)) != 0 &&
- !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
- con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
- pr_warning("wrong peer, want %s/%d, got %s/%d\n",
- pr_addr(&con->peer_addr.in_addr),
- (int)le32_to_cpu(con->peer_addr.nonce),
- pr_addr(&con->actual_peer_addr.in_addr),
- (int)le32_to_cpu(con->actual_peer_addr.nonce));
- con->error_msg = "wrong peer at address";
- return -1;
- }
-
- /*
- * did we learn our address?
- */
- if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
- int port = addr_port(&con->msgr->inst.addr.in_addr);
-
- memcpy(&con->msgr->inst.addr.in_addr,
- &con->peer_addr_for_me.in_addr,
- sizeof(con->peer_addr_for_me.in_addr));
- addr_set_port(&con->msgr->inst.addr.in_addr, port);
- encode_my_addr(con->msgr);
- dout("process_banner learned my addr is %s\n",
- pr_addr(&con->msgr->inst.addr.in_addr));
- }
-
- set_bit(NEGOTIATING, &con->state);
- prepare_read_connect(con);
- return 0;
-}
-
-static void fail_protocol(struct ceph_connection *con)
-{
- reset_connection(con);
- set_bit(CLOSED, &con->state); /* in case there's queued work */
-
- mutex_unlock(&con->mutex);
- if (con->ops->bad_proto)
- con->ops->bad_proto(con);
- mutex_lock(&con->mutex);
-}
-
-static int process_connect(struct ceph_connection *con)
-{
- u64 sup_feat = CEPH_FEATURE_SUPPORTED;
- u64 req_feat = CEPH_FEATURE_REQUIRED;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
-
- dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
- switch (con->in_reply.tag) {
- case CEPH_MSGR_TAG_FEATURES:
- pr_err("%s%lld %s feature set mismatch,"
- " my %llx < server's %llx, missing %llx\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr),
- sup_feat, server_feat, server_feat & ~sup_feat);
- con->error_msg = "missing required protocol features";
- fail_protocol(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADPROTOVER:
- pr_err("%s%lld %s protocol version mismatch,"
- " my %d != server's %d\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr),
- le32_to_cpu(con->out_connect.protocol_version),
- le32_to_cpu(con->in_reply.protocol_version));
- con->error_msg = "protocol version mismatch";
- fail_protocol(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADAUTHORIZER:
- con->auth_retry++;
- dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
- con->auth_retry);
- if (con->auth_retry == 2) {
- con->error_msg = "connect authorization failure";
- reset_connection(con);
- set_bit(CLOSED, &con->state);
- return -1;
- }
- con->auth_retry = 1;
- prepare_write_connect(con->msgr, con, 0);
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RESETSESSION:
- /*
- * If we connected with a large connect_seq but the peer
- * has no record of a session with us (no connection, or
- * connect_seq == 0), they will send RESETSESION to indicate
- * that they must have reset their session, and may have
- * dropped messages.
- */
- dout("process_connect got RESET peer seq %u\n",
- le32_to_cpu(con->in_connect.connect_seq));
- pr_err("%s%lld %s connection reset\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr));
- reset_connection(con);
- prepare_write_connect(con->msgr, con, 0);
- prepare_read_connect(con);
-
- /* Tell ceph about it. */
- mutex_unlock(&con->mutex);
- pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
- if (con->ops->peer_reset)
- con->ops->peer_reset(con);
- mutex_lock(&con->mutex);
- break;
-
- case CEPH_MSGR_TAG_RETRY_SESSION:
- /*
- * If we sent a smaller connect_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
- le32_to_cpu(con->out_connect.connect_seq),
- le32_to_cpu(con->in_connect.connect_seq));
- con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
- prepare_write_connect(con->msgr, con, 0);
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RETRY_GLOBAL:
- /*
- * If we sent a smaller global_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_connect.global_seq));
- get_global_seq(con->msgr,
- le32_to_cpu(con->in_connect.global_seq));
- prepare_write_connect(con->msgr, con, 0);
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_READY:
- if (req_feat & ~server_feat) {
- pr_err("%s%lld %s protocol feature mismatch,"
- " my required %llx > server's %llx, need %llx\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr),
- req_feat, server_feat, req_feat & ~server_feat);
- con->error_msg = "missing required protocol features";
- fail_protocol(con);
- return -1;
- }
- clear_bit(CONNECTING, &con->state);
- con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
- con->connect_seq++;
- con->peer_features = server_feat;
- dout("process_connect got READY gseq %d cseq %d (%d)\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.connect_seq),
- con->connect_seq);
- WARN_ON(con->connect_seq !=
- le32_to_cpu(con->in_reply.connect_seq));
-
- if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
- set_bit(LOSSYTX, &con->state);
-
- prepare_read_tag(con);
- break;
-
- case CEPH_MSGR_TAG_WAIT:
- /*
- * If there is a connection race (we are opening
- * connections to each other), one of us may just have
- * to WAIT. This shouldn't happen if we are the
- * client.
- */
- pr_err("process_connect peer connecting WAIT\n");
-
- default:
- pr_err("connect protocol error, will retry\n");
- con->error_msg = "protocol error, garbage tag during connect";
- return -1;
- }
- return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
- int to = 0;
-
- return read_partial(con, &to, sizeof(con->in_temp_ack),
- &con->in_temp_ack);
-}
-
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u64 ack = le64_to_cpu(con->in_temp_ack);
- u64 seq;
-
- while (!list_empty(&con->out_sent)) {
- m = list_first_entry(&con->out_sent, struct ceph_msg,
- list_head);
- seq = le64_to_cpu(m->hdr.seq);
- if (seq > ack)
- break;
- dout("got ack for seq %llu type %d at %p\n", seq,
- le16_to_cpu(m->hdr.type), m);
- ceph_msg_remove(m);
- }
- prepare_read_tag(con);
-}
-
-
-
-
-static int read_partial_message_section(struct ceph_connection *con,
- struct kvec *section,
- unsigned int sec_len, u32 *crc)
-{
- int left;
- int ret;
-
- BUG_ON(!section);
-
- while (section->iov_len < sec_len) {
- BUG_ON(section->iov_base == NULL);
- left = sec_len - section->iov_len;
- ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
- section->iov_len, left);
- if (ret <= 0)
- return ret;
- section->iov_len += ret;
- if (section->iov_len == sec_len)
- *crc = crc32c(0, section->iov_base,
- section->iov_len);
- }
-
- return 1;
-}
-
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip);
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->in_msg;
- void *p;
- int ret;
- int to, left;
- unsigned front_len, middle_len, data_len, data_off;
- int datacrc = con->msgr->nocrc;
- int skip;
- u64 seq;
-
- dout("read_partial_message con %p msg %p\n", con, m);
-
- /* header */
- while (con->in_base_pos < sizeof(con->in_hdr)) {
- left = sizeof(con->in_hdr) - con->in_base_pos;
- ret = ceph_tcp_recvmsg(con->sock,
- (char *)&con->in_hdr + con->in_base_pos,
- left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- if (con->in_base_pos == sizeof(con->in_hdr)) {
- u32 crc = crc32c(0, (void *)&con->in_hdr,
- sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
- if (crc != le32_to_cpu(con->in_hdr.crc)) {
- pr_err("read_partial_message bad hdr "
- " crc %u != expected %u\n",
- crc, con->in_hdr.crc);
- return -EBADMSG;
- }
- }
- }
- front_len = le32_to_cpu(con->in_hdr.front_len);
- if (front_len > CEPH_MSG_MAX_FRONT_LEN)
- return -EIO;
- middle_len = le32_to_cpu(con->in_hdr.middle_len);
- if (middle_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
- data_len = le32_to_cpu(con->in_hdr.data_len);
- if (data_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
- data_off = le16_to_cpu(con->in_hdr.data_off);
-
- /* verify seq# */
- seq = le64_to_cpu(con->in_hdr.seq);
- if ((s64)seq - (s64)con->in_seq < 1) {
- pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
- ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr),
- seq, con->in_seq + 1);
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
- return 0;
- } else if ((s64)seq - (s64)con->in_seq > 1) {
- pr_err("read_partial_message bad seq %lld expected %lld\n",
- seq, con->in_seq + 1);
- con->error_msg = "bad message sequence # for incoming message";
- return -EBADMSG;
- }
-
- /* allocate message? */
- if (!con->in_msg) {
- dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
- con->in_hdr.front_len, con->in_hdr.data_len);
- skip = 0;
- con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
- if (skip) {
- /* skip this message */
- dout("alloc_msg said skip message\n");
- BUG_ON(con->in_msg);
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
- return 0;
- }
- if (!con->in_msg) {
- con->error_msg =
- "error allocating memory for incoming message";
- return -ENOMEM;
- }
- m = con->in_msg;
- m->front.iov_len = 0; /* haven't read it yet */
- if (m->middle)
- m->middle->vec.iov_len = 0;
-
- con->in_msg_pos.page = 0;
- con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
- con->in_msg_pos.data_pos = 0;
- }
-
- /* front */
- ret = read_partial_message_section(con, &m->front, front_len,
- &con->in_front_crc);
- if (ret <= 0)
- return ret;
-
- /* middle */
- if (m->middle) {
- ret = read_partial_message_section(con, &m->middle->vec,
- middle_len,
- &con->in_middle_crc);
- if (ret <= 0)
- return ret;
- }
-
- /* (page) data */
- while (con->in_msg_pos.data_pos < data_len) {
- left = min((int)(data_len - con->in_msg_pos.data_pos),
- (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
- BUG_ON(m->pages == NULL);
- p = kmap(m->pages[con->in_msg_pos.page]);
- ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
- left);
- if (ret > 0 && datacrc)
- con->in_data_crc =
- crc32c(con->in_data_crc,
- p + con->in_msg_pos.page_pos, ret);
- kunmap(m->pages[con->in_msg_pos.page]);
- if (ret <= 0)
- return ret;
- con->in_msg_pos.data_pos += ret;
- con->in_msg_pos.page_pos += ret;
- if (con->in_msg_pos.page_pos == PAGE_SIZE) {
- con->in_msg_pos.page_pos = 0;
- con->in_msg_pos.page++;
- }
- }
-
- /* footer */
- to = sizeof(m->hdr) + sizeof(m->footer);
- while (con->in_base_pos < to) {
- left = to - con->in_base_pos;
- ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
- (con->in_base_pos - sizeof(m->hdr)),
- left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
- m, front_len, m->footer.front_crc, middle_len,
- m->footer.middle_crc, data_len, m->footer.data_crc);
-
- /* crc ok? */
- if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
- pr_err("read_partial_message %p front crc %u != exp. %u\n",
- m, con->in_front_crc, m->footer.front_crc);
- return -EBADMSG;
- }
- if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
- pr_err("read_partial_message %p middle crc %u != exp %u\n",
- m, con->in_middle_crc, m->footer.middle_crc);
- return -EBADMSG;
- }
- if (datacrc &&
- (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
- con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
- pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
- con->in_data_crc, le32_to_cpu(m->footer.data_crc));
- return -EBADMSG;
- }
-
- return 1; /* done! */
-}
-
-/*
- * Process message. This happens in the worker thread. The callback should
- * be careful not to do anything that waits on other incoming messages or it
- * may deadlock.
- */
-static void process_message(struct ceph_connection *con)
-{
- struct ceph_msg *msg;
-
- msg = con->in_msg;
- con->in_msg = NULL;
-
- /* if first message, set peer_name */
- if (con->peer_name.type == 0)
- con->peer_name = msg->hdr.src;
-
- con->in_seq++;
- mutex_unlock(&con->mutex);
-
- dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
- msg, le64_to_cpu(msg->hdr.seq),
- ENTITY_NAME(msg->hdr.src),
- le16_to_cpu(msg->hdr.type),
- ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
- le32_to_cpu(msg->hdr.front_len),
- le32_to_cpu(msg->hdr.data_len),
- con->in_front_crc, con->in_middle_crc, con->in_data_crc);
- con->ops->dispatch(con, msg);
-
- mutex_lock(&con->mutex);
- prepare_read_tag(con);
-}
-
-
-/*
- * Write something to the socket. Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
- struct ceph_messenger *msgr = con->msgr;
- int ret = 1;
-
- dout("try_write start %p state %lu nref %d\n", con, con->state,
- atomic_read(&con->nref));
-
-more:
- dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-
- /* open the socket first? */
- if (con->sock == NULL) {
- /*
- * if we were STANDBY and are reconnecting _this_
- * connection, bump connect_seq now. Always bump
- * global_seq.
- */
- if (test_and_clear_bit(STANDBY, &con->state))
- con->connect_seq++;
-
- prepare_write_banner(msgr, con);
- prepare_write_connect(msgr, con, 1);
- prepare_read_banner(con);
- set_bit(CONNECTING, &con->state);
- clear_bit(NEGOTIATING, &con->state);
-
- BUG_ON(con->in_msg);
- con->in_tag = CEPH_MSGR_TAG_READY;
- dout("try_write initiating connect on %p new state %lu\n",
- con, con->state);
- con->sock = ceph_tcp_connect(con);
- if (IS_ERR(con->sock)) {
- con->sock = NULL;
- con->error_msg = "connect error";
- ret = -1;
- goto out;
- }
- }
-
-more_kvec:
- /* kvec data queued? */
- if (con->out_skip) {
- ret = write_partial_skip(con);
- if (ret <= 0)
- goto done;
- if (ret < 0) {
- dout("try_write write_partial_skip err %d\n", ret);
- goto done;
- }
- }
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
- if (ret <= 0)
- goto done;
- }
-
- /* msg pages? */
- if (con->out_msg) {
- if (con->out_msg_done) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL; /* we're done with this one */
- goto do_next;
- }
-
- ret = write_partial_msg_pages(con);
- if (ret == 1)
- goto more_kvec; /* we need to send the footer, too! */
- if (ret == 0)
- goto done;
- if (ret < 0) {
- dout("try_write write_partial_msg_pages err %d\n",
- ret);
- goto done;
- }
- }
-
-do_next:
- if (!test_bit(CONNECTING, &con->state)) {
- /* is anything else pending? */
- if (!list_empty(&con->out_queue)) {
- prepare_write_message(con);
- goto more;
- }
- if (con->in_seq > con->in_seq_acked) {
- prepare_write_ack(con);
- goto more;
- }
- if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
- prepare_write_keepalive(con);
- goto more;
- }
- }
-
- /* Nothing to do! */
- clear_bit(WRITE_PENDING, &con->state);
- dout("try_write nothing else to write.\n");
-done:
- ret = 0;
-out:
- dout("try_write done on %p\n", con);
- return ret;
-}
-
-
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
- int ret = -1;
-
- if (!con->sock)
- return 0;
-
- if (test_bit(STANDBY, &con->state))
- return 0;
-
- dout("try_read start on %p\n", con);
-
-more:
- dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
- con->in_base_pos);
- if (test_bit(CONNECTING, &con->state)) {
- if (!test_bit(NEGOTIATING, &con->state)) {
- dout("try_read connecting\n");
- ret = read_partial_banner(con);
- if (ret <= 0)
- goto done;
- if (process_banner(con) < 0) {
- ret = -1;
- goto out;
- }
- }
- ret = read_partial_connect(con);
- if (ret <= 0)
- goto done;
- if (process_connect(con) < 0) {
- ret = -1;
- goto out;
- }
- goto more;
- }
-
- if (con->in_base_pos < 0) {
- /*
- * skipping + discarding content.
- *
- * FIXME: there must be a better way to do this!
- */
- static char buf[1024];
- int skip = min(1024, -con->in_base_pos);
- dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
- ret = ceph_tcp_recvmsg(con->sock, buf, skip);
- if (ret <= 0)
- goto done;
- con->in_base_pos += ret;
- if (con->in_base_pos)
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY) {
- /*
- * what's next?
- */
- ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
- if (ret <= 0)
- goto done;
- dout("try_read got tag %d\n", (int)con->in_tag);
- switch (con->in_tag) {
- case CEPH_MSGR_TAG_MSG:
- prepare_read_message(con);
- break;
- case CEPH_MSGR_TAG_ACK:
- prepare_read_ack(con);
- break;
- case CEPH_MSGR_TAG_CLOSE:
- set_bit(CLOSED, &con->state); /* fixme */
- goto done;
- default:
- goto bad_tag;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_MSG) {
- ret = read_partial_message(con);
- if (ret <= 0) {
- switch (ret) {
- case -EBADMSG:
- con->error_msg = "bad crc";
- ret = -EIO;
- goto out;
- case -EIO:
- con->error_msg = "io error";
- goto out;
- default:
- goto done;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY)
- goto more;
- process_message(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_ACK) {
- ret = read_partial_ack(con);
- if (ret <= 0)
- goto done;
- process_ack(con);
- goto more;
- }
-
-done:
- ret = 0;
-out:
- dout("try_read done on %p\n", con);
- return ret;
-
-bad_tag:
- pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
- con->error_msg = "protocol error, garbage tag";
- ret = -1;
- goto out;
-}
-
-
-/*
- * Atomically queue work on a connection. Bump @con reference to
- * avoid races with connection teardown.
- *
- * There is some trickery going on with QUEUED and BUSY because we
- * only want a _single_ thread operating on each connection at any
- * point in time, but we want to use all available CPUs.
- *
- * The worker thread only proceeds if it can atomically set BUSY. It
- * clears QUEUED and does it's thing. When it thinks it's done, it
- * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
- * (tries again to set BUSY).
- *
- * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
- * try to queue work. If that fails (work is already queued, or BUSY)
- * we give up (work also already being done or is queued) but leave QUEUED
- * set so that the worker thread will loop if necessary.
- */
-static void queue_con(struct ceph_connection *con)
-{
- if (test_bit(DEAD, &con->state)) {
- dout("queue_con %p ignoring: DEAD\n",
- con);
- return;
- }
-
- if (!con->ops->get(con)) {
- dout("queue_con %p ref count 0\n", con);
- return;
- }
-
- set_bit(QUEUED, &con->state);
- if (test_bit(BUSY, &con->state)) {
- dout("queue_con %p - already BUSY\n", con);
- con->ops->put(con);
- } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
- dout("queue_con %p - already queued\n", con);
- con->ops->put(con);
- } else {
- dout("queue_con %p\n", con);
- }
-}
-
-/*
- * Do some work on a connection. Drop a connection ref when we're done.
- */
-static void con_work(struct work_struct *work)
-{
- struct ceph_connection *con = container_of(work, struct ceph_connection,
- work.work);
- int backoff = 0;
-
-more:
- if (test_and_set_bit(BUSY, &con->state) != 0) {
- dout("con_work %p BUSY already set\n", con);
- goto out;
- }
- dout("con_work %p start, clearing QUEUED\n", con);
- clear_bit(QUEUED, &con->state);
-
- mutex_lock(&con->mutex);
-
- if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
- dout("con_work CLOSED\n");
- con_close_socket(con);
- goto done;
- }
- if (test_and_clear_bit(OPENING, &con->state)) {
- /* reopen w/ new peer */
- dout("con_work OPENING\n");
- con_close_socket(con);
- }
-
- if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
- try_read(con) < 0 ||
- try_write(con) < 0) {
- mutex_unlock(&con->mutex);
- backoff = 1;
- ceph_fault(con); /* error/fault path */
- goto done_unlocked;
- }
-
-done:
- mutex_unlock(&con->mutex);
-
-done_unlocked:
- clear_bit(BUSY, &con->state);
- dout("con->state=%lu\n", con->state);
- if (test_bit(QUEUED, &con->state)) {
- if (!backoff || test_bit(OPENING, &con->state)) {
- dout("con_work %p QUEUED reset, looping\n", con);
- goto more;
- }
- dout("con_work %p QUEUED reset, but just faulted\n", con);
- clear_bit(QUEUED, &con->state);
- }
- dout("con_work %p done\n", con);
-
-out:
- con->ops->put(con);
-}
-
-
-/*
- * Generic error/fault handler. A retry mechanism is used with
- * exponential backoff
- */
-static void ceph_fault(struct ceph_connection *con)
-{
- pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
- pr_addr(&con->peer_addr.in_addr), con->error_msg);
- dout("fault %p state %lu to peer %s\n",
- con, con->state, pr_addr(&con->peer_addr.in_addr));
-
- if (test_bit(LOSSYTX, &con->state)) {
- dout("fault on LOSSYTX channel\n");
- goto out;
- }
-
- mutex_lock(&con->mutex);
- if (test_bit(CLOSED, &con->state))
- goto out_unlock;
-
- con_close_socket(con);
-
- if (con->in_msg) {
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
- /* Requeue anything that hasn't been acked */
- list_splice_init(&con->out_sent, &con->out_queue);
-
- /* If there are no messages in the queue, place the connection
- * in a STANDBY state (i.e., don't try to reconnect just yet). */
- if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
- dout("fault setting STANDBY\n");
- set_bit(STANDBY, &con->state);
- } else {
- /* retry after a delay. */
- if (con->delay == 0)
- con->delay = BASE_DELAY_INTERVAL;
- else if (con->delay < MAX_DELAY_INTERVAL)
- con->delay *= 2;
- dout("fault queueing %p delay %lu\n", con, con->delay);
- con->ops->get(con);
- if (queue_delayed_work(ceph_msgr_wq, &con->work,
- round_jiffies_relative(con->delay)) == 0)
- con->ops->put(con);
- }
-
-out_unlock:
- mutex_unlock(&con->mutex);
-out:
- /*
- * in case we faulted due to authentication, invalidate our
- * current tickets so that we can get new ones.
- */
- if (con->auth_retry && con->ops->invalidate_authorizer) {
- dout("calling invalidate_authorizer()\n");
- con->ops->invalidate_authorizer(con);
- }
-
- if (con->ops->fault)
- con->ops->fault(con);
-}
-
-
-
-/*
- * create a new messenger instance
- */
-struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
-{
- struct ceph_messenger *msgr;
-
- msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
- if (msgr == NULL)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_init(&msgr->global_seq_lock);
-
- /* the zero page is needed if a request is "canceled" while the message
- * is being written over the socket */
- msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
- if (!msgr->zero_page) {
- kfree(msgr);
- return ERR_PTR(-ENOMEM);
- }
- kmap(msgr->zero_page);
-
- if (myaddr)
- msgr->inst.addr = *myaddr;
-
- /* select a random nonce */
- msgr->inst.addr.type = 0;
- get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
- encode_my_addr(msgr);
-
- dout("messenger_create %p\n", msgr);
- return msgr;
-}
-
-void ceph_messenger_destroy(struct ceph_messenger *msgr)
-{
- dout("destroy %p\n", msgr);
- kunmap(msgr->zero_page);
- __free_page(msgr->zero_page);
- kfree(msgr);
- dout("destroyed messenger %p\n", msgr);
-}
-
-/*
- * Queue up an outgoing message on the given connection.
- */
-void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
-{
- if (test_bit(CLOSED, &con->state)) {
- dout("con_send %p closed, dropping %p\n", con, msg);
- ceph_msg_put(msg);
- return;
- }
-
- /* set src+dst */
- msg->hdr.src = con->msgr->inst.name;
-
- BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
-
- msg->needs_out_seq = true;
-
- /* queue */
- mutex_lock(&con->mutex);
- BUG_ON(!list_empty(&msg->list_head));
- list_add_tail(&msg->list_head, &con->out_queue);
- dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
- ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
- ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
- le32_to_cpu(msg->hdr.front_len),
- le32_to_cpu(msg->hdr.middle_len),
- le32_to_cpu(msg->hdr.data_len));
- mutex_unlock(&con->mutex);
-
- /* if there wasn't anything waiting to send before, queue
- * new work */
- if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
- queue_con(con);
-}
-
-/*
- * Revoke a message that was previously queued for send
- */
-void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
-{
- mutex_lock(&con->mutex);
- if (!list_empty(&msg->list_head)) {
- dout("con_revoke %p msg %p - was on queue\n", con, msg);
- list_del_init(&msg->list_head);
- ceph_msg_put(msg);
- msg->hdr.seq = 0;
- }
- if (con->out_msg == msg) {
- dout("con_revoke %p msg %p - was sending\n", con, msg);
- con->out_msg = NULL;
- if (con->out_kvec_is_msg) {
- con->out_skip = con->out_kvec_bytes;
- con->out_kvec_is_msg = false;
- }
- ceph_msg_put(msg);
- msg->hdr.seq = 0;
- }
- mutex_unlock(&con->mutex);
-}
-
-/*
- * Revoke a message that we may be reading data into
- */
-void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
-{
- mutex_lock(&con->mutex);
- if (con->in_msg && con->in_msg == msg) {
- unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
- unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
- unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
-
- /* skip rest of message */
- dout("con_revoke_pages %p msg %p revoked\n", con, msg);
- con->in_base_pos = con->in_base_pos -
- sizeof(struct ceph_msg_header) -
- front_len -
- middle_len -
- data_len -
- sizeof(struct ceph_msg_footer);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
- } else {
- dout("con_revoke_pages %p msg %p pages %p no-op\n",
- con, con->in_msg, msg);
- }
- mutex_unlock(&con->mutex);
-}
-
-/*
- * Queue a keepalive byte to ensure the tcp connection is alive.
- */
-void ceph_con_keepalive(struct ceph_connection *con)
-{
- if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
- test_and_set_bit(WRITE_PENDING, &con->state) == 0)
- queue_con(con);
-}
-
-
-/*
- * construct a new message with given type, size
- * the new msg has a ref count of 1.
- */
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-{
- struct ceph_msg *m;
-
- m = kmalloc(sizeof(*m), flags);
- if (m == NULL)
- goto out;
- kref_init(&m->kref);
- INIT_LIST_HEAD(&m->list_head);
-
- m->hdr.tid = 0;
- m->hdr.type = cpu_to_le16(type);
- m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
- m->hdr.version = 0;
- m->hdr.front_len = cpu_to_le32(front_len);
- m->hdr.middle_len = 0;
- m->hdr.data_len = 0;
- m->hdr.data_off = 0;
- m->hdr.reserved = 0;
- m->footer.front_crc = 0;
- m->footer.middle_crc = 0;
- m->footer.data_crc = 0;
- m->footer.flags = 0;
- m->front_max = front_len;
- m->front_is_vmalloc = false;
- m->more_to_follow = false;
- m->pool = NULL;
-
- /* front */
- if (front_len) {
- if (front_len > PAGE_CACHE_SIZE) {
- m->front.iov_base = __vmalloc(front_len, flags,
- PAGE_KERNEL);
- m->front_is_vmalloc = true;
- } else {
- m->front.iov_base = kmalloc(front_len, flags);
- }
- if (m->front.iov_base == NULL) {
- pr_err("msg_new can't allocate %d bytes\n",
- front_len);
- goto out2;
- }
- } else {
- m->front.iov_base = NULL;
- }
- m->front.iov_len = front_len;
-
- /* middle */
- m->middle = NULL;
-
- /* data */
- m->nr_pages = 0;
- m->pages = NULL;
- m->pagelist = NULL;
-
- dout("ceph_msg_new %p front %d\n", m, front_len);
- return m;
-
-out2:
- ceph_msg_put(m);
-out:
- pr_err("msg_new can't create type %d front %d\n", type, front_len);
- return NULL;
-}
-
-/*
- * Allocate "middle" portion of a message, if it is needed and wasn't
- * allocated by alloc_msg. This allows us to read a small fixed-size
- * per-type header in the front and then gracefully fail (i.e.,
- * propagate the error to the caller based on info in the front) when
- * the middle is too large.
- */
-static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
-{
- int type = le16_to_cpu(msg->hdr.type);
- int middle_len = le32_to_cpu(msg->hdr.middle_len);
-
- dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
- ceph_msg_type_name(type), middle_len);
- BUG_ON(!middle_len);
- BUG_ON(msg->middle);
-
- msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
- if (!msg->middle)
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Generic message allocator, for incoming messages.
- */
-static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
-{
- int type = le16_to_cpu(hdr->type);
- int front_len = le32_to_cpu(hdr->front_len);
- int middle_len = le32_to_cpu(hdr->middle_len);
- struct ceph_msg *msg = NULL;
- int ret;
-
- if (con->ops->alloc_msg) {
- mutex_unlock(&con->mutex);
- msg = con->ops->alloc_msg(con, hdr, skip);
- mutex_lock(&con->mutex);
- if (!msg || *skip)
- return NULL;
- }
- if (!msg) {
- *skip = 0;
- msg = ceph_msg_new(type, front_len, GFP_NOFS);
- if (!msg) {
- pr_err("unable to allocate msg type %d len %d\n",
- type, front_len);
- return NULL;
- }
- }
- memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-
- if (middle_len && !msg->middle) {
- ret = ceph_alloc_middle(con, msg);
- if (ret < 0) {
- ceph_msg_put(msg);
- return NULL;
- }
- }
-
- return msg;
-}
-
-
-/*
- * Free a generically kmalloc'd message.
- */
-void ceph_msg_kfree(struct ceph_msg *m)
-{
- dout("msg_kfree %p\n", m);
- if (m->front_is_vmalloc)
- vfree(m->front.iov_base);
- else
- kfree(m->front.iov_base);
- kfree(m);
-}
-
-/*
- * Drop a msg ref. Destroy as needed.
- */
-void ceph_msg_last_put(struct kref *kref)
-{
- struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
-
- dout("ceph_msg_put last one on %p\n", m);
- WARN_ON(!list_empty(&m->list_head));
-
- /* drop middle, data, if any */
- if (m->middle) {
- ceph_buffer_put(m->middle);
- m->middle = NULL;
- }
- m->nr_pages = 0;
- m->pages = NULL;
-
- if (m->pagelist) {
- ceph_pagelist_release(m->pagelist);
- kfree(m->pagelist);
- m->pagelist = NULL;
- }
-
- if (m->pool)
- ceph_msgpool_put(m->pool, m);
- else
- ceph_msg_kfree(m);
-}
-
-void ceph_msg_dump(struct ceph_msg *msg)
-{
- pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
- msg->front_max, msg->nr_pages);
- print_hex_dump(KERN_DEBUG, "header: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- &msg->hdr, sizeof(msg->hdr), true);
- print_hex_dump(KERN_DEBUG, " front: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- msg->front.iov_base, msg->front.iov_len, true);
- if (msg->middle)
- print_hex_dump(KERN_DEBUG, "middle: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- msg->middle->vec.iov_base,
- msg->middle->vec.iov_len, true);
- print_hex_dump(KERN_DEBUG, "footer: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- &msg->footer, sizeof(msg->footer), true);
-}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
deleted file mode 100644
index 76fbc957bc13..000000000000
--- a/fs/ceph/messenger.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __FS_CEPH_MESSENGER_H
-#define __FS_CEPH_MESSENGER_H
-
-#include <linux/kref.h>
-#include <linux/mutex.h>
-#include <linux/net.h>
-#include <linux/radix-tree.h>
-#include <linux/uio.h>
-#include <linux/version.h>
-#include <linux/workqueue.h>
-
-#include "types.h"
-#include "buffer.h"
-
-struct ceph_msg;
-struct ceph_connection;
-
-extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
-
-/*
- * Ceph defines these callbacks for handling connection events.
- */
-struct ceph_connection_operations {
- struct ceph_connection *(*get)(struct ceph_connection *);
- void (*put)(struct ceph_connection *);
-
- /* handle an incoming message. */
- void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
-
- /* authorize an outgoing connection */
- int (*get_authorizer) (struct ceph_connection *con,
- void **buf, int *len, int *proto,
- void **reply_buf, int *reply_len, int force_new);
- int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
- int (*invalidate_authorizer)(struct ceph_connection *con);
-
- /* protocol version mismatch */
- void (*bad_proto) (struct ceph_connection *con);
-
- /* there was some error on the socket (disconnect, whatever) */
- void (*fault) (struct ceph_connection *con);
-
- /* a remote host as terminated a message exchange session, and messages
- * we sent (or they tried to send us) may be lost. */
- void (*peer_reset) (struct ceph_connection *con);
-
- struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip);
-};
-
-/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
-
-struct ceph_messenger {
- struct ceph_entity_inst inst; /* my name+address */
- struct ceph_entity_addr my_enc_addr;
- struct page *zero_page; /* used in certain error cases */
-
- bool nocrc;
-
- /*
- * the global_seq counts connections i (attempt to) initiate
- * in order to disambiguate certain connect race conditions.
- */
- u32 global_seq;
- spinlock_t global_seq_lock;
-};
-
-/*
- * a single message. it contains a header (src, dest, message type, etc.),
- * footer (crc values, mainly), a "front" message body, and possibly a
- * data payload (stored in some number of pages).
- */
-struct ceph_msg {
- struct ceph_msg_header hdr; /* header */
- struct ceph_msg_footer footer; /* footer */
- struct kvec front; /* unaligned blobs of message */
- struct ceph_buffer *middle;
- struct page **pages; /* data payload. NOT OWNER. */
- unsigned nr_pages; /* size of page array */
- struct ceph_pagelist *pagelist; /* instead of pages */
- struct list_head list_head;
- struct kref kref;
- bool front_is_vmalloc;
- bool more_to_follow;
- bool needs_out_seq;
- int front_max;
-
- struct ceph_msgpool *pool;
-};
-
-struct ceph_msg_pos {
- int page, page_pos; /* which page; offset in page */
- int data_pos; /* offset in data payload */
- int did_page_crc; /* true if we've calculated crc for current page */
-};
-
-/* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL (HZ/2)
-#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
-
-/*
- * ceph_connection state bit flags
- *
- * QUEUED and BUSY are used together to ensure that only a single
- * thread is currently opening, reading or writing data to the socket.
- */
-#define LOSSYTX 0 /* we can close channel or drop messages on errors */
-#define CONNECTING 1
-#define NEGOTIATING 2
-#define KEEPALIVE_PENDING 3
-#define WRITE_PENDING 4 /* we have data ready to send */
-#define QUEUED 5 /* there is work queued on this connection */
-#define BUSY 6 /* work is being done */
-#define STANDBY 8 /* no outgoing messages, socket closed. we keep
- * the ceph_connection around to maintain shared
- * state with the peer. */
-#define CLOSED 10 /* we've closed the connection */
-#define SOCK_CLOSED 11 /* socket state changed to closed */
-#define OPENING 13 /* open connection w/ (possibly new) peer */
-#define DEAD 14 /* dead, about to kfree */
-
-/*
- * A single connection with another host.
- *
- * We maintain a queue of outgoing messages, and some session state to
- * ensure that we can preserve the lossless, ordered delivery of
- * messages in the case of a TCP disconnect.
- */
-struct ceph_connection {
- void *private;
- atomic_t nref;
-
- const struct ceph_connection_operations *ops;
-
- struct ceph_messenger *msgr;
- struct socket *sock;
- unsigned long state; /* connection state (see flags above) */
- const char *error_msg; /* error message, if any */
-
- struct ceph_entity_addr peer_addr; /* peer address */
- struct ceph_entity_name peer_name; /* peer name */
- struct ceph_entity_addr peer_addr_for_me;
- unsigned peer_features;
- u32 connect_seq; /* identify the most recent connection
- attempt for this connection, client */
- u32 peer_global_seq; /* peer's global seq for this connection */
-
- int auth_retry; /* true if we need a newer authorizer */
- void *auth_reply_buf; /* where to put the authorizer reply */
- int auth_reply_buf_len;
-
- struct mutex mutex;
-
- /* out queue */
- struct list_head out_queue;
- struct list_head out_sent; /* sending or sent but unacked */
- u64 out_seq; /* last message queued for send */
- bool out_keepalive_pending;
-
- u64 in_seq, in_seq_acked; /* last message received, acked */
-
- /* connection negotiation temps */
- char in_banner[CEPH_BANNER_MAX_LEN];
- union {
- struct { /* outgoing connection */
- struct ceph_msg_connect out_connect;
- struct ceph_msg_connect_reply in_reply;
- };
- struct { /* incoming */
- struct ceph_msg_connect in_connect;
- struct ceph_msg_connect_reply out_reply;
- };
- };
- struct ceph_entity_addr actual_peer_addr;
-
- /* message out temps */
- struct ceph_msg *out_msg; /* sending message (== tail of
- out_sent) */
- bool out_msg_done;
- struct ceph_msg_pos out_msg_pos;
-
- struct kvec out_kvec[8], /* sending header/footer data */
- *out_kvec_cur;
- int out_kvec_left; /* kvec's left in out_kvec */
- int out_skip; /* skip this many bytes */
- int out_kvec_bytes; /* total bytes left */
- bool out_kvec_is_msg; /* kvec refers to out_msg */
- int out_more; /* there is more data after the kvecs */
- __le64 out_temp_ack; /* for writing an ack */
-
- /* message in temps */
- struct ceph_msg_header in_hdr;
- struct ceph_msg *in_msg;
- struct ceph_msg_pos in_msg_pos;
- u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
-
- char in_tag; /* protocol control byte */
- int in_base_pos; /* bytes read */
- __le64 in_temp_ack; /* for reading an ack */
-
- struct delayed_work work; /* send|recv work */
- unsigned long delay; /* current delay interval */
-};
-
-
-extern const char *pr_addr(const struct sockaddr_storage *ss);
-extern int ceph_parse_ips(const char *c, const char *end,
- struct ceph_entity_addr *addr,
- int max_count, int *count);
-
-
-extern int ceph_msgr_init(void);
-extern void ceph_msgr_exit(void);
-extern void ceph_msgr_flush(void);
-
-extern struct ceph_messenger *ceph_messenger_create(
- struct ceph_entity_addr *myaddr);
-extern void ceph_messenger_destroy(struct ceph_messenger *);
-
-extern void ceph_con_init(struct ceph_messenger *msgr,
- struct ceph_connection *con);
-extern void ceph_con_open(struct ceph_connection *con,
- struct ceph_entity_addr *addr);
-extern bool ceph_con_opened(struct ceph_connection *con);
-extern void ceph_con_close(struct ceph_connection *con);
-extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_message(struct ceph_connection *con,
- struct ceph_msg *msg);
-extern void ceph_con_keepalive(struct ceph_connection *con);
-extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
-extern void ceph_con_put(struct ceph_connection *con);
-
-extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-extern void ceph_msg_kfree(struct ceph_msg *m);
-
-
-static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
-{
- kref_get(&msg->kref);
- return msg;
-}
-extern void ceph_msg_last_put(struct kref *kref);
-static inline void ceph_msg_put(struct ceph_msg *msg)
-{
- kref_put(&msg->kref, ceph_msg_last_put);
-}
-
-extern void ceph_msg_dump(struct ceph_msg *msg);
-
-#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
deleted file mode 100644
index b2a5a3e4a671..000000000000
--- a/fs/ceph/mon_client.c
+++ /dev/null
@@ -1,1018 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/sched.h>
-
-#include "mon_client.h"
-#include "super.h"
-#include "auth.h"
-#include "decode.h"
-
-/*
- * Interact with Ceph monitor cluster. Handle requests for new map
- * versions, and periodically resend as needed. Also implement
- * statfs() and umount().
- *
- * A small cluster of Ceph "monitors" are responsible for managing critical
- * cluster configuration and state information. An odd number (e.g., 3, 5)
- * of cmon daemons use a modified version of the Paxos part-time parliament
- * algorithm to manage the MDS map (mds cluster membership), OSD map, and
- * list of clients who have mounted the file system.
- *
- * We maintain an open, active session with a monitor at all times in order to
- * receive timely MDSMap updates. We periodically send a keepalive byte on the
- * TCP socket to ensure we detect a failure. If the connection does break, we
- * randomly hunt for a new monitor. Once the connection is reestablished, we
- * resend any outstanding requests.
- */
-
-static const struct ceph_connection_operations mon_con_ops;
-
-static int __validate_auth(struct ceph_mon_client *monc);
-
-/*
- * Decode a monmap blob (e.g., during mount).
- */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
-{
- struct ceph_monmap *m = NULL;
- int i, err = -EINVAL;
- struct ceph_fsid fsid;
- u32 epoch, num_mon;
- u16 version;
- u32 len;
-
- ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
-
- dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
-
- ceph_decode_16_safe(&p, end, version, bad);
-
- ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- epoch = ceph_decode_32(&p);
-
- num_mon = ceph_decode_32(&p);
- ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
-
- if (num_mon >= CEPH_MAX_MON)
- goto bad;
- m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
- m->fsid = fsid;
- m->epoch = epoch;
- m->num_mon = num_mon;
- ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
- for (i = 0; i < num_mon; i++)
- ceph_decode_addr(&m->mon_inst[i].addr);
-
- dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
- m->num_mon);
- for (i = 0; i < m->num_mon; i++)
- dout("monmap_decode mon%d is %s\n", i,
- pr_addr(&m->mon_inst[i].addr.in_addr));
- return m;
-
-bad:
- dout("monmap_decode failed with %d\n", err);
- kfree(m);
- return ERR_PTR(err);
-}
-
-/*
- * return true if *addr is included in the monmap.
- */
-int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
-{
- int i;
-
- for (i = 0; i < m->num_mon; i++)
- if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
- return 1;
- return 0;
-}
-
-/*
- * Send an auth request.
- */
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
- monc->pending_auth = 1;
- monc->m_auth->front.iov_len = len;
- monc->m_auth->hdr.front_len = cpu_to_le32(len);
- ceph_con_revoke(monc->con, monc->m_auth);
- ceph_msg_get(monc->m_auth); /* keep our ref */
- ceph_con_send(monc->con, monc->m_auth);
-}
-
-/*
- * Close monitor session, if any.
- */
-static void __close_session(struct ceph_mon_client *monc)
-{
- if (monc->con) {
- dout("__close_session closing mon%d\n", monc->cur_mon);
- ceph_con_revoke(monc->con, monc->m_auth);
- ceph_con_close(monc->con);
- monc->cur_mon = -1;
- monc->pending_auth = 0;
- ceph_auth_reset(monc->auth);
- }
-}
-
-/*
- * Open a session with a (new) monitor.
- */
-static int __open_session(struct ceph_mon_client *monc)
-{
- char r;
- int ret;
-
- if (monc->cur_mon < 0) {
- get_random_bytes(&r, 1);
- monc->cur_mon = r % monc->monmap->num_mon;
- dout("open_session num=%d r=%d -> mon%d\n",
- monc->monmap->num_mon, r, monc->cur_mon);
- monc->sub_sent = 0;
- monc->sub_renew_after = jiffies; /* i.e., expired */
- monc->want_next_osdmap = !!monc->want_next_osdmap;
-
- dout("open_session mon%d opening\n", monc->cur_mon);
- monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
- monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
- ceph_con_open(monc->con,
- &monc->monmap->mon_inst[monc->cur_mon].addr);
-
- /* initiatiate authentication handshake */
- ret = ceph_auth_build_hello(monc->auth,
- monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
- __send_prepared_auth_request(monc, ret);
- } else {
- dout("open_session mon%d already open\n", monc->cur_mon);
- }
- return 0;
-}
-
-static bool __sub_expired(struct ceph_mon_client *monc)
-{
- return time_after_eq(jiffies, monc->sub_renew_after);
-}
-
-/*
- * Reschedule delayed work timer.
- */
-static void __schedule_delayed(struct ceph_mon_client *monc)
-{
- unsigned delay;
-
- if (monc->cur_mon < 0 || __sub_expired(monc))
- delay = 10 * HZ;
- else
- delay = 20 * HZ;
- dout("__schedule_delayed after %u\n", delay);
- schedule_delayed_work(&monc->delayed_work, delay);
-}
-
-/*
- * Send subscribe request for mdsmap and/or osdmap.
- */
-static void __send_subscribe(struct ceph_mon_client *monc)
-{
- dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
- (unsigned)monc->sub_sent, __sub_expired(monc),
- monc->want_next_osdmap);
- if ((__sub_expired(monc) && !monc->sub_sent) ||
- monc->want_next_osdmap == 1) {
- struct ceph_msg *msg = monc->m_subscribe;
- struct ceph_mon_subscribe_item *i;
- void *p, *end;
-
- p = msg->front.iov_base;
- end = p + msg->front_max;
-
- dout("__send_subscribe to 'mdsmap' %u+\n",
- (unsigned)monc->have_mdsmap);
- if (monc->want_next_osdmap) {
- dout("__send_subscribe to 'osdmap' %u\n",
- (unsigned)monc->have_osdmap);
- ceph_encode_32(&p, 3);
- ceph_encode_string(&p, end, "osdmap", 6);
- i = p;
- i->have = cpu_to_le64(monc->have_osdmap);
- i->onetime = 1;
- p += sizeof(*i);
- monc->want_next_osdmap = 2; /* requested */
- } else {
- ceph_encode_32(&p, 2);
- }
- ceph_encode_string(&p, end, "mdsmap", 6);
- i = p;
- i->have = cpu_to_le64(monc->have_mdsmap);
- i->onetime = 0;
- p += sizeof(*i);
- ceph_encode_string(&p, end, "monmap", 6);
- i = p;
- i->have = 0;
- i->onetime = 0;
- p += sizeof(*i);
-
- msg->front.iov_len = p - msg->front.iov_base;
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- ceph_con_revoke(monc->con, msg);
- ceph_con_send(monc->con, ceph_msg_get(msg));
-
- monc->sub_sent = jiffies | 1; /* never 0 */
- }
-}
-
-static void handle_subscribe_ack(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- unsigned seconds;
- struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
-
- if (msg->front.iov_len < sizeof(*h))
- goto bad;
- seconds = le32_to_cpu(h->duration);
-
- mutex_lock(&monc->mutex);
- if (monc->hunting) {
- pr_info("mon%d %s session established\n",
- monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
- monc->hunting = false;
- }
- dout("handle_subscribe_ack after %d seconds\n", seconds);
- monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
- monc->sub_sent = 0;
- mutex_unlock(&monc->mutex);
- return;
-bad:
- pr_err("got corrupt subscribe-ack msg\n");
- ceph_msg_dump(msg);
-}
-
-/*
- * Keep track of which maps we have
- */
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
-{
- mutex_lock(&monc->mutex);
- monc->have_mdsmap = got;
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
-{
- mutex_lock(&monc->mutex);
- monc->have_osdmap = got;
- monc->want_next_osdmap = 0;
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
-/*
- * Register interest in the next osdmap
- */
-void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
-{
- dout("request_next_osdmap have %u\n", monc->have_osdmap);
- mutex_lock(&monc->mutex);
- if (!monc->want_next_osdmap)
- monc->want_next_osdmap = 1;
- if (monc->want_next_osdmap < 2)
- __send_subscribe(monc);
- mutex_unlock(&monc->mutex);
-}
-
-/*
- *
- */
-int ceph_monc_open_session(struct ceph_mon_client *monc)
-{
- if (!monc->con) {
- monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
- if (!monc->con)
- return -ENOMEM;
- ceph_con_init(monc->client->msgr, monc->con);
- monc->con->private = monc;
- monc->con->ops = &mon_con_ops;
- }
-
- mutex_lock(&monc->mutex);
- __open_session(monc);
- __schedule_delayed(monc);
- mutex_unlock(&monc->mutex);
- return 0;
-}
-
-/*
- * The monitor responds with mount ack indicate mount success. The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- struct ceph_client *client = monc->client;
- struct ceph_monmap *monmap = NULL, *old = monc->monmap;
- void *p, *end;
-
- mutex_lock(&monc->mutex);
-
- dout("handle_monmap\n");
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
-
- monmap = ceph_monmap_decode(p, end);
- if (IS_ERR(monmap)) {
- pr_err("problem decoding monmap, %d\n",
- (int)PTR_ERR(monmap));
- goto out;
- }
-
- if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
- kfree(monmap);
- goto out;
- }
-
- client->monc.monmap = monmap;
- kfree(old);
-
-out:
- mutex_unlock(&monc->mutex);
- wake_up_all(&client->auth_wq);
-}
-
-/*
- * generic requests (e.g., statfs, poolop)
- */
-static struct ceph_mon_generic_request *__lookup_generic_req(
- struct ceph_mon_client *monc, u64 tid)
-{
- struct ceph_mon_generic_request *req;
- struct rb_node *n = monc->generic_request_tree.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_mon_generic_request, node);
- if (tid < req->tid)
- n = n->rb_left;
- else if (tid > req->tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
-}
-
-static void __insert_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *new)
-{
- struct rb_node **p = &monc->generic_request_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_mon_generic_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_mon_generic_request, node);
- if (new->tid < req->tid)
- p = &(*p)->rb_left;
- else if (new->tid > req->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &monc->generic_request_tree);
-}
-
-static void release_generic_request(struct kref *kref)
-{
- struct ceph_mon_generic_request *req =
- container_of(kref, struct ceph_mon_generic_request, kref);
-
- if (req->reply)
- ceph_msg_put(req->reply);
- if (req->request)
- ceph_msg_put(req->request);
-
- kfree(req);
-}
-
-static void put_generic_request(struct ceph_mon_generic_request *req)
-{
- kref_put(&req->kref, release_generic_request);
-}
-
-static void get_generic_request(struct ceph_mon_generic_request *req)
-{
- kref_get(&req->kref);
-}
-
-static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
-{
- struct ceph_mon_client *monc = con->private;
- struct ceph_mon_generic_request *req;
- u64 tid = le64_to_cpu(hdr->tid);
- struct ceph_msg *m;
-
- mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (!req) {
- dout("get_generic_reply %lld dne\n", tid);
- *skip = 1;
- m = NULL;
- } else {
- dout("get_generic_reply %lld got %p\n", tid, req->reply);
- m = ceph_msg_get(req->reply);
- /*
- * we don't need to track the connection reading into
- * this reply because we only have one open connection
- * at a time, ever.
- */
- }
- mutex_unlock(&monc->mutex);
- return m;
-}
-
-static int do_generic_request(struct ceph_mon_client *monc,
- struct ceph_mon_generic_request *req)
-{
- int err;
-
- /* register request */
- mutex_lock(&monc->mutex);
- req->tid = ++monc->last_tid;
- req->request->hdr.tid = cpu_to_le64(req->tid);
- __insert_generic_request(monc, req);
- monc->num_generic_requests++;
- ceph_con_send(monc->con, ceph_msg_get(req->request));
- mutex_unlock(&monc->mutex);
-
- err = wait_for_completion_interruptible(&req->completion);
-
- mutex_lock(&monc->mutex);
- rb_erase(&req->node, &monc->generic_request_tree);
- monc->num_generic_requests--;
- mutex_unlock(&monc->mutex);
-
- if (!err)
- err = req->result;
- return err;
-}
-
-/*
- * statfs
- */
-static void handle_statfs_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- struct ceph_mon_generic_request *req;
- struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
- u64 tid = le64_to_cpu(msg->hdr.tid);
-
- if (msg->front.iov_len != sizeof(*reply))
- goto bad;
- dout("handle_statfs_reply %p tid %llu\n", msg, tid);
-
- mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (req) {
- *(struct ceph_statfs *)req->buf = reply->st;
- req->result = 0;
- get_generic_request(req);
- }
- mutex_unlock(&monc->mutex);
- if (req) {
- complete_all(&req->completion);
- put_generic_request(req);
- }
- return;
-
-bad:
- pr_err("corrupt generic reply, tid %llu\n", tid);
- ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
- struct ceph_mon_generic_request *req;
- struct ceph_mon_statfs *h;
- int err;
-
- req = kzalloc(sizeof(*req), GFP_NOFS);
- if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = buf;
- req->buf_len = sizeof(*buf);
- init_completion(&req->completion);
-
- err = -ENOMEM;
- req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
- if (!req->request)
- goto out;
- req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
- if (!req->reply)
- goto out;
-
- /* fill out request */
- h = req->request->front.iov_base;
- h->monhdr.have_version = 0;
- h->monhdr.session_mon = cpu_to_le16(-1);
- h->monhdr.session_mon_tid = 0;
- h->fsid = monc->monmap->fsid;
-
- err = do_generic_request(monc, req);
-
-out:
- kref_put(&req->kref, release_generic_request);
- return err;
-}
-
-/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
- char *dst, size_t dst_len)
-{
- u32 buf_len;
-
- if (src_len != sizeof(u32) + dst_len)
- return -EINVAL;
-
- buf_len = le32_to_cpu(*(u32 *)src);
- if (buf_len != dst_len)
- return -EINVAL;
-
- memcpy(dst, src + sizeof(u32), dst_len);
- return 0;
-}
-
-static void handle_poolop_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- struct ceph_mon_generic_request *req;
- struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
- u64 tid = le64_to_cpu(msg->hdr.tid);
-
- if (msg->front.iov_len < sizeof(*reply))
- goto bad;
- dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-
- mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (req) {
- if (req->buf_len &&
- get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
- msg->front.iov_len - sizeof(*reply),
- req->buf, req->buf_len) < 0) {
- mutex_unlock(&monc->mutex);
- goto bad;
- }
- req->result = le32_to_cpu(reply->reply_code);
- get_generic_request(req);
- }
- mutex_unlock(&monc->mutex);
- if (req) {
- complete(&req->completion);
- put_generic_request(req);
- }
- return;
-
-bad:
- pr_err("corrupt generic reply, tid %llu\n", tid);
- ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous pool op.
- */
-int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
- u32 pool, u64 snapid,
- char *buf, int len)
-{
- struct ceph_mon_generic_request *req;
- struct ceph_mon_poolop *h;
- int err;
-
- req = kzalloc(sizeof(*req), GFP_NOFS);
- if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = buf;
- req->buf_len = len;
- init_completion(&req->completion);
-
- err = -ENOMEM;
- req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
- if (!req->request)
- goto out;
- req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
- if (!req->reply)
- goto out;
-
- /* fill out request */
- req->request->hdr.version = cpu_to_le16(2);
- h = req->request->front.iov_base;
- h->monhdr.have_version = 0;
- h->monhdr.session_mon = cpu_to_le16(-1);
- h->monhdr.session_mon_tid = 0;
- h->fsid = monc->monmap->fsid;
- h->pool = cpu_to_le32(pool);
- h->op = cpu_to_le32(op);
- h->auid = 0;
- h->snapid = cpu_to_le64(snapid);
- h->name_len = 0;
-
- err = do_generic_request(monc, req);
-
-out:
- kref_put(&req->kref, release_generic_request);
- return err;
-}
-
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 *snapid)
-{
- return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
- pool, 0, (char *)snapid, sizeof(*snapid));
-
-}
-
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 snapid)
-{
- return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
- pool, snapid, 0, 0);
-
-}
-
-/*
- * Resend pending generic requests.
- */
-static void __resend_generic_request(struct ceph_mon_client *monc)
-{
- struct ceph_mon_generic_request *req;
- struct rb_node *p;
-
- for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_mon_generic_request, node);
- ceph_con_revoke(monc->con, req->request);
- ceph_con_send(monc->con, ceph_msg_get(req->request));
- }
-}
-
-/*
- * Delayed work. If we haven't mounted yet, retry. Otherwise,
- * renew/retry subscription as needed (in case it is timing out, or we
- * got an ENOMEM). And keep the monitor connection alive.
- */
-static void delayed_work(struct work_struct *work)
-{
- struct ceph_mon_client *monc =
- container_of(work, struct ceph_mon_client, delayed_work.work);
-
- dout("monc delayed_work\n");
- mutex_lock(&monc->mutex);
- if (monc->hunting) {
- __close_session(monc);
- __open_session(monc); /* continue hunting */
- } else {
- ceph_con_keepalive(monc->con);
-
- __validate_auth(monc);
-
- if (monc->auth->ops->is_authenticated(monc->auth))
- __send_subscribe(monc);
- }
- __schedule_delayed(monc);
- mutex_unlock(&monc->mutex);
-}
-
-/*
- * On startup, we build a temporary monmap populated with the IPs
- * provided by mount(2).
- */
-static int build_initial_monmap(struct ceph_mon_client *monc)
-{
- struct ceph_mount_args *args = monc->client->mount_args;
- struct ceph_entity_addr *mon_addr = args->mon_addr;
- int num_mon = args->num_mon;
- int i;
-
- /* build initial monmap */
- monc->monmap = kzalloc(sizeof(*monc->monmap) +
- num_mon*sizeof(monc->monmap->mon_inst[0]),
- GFP_KERNEL);
- if (!monc->monmap)
- return -ENOMEM;
- for (i = 0; i < num_mon; i++) {
- monc->monmap->mon_inst[i].addr = mon_addr[i];
- monc->monmap->mon_inst[i].addr.nonce = 0;
- monc->monmap->mon_inst[i].name.type =
- CEPH_ENTITY_TYPE_MON;
- monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
- }
- monc->monmap->num_mon = num_mon;
- monc->have_fsid = false;
-
- /* release addr memory */
- kfree(args->mon_addr);
- args->mon_addr = NULL;
- args->num_mon = 0;
- return 0;
-}
-
-int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
-{
- int err = 0;
-
- dout("init\n");
- memset(monc, 0, sizeof(*monc));
- monc->client = cl;
- monc->monmap = NULL;
- mutex_init(&monc->mutex);
-
- err = build_initial_monmap(monc);
- if (err)
- goto out;
-
- monc->con = NULL;
-
- /* authentication */
- monc->auth = ceph_auth_init(cl->mount_args->name,
- cl->mount_args->secret);
- if (IS_ERR(monc->auth))
- return PTR_ERR(monc->auth);
- monc->auth->want_keys =
- CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
- CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-
- /* msgs */
- err = -ENOMEM;
- monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
- sizeof(struct ceph_mon_subscribe_ack),
- GFP_NOFS);
- if (!monc->m_subscribe_ack)
- goto out_monmap;
-
- monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
- if (!monc->m_subscribe)
- goto out_subscribe_ack;
-
- monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
- if (!monc->m_auth_reply)
- goto out_subscribe;
-
- monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
- monc->pending_auth = 0;
- if (!monc->m_auth)
- goto out_auth_reply;
-
- monc->cur_mon = -1;
- monc->hunting = true;
- monc->sub_renew_after = jiffies;
- monc->sub_sent = 0;
-
- INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
- monc->generic_request_tree = RB_ROOT;
- monc->num_generic_requests = 0;
- monc->last_tid = 0;
-
- monc->have_mdsmap = 0;
- monc->have_osdmap = 0;
- monc->want_next_osdmap = 1;
- return 0;
-
-out_auth_reply:
- ceph_msg_put(monc->m_auth_reply);
-out_subscribe:
- ceph_msg_put(monc->m_subscribe);
-out_subscribe_ack:
- ceph_msg_put(monc->m_subscribe_ack);
-out_monmap:
- kfree(monc->monmap);
-out:
- return err;
-}
-
-void ceph_monc_stop(struct ceph_mon_client *monc)
-{
- dout("stop\n");
- cancel_delayed_work_sync(&monc->delayed_work);
-
- mutex_lock(&monc->mutex);
- __close_session(monc);
- if (monc->con) {
- monc->con->private = NULL;
- monc->con->ops->put(monc->con);
- monc->con = NULL;
- }
- mutex_unlock(&monc->mutex);
-
- ceph_auth_destroy(monc->auth);
-
- ceph_msg_put(monc->m_auth);
- ceph_msg_put(monc->m_auth_reply);
- ceph_msg_put(monc->m_subscribe);
- ceph_msg_put(monc->m_subscribe_ack);
-
- kfree(monc->monmap);
-}
-
-static void handle_auth_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- int ret;
- int was_auth = 0;
-
- mutex_lock(&monc->mutex);
- if (monc->auth->ops)
- was_auth = monc->auth->ops->is_authenticated(monc->auth);
- monc->pending_auth = 0;
- ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
- msg->front.iov_len,
- monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
- if (ret < 0) {
- monc->client->auth_err = ret;
- wake_up_all(&monc->client->auth_wq);
- } else if (ret > 0) {
- __send_prepared_auth_request(monc, ret);
- } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
- dout("authenticated, starting session\n");
-
- monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
- monc->client->msgr->inst.name.num =
- cpu_to_le64(monc->auth->global_id);
-
- __send_subscribe(monc);
- __resend_generic_request(monc);
- }
- mutex_unlock(&monc->mutex);
-}
-
-static int __validate_auth(struct ceph_mon_client *monc)
-{
- int ret;
-
- if (monc->pending_auth)
- return 0;
-
- ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
- if (ret <= 0)
- return ret; /* either an error, or no need to authenticate */
- __send_prepared_auth_request(monc, ret);
- return 0;
-}
-
-int ceph_monc_validate_auth(struct ceph_mon_client *monc)
-{
- int ret;
-
- mutex_lock(&monc->mutex);
- ret = __validate_auth(monc);
- mutex_unlock(&monc->mutex);
- return ret;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
- struct ceph_mon_client *monc = con->private;
- int type = le16_to_cpu(msg->hdr.type);
-
- if (!monc)
- return;
-
- switch (type) {
- case CEPH_MSG_AUTH_REPLY:
- handle_auth_reply(monc, msg);
- break;
-
- case CEPH_MSG_MON_SUBSCRIBE_ACK:
- handle_subscribe_ack(monc, msg);
- break;
-
- case CEPH_MSG_STATFS_REPLY:
- handle_statfs_reply(monc, msg);
- break;
-
- case CEPH_MSG_POOLOP_REPLY:
- handle_poolop_reply(monc, msg);
- break;
-
- case CEPH_MSG_MON_MAP:
- ceph_monc_handle_map(monc, msg);
- break;
-
- case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(&monc->client->mdsc, msg);
- break;
-
- case CEPH_MSG_OSD_MAP:
- ceph_osdc_handle_map(&monc->client->osdc, msg);
- break;
-
- default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
- }
- ceph_msg_put(msg);
-}
-
-/*
- * Allocate memory for incoming message
- */
-static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
-{
- struct ceph_mon_client *monc = con->private;
- int type = le16_to_cpu(hdr->type);
- int front_len = le32_to_cpu(hdr->front_len);
- struct ceph_msg *m = NULL;
-
- *skip = 0;
-
- switch (type) {
- case CEPH_MSG_MON_SUBSCRIBE_ACK:
- m = ceph_msg_get(monc->m_subscribe_ack);
- break;
- case CEPH_MSG_POOLOP_REPLY:
- case CEPH_MSG_STATFS_REPLY:
- return get_generic_reply(con, hdr, skip);
- case CEPH_MSG_AUTH_REPLY:
- m = ceph_msg_get(monc->m_auth_reply);
- break;
- case CEPH_MSG_MON_MAP:
- case CEPH_MSG_MDS_MAP:
- case CEPH_MSG_OSD_MAP:
- m = ceph_msg_new(type, front_len, GFP_NOFS);
- break;
- }
-
- if (!m) {
- pr_info("alloc_msg unknown type %d\n", type);
- *skip = 1;
- }
- return m;
-}
-
-/*
- * If the monitor connection resets, pick a new monitor and resubmit
- * any pending requests.
- */
-static void mon_fault(struct ceph_connection *con)
-{
- struct ceph_mon_client *monc = con->private;
-
- if (!monc)
- return;
-
- dout("mon_fault\n");
- mutex_lock(&monc->mutex);
- if (!con->private)
- goto out;
-
- if (monc->con && !monc->hunting)
- pr_info("mon%d %s session lost, "
- "hunting for new mon\n", monc->cur_mon,
- pr_addr(&monc->con->peer_addr.in_addr));
-
- __close_session(monc);
- if (!monc->hunting) {
- /* start hunting */
- monc->hunting = true;
- __open_session(monc);
- } else {
- /* already hunting, let's wait a bit */
- __schedule_delayed(monc);
- }
-out:
- mutex_unlock(&monc->mutex);
-}
-
-static const struct ceph_connection_operations mon_con_ops = {
- .get = ceph_con_get,
- .put = ceph_con_put,
- .dispatch = dispatch,
- .fault = mon_fault,
- .alloc_msg = mon_alloc_msg,
-};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
deleted file mode 100644
index 8e396f2c0963..000000000000
--- a/fs/ceph/mon_client.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef _FS_CEPH_MON_CLIENT_H
-#define _FS_CEPH_MON_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/rbtree.h>
-
-#include "messenger.h"
-
-struct ceph_client;
-struct ceph_mount_args;
-struct ceph_auth_client;
-
-/*
- * The monitor map enumerates the set of all monitors.
- */
-struct ceph_monmap {
- struct ceph_fsid fsid;
- u32 epoch;
- u32 num_mon;
- struct ceph_entity_inst mon_inst[0];
-};
-
-struct ceph_mon_client;
-struct ceph_mon_generic_request;
-
-
-/*
- * Generic mechanism for resending monitor requests.
- */
-typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
- int newmon);
-
-/* a pending monitor request */
-struct ceph_mon_request {
- struct ceph_mon_client *monc;
- struct delayed_work delayed_work;
- unsigned long delay;
- ceph_monc_request_func_t do_request;
-};
-
-/*
- * ceph_mon_generic_request is being used for the statfs and poolop requests
- * which are bening done a bit differently because we need to get data back
- * to the caller
- */
-struct ceph_mon_generic_request {
- struct kref kref;
- u64 tid;
- struct rb_node node;
- int result;
- void *buf;
- int buf_len;
- struct completion completion;
- struct ceph_msg *request; /* original request */
- struct ceph_msg *reply; /* and reply */
-};
-
-struct ceph_mon_client {
- struct ceph_client *client;
- struct ceph_monmap *monmap;
-
- struct mutex mutex;
- struct delayed_work delayed_work;
-
- struct ceph_auth_client *auth;
- struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
- int pending_auth;
-
- bool hunting;
- int cur_mon; /* last monitor i contacted */
- unsigned long sub_sent, sub_renew_after;
- struct ceph_connection *con;
- bool have_fsid;
-
- /* pending generic requests */
- struct rb_root generic_request_tree;
- int num_generic_requests;
- u64 last_tid;
-
- /* mds/osd map */
- int want_next_osdmap; /* 1 = want, 2 = want+asked */
- u32 have_osdmap, have_mdsmap;
-
-#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_file;
-#endif
-};
-
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
-extern int ceph_monmap_contains(struct ceph_monmap *m,
- struct ceph_entity_addr *addr);
-
-extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
-extern void ceph_monc_stop(struct ceph_mon_client *monc);
-
-/*
- * The model here is to indicate that we need a new map of at least
- * epoch @want, and also call in when we receive a map. We will
- * periodically rerequest the map from the monitor cluster until we
- * get what we want.
- */
-extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
-extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
-
-extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
-
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
- struct ceph_statfs *buf);
-
-extern int ceph_monc_open_session(struct ceph_mon_client *monc);
-
-extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 *snapid);
-
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 snapid);
-
-#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
deleted file mode 100644
index dd65a6438131..000000000000
--- a/fs/ceph/msgpool.c
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-
-#include "msgpool.h"
-
-static void *alloc_fn(gfp_t gfp_mask, void *arg)
-{
- struct ceph_msgpool *pool = arg;
- void *p;
-
- p = ceph_msg_new(0, pool->front_len, gfp_mask);
- if (!p)
- pr_err("msgpool %s alloc failed\n", pool->name);
- return p;
-}
-
-static void free_fn(void *element, void *arg)
-{
- ceph_msg_put(element);
-}
-
-int ceph_msgpool_init(struct ceph_msgpool *pool,
- int front_len, int size, bool blocking, const char *name)
-{
- pool->front_len = front_len;
- pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
- if (!pool->pool)
- return -ENOMEM;
- pool->name = name;
- return 0;
-}
-
-void ceph_msgpool_destroy(struct ceph_msgpool *pool)
-{
- mempool_destroy(pool->pool);
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
- int front_len)
-{
- if (front_len > pool->front_len) {
- pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
- pool->name, front_len, pool->front_len);
- WARN_ON(1);
-
- /* try to alloc a fresh message */
- return ceph_msg_new(0, front_len, GFP_NOFS);
- }
-
- return mempool_alloc(pool->pool, GFP_NOFS);
-}
-
-void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
-{
- /* reset msg front_len; user may have changed it */
- msg->front.iov_len = pool->front_len;
- msg->hdr.front_len = cpu_to_le32(pool->front_len);
-
- kref_init(&msg->kref); /* retake single ref */
-}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
deleted file mode 100644
index a362605f9368..000000000000
--- a/fs/ceph/msgpool.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FS_CEPH_MSGPOOL
-#define _FS_CEPH_MSGPOOL
-
-#include <linux/mempool.h>
-#include "messenger.h"
-
-/*
- * we use memory pools for preallocating messages we may receive, to
- * avoid unexpected OOM conditions.
- */
-struct ceph_msgpool {
- const char *name;
- mempool_t *pool;
- int front_len; /* preallocated payload size */
-};
-
-extern int ceph_msgpool_init(struct ceph_msgpool *pool,
- int front_len, int size, bool blocking,
- const char *name);
-extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
- int front_len);
-extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
-
-#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
deleted file mode 100644
index 680d3d648cac..000000000000
--- a/fs/ceph/msgr.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CEPH_MSGR_H
-#define CEPH_MSGR_H
-
-/*
- * Data types for message passing layer used by Ceph.
- */
-
-#define CEPH_MON_PORT 6789 /* default monitor port */
-
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST 6789
-#define CEPH_PORT_START 6800 /* non-monitors start here */
-#define CEPH_PORT_LAST 6900
-
-/*
- * tcp connection banner. include a protocol version. and adjust
- * whenever the wire protocol changes. try to keep this string length
- * constant.
- */
-#define CEPH_BANNER "ceph v027"
-#define CEPH_BANNER_MAX_LEN 30
-
-
-/*
- * Rollover-safe type and comparator for 32-bit sequence numbers.
- * Comparator returns -1, 0, or 1.
- */
-typedef __u32 ceph_seq_t;
-
-static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
-{
- return (__s32)a - (__s32)b;
-}
-
-
-/*
- * entity_name -- logical name for a process participating in the
- * network, e.g. 'mds0' or 'osd3'.
- */
-struct ceph_entity_name {
- __u8 type; /* CEPH_ENTITY_TYPE_* */
- __le64 num;
-} __attribute__ ((packed));
-
-#define CEPH_ENTITY_TYPE_MON 0x01
-#define CEPH_ENTITY_TYPE_MDS 0x02
-#define CEPH_ENTITY_TYPE_OSD 0x04
-#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_AUTH 0x20
-
-#define CEPH_ENTITY_TYPE_ANY 0xFF
-
-extern const char *ceph_entity_type_name(int type);
-
-/*
- * entity_addr -- network address
- */
-struct ceph_entity_addr {
- __le32 type;
- __le32 nonce; /* unique id for process (e.g. pid) */
- struct sockaddr_storage in_addr;
-} __attribute__ ((packed));
-
-struct ceph_entity_inst {
- struct ceph_entity_name name;
- struct ceph_entity_addr addr;
-} __attribute__ ((packed));
-
-
-/* used by message exchange protocol */
-#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
-#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
-#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
- incoming connection */
-#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
- with higher cseq */
-#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
- with higher gseq */
-#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
-#define CEPH_MSGR_TAG_MSG 7 /* message */
-#define CEPH_MSGR_TAG_ACK 8 /* message ack */
-#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
-#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
-#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
-
-
-/*
- * connection negotiation
- */
-struct ceph_msg_connect {
- __le64 features; /* supported feature bits */
- __le32 host_type; /* CEPH_ENTITY_TYPE_* */
- __le32 global_seq; /* count connections initiated by this host */
- __le32 connect_seq; /* count connections initiated in this session */
- __le32 protocol_version;
- __le32 authorizer_protocol;
- __le32 authorizer_len;
- __u8 flags; /* CEPH_MSG_CONNECT_* */
-} __attribute__ ((packed));
-
-struct ceph_msg_connect_reply {
- __u8 tag;
- __le64 features; /* feature bits for this session */
- __le32 global_seq;
- __le32 connect_seq;
- __le32 protocol_version;
- __le32 authorizer_len;
- __u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
-
-
-/*
- * message header
- */
-struct ceph_msg_header_old {
- __le64 seq; /* message seq# for this session */
- __le64 tid; /* transaction id */
- __le16 type; /* message type */
- __le16 priority; /* priority. higher value == higher priority */
- __le16 version; /* version of message encoding */
-
- __le32 front_len; /* bytes in main payload */
- __le32 middle_len;/* bytes in middle payload */
- __le32 data_len; /* bytes of data payload */
- __le16 data_off; /* sender: include full offset;
- receiver: mask against ~PAGE_MASK */
-
- struct ceph_entity_inst src, orig_src;
- __le32 reserved;
- __le32 crc; /* header crc32c */
-} __attribute__ ((packed));
-
-struct ceph_msg_header {
- __le64 seq; /* message seq# for this session */
- __le64 tid; /* transaction id */
- __le16 type; /* message type */
- __le16 priority; /* priority. higher value == higher priority */
- __le16 version; /* version of message encoding */
-
- __le32 front_len; /* bytes in main payload */
- __le32 middle_len;/* bytes in middle payload */
- __le32 data_len; /* bytes of data payload */
- __le16 data_off; /* sender: include full offset;
- receiver: mask against ~PAGE_MASK */
-
- struct ceph_entity_name src;
- __le32 reserved;
- __le32 crc; /* header crc32c */
-} __attribute__ ((packed));
-
-#define CEPH_MSG_PRIO_LOW 64
-#define CEPH_MSG_PRIO_DEFAULT 127
-#define CEPH_MSG_PRIO_HIGH 196
-#define CEPH_MSG_PRIO_HIGHEST 255
-
-/*
- * follows data payload
- */
-struct ceph_msg_footer {
- __le32 front_crc, middle_crc, data_crc;
- __u8 flags;
-} __attribute__ ((packed));
-
-#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
-#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
-
-
-#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
deleted file mode 100644
index dfced1dacbcd..000000000000
--- a/fs/ceph/osd_client.c
+++ /dev/null
@@ -1,1539 +0,0 @@
-#include "ceph_debug.h"
-
-#include <linux/err.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-
-#include "super.h"
-#include "osd_client.h"
-#include "messenger.h"
-#include "decode.h"
-#include "auth.h"
-
-#define OSD_OP_FRONT_LEN 4096
-#define OSD_OPREPLY_FRONT_LEN 512
-
-static const struct ceph_connection_operations osd_con_ops;
-static int __kick_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *kickosd);
-
-static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
-
-/*
- * Implement client access to distributed object storage cluster.
- *
- * All data objects are stored within a cluster/cloud of OSDs, or
- * "object storage devices." (Note that Ceph OSDs have _nothing_ to
- * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
- * remote daemons serving up and coordinating consistent and safe
- * access to storage.
- *
- * Cluster membership and the mapping of data objects onto storage devices
- * are described by the osd map.
- *
- * We keep track of pending OSD requests (read, write), resubmit
- * requests to different OSDs when the cluster topology/data layout
- * change, or retry the affected requests when the communications
- * channel with an OSD is reset.
- */
-
-/*
- * calculate the mapping of a file extent onto an object, and fill out the
- * request accordingly. shorten extent as necessary if it crosses an
- * object boundary.
- *
- * fill osd op in request message.
- */
-static void calc_layout(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
- struct ceph_osd_op *op = (void *)(reqhead + 1);
- u64 orig_len = *plen;
- u64 objoff, objlen; /* extent in object */
- u64 bno;
-
- reqhead->snapid = cpu_to_le64(vino.snap);
-
- /* object extent? */
- ceph_calc_file_object_mapping(layout, off, plen, &bno,
- &objoff, &objlen);
- if (*plen < orig_len)
- dout(" skipping last %llu, final file extent %llu~%llu\n",
- orig_len - *plen, off, *plen);
-
- sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
- req->r_oid_len = strlen(req->r_oid);
-
- op->extent.offset = cpu_to_le64(objoff);
- op->extent.length = cpu_to_le64(objlen);
- req->r_num_pages = calc_pages_for(off, *plen);
-
- dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
- req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
-}
-
-/*
- * requests
- */
-void ceph_osdc_release_request(struct kref *kref)
-{
- struct ceph_osd_request *req = container_of(kref,
- struct ceph_osd_request,
- r_kref);
-
- if (req->r_request)
- ceph_msg_put(req->r_request);
- if (req->r_reply)
- ceph_msg_put(req->r_reply);
- if (req->r_con_filling_msg) {
- dout("release_request revoking pages %p from con %p\n",
- req->r_pages, req->r_con_filling_msg);
- ceph_con_revoke_message(req->r_con_filling_msg,
- req->r_reply);
- ceph_con_put(req->r_con_filling_msg);
- }
- if (req->r_own_pages)
- ceph_release_page_vector(req->r_pages,
- req->r_num_pages);
- ceph_put_snap_context(req->r_snapc);
- if (req->r_mempool)
- mempool_free(req, req->r_osdc->req_mempool);
- else
- kfree(req);
-}
-
-/*
- * build new request AND message, calculate layout, and adjust file
- * extent as needed.
- *
- * if the file was recently truncated, we include information about its
- * old and new size so that the object can be updated appropriately. (we
- * avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
- */
-struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
- struct ceph_file_layout *layout,
- struct ceph_vino vino,
- u64 off, u64 *plen,
- int opcode, int flags,
- struct ceph_snap_context *snapc,
- int do_sync,
- u32 truncate_seq,
- u64 truncate_size,
- struct timespec *mtime,
- bool use_mempool, int num_reply)
-{
- struct ceph_osd_request *req;
- struct ceph_msg *msg;
- struct ceph_osd_request_head *head;
- struct ceph_osd_op *op;
- void *p;
- int num_op = 1 + do_sync;
- size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
- int i;
-
- if (use_mempool) {
- req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
- memset(req, 0, sizeof(*req));
- } else {
- req = kzalloc(sizeof(*req), GFP_NOFS);
- }
- if (req == NULL)
- return NULL;
-
- req->r_osdc = osdc;
- req->r_mempool = use_mempool;
- kref_init(&req->r_kref);
- init_completion(&req->r_completion);
- init_completion(&req->r_safe_completion);
- INIT_LIST_HEAD(&req->r_unsafe_item);
- req->r_flags = flags;
-
- WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
- /* create reply message */
- if (use_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
- else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
- OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
- req->r_reply = msg;
-
- /* create request message; allow space for oid */
- msg_size += 40;
- if (snapc)
- msg_size += sizeof(u64) * snapc->num_snaps;
- if (use_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
- else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
- if (!msg) {
- ceph_osdc_put_request(req);
- return NULL;
- }
- msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
- memset(msg->front.iov_base, 0, msg->front.iov_len);
- head = msg->front.iov_base;
- op = (void *)(head + 1);
- p = (void *)(op + num_op);
-
- req->r_request = msg;
- req->r_snapc = ceph_get_snap_context(snapc);
-
- head->client_inc = cpu_to_le32(1); /* always, for now. */
- head->flags = cpu_to_le32(flags);
- if (flags & CEPH_OSD_FLAG_WRITE)
- ceph_encode_timespec(&head->mtime, mtime);
- head->num_ops = cpu_to_le16(num_op);
- op->op = cpu_to_le16(opcode);
-
- /* calculate max write size */
- calc_layout(osdc, vino, layout, off, plen, req);
- req->r_file_layout = *layout; /* keep a copy */
-
- if (flags & CEPH_OSD_FLAG_WRITE) {
- req->r_request->hdr.data_off = cpu_to_le16(off);
- req->r_request->hdr.data_len = cpu_to_le32(*plen);
- op->payload_len = cpu_to_le32(*plen);
- }
- op->extent.truncate_size = cpu_to_le64(truncate_size);
- op->extent.truncate_seq = cpu_to_le32(truncate_seq);
-
- /* fill in oid */
- head->object_len = cpu_to_le32(req->r_oid_len);
- memcpy(p, req->r_oid, req->r_oid_len);
- p += req->r_oid_len;
-
- if (do_sync) {
- op++;
- op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
- }
- if (snapc) {
- head->snap_seq = cpu_to_le64(snapc->seq);
- head->num_snaps = cpu_to_le32(snapc->num_snaps);
- for (i = 0; i < snapc->num_snaps; i++) {
- put_unaligned_le64(snapc->snaps[i], p);
- p += sizeof(u64);
- }
- }
-
- BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
- msg_size = p - msg->front.iov_base;
- msg->front.iov_len = msg_size;
- msg->hdr.front_len = cpu_to_le32(msg_size);
- return req;
-}
-
-/*
- * We keep osd requests in an rbtree, sorted by ->r_tid.
- */
-static void __insert_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *new)
-{
- struct rb_node **p = &osdc->requests.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd_request *req = NULL;
-
- while (*p) {
- parent = *p;
- req = rb_entry(parent, struct ceph_osd_request, r_node);
- if (new->r_tid < req->r_tid)
- p = &(*p)->rb_left;
- else if (new->r_tid > req->r_tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->r_node, parent, p);
- rb_insert_color(&new->r_node, &osdc->requests);
-}
-
-static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
- u64 tid)
-{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid)
- n = n->rb_left;
- else if (tid > req->r_tid)
- n = n->rb_right;
- else
- return req;
- }
- return NULL;
-}
-
-static struct ceph_osd_request *
-__lookup_request_ge(struct ceph_osd_client *osdc,
- u64 tid)
-{
- struct ceph_osd_request *req;
- struct rb_node *n = osdc->requests.rb_node;
-
- while (n) {
- req = rb_entry(n, struct ceph_osd_request, r_node);
- if (tid < req->r_tid) {
- if (!n->rb_left)
- return req;
- n = n->rb_left;
- } else if (tid > req->r_tid) {
- n = n->rb_right;
- } else {
- return req;
- }
- }
- return NULL;
-}
-
-
-/*
- * If the osd connection drops, we need to resubmit all requests.
- */
-static void osd_reset(struct ceph_connection *con)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
-
- if (!osd)
- return;
- dout("osd_reset osd%d\n", osd->o_osd);
- osdc = osd->o_osdc;
- down_read(&osdc->map_sem);
- kick_requests(osdc, osd);
- up_read(&osdc->map_sem);
-}
-
-/*
- * Track open sessions with osds.
- */
-static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
-{
- struct ceph_osd *osd;
-
- osd = kzalloc(sizeof(*osd), GFP_NOFS);
- if (!osd)
- return NULL;
-
- atomic_set(&osd->o_ref, 1);
- osd->o_osdc = osdc;
- INIT_LIST_HEAD(&osd->o_requests);
- INIT_LIST_HEAD(&osd->o_osd_lru);
- osd->o_incarnation = 1;
-
- ceph_con_init(osdc->client->msgr, &osd->o_con);
- osd->o_con.private = osd;
- osd->o_con.ops = &osd_con_ops;
- osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
-
- INIT_LIST_HEAD(&osd->o_keepalive_item);
- return osd;
-}
-
-static struct ceph_osd *get_osd(struct ceph_osd *osd)
-{
- if (atomic_inc_not_zero(&osd->o_ref)) {
- dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
- atomic_read(&osd->o_ref));
- return osd;
- } else {
- dout("get_osd %p FAIL\n", osd);
- return NULL;
- }
-}
-
-static void put_osd(struct ceph_osd *osd)
-{
- dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
- atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref)) {
- struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
-
- if (osd->o_authorizer)
- ac->ops->destroy_authorizer(ac, osd->o_authorizer);
- kfree(osd);
- }
-}
-
-/*
- * remove an osd from our map
- */
-static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- dout("__remove_osd %p\n", osd);
- BUG_ON(!list_empty(&osd->o_requests));
- rb_erase(&osd->o_node, &osdc->osds);
- list_del_init(&osd->o_osd_lru);
- ceph_con_close(&osd->o_con);
- put_osd(osd);
-}
-
-static void __move_osd_to_lru(struct ceph_osd_client *osdc,
- struct ceph_osd *osd)
-{
- dout("__move_osd_to_lru %p\n", osd);
- BUG_ON(!list_empty(&osd->o_osd_lru));
- list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
- osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
-}
-
-static void __remove_osd_from_lru(struct ceph_osd *osd)
-{
- dout("__remove_osd_from_lru %p\n", osd);
- if (!list_empty(&osd->o_osd_lru))
- list_del_init(&osd->o_osd_lru);
-}
-
-static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
-{
- struct ceph_osd *osd, *nosd;
-
- dout("__remove_old_osds %p\n", osdc);
- mutex_lock(&osdc->request_mutex);
- list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
- if (!remove_all && time_before(jiffies, osd->lru_ttl))
- break;
- __remove_osd(osdc, osd);
- }
- mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * reset osd connect
- */
-static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
-{
- struct ceph_osd_request *req;
- int ret = 0;
-
- dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
- if (list_empty(&osd->o_requests)) {
- __remove_osd(osdc, osd);
- } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
- &osd->o_con.peer_addr,
- sizeof(osd->o_con.peer_addr)) == 0 &&
- !ceph_con_opened(&osd->o_con)) {
- dout(" osd addr hasn't changed and connection never opened,"
- " letting msgr retry");
- /* touch each r_stamp for handle_timeout()'s benfit */
- list_for_each_entry(req, &osd->o_requests, r_osd_item)
- req->r_stamp = jiffies;
- ret = -EAGAIN;
- } else {
- ceph_con_close(&osd->o_con);
- ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
- osd->o_incarnation++;
- }
- return ret;
-}
-
-static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
-{
- struct rb_node **p = &osdc->osds.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_osd *osd = NULL;
-
- while (*p) {
- parent = *p;
- osd = rb_entry(parent, struct ceph_osd, o_node);
- if (new->o_osd < osd->o_osd)
- p = &(*p)->rb_left;
- else if (new->o_osd > osd->o_osd)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&new->o_node, parent, p);
- rb_insert_color(&new->o_node, &osdc->osds);
-}
-
-static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
-{
- struct ceph_osd *osd;
- struct rb_node *n = osdc->osds.rb_node;
-
- while (n) {
- osd = rb_entry(n, struct ceph_osd, o_node);
- if (o < osd->o_osd)
- n = n->rb_left;
- else if (o > osd->o_osd)
- n = n->rb_right;
- else
- return osd;
- }
- return NULL;
-}
-
-static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
-{
- schedule_delayed_work(&osdc->timeout_work,
- osdc->client->mount_args->osd_keepalive_timeout * HZ);
-}
-
-static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
-{
- cancel_delayed_work(&osdc->timeout_work);
-}
-
-/*
- * Register request, assign tid. If this is the first request, set up
- * the timeout event.
- */
-static void register_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- mutex_lock(&osdc->request_mutex);
- req->r_tid = ++osdc->last_tid;
- req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
- INIT_LIST_HEAD(&req->r_req_lru_item);
-
- dout("register_request %p tid %lld\n", req, req->r_tid);
- __insert_request(osdc, req);
- ceph_osdc_get_request(req);
- osdc->num_requests++;
-
- if (osdc->num_requests == 1) {
- dout(" first request, scheduling timeout\n");
- __schedule_osd_timeout(osdc);
- }
- mutex_unlock(&osdc->request_mutex);
-}
-
-/*
- * called under osdc->request_mutex
- */
-static void __unregister_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- dout("__unregister_request %p tid %lld\n", req, req->r_tid);
- rb_erase(&req->r_node, &osdc->requests);
- osdc->num_requests--;
-
- if (req->r_osd) {
- /* make sure the original request isn't in flight. */
- ceph_con_revoke(&req->r_osd->o_con, req->r_request);
-
- list_del_init(&req->r_osd_item);
- if (list_empty(&req->r_osd->o_requests))
- __move_osd_to_lru(osdc, req->r_osd);
- req->r_osd = NULL;
- }
-
- ceph_osdc_put_request(req);
-
- list_del_init(&req->r_req_lru_item);
- if (osdc->num_requests == 0) {
- dout(" no requests, canceling timeout\n");
- __cancel_osd_timeout(osdc);
- }
-}
-
-/*
- * Cancel a previously queued request message
- */
-static void __cancel_request(struct ceph_osd_request *req)
-{
- if (req->r_sent) {
- ceph_con_revoke(&req->r_osd->o_con, req->r_request);
- req->r_sent = 0;
- }
- list_del_init(&req->r_req_lru_item);
-}
-
-/*
- * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
- * (as needed), and set the request r_osd appropriately. If there is
- * no up osd, set r_osd to NULL.
- *
- * Return 0 if unchanged, 1 if changed, or negative on error.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static int __map_osds(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
- struct ceph_pg pgid;
- int acting[CEPH_PG_MAX_SIZE];
- int o = -1, num = 0;
- int err;
-
- dout("map_osds %p tid %lld\n", req, req->r_tid);
- err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
- &req->r_file_layout, osdc->osdmap);
- if (err)
- return err;
- pgid = reqhead->layout.ol_pgid;
- req->r_pgid = pgid;
-
- err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
- if (err > 0) {
- o = acting[0];
- num = err;
- }
-
- if ((req->r_osd && req->r_osd->o_osd == o &&
- req->r_sent >= req->r_osd->o_incarnation &&
- req->r_num_pg_osds == num &&
- memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1))
- return 0; /* no change */
-
- dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
- req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
- req->r_osd ? req->r_osd->o_osd : -1);
-
- /* record full pg acting set */
- memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
- req->r_num_pg_osds = num;
-
- if (req->r_osd) {
- __cancel_request(req);
- list_del_init(&req->r_osd_item);
- req->r_osd = NULL;
- }
-
- req->r_osd = __lookup_osd(osdc, o);
- if (!req->r_osd && o >= 0) {
- err = -ENOMEM;
- req->r_osd = create_osd(osdc);
- if (!req->r_osd)
- goto out;
-
- dout("map_osds osd %p is osd%d\n", req->r_osd, o);
- req->r_osd->o_osd = o;
- req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
- __insert_osd(osdc, req->r_osd);
-
- ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
- }
-
- if (req->r_osd) {
- __remove_osd_from_lru(req->r_osd);
- list_add(&req->r_osd_item, &req->r_osd->o_requests);
- }
- err = 1; /* osd or pg changed */
-
-out:
- return err;
-}
-
-/*
- * caller should hold map_sem (for read) and request_mutex
- */
-static int __send_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- struct ceph_osd_request_head *reqhead;
- int err;
-
- err = __map_osds(osdc, req);
- if (err < 0)
- return err;
- if (req->r_osd == NULL) {
- dout("send_request %p no up osds in pg\n", req);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
- return 0;
- }
-
- dout("send_request %p tid %llu to osd%d flags %d\n",
- req, req->r_tid, req->r_osd->o_osd, req->r_flags);
-
- reqhead = req->r_request->front.iov_base;
- reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
- reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
- reqhead->reassert_version = req->r_reassert_version;
-
- req->r_stamp = jiffies;
- list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
-
- ceph_msg_get(req->r_request); /* send consumes a ref */
- ceph_con_send(&req->r_osd->o_con, req->r_request);
- req->r_sent = req->r_osd->o_incarnation;
- return 0;
-}
-
-/*
- * Timeout callback, called every N seconds when 1 or more osd
- * requests has been active for more than N seconds. When this
- * happens, we ping all OSDs with requests who have timed out to
- * ensure any communications channel reset is detected. Reset the
- * request timeouts another N seconds in the future as we go.
- * Reschedule the timeout event another N seconds in future (unless
- * there are no open requests).
- */
-static void handle_timeout(struct work_struct *work)
-{
- struct ceph_osd_client *osdc =
- container_of(work, struct ceph_osd_client, timeout_work.work);
- struct ceph_osd_request *req, *last_req = NULL;
- struct ceph_osd *osd;
- unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
- unsigned long keepalive =
- osdc->client->mount_args->osd_keepalive_timeout * HZ;
- unsigned long last_stamp = 0;
- struct rb_node *p;
- struct list_head slow_osds;
-
- dout("timeout\n");
- down_read(&osdc->map_sem);
-
- ceph_monc_request_next_osdmap(&osdc->client->monc);
-
- mutex_lock(&osdc->request_mutex);
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- if (req->r_resend) {
- int err;
-
- dout("osdc resending prev failed %lld\n", req->r_tid);
- err = __send_request(osdc, req);
- if (err)
- dout("osdc failed again on %lld\n", req->r_tid);
- else
- req->r_resend = false;
- continue;
- }
- }
-
- /*
- * reset osds that appear to be _really_ unresponsive. this
- * is a failsafe measure.. we really shouldn't be getting to
- * this point if the system is working properly. the monitors
- * should mark the osd as failed and we should find out about
- * it from an updated osd map.
- */
- while (timeout && !list_empty(&osdc->req_lru)) {
- req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
- r_req_lru_item);
-
- if (time_before(jiffies, req->r_stamp + timeout))
- break;
-
- BUG_ON(req == last_req && req->r_stamp == last_stamp);
- last_req = req;
- last_stamp = req->r_stamp;
-
- osd = req->r_osd;
- BUG_ON(!osd);
- pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
- req->r_tid, osd->o_osd);
- __kick_requests(osdc, osd);
- }
-
- /*
- * ping osds that are a bit slow. this ensures that if there
- * is a break in the TCP connection we will notice, and reopen
- * a connection with that osd (from the fault callback).
- */
- INIT_LIST_HEAD(&slow_osds);
- list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
- if (time_before(jiffies, req->r_stamp + keepalive))
- break;
-
- osd = req->r_osd;
- BUG_ON(!osd);
- dout(" tid %llu is slow, will send keepalive on osd%d\n",
- req->r_tid, osd->o_osd);
- list_move_tail(&osd->o_keepalive_item, &slow_osds);
- }
- while (!list_empty(&slow_osds)) {
- osd = list_entry(slow_osds.next, struct ceph_osd,
- o_keepalive_item);
- list_del_init(&osd->o_keepalive_item);
- ceph_con_keepalive(&osd->o_con);
- }
-
- __schedule_osd_timeout(osdc);
- mutex_unlock(&osdc->request_mutex);
-
- up_read(&osdc->map_sem);
-}
-
-static void handle_osds_timeout(struct work_struct *work)
-{
- struct ceph_osd_client *osdc =
- container_of(work, struct ceph_osd_client,
- osds_timeout_work.work);
- unsigned long delay =
- osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
-
- dout("osds timeout\n");
- down_read(&osdc->map_sem);
- remove_old_osds(osdc, 0);
- up_read(&osdc->map_sem);
-
- schedule_delayed_work(&osdc->osds_timeout_work,
- round_jiffies_relative(delay));
-}
-
-/*
- * handle osd op reply. either call the callback if it is specified,
- * or do the completion to wake up the waiting thread.
- */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
- struct ceph_connection *con)
-{
- struct ceph_osd_reply_head *rhead = msg->front.iov_base;
- struct ceph_osd_request *req;
- u64 tid;
- int numops, object_len, flags;
- s32 result;
-
- tid = le64_to_cpu(msg->hdr.tid);
- if (msg->front.iov_len < sizeof(*rhead))
- goto bad;
- numops = le32_to_cpu(rhead->num_ops);
- object_len = le32_to_cpu(rhead->object_len);
- result = le32_to_cpu(rhead->result);
- if (msg->front.iov_len != sizeof(*rhead) + object_len +
- numops * sizeof(struct ceph_osd_op))
- goto bad;
- dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
-
- /* lookup */
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (req == NULL) {
- dout("handle_reply tid %llu dne\n", tid);
- mutex_unlock(&osdc->request_mutex);
- return;
- }
- ceph_osdc_get_request(req);
- flags = le32_to_cpu(rhead->flags);
-
- /*
- * if this connection filled our message, drop our reference now, to
- * avoid a (safe but slower) revoke later.
- */
- if (req->r_con_filling_msg == con && req->r_reply == msg) {
- dout(" dropping con_filling_msg ref %p\n", con);
- req->r_con_filling_msg = NULL;
- ceph_con_put(con);
- }
-
- if (!req->r_got_reply) {
- unsigned bytes;
-
- req->r_result = le32_to_cpu(rhead->result);
- bytes = le32_to_cpu(msg->hdr.data_len);
- dout("handle_reply result %d bytes %d\n", req->r_result,
- bytes);
- if (req->r_result == 0)
- req->r_result = bytes;
-
- /* in case this is a write and we need to replay, */
- req->r_reassert_version = rhead->reassert_version;
-
- req->r_got_reply = 1;
- } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
- dout("handle_reply tid %llu dup ack\n", tid);
- mutex_unlock(&osdc->request_mutex);
- goto done;
- }
-
- dout("handle_reply tid %llu flags %d\n", tid, flags);
-
- /* either this is a read, or we got the safe response */
- if (result < 0 ||
- (flags & CEPH_OSD_FLAG_ONDISK) ||
- ((flags & CEPH_OSD_FLAG_WRITE) == 0))
- __unregister_request(osdc, req);
-
- mutex_unlock(&osdc->request_mutex);
-
- if (req->r_callback)
- req->r_callback(req, msg);
- else
- complete_all(&req->r_completion);
-
- if (flags & CEPH_OSD_FLAG_ONDISK) {
- if (req->r_safe_callback)
- req->r_safe_callback(req, msg);
- complete_all(&req->r_safe_completion); /* fsync waiter */
- }
-
-done:
- ceph_osdc_put_request(req);
- return;
-
-bad:
- pr_err("corrupt osd_op_reply got %d %d expected %d\n",
- (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
- (int)sizeof(*rhead));
- ceph_msg_dump(msg);
-}
-
-
-static int __kick_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *kickosd)
-{
- struct ceph_osd_request *req;
- struct rb_node *p, *n;
- int needmap = 0;
- int err;
-
- dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
- if (kickosd) {
- err = __reset_osd(osdc, kickosd);
- if (err == -EAGAIN)
- return 1;
- } else {
- for (p = rb_first(&osdc->osds); p; p = n) {
- struct ceph_osd *osd =
- rb_entry(p, struct ceph_osd, o_node);
-
- n = rb_next(p);
- if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
- memcmp(&osd->o_con.peer_addr,
- ceph_osd_addr(osdc->osdmap,
- osd->o_osd),
- sizeof(struct ceph_entity_addr)) != 0)
- __reset_osd(osdc, osd);
- }
- }
-
- for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_osd_request, r_node);
-
- if (req->r_resend) {
- dout(" r_resend set on tid %llu\n", req->r_tid);
- __cancel_request(req);
- goto kick;
- }
- if (req->r_osd && kickosd == req->r_osd) {
- __cancel_request(req);
- goto kick;
- }
-
- err = __map_osds(osdc, req);
- if (err == 0)
- continue; /* no change */
- if (err < 0) {
- /*
- * FIXME: really, we should set the request
- * error and fail if this isn't a 'nofail'
- * request, but that's a fair bit more
- * complicated to do. So retry!
- */
- dout(" setting r_resend on %llu\n", req->r_tid);
- req->r_resend = true;
- continue;
- }
- if (req->r_osd == NULL) {
- dout("tid %llu maps to no valid osd\n", req->r_tid);
- needmap++; /* request a newer map */
- continue;
- }
-
-kick:
- dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
- req->r_osd ? req->r_osd->o_osd : -1);
- req->r_flags |= CEPH_OSD_FLAG_RETRY;
- err = __send_request(osdc, req);
- if (err) {
- dout(" setting r_resend on %llu\n", req->r_tid);
- req->r_resend = true;
- }
- }
-
- return needmap;
-}
-
-/*
- * Resubmit osd requests whose osd or osd address has changed. Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
- struct ceph_osd *kickosd)
-{
- int needmap;
-
- mutex_lock(&osdc->request_mutex);
- needmap = __kick_requests(osdc, kickosd);
- mutex_unlock(&osdc->request_mutex);
-
- if (needmap) {
- dout("%d requests for down osds, need new map\n", needmap);
- ceph_monc_request_next_osdmap(&osdc->client->monc);
- }
-
-}
-/*
- * Process updated osd map.
- *
- * The message contains any number of incremental and full maps, normally
- * indicating some sort of topology change in the cluster. Kick requests
- * off to different OSDs as needed.
- */
-void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
-{
- void *p, *end, *next;
- u32 nr_maps, maplen;
- u32 epoch;
- struct ceph_osdmap *newmap = NULL, *oldmap;
- int err;
- struct ceph_fsid fsid;
-
- dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
- p = msg->front.iov_base;
- end = p + msg->front.iov_len;
-
- /* verify fsid */
- ceph_decode_need(&p, end, sizeof(fsid), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- if (ceph_check_fsid(osdc->client, &fsid) < 0)
- return;
-
- down_write(&osdc->map_sem);
-
- /* incremental maps */
- ceph_decode_32_safe(&p, end, nr_maps, bad);
- dout(" %d inc maps\n", nr_maps);
- while (nr_maps > 0) {
- ceph_decode_need(&p, end, 2*sizeof(u32), bad);
- epoch = ceph_decode_32(&p);
- maplen = ceph_decode_32(&p);
- ceph_decode_need(&p, end, maplen, bad);
- next = p + maplen;
- if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
- dout("applying incremental map %u len %d\n",
- epoch, maplen);
- newmap = osdmap_apply_incremental(&p, next,
- osdc->osdmap,
- osdc->client->msgr);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
- goto bad;
- }
- BUG_ON(!newmap);
- if (newmap != osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = newmap;
- }
- } else {
- dout("ignoring incremental map %u len %d\n",
- epoch, maplen);
- }
- p = next;
- nr_maps--;
- }
- if (newmap)
- goto done;
-
- /* full maps */
- ceph_decode_32_safe(&p, end, nr_maps, bad);
- dout(" %d full maps\n", nr_maps);
- while (nr_maps) {
- ceph_decode_need(&p, end, 2*sizeof(u32), bad);
- epoch = ceph_decode_32(&p);
- maplen = ceph_decode_32(&p);
- ceph_decode_need(&p, end, maplen, bad);
- if (nr_maps > 1) {
- dout("skipping non-latest full map %u len %d\n",
- epoch, maplen);
- } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
- dout("skipping full map %u len %d, "
- "older than our %u\n", epoch, maplen,
- osdc->osdmap->epoch);
- } else {
- dout("taking full map %u len %d\n", epoch, maplen);
- newmap = osdmap_decode(&p, p+maplen);
- if (IS_ERR(newmap)) {
- err = PTR_ERR(newmap);
- goto bad;
- }
- BUG_ON(!newmap);
- oldmap = osdc->osdmap;
- osdc->osdmap = newmap;
- if (oldmap)
- ceph_osdmap_destroy(oldmap);
- }
- p += maplen;
- nr_maps--;
- }
-
-done:
- downgrade_write(&osdc->map_sem);
- ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
- if (newmap)
- kick_requests(osdc, NULL);
- up_read(&osdc->map_sem);
- wake_up_all(&osdc->client->auth_wq);
- return;
-
-bad:
- pr_err("osdc handle_map corrupt msg\n");
- ceph_msg_dump(msg);
- up_write(&osdc->map_sem);
- return;
-}
-
-/*
- * Register request, send initial attempt.
- */
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
-{
- int rc = 0;
-
- req->r_request->pages = req->r_pages;
- req->r_request->nr_pages = req->r_num_pages;
-
- register_request(osdc, req);
-
- down_read(&osdc->map_sem);
- mutex_lock(&osdc->request_mutex);
- /*
- * a racing kick_requests() may have sent the message for us
- * while we dropped request_mutex above, so only send now if
- * the request still han't been touched yet.
- */
- if (req->r_sent == 0) {
- rc = __send_request(osdc, req);
- if (rc) {
- if (nofail) {
- dout("osdc_start_request failed send, "
- " marking %lld\n", req->r_tid);
- req->r_resend = true;
- rc = 0;
- } else {
- __unregister_request(osdc, req);
- }
- }
- }
- mutex_unlock(&osdc->request_mutex);
- up_read(&osdc->map_sem);
- return rc;
-}
-
-/*
- * wait for a request to complete
- */
-int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req)
-{
- int rc;
-
- rc = wait_for_completion_interruptible(&req->r_completion);
- if (rc < 0) {
- mutex_lock(&osdc->request_mutex);
- __cancel_request(req);
- __unregister_request(osdc, req);
- mutex_unlock(&osdc->request_mutex);
- dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
- return rc;
- }
-
- dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
- return req->r_result;
-}
-
-/*
- * sync - wait for all in-flight requests to flush. avoid starvation.
- */
-void ceph_osdc_sync(struct ceph_osd_client *osdc)
-{
- struct ceph_osd_request *req;
- u64 last_tid, next_tid = 0;
-
- mutex_lock(&osdc->request_mutex);
- last_tid = osdc->last_tid;
- while (1) {
- req = __lookup_request_ge(osdc, next_tid);
- if (!req)
- break;
- if (req->r_tid > last_tid)
- break;
-
- next_tid = req->r_tid + 1;
- if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
- continue;
-
- ceph_osdc_get_request(req);
- mutex_unlock(&osdc->request_mutex);
- dout("sync waiting on tid %llu (last is %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- mutex_lock(&osdc->request_mutex);
- ceph_osdc_put_request(req);
- }
- mutex_unlock(&osdc->request_mutex);
- dout("sync done (thru tid %llu)\n", last_tid);
-}
-
-/*
- * init, shutdown
- */
-int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
-{
- int err;
-
- dout("init\n");
- osdc->client = client;
- osdc->osdmap = NULL;
- init_rwsem(&osdc->map_sem);
- init_completion(&osdc->map_waiters);
- osdc->last_requested_map = 0;
- mutex_init(&osdc->request_mutex);
- osdc->last_tid = 0;
- osdc->osds = RB_ROOT;
- INIT_LIST_HEAD(&osdc->osd_lru);
- osdc->requests = RB_ROOT;
- INIT_LIST_HEAD(&osdc->req_lru);
- osdc->num_requests = 0;
- INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
- INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
-
- schedule_delayed_work(&osdc->osds_timeout_work,
- round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
-
- err = -ENOMEM;
- osdc->req_mempool = mempool_create_kmalloc_pool(10,
- sizeof(struct ceph_osd_request));
- if (!osdc->req_mempool)
- goto out;
-
- err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
- "osd_op");
- if (err < 0)
- goto out_mempool;
- err = ceph_msgpool_init(&osdc->msgpool_op_reply,
- OSD_OPREPLY_FRONT_LEN, 10, true,
- "osd_op_reply");
- if (err < 0)
- goto out_msgpool;
- return 0;
-
-out_msgpool:
- ceph_msgpool_destroy(&osdc->msgpool_op);
-out_mempool:
- mempool_destroy(osdc->req_mempool);
-out:
- return err;
-}
-
-void ceph_osdc_stop(struct ceph_osd_client *osdc)
-{
- cancel_delayed_work_sync(&osdc->timeout_work);
- cancel_delayed_work_sync(&osdc->osds_timeout_work);
- if (osdc->osdmap) {
- ceph_osdmap_destroy(osdc->osdmap);
- osdc->osdmap = NULL;
- }
- remove_old_osds(osdc, 1);
- mempool_destroy(osdc->req_mempool);
- ceph_msgpool_destroy(&osdc->msgpool_op);
- ceph_msgpool_destroy(&osdc->msgpool_op_reply);
-}
-
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, 0, truncate_seq, truncate_size, NULL,
- false, 1);
- if (!req)
- return -ENOMEM;
-
- /* it may be a short read due to an object boundary */
- req->r_pages = pages;
-
- dout("readpages final extent is %llu~%llu (%d pages)\n",
- off, *plen, req->r_num_pages);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec *mtime,
- struct page **pages, int num_pages,
- int flags, int do_sync, bool nofail)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- BUG_ON(vino.snap != CEPH_NOSNAP);
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
- CEPH_OSD_OP_WRITE,
- flags | CEPH_OSD_FLAG_ONDISK |
- CEPH_OSD_FLAG_WRITE,
- snapc, do_sync,
- truncate_seq, truncate_size, mtime,
- nofail, 1);
- if (!req)
- return -ENOMEM;
-
- /* it may be a short write due to an object boundary */
- req->r_pages = pages;
- dout("writepages %llu~%llu (%d pages)\n", off, len,
- req->r_num_pages);
-
- rc = ceph_osdc_start_request(osdc, req, nofail);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-
-/*
- * handle incoming message
- */
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
- int type = le16_to_cpu(msg->hdr.type);
-
- if (!osd)
- goto out;
- osdc = osd->o_osdc;
-
- switch (type) {
- case CEPH_MSG_OSD_MAP:
- ceph_osdc_handle_map(osdc, msg);
- break;
- case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg, con);
- break;
-
- default:
- pr_err("received unknown message type %d %s\n", type,
- ceph_msg_type_name(type));
- }
-out:
- ceph_msg_put(msg);
-}
-
-/*
- * lookup and return message for incoming reply. set up reply message
- * pages.
- */
-static struct ceph_msg *get_reply(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc = osd->o_osdc;
- struct ceph_msg *m;
- struct ceph_osd_request *req;
- int front = le32_to_cpu(hdr->front_len);
- int data_len = le32_to_cpu(hdr->data_len);
- u64 tid;
-
- tid = le64_to_cpu(hdr->tid);
- mutex_lock(&osdc->request_mutex);
- req = __lookup_request(osdc, tid);
- if (!req) {
- *skip = 1;
- m = NULL;
- pr_info("get_reply unknown tid %llu from osd%d\n", tid,
- osd->o_osd);
- goto out;
- }
-
- if (req->r_con_filling_msg) {
- dout("get_reply revoking msg %p from old con %p\n",
- req->r_reply, req->r_con_filling_msg);
- ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
- ceph_con_put(req->r_con_filling_msg);
- req->r_con_filling_msg = NULL;
- }
-
- if (front > req->r_reply->front.iov_len) {
- pr_warning("get_reply front %d > preallocated %d\n",
- front, (int)req->r_reply->front.iov_len);
- m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
- if (!m)
- goto out;
- ceph_msg_put(req->r_reply);
- req->r_reply = m;
- }
- m = ceph_msg_get(req->r_reply);
-
- if (data_len > 0) {
- unsigned data_off = le16_to_cpu(hdr->data_off);
- int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
- if (unlikely(req->r_num_pages < want)) {
- pr_warning("tid %lld reply %d > expected %d pages\n",
- tid, want, m->nr_pages);
- *skip = 1;
- ceph_msg_put(m);
- m = NULL;
- goto out;
- }
- m->pages = req->r_pages;
- m->nr_pages = req->r_num_pages;
- }
- *skip = 0;
- req->r_con_filling_msg = ceph_con_get(con);
- dout("get_reply tid %lld %p\n", tid, m);
-
-out:
- mutex_unlock(&osdc->request_mutex);
- return m;
-
-}
-
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
-{
- struct ceph_osd *osd = con->private;
- int type = le16_to_cpu(hdr->type);
- int front = le32_to_cpu(hdr->front_len);
-
- switch (type) {
- case CEPH_MSG_OSD_MAP:
- return ceph_msg_new(type, front, GFP_NOFS);
- case CEPH_MSG_OSD_OPREPLY:
- return get_reply(con, hdr, skip);
- default:
- pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
- osd->o_osd);
- *skip = 1;
- return NULL;
- }
-}
-
-/*
- * Wrappers to refcount containing ceph_osd struct
- */
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
-{
- struct ceph_osd *osd = con->private;
- if (get_osd(osd))
- return con;
- return NULL;
-}
-
-static void put_osd_con(struct ceph_connection *con)
-{
- struct ceph_osd *osd = con->private;
- put_osd(osd);
-}
-
-/*
- * authentication
- */
-static int get_authorizer(struct ceph_connection *con,
- void **buf, int *len, int *proto,
- void **reply_buf, int *reply_len, int force_new)
-{
- struct ceph_osd *o = con->private;
- struct ceph_osd_client *osdc = o->o_osdc;
- struct ceph_auth_client *ac = osdc->client->monc.auth;
- int ret = 0;
-
- if (force_new && o->o_authorizer) {
- ac->ops->destroy_authorizer(ac, o->o_authorizer);
- o->o_authorizer = NULL;
- }
- if (o->o_authorizer == NULL) {
- ret = ac->ops->create_authorizer(
- ac, CEPH_ENTITY_TYPE_OSD,
- &o->o_authorizer,
- &o->o_authorizer_buf,
- &o->o_authorizer_buf_len,
- &o->o_authorizer_reply_buf,
- &o->o_authorizer_reply_buf_len);
- if (ret)
- return ret;
- }
-
- *proto = ac->protocol;
- *buf = o->o_authorizer_buf;
- *len = o->o_authorizer_buf_len;
- *reply_buf = o->o_authorizer_reply_buf;
- *reply_len = o->o_authorizer_reply_buf_len;
- return 0;
-}
-
-
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
-{
- struct ceph_osd *o = con->private;
- struct ceph_osd_client *osdc = o->o_osdc;
- struct ceph_auth_client *ac = osdc->client->monc.auth;
-
- return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
-}
-
-static int invalidate_authorizer(struct ceph_connection *con)
-{
- struct ceph_osd *o = con->private;
- struct ceph_osd_client *osdc = o->o_osdc;
- struct ceph_auth_client *ac = osdc->client->monc.auth;
-
- if (ac->ops->invalidate_authorizer)
- ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-
- return ceph_monc_validate_auth(&osdc->client->monc);
-}
-
-static const struct ceph_connection_operations osd_con_ops = {
- .get = get_osd_con,
- .put = put_osd_con,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .alloc_msg = alloc_msg,
- .fault = osd_reset,
-};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
deleted file mode 100644
index ce776989ef6a..000000000000
--- a/fs/ceph/osd_client.h
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifndef _FS_CEPH_OSD_CLIENT_H
-#define _FS_CEPH_OSD_CLIENT_H
-
-#include <linux/completion.h>
-#include <linux/kref.h>
-#include <linux/mempool.h>
-#include <linux/rbtree.h>
-
-#include "types.h"
-#include "osdmap.h"
-#include "messenger.h"
-
-struct ceph_msg;
-struct ceph_snap_context;
-struct ceph_osd_request;
-struct ceph_osd_client;
-struct ceph_authorizer;
-
-/*
- * completion callback for async writepages
- */
-typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
- struct ceph_msg *);
-
-/* a given osd we're communicating with */
-struct ceph_osd {
- atomic_t o_ref;
- struct ceph_osd_client *o_osdc;
- int o_osd;
- int o_incarnation;
- struct rb_node o_node;
- struct ceph_connection o_con;
- struct list_head o_requests;
- struct list_head o_osd_lru;
- struct ceph_authorizer *o_authorizer;
- void *o_authorizer_buf, *o_authorizer_reply_buf;
- size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
- unsigned long lru_ttl;
- int o_marked_for_keepalive;
- struct list_head o_keepalive_item;
-};
-
-/* an in-flight request */
-struct ceph_osd_request {
- u64 r_tid; /* unique for this client */
- struct rb_node r_node;
- struct list_head r_req_lru_item;
- struct list_head r_osd_item;
- struct ceph_osd *r_osd;
- struct ceph_pg r_pgid;
- int r_pg_osds[CEPH_PG_MAX_SIZE];
- int r_num_pg_osds;
-
- struct ceph_connection *r_con_filling_msg;
-
- struct ceph_msg *r_request, *r_reply;
- int r_result;
- int r_flags; /* any additional flags for the osd */
- u32 r_sent; /* >0 if r_request is sending/sent */
- int r_got_reply;
-
- struct ceph_osd_client *r_osdc;
- struct kref r_kref;
- bool r_mempool;
- struct completion r_completion, r_safe_completion;
- ceph_osdc_callback_t r_callback, r_safe_callback;
- struct ceph_eversion r_reassert_version;
- struct list_head r_unsafe_item;
-
- struct inode *r_inode; /* for use by callbacks */
-
- char r_oid[40]; /* object name */
- int r_oid_len;
- unsigned long r_stamp; /* send OR check time */
- bool r_resend; /* msg send failed, needs retry */
-
- struct ceph_file_layout r_file_layout;
- struct ceph_snap_context *r_snapc; /* snap context for writes */
- unsigned r_num_pages; /* size of page array (follows) */
- struct page **r_pages; /* pages for data payload */
- int r_pages_from_pool;
- int r_own_pages; /* if true, i own page list */
-};
-
-struct ceph_osd_client {
- struct ceph_client *client;
-
- struct ceph_osdmap *osdmap; /* current map */
- struct rw_semaphore map_sem;
- struct completion map_waiters;
- u64 last_requested_map;
-
- struct mutex request_mutex;
- struct rb_root osds; /* osds */
- struct list_head osd_lru; /* idle osds */
- u64 timeout_tid; /* tid of timeout triggering rq */
- u64 last_tid; /* tid of last request */
- struct rb_root requests; /* pending requests */
- struct list_head req_lru; /* pending requests lru */
- int num_requests;
- struct delayed_work timeout_work;
- struct delayed_work osds_timeout_work;
-#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_file;
-#endif
-
- mempool_t *req_mempool;
-
- struct ceph_msgpool msgpool_op;
- struct ceph_msgpool msgpool_op_reply;
-};
-
-extern int ceph_osdc_init(struct ceph_osd_client *osdc,
- struct ceph_client *client);
-extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
-
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
- struct ceph_msg *msg);
-extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
- struct ceph_msg *msg);
-
-extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
- struct ceph_file_layout *layout,
- struct ceph_vino vino,
- u64 offset, u64 *len, int op, int flags,
- struct ceph_snap_context *snapc,
- int do_sync, u32 truncate_seq,
- u64 truncate_size,
- struct timespec *mtime,
- bool use_mempool, int num_reply);
-
-static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
-{
- kref_get(&req->r_kref);
-}
-extern void ceph_osdc_release_request(struct kref *kref);
-static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
-{
- kref_put(&req->r_kref, ceph_osdc_release_request);
-}
-
-extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail);
-extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req);
-extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
-
-extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int nr_pages);
-
-extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
- struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *sc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec *mtime,
- struct page **pages, int nr_pages,
- int flags, int do_sync, bool nofail);
-
-#endif
-
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
deleted file mode 100644
index e31f118f1392..000000000000
--- a/fs/ceph/osdmap.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-
-#include "ceph_debug.h"
-
-#include <linux/slab.h>
-#include <asm/div64.h>
-
-#include "super.h"
-#include "osdmap.h"
-#include "crush/hash.h"
-#include "crush/mapper.h"
-#include "decode.h"
-
-char *ceph_osdmap_state_str(char *str, int len, int state)
-{
- int flag = 0;
-
- if (!len)
- goto done;
-
- *str = '\0';
- if (state) {
- if (state & CEPH_OSD_EXISTS) {
- snprintf(str, len, "exists");
- flag = 1;
- }
- if (state & CEPH_OSD_UP) {
- snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
- "up");
- flag = 1;
- }
- } else {
- snprintf(str, len, "doesn't exist");
- }
-done:
- return str;
-}
-
-/* maps */
-
-static int calc_bits_of(unsigned t)
-{
- int b = 0;
- while (t) {
- t = t >> 1;
- b++;
- }
- return b;
-}
-
-/*
- * the foo_mask is the smallest value 2^n-1 that is >= foo.
- */
-static void calc_pg_masks(struct ceph_pg_pool_info *pi)
-{
- pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
- pi->pgp_num_mask =
- (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
- pi->lpg_num_mask =
- (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
- pi->lpgp_num_mask =
- (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
-}
-
-/*
- * decode crush map
- */
-static int crush_decode_uniform_bucket(void **p, void *end,
- struct crush_bucket_uniform *b)
-{
- dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
- ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
- b->item_weight = ceph_decode_32(p);
- return 0;
-bad:
- return -EINVAL;
-}
-
-static int crush_decode_list_bucket(void **p, void *end,
- struct crush_bucket_list *b)
-{
- int j;
- dout("crush_decode_list_bucket %p to %p\n", *p, end);
- b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->item_weights == NULL)
- return -ENOMEM;
- b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->sum_weights == NULL)
- return -ENOMEM;
- ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
- for (j = 0; j < b->h.size; j++) {
- b->item_weights[j] = ceph_decode_32(p);
- b->sum_weights[j] = ceph_decode_32(p);
- }
- return 0;
-bad:
- return -EINVAL;
-}
-
-static int crush_decode_tree_bucket(void **p, void *end,
- struct crush_bucket_tree *b)
-{
- int j;
- dout("crush_decode_tree_bucket %p to %p\n", *p, end);
- ceph_decode_32_safe(p, end, b->num_nodes, bad);
- b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
- if (b->node_weights == NULL)
- return -ENOMEM;
- ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
- for (j = 0; j < b->num_nodes; j++)
- b->node_weights[j] = ceph_decode_32(p);
- return 0;
-bad:
- return -EINVAL;
-}
-
-static int crush_decode_straw_bucket(void **p, void *end,
- struct crush_bucket_straw *b)
-{
- int j;
- dout("crush_decode_straw_bucket %p to %p\n", *p, end);
- b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->item_weights == NULL)
- return -ENOMEM;
- b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
- if (b->straws == NULL)
- return -ENOMEM;
- ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
- for (j = 0; j < b->h.size; j++) {
- b->item_weights[j] = ceph_decode_32(p);
- b->straws[j] = ceph_decode_32(p);
- }
- return 0;
-bad:
- return -EINVAL;
-}
-
-static struct crush_map *crush_decode(void *pbyval, void *end)
-{
- struct crush_map *c;
- int err = -EINVAL;
- int i, j;
- void **p = &pbyval;
- void *start = pbyval;
- u32 magic;
-
- dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
- c = kzalloc(sizeof(*c), GFP_NOFS);
- if (c == NULL)
- return ERR_PTR(-ENOMEM);
-
- ceph_decode_need(p, end, 4*sizeof(u32), bad);
- magic = ceph_decode_32(p);
- if (magic != CRUSH_MAGIC) {
- pr_err("crush_decode magic %x != current %x\n",
- (unsigned)magic, (unsigned)CRUSH_MAGIC);
- goto bad;
- }
- c->max_buckets = ceph_decode_32(p);
- c->max_rules = ceph_decode_32(p);
- c->max_devices = ceph_decode_32(p);
-
- c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
- if (c->device_parents == NULL)
- goto badmem;
- c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
- if (c->bucket_parents == NULL)
- goto badmem;
-
- c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
- if (c->buckets == NULL)
- goto badmem;
- c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
- if (c->rules == NULL)
- goto badmem;
-
- /* buckets */
- for (i = 0; i < c->max_buckets; i++) {
- int size = 0;
- u32 alg;
- struct crush_bucket *b;
-
- ceph_decode_32_safe(p, end, alg, bad);
- if (alg == 0) {
- c->buckets[i] = NULL;
- continue;
- }
- dout("crush_decode bucket %d off %x %p to %p\n",
- i, (int)(*p-start), *p, end);
-
- switch (alg) {
- case CRUSH_BUCKET_UNIFORM:
- size = sizeof(struct crush_bucket_uniform);
- break;
- case CRUSH_BUCKET_LIST:
- size = sizeof(struct crush_bucket_list);
- break;
- case CRUSH_BUCKET_TREE:
- size = sizeof(struct crush_bucket_tree);
- break;
- case CRUSH_BUCKET_STRAW:
- size = sizeof(struct crush_bucket_straw);
- break;
- default:
- err = -EINVAL;
- goto bad;
- }
- BUG_ON(size == 0);
- b = c->buckets[i] = kzalloc(size, GFP_NOFS);
- if (b == NULL)
- goto badmem;
-
- ceph_decode_need(p, end, 4*sizeof(u32), bad);
- b->id = ceph_decode_32(p);
- b->type = ceph_decode_16(p);
- b->alg = ceph_decode_8(p);
- b->hash = ceph_decode_8(p);
- b->weight = ceph_decode_32(p);
- b->size = ceph_decode_32(p);
-
- dout("crush_decode bucket size %d off %x %p to %p\n",
- b->size, (int)(*p-start), *p, end);
-
- b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
- if (b->items == NULL)
- goto badmem;
- b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
- if (b->perm == NULL)
- goto badmem;
- b->perm_n = 0;
-
- ceph_decode_need(p, end, b->size*sizeof(u32), bad);
- for (j = 0; j < b->size; j++)
- b->items[j] = ceph_decode_32(p);
-
- switch (b->alg) {
- case CRUSH_BUCKET_UNIFORM:
- err = crush_decode_uniform_bucket(p, end,
- (struct crush_bucket_uniform *)b);
- if (err < 0)
- goto bad;
- break;
- case CRUSH_BUCKET_LIST:
- err = crush_decode_list_bucket(p, end,
- (struct crush_bucket_list *)b);
- if (err < 0)
- goto bad;
- break;
- case CRUSH_BUCKET_TREE:
- err = crush_decode_tree_bucket(p, end,
- (struct crush_bucket_tree *)b);
- if (err < 0)
- goto bad;
- break;
- case CRUSH_BUCKET_STRAW:
- err = crush_decode_straw_bucket(p, end,
- (struct crush_bucket_straw *)b);
- if (err < 0)
- goto bad;
- break;
- }
- }
-
- /* rules */
- dout("rule vec is %p\n", c->rules);
- for (i = 0; i < c->max_rules; i++) {
- u32 yes;
- struct crush_rule *r;
-
- ceph_decode_32_safe(p, end, yes, bad);
- if (!yes) {
- dout("crush_decode NO rule %d off %x %p to %p\n",
- i, (int)(*p-start), *p, end);
- c->rules[i] = NULL;
- continue;
- }
-
- dout("crush_decode rule %d off %x %p to %p\n",
- i, (int)(*p-start), *p, end);
-
- /* len */
- ceph_decode_32_safe(p, end, yes, bad);
-#if BITS_PER_LONG == 32
- err = -EINVAL;
- if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
- goto bad;
-#endif
- r = c->rules[i] = kmalloc(sizeof(*r) +
- yes*sizeof(struct crush_rule_step),
- GFP_NOFS);
- if (r == NULL)
- goto badmem;
- dout(" rule %d is at %p\n", i, r);
- r->len = yes;
- ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
- ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
- for (j = 0; j < r->len; j++) {
- r->steps[j].op = ceph_decode_32(p);
- r->steps[j].arg1 = ceph_decode_32(p);
- r->steps[j].arg2 = ceph_decode_32(p);
- }
- }
-
- /* ignore trailing name maps. */
-
- dout("crush_decode success\n");
- return c;
-
-badmem:
- err = -ENOMEM;
-bad:
- dout("crush_decode fail %d\n", err);
- crush_destroy(c);
- return ERR_PTR(err);
-}
-
-/*
- * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
- */
-static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
-{
- u64 a = *(u64 *)&l;
- u64 b = *(u64 *)&r;
-
- if (a < b)
- return -1;
- if (a > b)
- return 1;
- return 0;
-}
-
-static int __insert_pg_mapping(struct ceph_pg_mapping *new,
- struct rb_root *root)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_mapping *pg = NULL;
- int c;
-
- while (*p) {
- parent = *p;
- pg = rb_entry(parent, struct ceph_pg_mapping, node);
- c = pgid_cmp(new->pgid, pg->pgid);
- if (c < 0)
- p = &(*p)->rb_left;
- else if (c > 0)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
- return 0;
-}
-
-static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
- struct ceph_pg pgid)
-{
- struct rb_node *n = root->rb_node;
- struct ceph_pg_mapping *pg;
- int c;
-
- while (n) {
- pg = rb_entry(n, struct ceph_pg_mapping, node);
- c = pgid_cmp(pgid, pg->pgid);
- if (c < 0)
- n = n->rb_left;
- else if (c > 0)
- n = n->rb_right;
- else
- return pg;
- }
- return NULL;
-}
-
-/*
- * rbtree of pg pool info
- */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_pool_info *pi = NULL;
-
- while (*p) {
- parent = *p;
- pi = rb_entry(parent, struct ceph_pg_pool_info, node);
- if (new->id < pi->id)
- p = &(*p)->rb_left;
- else if (new->id > pi->id)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
- return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
-{
- struct ceph_pg_pool_info *pi;
- struct rb_node *n = root->rb_node;
-
- while (n) {
- pi = rb_entry(n, struct ceph_pg_pool_info, node);
- if (id < pi->id)
- n = n->rb_left;
- else if (id > pi->id)
- n = n->rb_right;
- else
- return pi;
- }
- return NULL;
-}
-
-static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
-{
- rb_erase(&pi->node, root);
- kfree(pi->name);
- kfree(pi);
-}
-
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
-{
- unsigned n, m;
-
- ceph_decode_copy(p, &pi->v, sizeof(pi->v));
- calc_pg_masks(pi);
-
- /* num_snaps * snap_info_t */
- n = le32_to_cpu(pi->v.num_snaps);
- while (n--) {
- ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
- sizeof(struct ceph_timespec), bad);
- *p += sizeof(u64) + /* key */
- 1 + sizeof(u64) + /* u8, snapid */
- sizeof(struct ceph_timespec);
- m = ceph_decode_32(p); /* snap name */
- *p += m;
- }
-
- *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
- return 0;
-
-bad:
- return -EINVAL;
-}
-
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
-{
- struct ceph_pg_pool_info *pi;
- u32 num, len, pool;
-
- ceph_decode_32_safe(p, end, num, bad);
- dout(" %d pool names\n", num);
- while (num--) {
- ceph_decode_32_safe(p, end, pool, bad);
- ceph_decode_32_safe(p, end, len, bad);
- dout(" pool %d len %d\n", pool, len);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
- if (pi) {
- kfree(pi->name);
- pi->name = kmalloc(len + 1, GFP_NOFS);
- if (pi->name) {
- memcpy(pi->name, *p, len);
- pi->name[len] = '\0';
- dout(" name is %s\n", pi->name);
- }
- }
- *p += len;
- }
- return 0;
-
-bad:
- return -EINVAL;
-}
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
- dout("osdmap_destroy %p\n", map);
- if (map->crush)
- crush_destroy(map->crush);
- while (!RB_EMPTY_ROOT(&map->pg_temp)) {
- struct ceph_pg_mapping *pg =
- rb_entry(rb_first(&map->pg_temp),
- struct ceph_pg_mapping, node);
- rb_erase(&pg->node, &map->pg_temp);
- kfree(pg);
- }
- while (!RB_EMPTY_ROOT(&map->pg_pools)) {
- struct ceph_pg_pool_info *pi =
- rb_entry(rb_first(&map->pg_pools),
- struct ceph_pg_pool_info, node);
- __remove_pg_pool(&map->pg_pools, pi);
- }
- kfree(map->osd_state);
- kfree(map->osd_weight);
- kfree(map->osd_addr);
- kfree(map);
-}
-
-/*
- * adjust max osd value. reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
- u8 *state;
- struct ceph_entity_addr *addr;
- u32 *weight;
-
- state = kcalloc(max, sizeof(*state), GFP_NOFS);
- addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
- weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
- if (state == NULL || addr == NULL || weight == NULL) {
- kfree(state);
- kfree(addr);
- kfree(weight);
- return -ENOMEM;
- }
-
- /* copy old? */
- if (map->osd_state) {
- memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
- memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
- memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
- kfree(map->osd_state);
- kfree(map->osd_addr);
- kfree(map->osd_weight);
- }
-
- map->osd_state = state;
- map->osd_weight = weight;
- map->osd_addr = addr;
- map->max_osd = max;
- return 0;
-}
-
-/*
- * decode a full map.
- */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
-{
- struct ceph_osdmap *map;
- u16 version;
- u32 len, max, i;
- u8 ev;
- int err = -EINVAL;
- void *start = *p;
- struct ceph_pg_pool_info *pi;
-
- dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
-
- map = kzalloc(sizeof(*map), GFP_NOFS);
- if (map == NULL)
- return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
-
- ceph_decode_16_safe(p, end, version, bad);
- if (version > CEPH_OSDMAP_VERSION) {
- pr_warning("got unknown v %d > %d of osdmap\n", version,
- CEPH_OSDMAP_VERSION);
- goto bad;
- }
-
- ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
- ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
- map->epoch = ceph_decode_32(p);
- ceph_decode_copy(p, &map->created, sizeof(map->created));
- ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-
- ceph_decode_32_safe(p, end, max, bad);
- while (max--) {
- ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
- pi = kzalloc(sizeof(*pi), GFP_NOFS);
- if (!pi)
- goto bad;
- pi->id = ceph_decode_32(p);
- ev = ceph_decode_8(p); /* encoding version */
- if (ev > CEPH_PG_POOL_VERSION) {
- pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
- ev, CEPH_PG_POOL_VERSION);
- kfree(pi);
- goto bad;
- }
- err = __decode_pool(p, end, pi);
- if (err < 0)
- goto bad;
- __insert_pg_pool(&map->pg_pools, pi);
- }
-
- if (version >= 5 && __decode_pool_names(p, end, map) < 0)
- goto bad;
-
- ceph_decode_32_safe(p, end, map->pool_max, bad);
-
- ceph_decode_32_safe(p, end, map->flags, bad);
-
- max = ceph_decode_32(p);
-
- /* (re)alloc osd arrays */
- err = osdmap_set_max_osd(map, max);
- if (err < 0)
- goto bad;
- dout("osdmap_decode max_osd = %d\n", map->max_osd);
-
- /* osds */
- err = -EINVAL;
- ceph_decode_need(p, end, 3*sizeof(u32) +
- map->max_osd*(1 + sizeof(*map->osd_weight) +
- sizeof(*map->osd_addr)), bad);
- *p += 4; /* skip length field (should match max) */
- ceph_decode_copy(p, map->osd_state, map->max_osd);
-
- *p += 4; /* skip length field (should match max) */
- for (i = 0; i < map->max_osd; i++)
- map->osd_weight[i] = ceph_decode_32(p);
-
- *p += 4; /* skip length field (should match max) */
- ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
- for (i = 0; i < map->max_osd; i++)
- ceph_decode_addr(&map->osd_addr[i]);
-
- /* pg_temp */
- ceph_decode_32_safe(p, end, len, bad);
- for (i = 0; i < len; i++) {
- int n, j;
- struct ceph_pg pgid;
- struct ceph_pg_mapping *pg;
-
- ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
- ceph_decode_copy(p, &pgid, sizeof(pgid));
- n = ceph_decode_32(p);
- ceph_decode_need(p, end, n * sizeof(u32), bad);
- err = -ENOMEM;
- pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
- if (!pg)
- goto bad;
- pg->pgid = pgid;
- pg->len = n;
- for (j = 0; j < n; j++)
- pg->osds[j] = ceph_decode_32(p);
-
- err = __insert_pg_mapping(pg, &map->pg_temp);
- if (err)
- goto bad;
- dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
- }
-
- /* crush */
- ceph_decode_32_safe(p, end, len, bad);
- dout("osdmap_decode crush len %d from off 0x%x\n", len,
- (int)(*p - start));
- ceph_decode_need(p, end, len, bad);
- map->crush = crush_decode(*p, end);
- *p += len;
- if (IS_ERR(map->crush)) {
- err = PTR_ERR(map->crush);
- map->crush = NULL;
- goto bad;
- }
-
- /* ignore the rest of the map */
- *p = end;
-
- dout("osdmap_decode done %p %p\n", *p, end);
- return map;
-
-bad:
- dout("osdmap_decode fail\n");
- ceph_osdmap_destroy(map);
- return ERR_PTR(err);
-}
-
-/*
- * decode and apply an incremental map update.
- */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr)
-{
- struct crush_map *newcrush = NULL;
- struct ceph_fsid fsid;
- u32 epoch = 0;
- struct ceph_timespec modified;
- u32 len, pool;
- __s32 new_pool_max, new_flags, max;
- void *start = *p;
- int err = -EINVAL;
- u16 version;
- struct rb_node *rbp;
-
- ceph_decode_16_safe(p, end, version, bad);
- if (version > CEPH_OSDMAP_INC_VERSION) {
- pr_warning("got unknown v %d > %d of inc osdmap\n", version,
- CEPH_OSDMAP_INC_VERSION);
- goto bad;
- }
-
- ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
- bad);
- ceph_decode_copy(p, &fsid, sizeof(fsid));
- epoch = ceph_decode_32(p);
- BUG_ON(epoch != map->epoch+1);
- ceph_decode_copy(p, &modified, sizeof(modified));
- new_pool_max = ceph_decode_32(p);
- new_flags = ceph_decode_32(p);
-
- /* full map? */
- ceph_decode_32_safe(p, end, len, bad);
- if (len > 0) {
- dout("apply_incremental full map len %d, %p to %p\n",
- len, *p, end);
- return osdmap_decode(p, min(*p+len, end));
- }
-
- /* new crush? */
- ceph_decode_32_safe(p, end, len, bad);
- if (len > 0) {
- dout("apply_incremental new crush map len %d, %p to %p\n",
- len, *p, end);
- newcrush = crush_decode(*p, min(*p+len, end));
- if (IS_ERR(newcrush))
- return ERR_CAST(newcrush);
- *p += len;
- }
-
- /* new flags? */
- if (new_flags >= 0)
- map->flags = new_flags;
- if (new_pool_max >= 0)
- map->pool_max = new_pool_max;
-
- ceph_decode_need(p, end, 5*sizeof(u32), bad);
-
- /* new max? */
- max = ceph_decode_32(p);
- if (max >= 0) {
- err = osdmap_set_max_osd(map, max);
- if (err < 0)
- goto bad;
- }
-
- map->epoch++;
- map->modified = map->modified;
- if (newcrush) {
- if (map->crush)
- crush_destroy(map->crush);
- map->crush = newcrush;
- newcrush = NULL;
- }
-
- /* new_pool */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- __u8 ev;
- struct ceph_pg_pool_info *pi;
-
- ceph_decode_32_safe(p, end, pool, bad);
- ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
- ev = ceph_decode_8(p); /* encoding version */
- if (ev > CEPH_PG_POOL_VERSION) {
- pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
- ev, CEPH_PG_POOL_VERSION);
- goto bad;
- }
- pi = __lookup_pg_pool(&map->pg_pools, pool);
- if (!pi) {
- pi = kzalloc(sizeof(*pi), GFP_NOFS);
- if (!pi) {
- err = -ENOMEM;
- goto bad;
- }
- pi->id = pool;
- __insert_pg_pool(&map->pg_pools, pi);
- }
- err = __decode_pool(p, end, pi);
- if (err < 0)
- goto bad;
- }
- if (version >= 5 && __decode_pool_names(p, end, map) < 0)
- goto bad;
-
- /* old_pool */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- struct ceph_pg_pool_info *pi;
-
- ceph_decode_32_safe(p, end, pool, bad);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
- if (pi)
- __remove_pg_pool(&map->pg_pools, pi);
- }
-
- /* new_up */
- err = -EINVAL;
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- u32 osd;
- struct ceph_entity_addr addr;
- ceph_decode_32_safe(p, end, osd, bad);
- ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
- ceph_decode_addr(&addr);
- pr_info("osd%d up\n", osd);
- BUG_ON(osd >= map->max_osd);
- map->osd_state[osd] |= CEPH_OSD_UP;
- map->osd_addr[osd] = addr;
- }
-
- /* new_down */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- u32 osd;
- ceph_decode_32_safe(p, end, osd, bad);
- (*p)++; /* clean flag */
- pr_info("osd%d down\n", osd);
- if (osd < map->max_osd)
- map->osd_state[osd] &= ~CEPH_OSD_UP;
- }
-
- /* new_weight */
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- u32 osd, off;
- ceph_decode_need(p, end, sizeof(u32)*2, bad);
- osd = ceph_decode_32(p);
- off = ceph_decode_32(p);
- pr_info("osd%d weight 0x%x %s\n", osd, off,
- off == CEPH_OSD_IN ? "(in)" :
- (off == CEPH_OSD_OUT ? "(out)" : ""));
- if (osd < map->max_osd)
- map->osd_weight[osd] = off;
- }
-
- /* new_pg_temp */
- rbp = rb_first(&map->pg_temp);
- ceph_decode_32_safe(p, end, len, bad);
- while (len--) {
- struct ceph_pg_mapping *pg;
- int j;
- struct ceph_pg pgid;
- u32 pglen;
- ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
- ceph_decode_copy(p, &pgid, sizeof(pgid));
- pglen = ceph_decode_32(p);
-
- /* remove any? */
- while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
- node)->pgid, pgid) <= 0) {
- struct ceph_pg_mapping *cur =
- rb_entry(rbp, struct ceph_pg_mapping, node);
-
- rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
- rb_erase(&cur->node, &map->pg_temp);
- kfree(cur);
- }
-
- if (pglen) {
- /* insert */
- ceph_decode_need(p, end, pglen*sizeof(u32), bad);
- pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
- if (!pg) {
- err = -ENOMEM;
- goto bad;
- }
- pg->pgid = pgid;
- pg->len = pglen;
- for (j = 0; j < pglen; j++)
- pg->osds[j] = ceph_decode_32(p);
- err = __insert_pg_mapping(pg, &map->pg_temp);
- if (err) {
- kfree(pg);
- goto bad;
- }
- dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
- pglen);
- }
- }
- while (rbp) {
- struct ceph_pg_mapping *cur =
- rb_entry(rbp, struct ceph_pg_mapping, node);
-
- rbp = rb_next(rbp);
- dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
- rb_erase(&cur->node, &map->pg_temp);
- kfree(cur);
- }
-
- /* ignore the rest */
- *p = end;
- return map;
-
-bad:
- pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
- epoch, (int)(*p - start), *p, start, end);
- print_hex_dump(KERN_DEBUG, "osdmap: ",
- DUMP_PREFIX_OFFSET, 16, 1,
- start, end - start, true);
- if (newcrush)
- crush_destroy(newcrush);
- return ERR_PTR(err);
-}
-
-
-
-
-/*
- * calculate file layout from given offset, length.
- * fill in correct oid, logical length, and object extent
- * offset, length.
- *
- * for now, we write only a single su, until we can
- * pass a stride back to the caller.
- */
-void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u64 *ono,
- u64 *oxoff, u64 *oxlen)
-{
- u32 osize = le32_to_cpu(layout->fl_object_size);
- u32 su = le32_to_cpu(layout->fl_stripe_unit);
- u32 sc = le32_to_cpu(layout->fl_stripe_count);
- u32 bl, stripeno, stripepos, objsetno;
- u32 su_per_object;
- u64 t, su_offset;
-
- dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
- osize, su);
- su_per_object = osize / su;
- dout("osize %u / su %u = su_per_object %u\n", osize, su,
- su_per_object);
-
- BUG_ON((su & ~PAGE_MASK) != 0);
- /* bl = *off / su; */
- t = off;
- do_div(t, su);
- bl = t;
- dout("off %llu / su %u = bl %u\n", off, su, bl);
-
- stripeno = bl / sc;
- stripepos = bl % sc;
- objsetno = stripeno / su_per_object;
-
- *ono = objsetno * sc + stripepos;
- dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
-
- /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
- t = off;
- su_offset = do_div(t, su);
- *oxoff = su_offset + (stripeno % su_per_object) * su;
-
- /*
- * Calculate the length of the extent being written to the selected
- * object. This is the minimum of the full length requested (plen) or
- * the remainder of the current stripe being written to.
- */
- *oxlen = min_t(u64, *plen, su - su_offset);
- *plen = *oxlen;
-
- dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-}
-
-/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
- */
-int ceph_calc_object_layout(struct ceph_object_layout *ol,
- const char *oid,
- struct ceph_file_layout *fl,
- struct ceph_osdmap *osdmap)
-{
- unsigned num, num_mask;
- struct ceph_pg pgid;
- s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
- int poolid = le32_to_cpu(fl->fl_pg_pool);
- struct ceph_pg_pool_info *pool;
- unsigned ps;
-
- BUG_ON(!osdmap);
-
- pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
- if (!pool)
- return -EIO;
- ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
- if (preferred >= 0) {
- ps += preferred;
- num = le32_to_cpu(pool->v.lpg_num);
- num_mask = pool->lpg_num_mask;
- } else {
- num = le32_to_cpu(pool->v.pg_num);
- num_mask = pool->pg_num_mask;
- }
-
- pgid.ps = cpu_to_le16(ps);
- pgid.preferred = cpu_to_le16(preferred);
- pgid.pool = fl->fl_pg_pool;
- if (preferred >= 0)
- dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
- (int)preferred);
- else
- dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
-
- ol->ol_pgid = pgid;
- ol->ol_stripe_unit = fl->fl_object_stripe_unit;
- return 0;
-}
-
-/*
- * Calculate raw osd vector for the given pgid. Return pointer to osd
- * array, or NULL on failure.
- */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *osds, int *num)
-{
- struct ceph_pg_mapping *pg;
- struct ceph_pg_pool_info *pool;
- int ruleno;
- unsigned poolid, ps, pps;
- int preferred;
-
- /* pg_temp? */
- pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
- if (pg) {
- *num = pg->len;
- return pg->osds;
- }
-
- /* crush */
- poolid = le32_to_cpu(pgid.pool);
- ps = le16_to_cpu(pgid.ps);
- preferred = (s16)le16_to_cpu(pgid.preferred);
-
- /* don't forcefeed bad device ids to crush */
- if (preferred >= osdmap->max_osd ||
- preferred >= osdmap->crush->max_devices)
- preferred = -1;
-
- pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
- if (!pool)
- return NULL;
- ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
- pool->v.type, pool->v.size);
- if (ruleno < 0) {
- pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
- poolid, pool->v.crush_ruleset, pool->v.type,
- pool->v.size);
- return NULL;
- }
-
- if (preferred >= 0)
- pps = ceph_stable_mod(ps,
- le32_to_cpu(pool->v.lpgp_num),
- pool->lpgp_num_mask);
- else
- pps = ceph_stable_mod(ps,
- le32_to_cpu(pool->v.pgp_num),
- pool->pgp_num_mask);
- pps += poolid;
- *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
- min_t(int, pool->v.size, *num),
- preferred, osdmap->osd_weight);
- return osds;
-}
-
-/*
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *acting)
-{
- int rawosds[CEPH_PG_MAX_SIZE], *osds;
- int i, o, num = CEPH_PG_MAX_SIZE;
-
- osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
- if (!osds)
- return -1;
-
- /* primary is first up osd */
- o = 0;
- for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i]))
- acting[o++] = osds[i];
- return o;
-}
-
-/*
- * Return primary osd for given pgid, or -1 if none.
- */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
-{
- int rawosds[CEPH_PG_MAX_SIZE], *osds;
- int i, num = CEPH_PG_MAX_SIZE;
-
- osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
- if (!osds)
- return -1;
-
- /* primary is first up osd */
- for (i = 0; i < num; i++)
- if (ceph_osd_is_up(osdmap, osds[i]))
- return osds[i];
- return -1;
-}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
deleted file mode 100644
index 970b547e510d..000000000000
--- a/fs/ceph/osdmap.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef _FS_CEPH_OSDMAP_H
-#define _FS_CEPH_OSDMAP_H
-
-#include <linux/rbtree.h>
-#include "types.h"
-#include "ceph_fs.h"
-#include "crush/crush.h"
-
-/*
- * The osd map describes the current membership of the osd cluster and
- * specifies the mapping of objects to placement groups and placement
- * groups to (sets of) osds. That is, it completely specifies the
- * (desired) distribution of all data objects in the system at some
- * point in time.
- *
- * Each map version is identified by an epoch, which increases monotonically.
- *
- * The map can be updated either via an incremental map (diff) describing
- * the change between two successive epochs, or as a fully encoded map.
- */
-struct ceph_pg_pool_info {
- struct rb_node node;
- int id;
- struct ceph_pg_pool v;
- int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
- char *name;
-};
-
-struct ceph_pg_mapping {
- struct rb_node node;
- struct ceph_pg pgid;
- int len;
- int osds[];
-};
-
-struct ceph_osdmap {
- struct ceph_fsid fsid;
- u32 epoch;
- u32 mkfs_epoch;
- struct ceph_timespec created, modified;
-
- u32 flags; /* CEPH_OSDMAP_* */
-
- u32 max_osd; /* size of osd_state, _offload, _addr arrays */
- u8 *osd_state; /* CEPH_OSD_* */
- u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
- struct ceph_entity_addr *osd_addr;
-
- struct rb_root pg_temp;
- struct rb_root pg_pools;
- u32 pool_max;
-
- /* the CRUSH map specifies the mapping of placement groups to
- * the list of osds that store+replicate them. */
- struct crush_map *crush;
-};
-
-/*
- * file layout helpers
- */
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
- ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
- ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_preferred(l) \
- ((__s32)le32_to_cpu((l).fl_pg_preferred))
-#define ceph_file_layout_pg_pool(l) \
- ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_stripe_unit) *
- le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_object_size) *
- le32_to_cpu(l->fl_stripe_count);
-}
-
-
-static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
-{
- return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
-}
-
-static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
-{
- return map && (map->flags & flag);
-}
-
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
-
-static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
- int osd)
-{
- if (osd >= map->max_osd)
- return NULL;
- return &map->osd_addr[osd];
-}
-
-extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
-extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
- struct ceph_osdmap *map,
- struct ceph_messenger *msgr);
-extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
-
-/* calculate mapping of a file extent to an object */
-extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u64 *bno, u64 *oxoff, u64 *oxlen);
-
-/* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
- const char *oid,
- struct ceph_file_layout *fl,
- struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
- int *acting);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
- struct ceph_pg pgid);
-
-#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-
-#include "pagelist.h"
-
-static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
-{
- struct page *page = list_entry(pl->head.prev, struct page,
- lru);
- kunmap(page);
-}
-
-int ceph_pagelist_release(struct ceph_pagelist *pl)
-{
- if (pl->mapped_tail)
- ceph_pagelist_unmap_tail(pl);
-
- while (!list_empty(&pl->head)) {
- struct page *page = list_first_entry(&pl->head, struct page,
- lru);
- list_del(&page->lru);
- __free_page(page);
- }
- return 0;
-}
-
-static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
-{
- struct page *page = __page_cache_alloc(GFP_NOFS);
- if (!page)
- return -ENOMEM;
- pl->room += PAGE_SIZE;
- list_add_tail(&page->lru, &pl->head);
- if (pl->mapped_tail)
- ceph_pagelist_unmap_tail(pl);
- pl->mapped_tail = kmap(page);
- return 0;
-}
-
-int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
-{
- while (pl->room < len) {
- size_t bit = pl->room;
- int ret;
-
- memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
- buf, bit);
- pl->length += bit;
- pl->room -= bit;
- buf += bit;
- len -= bit;
- ret = ceph_pagelist_addpage(pl);
- if (ret)
- return ret;
- }
-
- memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
- pl->length += len;
- pl->room -= len;
- return 0;
-}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
deleted file mode 100644
index e8a4187e1087..000000000000
--- a/fs/ceph/pagelist.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __FS_CEPH_PAGELIST_H
-#define __FS_CEPH_PAGELIST_H
-
-#include <linux/list.h>
-
-struct ceph_pagelist {
- struct list_head head;
- void *mapped_tail;
- size_t length;
- size_t room;
-};
-
-static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
-{
- INIT_LIST_HEAD(&pl->head);
- pl->mapped_tail = NULL;
- pl->length = 0;
- pl->room = 0;
-}
-extern int ceph_pagelist_release(struct ceph_pagelist *pl);
-
-extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
-
-static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
-{
- __le64 ev = cpu_to_le64(v);
- return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
-{
- __le32 ev = cpu_to_le32(v);
- return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
-{
- __le16 ev = cpu_to_le16(v);
- return ceph_pagelist_append(pl, &ev, sizeof(ev));
-}
-static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
-{
- return ceph_pagelist_append(pl, &v, 1);
-}
-static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
- char *s, size_t len)
-{
- int ret = ceph_pagelist_encode_32(pl, len);
- if (ret)
- return ret;
- if (len)
- return ceph_pagelist_append(pl, s, len);
- return 0;
-}
-
-#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
deleted file mode 100644
index 6d5247f2e81b..000000000000
--- a/fs/ceph/rados.h
+++ /dev/null
@@ -1,405 +0,0 @@
-#ifndef CEPH_RADOS_H
-#define CEPH_RADOS_H
-
-/*
- * Data types for the Ceph distributed object storage layer RADOS
- * (Reliable Autonomic Distributed Object Store).
- */
-
-#include "msgr.h"
-
-/*
- * osdmap encoding versions
- */
-#define CEPH_OSDMAP_INC_VERSION 5
-#define CEPH_OSDMAP_INC_VERSION_EXT 5
-#define CEPH_OSDMAP_VERSION 5
-#define CEPH_OSDMAP_VERSION_EXT 5
-
-/*
- * fs id
- */
-struct ceph_fsid {
- unsigned char fsid[16];
-};
-
-static inline int ceph_fsid_compare(const struct ceph_fsid *a,
- const struct ceph_fsid *b)
-{
- return memcmp(a, b, sizeof(*a));
-}
-
-/*
- * ino, object, etc.
- */
-typedef __le64 ceph_snapid_t;
-#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
-#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
-#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
-
-struct ceph_timespec {
- __le32 tv_sec;
- __le32 tv_nsec;
-} __attribute__ ((packed));
-
-
-/*
- * object layout - how objects are mapped into PGs
- */
-#define CEPH_OBJECT_LAYOUT_HASH 1
-#define CEPH_OBJECT_LAYOUT_LINEAR 2
-#define CEPH_OBJECT_LAYOUT_HASHINO 3
-
-/*
- * pg layout -- how PGs are mapped onto (sets of) OSDs
- */
-#define CEPH_PG_LAYOUT_CRUSH 0
-#define CEPH_PG_LAYOUT_HASH 1
-#define CEPH_PG_LAYOUT_LINEAR 2
-#define CEPH_PG_LAYOUT_HYBRID 3
-
-#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
-
-/*
- * placement group.
- * we encode this into one __le64.
- */
-struct ceph_pg {
- __le16 preferred; /* preferred primary osd */
- __le16 ps; /* placement seed */
- __le32 pool; /* object pool */
-} __attribute__ ((packed));
-
-/*
- * pg_pool is a set of pgs storing a pool of objects
- *
- * pg_num -- base number of pseudorandomly placed pgs
- *
- * pgp_num -- effective number when calculating pg placement. this
- * is used for pg_num increases. new pgs result in data being "split"
- * into new pgs. for this to proceed smoothly, new pgs are intiially
- * colocated with their parents; that is, pgp_num doesn't increase
- * until the new pgs have successfully split. only _then_ are the new
- * pgs placed independently.
- *
- * lpg_num -- localized pg count (per device). replicas are randomly
- * selected.
- *
- * lpgp_num -- as above.
- */
-#define CEPH_PG_TYPE_REP 1
-#define CEPH_PG_TYPE_RAID4 2
-#define CEPH_PG_POOL_VERSION 2
-struct ceph_pg_pool {
- __u8 type; /* CEPH_PG_TYPE_* */
- __u8 size; /* number of osds in each pg */
- __u8 crush_ruleset; /* crush placement rule */
- __u8 object_hash; /* hash mapping object name to ps */
- __le32 pg_num, pgp_num; /* number of pg's */
- __le32 lpg_num, lpgp_num; /* number of localized pg's */
- __le32 last_change; /* most recent epoch changed */
- __le64 snap_seq; /* seq for per-pool snapshot */
- __le32 snap_epoch; /* epoch of last snap */
- __le32 num_snaps;
- __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
- __le64 auid; /* who owns the pg */
-} __attribute__ ((packed));
-
-/*
- * stable_mod func is used to control number of placement groups.
- * similar to straight-up modulo, but produces a stable mapping as b
- * increases over time. b is the number of bins, and bmask is the
- * containing power of 2 minus 1.
- *
- * b <= bmask and bmask=(2**n)-1
- * e.g., b=12 -> bmask=15, b=123 -> bmask=127
- */
-static inline int ceph_stable_mod(int x, int b, int bmask)
-{
- if ((x & bmask) < b)
- return x & bmask;
- else
- return x & (bmask >> 1);
-}
-
-/*
- * object layout - how a given object should be stored.
- */
-struct ceph_object_layout {
- struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
- __le32 ol_stripe_unit; /* for per-object parity, if any */
-} __attribute__ ((packed));
-
-/*
- * compound epoch+version, used by storage layer to serialize mutations
- */
-struct ceph_eversion {
- __le32 epoch;
- __le64 version;
-} __attribute__ ((packed));
-
-/*
- * osd map bits
- */
-
-/* status bits */
-#define CEPH_OSD_EXISTS 1
-#define CEPH_OSD_UP 2
-
-/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
-#define CEPH_OSD_IN 0x10000
-#define CEPH_OSD_OUT 0
-
-
-/*
- * osd map flag bits
- */
-#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
-#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
-#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
-#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
-#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
-
-/*
- * osd ops
- */
-#define CEPH_OSD_OP_MODE 0xf000
-#define CEPH_OSD_OP_MODE_RD 0x1000
-#define CEPH_OSD_OP_MODE_WR 0x2000
-#define CEPH_OSD_OP_MODE_RMW 0x3000
-#define CEPH_OSD_OP_MODE_SUB 0x4000
-
-#define CEPH_OSD_OP_TYPE 0x0f00
-#define CEPH_OSD_OP_TYPE_LOCK 0x0100
-#define CEPH_OSD_OP_TYPE_DATA 0x0200
-#define CEPH_OSD_OP_TYPE_ATTR 0x0300
-#define CEPH_OSD_OP_TYPE_EXEC 0x0400
-#define CEPH_OSD_OP_TYPE_PG 0x0500
-
-enum {
- /** data **/
- /* read */
- CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
- CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
-
- /* fancy read */
- CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
-
- /* write */
- CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
- CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
- CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
- CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
- CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
-
- /* fancy write */
- CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
- CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
- CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
- CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
-
- CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
- CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
- CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
-
- CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
- CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
-
- /** attrs **/
- /* read */
- CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
- CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
- CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
-
- /* write */
- CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
- CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
- CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
- CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
-
- /** subop **/
- CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
- CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
- CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
- CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
- CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
-
- /** lock **/
- CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
- CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
- CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
- CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
- CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
- CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
-
- /** exec **/
- CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
-
- /** pg **/
- CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
-};
-
-static inline int ceph_osd_op_type_lock(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
-}
-static inline int ceph_osd_op_type_data(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
-}
-static inline int ceph_osd_op_type_attr(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
-}
-static inline int ceph_osd_op_type_exec(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
-}
-static inline int ceph_osd_op_type_pg(int op)
-{
- return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
-}
-
-static inline int ceph_osd_op_mode_subop(int op)
-{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
-}
-static inline int ceph_osd_op_mode_read(int op)
-{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
-}
-static inline int ceph_osd_op_mode_modify(int op)
-{
- return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
-}
-
-/*
- * note that the following tmap stuff is also defined in the ceph librados.h
- * any modification here needs to be updated there
- */
-#define CEPH_OSD_TMAP_HDR 'h'
-#define CEPH_OSD_TMAP_SET 's'
-#define CEPH_OSD_TMAP_RM 'r'
-
-extern const char *ceph_osd_op_name(int op);
-
-
-/*
- * osd op flags
- *
- * An op may be READ, WRITE, or READ|WRITE.
- */
-enum {
- CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
- CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
- CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
- CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
- CEPH_OSD_FLAG_READ = 16, /* op may read */
- CEPH_OSD_FLAG_WRITE = 32, /* op may write */
- CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
- CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
- CEPH_OSD_FLAG_BALANCE_READS = 256,
- CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
- CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
- CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
- CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
-};
-
-enum {
- CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
-};
-
-#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
-
-/* xattr comparison */
-enum {
- CEPH_OSD_CMPXATTR_OP_NOP = 0,
- CEPH_OSD_CMPXATTR_OP_EQ = 1,
- CEPH_OSD_CMPXATTR_OP_NE = 2,
- CEPH_OSD_CMPXATTR_OP_GT = 3,
- CEPH_OSD_CMPXATTR_OP_GTE = 4,
- CEPH_OSD_CMPXATTR_OP_LT = 5,
- CEPH_OSD_CMPXATTR_OP_LTE = 6
-};
-
-enum {
- CEPH_OSD_CMPXATTR_MODE_STRING = 1,
- CEPH_OSD_CMPXATTR_MODE_U64 = 2
-};
-
-/*
- * an individual object operation. each may be accompanied by some data
- * payload
- */
-struct ceph_osd_op {
- __le16 op; /* CEPH_OSD_OP_* */
- __le32 flags; /* CEPH_OSD_FLAG_* */
- union {
- struct {
- __le64 offset, length;
- __le64 truncate_size;
- __le32 truncate_seq;
- } __attribute__ ((packed)) extent;
- struct {
- __le32 name_len;
- __le32 value_len;
- __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
- __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
- } __attribute__ ((packed)) xattr;
- struct {
- __u8 class_len;
- __u8 method_len;
- __u8 argc;
- __le32 indata_len;
- } __attribute__ ((packed)) cls;
- struct {
- __le64 cookie, count;
- } __attribute__ ((packed)) pgls;
- struct {
- __le64 snapid;
- } __attribute__ ((packed)) snap;
- };
- __le32 payload_len;
-} __attribute__ ((packed));
-
-/*
- * osd request message header. each request may include multiple
- * ceph_osd_op object operations.
- */
-struct ceph_osd_request_head {
- __le32 client_inc; /* client incarnation */
- struct ceph_object_layout layout; /* pgid */
- __le32 osdmap_epoch; /* client's osdmap epoch */
-
- __le32 flags;
-
- struct ceph_timespec mtime; /* for mutations only */
- struct ceph_eversion reassert_version; /* if we are replaying op */
-
- __le32 object_len; /* length of object name */
-
- __le64 snapid; /* snapid to read */
- __le64 snap_seq; /* writer's snap context */
- __le32 num_snaps;
-
- __le16 num_ops;
- struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
-} __attribute__ ((packed));
-
-struct ceph_osd_reply_head {
- __le32 client_inc; /* client incarnation */
- __le32 flags;
- struct ceph_object_layout layout;
- __le32 osdmap_epoch;
- struct ceph_eversion reassert_version; /* for replaying uncommitted */
-
- __le32 result; /* result code */
-
- __le32 object_len; /* length of object name */
- __le32 num_ops;
- struct ceph_osd_op ops[0]; /* ops[], object */
-} __attribute__ ((packed));
-
-
-#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/sort.h>
#include <linux/slab.h>
#include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
/*
* Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg)
{
- struct super_block *sb = mdsc->client->sb;
+ struct super_block *sb = mdsc->fsc->sb;
int mds = session->s_mds;
u64 split;
int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
/*
- * Ceph string constants
+ * Ceph fs string constants
*/
-#include "types.h"
+#include <linux/module.h>
+#include <linux/ceph/types.h>
-const char *ceph_entity_type_name(int type)
-{
- switch (type) {
- case CEPH_ENTITY_TYPE_MDS: return "mds";
- case CEPH_ENTITY_TYPE_OSD: return "osd";
- case CEPH_ENTITY_TYPE_MON: return "mon";
- case CEPH_ENTITY_TYPE_CLIENT: return "client";
- case CEPH_ENTITY_TYPE_AUTH: return "auth";
- default: return "unknown";
- }
-}
-
-const char *ceph_osd_op_name(int op)
-{
- switch (op) {
- case CEPH_OSD_OP_READ: return "read";
- case CEPH_OSD_OP_STAT: return "stat";
-
- case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
-
- case CEPH_OSD_OP_WRITE: return "write";
- case CEPH_OSD_OP_DELETE: return "delete";
- case CEPH_OSD_OP_TRUNCATE: return "truncate";
- case CEPH_OSD_OP_ZERO: return "zero";
- case CEPH_OSD_OP_WRITEFULL: return "writefull";
- case CEPH_OSD_OP_ROLLBACK: return "rollback";
-
- case CEPH_OSD_OP_APPEND: return "append";
- case CEPH_OSD_OP_STARTSYNC: return "startsync";
- case CEPH_OSD_OP_SETTRUNC: return "settrunc";
- case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
-
- case CEPH_OSD_OP_TMAPUP: return "tmapup";
- case CEPH_OSD_OP_TMAPGET: return "tmapget";
- case CEPH_OSD_OP_TMAPPUT: return "tmapput";
-
- case CEPH_OSD_OP_GETXATTR: return "getxattr";
- case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
- case CEPH_OSD_OP_SETXATTR: return "setxattr";
- case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
- case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
- case CEPH_OSD_OP_RMXATTR: return "rmxattr";
- case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
-
- case CEPH_OSD_OP_PULL: return "pull";
- case CEPH_OSD_OP_PUSH: return "push";
- case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
- case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
- case CEPH_OSD_OP_SCRUB: return "scrub";
-
- case CEPH_OSD_OP_WRLOCK: return "wrlock";
- case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
- case CEPH_OSD_OP_RDLOCK: return "rdlock";
- case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
- case CEPH_OSD_OP_UPLOCK: return "uplock";
- case CEPH_OSD_OP_DNLOCK: return "dnlock";
-
- case CEPH_OSD_OP_CALL: return "call";
-
- case CEPH_OSD_OP_PGLS: return "pgls";
- }
- return "???";
-}
const char *ceph_mds_state_name(int s)
{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
}
return "???";
}
-
-const char *ceph_pool_op_name(int op)
-{
- switch (op) {
- case POOL_OP_CREATE: return "create";
- case POOL_OP_DELETE: return "delete";
- case POOL_OP_AUID_CHANGE: return "auid change";
- case POOL_OP_CREATE_SNAP: return "create snap";
- case POOL_OP_DELETE_SNAP: return "delete snap";
- case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
- case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
- }
- return "???";
-}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
#include <linux/ctype.h>
@@ -15,10 +15,13 @@
#include <linux/statfs.h>
#include <linux/string.h>
-#include "decode.h"
#include "super.h"
-#include "mon_client.h"
-#include "auth.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/auth.h>
+#include <linux/ceph/debugfs.h>
/*
* Ceph superblock operations
@@ -26,36 +29,22 @@
* Handle the basics of mounting, unmounting.
*/
-
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
- const char *e = s + len;
-
- while (e != s && *(e-1) != '/')
- e--;
- return e;
-}
-
-
/*
* super ops
*/
static void ceph_put_super(struct super_block *s)
{
- struct ceph_client *client = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(s);
dout("put_super\n");
- ceph_mdsc_close_sessions(&client->mdsc);
+ ceph_mdsc_close_sessions(fsc->mdsc);
/*
* ensure we release the bdi before put_anon_super releases
* the device name.
*/
- if (s->s_bdi == &client->backing_dev_info) {
- bdi_unregister(&client->backing_dev_info);
+ if (s->s_bdi == &fsc->backing_dev_info) {
+ bdi_unregister(&fsc->backing_dev_info);
s->s_bdi = NULL;
}
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
- struct ceph_monmap *monmap = client->monc.monmap;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
+ struct ceph_monmap *monmap = fsc->client->monc.monmap;
struct ceph_statfs st;
u64 fsid;
int err;
dout("statfs\n");
- err = ceph_monc_do_statfs(&client->monc, &st);
+ err = ceph_monc_do_statfs(&fsc->client->monc, &st);
if (err < 0)
return err;
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
static int ceph_sync_fs(struct super_block *sb, int wait)
{
- struct ceph_client *client = ceph_sb_to_client(sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
if (!wait) {
dout("sync_fs (non-blocking)\n");
- ceph_flush_dirty_caps(&client->mdsc);
+ ceph_flush_dirty_caps(fsc->mdsc);
dout("sync_fs (non-blocking) done\n");
return 0;
}
dout("sync_fs (blocking)\n");
- ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
- ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
+ ceph_osdc_sync(&fsc->client->osdc);
+ ceph_mdsc_sync(fsc->mdsc);
dout("sync_fs (blocking) done\n");
return 0;
}
-static int default_congestion_kb(void)
-{
- int congestion_kb;
-
- /*
- * Copied from NFS
- *
- * congestion size, scale with available memory.
- *
- * 64MB: 8192k
- * 128MB: 11585k
- * 256MB: 16384k
- * 512MB: 23170k
- * 1GB: 32768k
- * 2GB: 46340k
- * 4GB: 65536k
- * 8GB: 92681k
- * 16GB: 131072k
- *
- * This allows larger machines to have larger/more transfers.
- * Limit the default to 256M
- */
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
- if (congestion_kb > 256*1024)
- congestion_kb = 256*1024;
-
- return congestion_kb;
-}
-
-/**
- * ceph_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- */
-static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
- struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
- struct ceph_mount_args *args = client->mount_args;
-
- if (args->flags & CEPH_OPT_FSID)
- seq_printf(m, ",fsid=%pU", &args->fsid);
- if (args->flags & CEPH_OPT_NOSHARE)
- seq_puts(m, ",noshare");
- if (args->flags & CEPH_OPT_DIRSTAT)
- seq_puts(m, ",dirstat");
- if ((args->flags & CEPH_OPT_RBYTES) == 0)
- seq_puts(m, ",norbytes");
- if (args->flags & CEPH_OPT_NOCRC)
- seq_puts(m, ",nocrc");
- if (args->flags & CEPH_OPT_NOASYNCREADDIR)
- seq_puts(m, ",noasyncreaddir");
-
- if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
- seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
- if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
- seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
- if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
- seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
- if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
- seq_printf(m, ",osdkeepalivetimeout=%d",
- args->osd_keepalive_timeout);
- if (args->wsize)
- seq_printf(m, ",wsize=%d", args->wsize);
- if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
- seq_printf(m, ",rsize=%d", args->rsize);
- if (args->congestion_kb != default_congestion_kb())
- seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
- if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_min=%d",
- args->caps_wanted_delay_min);
- if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_max=%d",
- args->caps_wanted_delay_max);
- if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
- seq_printf(m, ",cap_release_safety=%d",
- args->cap_release_safety);
- if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
- seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
- if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
- seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
- if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", args->snapdir_name);
- if (args->name)
- seq_printf(m, ",name=%s", args->name);
- if (args->secret)
- seq_puts(m, ",secret=<hidden>");
- return 0;
-}
-
-/*
- * caches
- */
-struct kmem_cache *ceph_inode_cachep;
-struct kmem_cache *ceph_cap_cachep;
-struct kmem_cache *ceph_dentry_cachep;
-struct kmem_cache *ceph_file_cachep;
-
-static void ceph_inode_init_once(void *foo)
-{
- struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
-}
-
-static int __init init_caches(void)
-{
- ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
- sizeof(struct ceph_inode_info),
- __alignof__(struct ceph_inode_info),
- (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
- ceph_inode_init_once);
- if (ceph_inode_cachep == NULL)
- return -ENOMEM;
-
- ceph_cap_cachep = KMEM_CACHE(ceph_cap,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_cachep == NULL)
- goto bad_cap;
-
- ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_dentry_cachep == NULL)
- goto bad_dentry;
-
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_file_cachep == NULL)
- goto bad_file;
-
- return 0;
-
-bad_file:
- kmem_cache_destroy(ceph_dentry_cachep);
-bad_dentry:
- kmem_cache_destroy(ceph_cap_cachep);
-bad_cap:
- kmem_cache_destroy(ceph_inode_cachep);
- return -ENOMEM;
-}
-
-static void destroy_caches(void)
-{
- kmem_cache_destroy(ceph_inode_cachep);
- kmem_cache_destroy(ceph_cap_cachep);
- kmem_cache_destroy(ceph_dentry_cachep);
- kmem_cache_destroy(ceph_file_cachep);
-}
-
-
-/*
- * ceph_umount_begin - initiate forced umount. Tear down down the
- * mount, skipping steps that may hang while waiting for server(s).
- */
-static void ceph_umount_begin(struct super_block *sb)
-{
- struct ceph_client *client = ceph_sb_to_client(sb);
-
- dout("ceph_umount_begin - starting forced umount\n");
- if (!client)
- return;
- client->mount_state = CEPH_MOUNT_SHUTDOWN;
- return;
-}
-
-static const struct super_operations ceph_super_ops = {
- .alloc_inode = ceph_alloc_inode,
- .destroy_inode = ceph_destroy_inode,
- .write_inode = ceph_write_inode,
- .sync_fs = ceph_sync_fs,
- .put_super = ceph_put_super,
- .show_options = ceph_show_options,
- .statfs = ceph_statfs,
- .umount_begin = ceph_umount_begin,
-};
-
-
-const char *ceph_msg_type_name(int type)
-{
- switch (type) {
- case CEPH_MSG_SHUTDOWN: return "shutdown";
- case CEPH_MSG_PING: return "ping";
- case CEPH_MSG_AUTH: return "auth";
- case CEPH_MSG_AUTH_REPLY: return "auth_reply";
- case CEPH_MSG_MON_MAP: return "mon_map";
- case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
- case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
- case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
- case CEPH_MSG_STATFS: return "statfs";
- case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
- case CEPH_MSG_MDS_MAP: return "mds_map";
- case CEPH_MSG_CLIENT_SESSION: return "client_session";
- case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
- case CEPH_MSG_CLIENT_REQUEST: return "client_request";
- case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
- case CEPH_MSG_CLIENT_REPLY: return "client_reply";
- case CEPH_MSG_CLIENT_CAPS: return "client_caps";
- case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
- case CEPH_MSG_CLIENT_SNAP: return "client_snap";
- case CEPH_MSG_CLIENT_LEASE: return "client_lease";
- case CEPH_MSG_OSD_MAP: return "osd_map";
- case CEPH_MSG_OSD_OP: return "osd_op";
- case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
- default: return "unknown";
- }
-}
-
-
/*
* mount options
*/
enum {
Opt_wsize,
Opt_rsize,
- Opt_osdtimeout,
- Opt_osdkeepalivetimeout,
- Opt_mount_timeout,
- Opt_osd_idle_ttl,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
Opt_congestion_kb,
Opt_last_int,
/* int args above */
- Opt_fsid,
Opt_snapdirname,
- Opt_name,
- Opt_secret,
Opt_last_string,
/* string args above */
- Opt_ip,
- Opt_noshare,
Opt_dirstat,
Opt_nodirstat,
Opt_rbytes,
Opt_norbytes,
- Opt_nocrc,
Opt_noasyncreaddir,
};
-static match_table_t arg_tokens = {
+static match_table_t fsopt_tokens = {
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
- {Opt_osdtimeout, "osdtimeout=%d"},
- {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
- {Opt_mount_timeout, "mount_timeout=%d"},
- {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
{Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
- {Opt_fsid, "fsid=%s"},
{Opt_snapdirname, "snapdirname=%s"},
- {Opt_name, "name=%s"},
- {Opt_secret, "secret=%s"},
/* string args above */
- {Opt_ip, "ip=%s"},
- {Opt_noshare, "noshare"},
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
{Opt_rbytes, "rbytes"},
{Opt_norbytes, "norbytes"},
- {Opt_nocrc, "nocrc"},
{Opt_noasyncreaddir, "noasyncreaddir"},
{-1, NULL}
};
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+static int parse_fsopt_token(char *c, void *private)
{
- int i = 0;
- char tmp[3];
- int err = -EINVAL;
- int d;
-
- dout("parse_fsid '%s'\n", str);
- tmp[2] = 0;
- while (*str && i < 16) {
- if (ispunct(*str)) {
- str++;
- continue;
+ struct ceph_mount_options *fsopt = private;
+ substring_t argstr[MAX_OPT_ARGS];
+ int token, intval, ret;
+
+ token = match_token((char *)c, fsopt_tokens, argstr);
+ if (token < 0)
+ return -EINVAL;
+
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ return ret;
}
- if (!isxdigit(str[0]) || !isxdigit(str[1]))
- break;
- tmp[0] = str[0];
- tmp[1] = str[1];
- if (sscanf(tmp, "%x", &d) < 1)
- break;
- fsid->fsid[i] = d & 0xff;
- i++;
- str += 2;
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
}
- if (i == 16)
- err = 0;
- dout("parse_fsid ret %d got fsid %pU", err, fsid);
- return err;
+ switch (token) {
+ case Opt_snapdirname:
+ kfree(fsopt->snapdir_name);
+ fsopt->snapdir_name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->snapdir_name)
+ return -ENOMEM;
+ break;
+
+ /* misc */
+ case Opt_wsize:
+ fsopt->wsize = intval;
+ break;
+ case Opt_rsize:
+ fsopt->rsize = intval;
+ break;
+ case Opt_caps_wanted_delay_min:
+ fsopt->caps_wanted_delay_min = intval;
+ break;
+ case Opt_caps_wanted_delay_max:
+ fsopt->caps_wanted_delay_max = intval;
+ break;
+ case Opt_readdir_max_entries:
+ fsopt->max_readdir = intval;
+ break;
+ case Opt_readdir_max_bytes:
+ fsopt->max_readdir_bytes = intval;
+ break;
+ case Opt_congestion_kb:
+ fsopt->congestion_kb = intval;
+ break;
+ case Opt_dirstat:
+ fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
+ break;
+ case Opt_nodirstat:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
+ break;
+ case Opt_rbytes:
+ fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
+ break;
+ case Opt_norbytes:
+ fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
+ break;
+ case Opt_noasyncreaddir:
+ fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
+ break;
+ default:
+ BUG_ON(token);
+ }
+ return 0;
}
-static struct ceph_mount_args *parse_mount_args(int flags, char *options,
- const char *dev_name,
- const char **path)
+static void destroy_mount_options(struct ceph_mount_options *args)
{
- struct ceph_mount_args *args;
- const char *c;
- int err = -ENOMEM;
- substring_t argstr[MAX_OPT_ARGS];
+ dout("destroy_mount_options %p\n", args);
+ kfree(args->snapdir_name);
+ kfree(args);
+}
- args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args)
- return ERR_PTR(-ENOMEM);
- args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
- GFP_KERNEL);
- if (!args->mon_addr)
- goto out;
+static int strcmp_null(const char *s1, const char *s2)
+{
+ if (!s1 && !s2)
+ return 0;
+ if (s1 && !s2)
+ return -1;
+ if (!s1 && s2)
+ return 1;
+ return strcmp(s1, s2);
+}
- dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
-
- /* start with defaults */
- args->sb_flags = flags;
- args->flags = CEPH_OPT_DEFAULT;
- args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
- args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
- args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
- args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
- args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
- args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
- args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
- args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
- args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
- args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
- args->congestion_kb = default_congestion_kb();
-
- /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
- err = -EINVAL;
- if (!dev_name)
- goto out;
- *path = strstr(dev_name, ":/");
- if (*path == NULL) {
- pr_err("device name is missing path (no :/ in %s)\n",
- dev_name);
- goto out;
- }
+static int compare_mount_options(struct ceph_mount_options *new_fsopt,
+ struct ceph_options *new_opt,
+ struct ceph_fs_client *fsc)
+{
+ struct ceph_mount_options *fsopt1 = new_fsopt;
+ struct ceph_mount_options *fsopt2 = fsc->mount_options;
+ int ofs = offsetof(struct ceph_mount_options, snapdir_name);
+ int ret;
- /* get mon ip(s) */
- err = ceph_parse_ips(dev_name, *path, args->mon_addr,
- CEPH_MAX_MON, &args->num_mon);
- if (err < 0)
- goto out;
+ ret = memcmp(fsopt1, fsopt2, ofs);
+ if (ret)
+ return ret;
+
+ ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
+ if (ret)
+ return ret;
+
+ return ceph_compare_options(new_opt, fsc->client);
+}
+
+static int parse_mount_options(struct ceph_mount_options **pfsopt,
+ struct ceph_options **popt,
+ int flags, char *options,
+ const char *dev_name,
+ const char **path)
+{
+ struct ceph_mount_options *fsopt;
+ const char *dev_name_end;
+ int err = -ENOMEM;
+
+ fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
+ if (!fsopt)
+ return -ENOMEM;
+
+ dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
+
+ fsopt->sb_flags = flags;
+ fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
+
+ fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+ fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+ fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
+ fsopt->congestion_kb = default_congestion_kb();
+
+ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ err = -EINVAL;
+ if (!dev_name)
+ goto out;
+ *path = strstr(dev_name, ":/");
+ if (*path == NULL) {
+ pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name);
+ goto out;
+ }
+ dev_name_end = *path;
+ dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
/* path on server */
*path += 2;
dout("server path '%s'\n", *path);
- /* parse mount options */
- while ((c = strsep(&options, ",")) != NULL) {
- int token, intval, ret;
- if (!*c)
- continue;
- err = -EINVAL;
- token = match_token((char *)c, arg_tokens, argstr);
- if (token < 0) {
- pr_err("bad mount option at '%s'\n", c);
- goto out;
- }
- if (token < Opt_last_int) {
- ret = match_int(&argstr[0], &intval);
- if (ret < 0) {
- pr_err("bad mount option arg (not int) "
- "at '%s'\n", c);
- continue;
- }
- dout("got int token %d val %d\n", token, intval);
- } else if (token > Opt_last_int && token < Opt_last_string) {
- dout("got string token %d val %s\n", token,
- argstr[0].from);
- } else {
- dout("got token %d\n", token);
- }
- switch (token) {
- case Opt_ip:
- err = ceph_parse_ips(argstr[0].from,
- argstr[0].to,
- &args->my_addr,
- 1, NULL);
- if (err < 0)
- goto out;
- args->flags |= CEPH_OPT_MYIP;
- break;
-
- case Opt_fsid:
- err = parse_fsid(argstr[0].from, &args->fsid);
- if (err == 0)
- args->flags |= CEPH_OPT_FSID;
- break;
- case Opt_snapdirname:
- kfree(args->snapdir_name);
- args->snapdir_name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
- case Opt_name:
- args->name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
- case Opt_secret:
- args->secret = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- break;
-
- /* misc */
- case Opt_wsize:
- args->wsize = intval;
- break;
- case Opt_rsize:
- args->rsize = intval;
- break;
- case Opt_osdtimeout:
- args->osd_timeout = intval;
- break;
- case Opt_osdkeepalivetimeout:
- args->osd_keepalive_timeout = intval;
- break;
- case Opt_osd_idle_ttl:
- args->osd_idle_ttl = intval;
- break;
- case Opt_mount_timeout:
- args->mount_timeout = intval;
- break;
- case Opt_caps_wanted_delay_min:
- args->caps_wanted_delay_min = intval;
- break;
- case Opt_caps_wanted_delay_max:
- args->caps_wanted_delay_max = intval;
- break;
- case Opt_readdir_max_entries:
- args->max_readdir = intval;
- break;
- case Opt_readdir_max_bytes:
- args->max_readdir_bytes = intval;
- break;
- case Opt_congestion_kb:
- args->congestion_kb = intval;
- break;
-
- case Opt_noshare:
- args->flags |= CEPH_OPT_NOSHARE;
- break;
-
- case Opt_dirstat:
- args->flags |= CEPH_OPT_DIRSTAT;
- break;
- case Opt_nodirstat:
- args->flags &= ~CEPH_OPT_DIRSTAT;
- break;
- case Opt_rbytes:
- args->flags |= CEPH_OPT_RBYTES;
- break;
- case Opt_norbytes:
- args->flags &= ~CEPH_OPT_RBYTES;
- break;
- case Opt_nocrc:
- args->flags |= CEPH_OPT_NOCRC;
- break;
- case Opt_noasyncreaddir:
- args->flags |= CEPH_OPT_NOASYNCREADDIR;
- break;
-
- default:
- BUG_ON(token);
- }
- }
- return args;
+ err = ceph_parse_options(popt, options, dev_name, dev_name_end,
+ parse_fsopt_token, (void *)fsopt);
+ if (err)
+ goto out;
+
+ /* success */
+ *pfsopt = fsopt;
+ return 0;
out:
- kfree(args->mon_addr);
- kfree(args);
- return ERR_PTR(err);
+ destroy_mount_options(fsopt);
+ return err;
}
-static void destroy_mount_args(struct ceph_mount_args *args)
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
{
- dout("destroy_mount_args %p\n", args);
- kfree(args->snapdir_name);
- args->snapdir_name = NULL;
- kfree(args->name);
- args->name = NULL;
- kfree(args->secret);
- args->secret = NULL;
- kfree(args);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
+ struct ceph_mount_options *fsopt = fsc->mount_options;
+ struct ceph_options *opt = fsc->client->options;
+
+ if (opt->flags & CEPH_OPT_FSID)
+ seq_printf(m, ",fsid=%pU", &opt->fsid);
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ seq_puts(m, ",noshare");
+ if (opt->flags & CEPH_OPT_NOCRC)
+ seq_puts(m, ",nocrc");
+
+ if (opt->name)
+ seq_printf(m, ",name=%s", opt->name);
+ if (opt->secret)
+ seq_puts(m, ",secret=<hidden>");
+
+ if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+ seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
+ if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+ seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
+ if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+ seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
+ if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+ seq_printf(m, ",osdkeepalivetimeout=%d",
+ opt->osd_keepalive_timeout);
+
+ if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
+ seq_puts(m, ",dirstat");
+ if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
+ seq_puts(m, ",norbytes");
+ if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
+ seq_puts(m, ",noasyncreaddir");
+
+ if (fsopt->wsize)
+ seq_printf(m, ",wsize=%d", fsopt->wsize);
+ if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+ seq_printf(m, ",rsize=%d", fsopt->rsize);
+ if (fsopt->congestion_kb != default_congestion_kb())
+ seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_min=%d",
+ fsopt->caps_wanted_delay_min);
+ if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_max=%d",
+ fsopt->caps_wanted_delay_max);
+ if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+ seq_printf(m, ",cap_release_safety=%d",
+ fsopt->cap_release_safety);
+ if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+ seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+ if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+ seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+ if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+ seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+ return 0;
}
/*
- * create a fresh client instance
+ * handle any mon messages the standard library doesn't understand.
+ * return error if we don't either.
*/
-static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
{
- struct ceph_client *client;
+ struct ceph_fs_client *fsc = client->private;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ switch (type) {
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_map(fsc->mdsc, msg);
+ return 0;
+
+ default:
+ return -1;
+ }
+}
+
+/*
+ * create a new fs client
+ */
+struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
+ struct ceph_options *opt)
+{
+ struct ceph_fs_client *fsc;
int err = -ENOMEM;
- client = kzalloc(sizeof(*client), GFP_KERNEL);
- if (client == NULL)
+ fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
+ if (!fsc)
return ERR_PTR(-ENOMEM);
- mutex_init(&client->mount_mutex);
-
- init_waitqueue_head(&client->auth_wq);
+ fsc->client = ceph_create_client(opt, fsc);
+ if (IS_ERR(fsc->client)) {
+ err = PTR_ERR(fsc->client);
+ goto fail;
+ }
+ fsc->client->extra_mon_dispatch = extra_mon_dispatch;
+ fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+ fsc->client->monc.want_mdsmap = 1;
- client->sb = NULL;
- client->mount_state = CEPH_MOUNT_MOUNTING;
- client->mount_args = args;
+ fsc->mount_options = fsopt;
- client->msgr = NULL;
+ fsc->sb = NULL;
+ fsc->mount_state = CEPH_MOUNT_MOUNTING;
- client->auth_err = 0;
- atomic_long_set(&client->writeback_count, 0);
+ atomic_long_set(&fsc->writeback_count, 0);
- err = bdi_init(&client->backing_dev_info);
+ err = bdi_init(&fsc->backing_dev_info);
if (err < 0)
- goto fail;
+ goto fail_client;
err = -ENOMEM;
- client->wb_wq = create_workqueue("ceph-writeback");
- if (client->wb_wq == NULL)
+ fsc->wb_wq = create_workqueue("ceph-writeback");
+ if (fsc->wb_wq == NULL)
goto fail_bdi;
- client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
- if (client->pg_inv_wq == NULL)
+ fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+ if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
- client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
- if (client->trunc_wq == NULL)
+ fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+ if (fsc->trunc_wq == NULL)
goto fail_pg_inv_wq;
/* set up mempools */
err = -ENOMEM;
- client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
- client->mount_args->wsize >> PAGE_CACHE_SHIFT);
- if (!client->wb_pagevec_pool)
+ fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+ fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
+ if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq;
/* caps */
- client->min_caps = args->max_readdir;
+ fsc->min_caps = fsopt->max_readdir;
+
+ return fsc;
- /* subsystems */
- err = ceph_monc_init(&client->monc, client);
- if (err < 0)
- goto fail_mempool;
- err = ceph_osdc_init(&client->osdc, client);
- if (err < 0)
- goto fail_monc;
- err = ceph_mdsc_init(&client->mdsc, client);
- if (err < 0)
- goto fail_osdc;
- return client;
-
-fail_osdc:
- ceph_osdc_stop(&client->osdc);
-fail_monc:
- ceph_monc_stop(&client->monc);
-fail_mempool:
- mempool_destroy(client->wb_pagevec_pool);
fail_trunc_wq:
- destroy_workqueue(client->trunc_wq);
+ destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq:
- destroy_workqueue(client->pg_inv_wq);
+ destroy_workqueue(fsc->pg_inv_wq);
fail_wb_wq:
- destroy_workqueue(client->wb_wq);
+ destroy_workqueue(fsc->wb_wq);
fail_bdi:
- bdi_destroy(&client->backing_dev_info);
+ bdi_destroy(&fsc->backing_dev_info);
+fail_client:
+ ceph_destroy_client(fsc->client);
fail:
- kfree(client);
+ kfree(fsc);
return ERR_PTR(err);
}
-static void ceph_destroy_client(struct ceph_client *client)
+void destroy_fs_client(struct ceph_fs_client *fsc)
{
- dout("destroy_client %p\n", client);
+ dout("destroy_fs_client %p\n", fsc);
- /* unmount */
- ceph_mdsc_stop(&client->mdsc);
- ceph_osdc_stop(&client->osdc);
+ destroy_workqueue(fsc->wb_wq);
+ destroy_workqueue(fsc->pg_inv_wq);
+ destroy_workqueue(fsc->trunc_wq);
- /*
- * make sure mds and osd connections close out before destroying
- * the auth module, which is needed to free those connections'
- * ceph_authorizers.
- */
- ceph_msgr_flush();
-
- ceph_monc_stop(&client->monc);
+ bdi_destroy(&fsc->backing_dev_info);
- ceph_debugfs_client_cleanup(client);
- destroy_workqueue(client->wb_wq);
- destroy_workqueue(client->pg_inv_wq);
- destroy_workqueue(client->trunc_wq);
+ mempool_destroy(fsc->wb_pagevec_pool);
- bdi_destroy(&client->backing_dev_info);
+ destroy_mount_options(fsc->mount_options);
- if (client->msgr)
- ceph_messenger_destroy(client->msgr);
- mempool_destroy(client->wb_pagevec_pool);
+ ceph_fs_debugfs_cleanup(fsc);
- destroy_mount_args(client->mount_args);
+ ceph_destroy_client(fsc->client);
- kfree(client);
- dout("destroy_client %p done\n", client);
+ kfree(fsc);
+ dout("destroy_fs_client %p done\n", fsc);
}
/*
- * Initially learn our fsid, or verify an fsid matches.
+ * caches
*/
-int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
{
- if (client->have_fsid) {
- if (ceph_fsid_compare(&client->fsid, fsid)) {
- pr_err("bad fsid, had %pU got %pU",
- &client->fsid, fsid);
- return -1;
- }
- } else {
- pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
- fsid);
- memcpy(&client->fsid, fsid, sizeof(*fsid));
- ceph_debugfs_client_init(client);
- client->have_fsid = true;
- }
+ struct ceph_inode_info *ci = foo;
+ inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+ ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+ sizeof(struct ceph_inode_info),
+ __alignof__(struct ceph_inode_info),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ ceph_inode_init_once);
+ if (ceph_inode_cachep == NULL)
+ return -ENOMEM;
+
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_cap_cachep == NULL)
+ goto bad_cap;
+
+ ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_dentry_cachep == NULL)
+ goto bad_dentry;
+
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_file_cachep == NULL)
+ goto bad_file;
+
return 0;
+
+bad_file:
+ kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+ kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+ kmem_cache_destroy(ceph_inode_cachep);
+ return -ENOMEM;
}
+static void destroy_caches(void)
+{
+ kmem_cache_destroy(ceph_inode_cachep);
+ kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_dentry_cachep);
+ kmem_cache_destroy(ceph_file_cachep);
+}
+
+
/*
- * true if we have the mon map (and have thus joined the cluster)
+ * ceph_umount_begin - initiate forced umount. Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
*/
-static int have_mon_and_osd_map(struct ceph_client *client)
+static void ceph_umount_begin(struct super_block *sb)
{
- return client->monc.monmap && client->monc.monmap->epoch &&
- client->osdc.osdmap && client->osdc.osdmap->epoch;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+ dout("ceph_umount_begin - starting forced umount\n");
+ if (!fsc)
+ return;
+ fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+ return;
}
+static const struct super_operations ceph_super_ops = {
+ .alloc_inode = ceph_alloc_inode,
+ .destroy_inode = ceph_destroy_inode,
+ .write_inode = ceph_write_inode,
+ .sync_fs = ceph_sync_fs,
+ .put_super = ceph_put_super,
+ .show_options = ceph_show_options,
+ .statfs = ceph_statfs,
+ .umount_begin = ceph_umount_begin,
+};
+
/*
* Bootstrap mount by opening the root directory. Note the mount
* @started time from caller, and time out if this takes too long.
*/
-static struct dentry *open_root_dentry(struct ceph_client *client,
+static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
const char *path,
unsigned long started)
{
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req = NULL;
int err;
struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
req->r_ino1.ino = CEPH_INO_ROOT;
req->r_ino1.snap = CEPH_NOSNAP;
req->r_started = started;
- req->r_timeout = client->mount_args->mount_timeout * HZ;
+ req->r_timeout = fsc->client->options->mount_timeout * HZ;
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == 0) {
dout("open_root_inode success\n");
if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
- client->sb->s_root == NULL)
+ fsc->sb->s_root == NULL)
root = d_alloc_root(req->r_target_inode);
else
root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
return root;
}
+
+
+
/*
* mount: join the ceph cluster, and open root directory.
*/
-static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
const char *path)
{
- struct ceph_entity_addr *myaddr = NULL;
int err;
- unsigned long timeout = client->mount_args->mount_timeout * HZ;
unsigned long started = jiffies; /* note the start time */
struct dentry *root;
+ int first = 0; /* first vfsmount for this super_block */
dout("mount start\n");
- mutex_lock(&client->mount_mutex);
-
- /* initialize the messenger */
- if (client->msgr == NULL) {
- if (ceph_test_opt(client, MYIP))
- myaddr = &client->mount_args->my_addr;
- client->msgr = ceph_messenger_create(myaddr);
- if (IS_ERR(client->msgr)) {
- err = PTR_ERR(client->msgr);
- client->msgr = NULL;
- goto out;
- }
- client->msgr->nocrc = ceph_test_opt(client, NOCRC);
- }
+ mutex_lock(&fsc->client->mount_mutex);
- /* open session, and wait for mon, mds, and osd maps */
- err = ceph_monc_open_session(&client->monc);
+ err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
- while (!have_mon_and_osd_map(client)) {
- err = -EIO;
- if (timeout && time_after_eq(jiffies, started + timeout))
- goto out;
-
- /* wait */
- dout("mount waiting for mon_map\n");
- err = wait_event_interruptible_timeout(client->auth_wq,
- have_mon_and_osd_map(client) || (client->auth_err < 0),
- timeout);
- if (err == -EINTR || err == -ERESTARTSYS)
- goto out;
- if (client->auth_err < 0) {
- err = client->auth_err;
- goto out;
- }
- }
-
dout("mount opening root\n");
- root = open_root_dentry(client, "", started);
+ root = open_root_dentry(fsc, "", started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto out;
}
- if (client->sb->s_root)
+ if (fsc->sb->s_root) {
dput(root);
- else
- client->sb->s_root = root;
+ } else {
+ fsc->sb->s_root = root;
+ first = 1;
+
+ err = ceph_fs_debugfs_init(fsc);
+ if (err < 0)
+ goto fail;
+ }
if (path[0] == 0) {
dget(root);
} else {
dout("mount opening base mountpoint\n");
- root = open_root_dentry(client, path, started);
+ root = open_root_dentry(fsc, path, started);
if (IS_ERR(root)) {
err = PTR_ERR(root);
- dput(client->sb->s_root);
- client->sb->s_root = NULL;
- goto out;
+ goto fail;
}
}
mnt->mnt_root = root;
- mnt->mnt_sb = client->sb;
+ mnt->mnt_sb = fsc->sb;
- client->mount_state = CEPH_MOUNT_MOUNTED;
+ fsc->mount_state = CEPH_MOUNT_MOUNTED;
dout("mount success\n");
err = 0;
out:
- mutex_unlock(&client->mount_mutex);
+ mutex_unlock(&fsc->client->mount_mutex);
return err;
+
+fail:
+ if (first) {
+ dput(fsc->sb->s_root);
+ fsc->sb->s_root = NULL;
+ }
+ goto out;
}
static int ceph_set_super(struct super_block *s, void *data)
{
- struct ceph_client *client = data;
+ struct ceph_fs_client *fsc = data;
int ret;
dout("set_super %p data %p\n", s, data);
- s->s_flags = client->mount_args->sb_flags;
+ s->s_flags = fsc->mount_options->sb_flags;
s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
- s->s_fs_info = client;
- client->sb = s;
+ s->s_fs_info = fsc;
+ fsc->sb = s;
s->s_op = &ceph_super_ops;
s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
fail:
s->s_fs_info = NULL;
- client->sb = NULL;
+ fsc->sb = NULL;
return ret;
}
@@ -926,30 +732,23 @@ fail:
*/
static int ceph_compare_super(struct super_block *sb, void *data)
{
- struct ceph_client *new = data;
- struct ceph_mount_args *args = new->mount_args;
- struct ceph_client *other = ceph_sb_to_client(sb);
- int i;
+ struct ceph_fs_client *new = data;
+ struct ceph_mount_options *fsopt = new->mount_options;
+ struct ceph_options *opt = new->client->options;
+ struct ceph_fs_client *other = ceph_sb_to_client(sb);
dout("ceph_compare_super %p\n", sb);
- if (args->flags & CEPH_OPT_FSID) {
- if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
- dout("fsid doesn't match\n");
- return 0;
- }
- } else {
- /* do we share (a) monitor? */
- for (i = 0; i < new->monc.monmap->num_mon; i++)
- if (ceph_monmap_contains(other->monc.monmap,
- &new->monc.monmap->mon_inst[i].addr))
- break;
- if (i == new->monc.monmap->num_mon) {
- dout("mon ip not part of monmap\n");
- return 0;
- }
- dout("mon ip matches existing sb %p\n", sb);
+
+ if (compare_mount_options(fsopt, opt, other)) {
+ dout("monitor(s)/mount options don't match\n");
+ return 0;
}
- if (args->sb_flags != other->mount_args->sb_flags) {
+ if ((opt->flags & CEPH_OPT_FSID) &&
+ ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
+ dout("fsid doesn't match\n");
+ return 0;
+ }
+ if (fsopt->sb_flags != other->mount_options->sb_flags) {
dout("flags differ\n");
return 0;
}
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb,
+ struct ceph_fs_client *fsc)
{
int err;
/* set ra_pages based on rsize mount option? */
- if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
- client->backing_dev_info.ra_pages =
- (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+ if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
+ fsc->backing_dev_info.ra_pages =
+ (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
- err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+ err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
atomic_long_inc_return(&bdi_seq));
if (!err)
- sb->s_bdi = &client->backing_dev_info;
+ sb->s_bdi = &fsc->backing_dev_info;
return err;
}
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
struct vfsmount *mnt)
{
struct super_block *sb;
- struct ceph_client *client;
+ struct ceph_fs_client *fsc;
int err;
int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
const char *path = NULL;
- struct ceph_mount_args *args;
+ struct ceph_mount_options *fsopt = NULL;
+ struct ceph_options *opt = NULL;
dout("ceph_get_sb\n");
- args = parse_mount_args(flags, data, dev_name, &path);
- if (IS_ERR(args)) {
- err = PTR_ERR(args);
+ err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
+ if (err < 0)
goto out_final;
- }
/* create client (which we may/may not use) */
- client = ceph_create_client(args);
- if (IS_ERR(client)) {
- err = PTR_ERR(client);
+ fsc = create_fs_client(fsopt, opt);
+ if (IS_ERR(fsc)) {
+ err = PTR_ERR(fsc);
+ kfree(fsopt);
+ kfree(opt);
goto out_final;
}
- if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+ err = ceph_mdsc_init(fsc);
+ if (err < 0)
+ goto out;
+
+ if (ceph_test_opt(fsc->client, NOSHARE))
compare_super = NULL;
- sb = sget(fs_type, compare_super, ceph_set_super, client);
+ sb = sget(fs_type, compare_super, ceph_set_super, fsc);
if (IS_ERR(sb)) {
err = PTR_ERR(sb);
goto out;
}
- if (ceph_sb_to_client(sb) != client) {
- ceph_destroy_client(client);
- client = ceph_sb_to_client(sb);
- dout("get_sb got existing client %p\n", client);
+ if (ceph_sb_to_client(sb) != fsc) {
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
+ fsc = ceph_sb_to_client(sb);
+ dout("get_sb got existing client %p\n", fsc);
} else {
- dout("get_sb using new client %p\n", client);
- err = ceph_register_bdi(sb, client);
+ dout("get_sb using new client %p\n", fsc);
+ err = ceph_register_bdi(sb, fsc);
if (err < 0)
goto out_splat;
}
- err = ceph_mount(client, mnt, path);
+ err = ceph_mount(fsc, mnt, path);
if (err < 0)
goto out_splat;
dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
return 0;
out_splat:
- ceph_mdsc_close_sessions(&client->mdsc);
+ ceph_mdsc_close_sessions(fsc->mdsc);
deactivate_locked_super(sb);
goto out_final;
out:
- ceph_destroy_client(client);
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
out_final:
dout("ceph_get_sb fail %d\n", err);
return err;
@@ -1042,11 +849,12 @@ out_final:
static void ceph_kill_sb(struct super_block *s)
{
- struct ceph_client *client = ceph_sb_to_client(s);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(s);
dout("kill_sb %p\n", s);
- ceph_mdsc_pre_umount(&client->mdsc);
+ ceph_mdsc_pre_umount(fsc->mdsc);
kill_anon_super(s); /* will call put_super after sb is r/o */
- ceph_destroy_client(client);
+ ceph_mdsc_destroy(fsc);
+ destroy_fs_client(fsc);
}
static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
static int __init init_ceph(void)
{
- int ret = 0;
-
- ret = ceph_debugfs_init();
- if (ret < 0)
- goto out;
-
- ret = ceph_msgr_init();
- if (ret < 0)
- goto out_debugfs;
-
- ret = init_caches();
+ int ret = init_caches();
if (ret)
- goto out_msgr;
+ goto out;
ret = register_filesystem(&ceph_fs_type);
if (ret)
goto out_icache;
- pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
- CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
- CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
- CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
+ pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
+
return 0;
out_icache:
destroy_caches();
-out_msgr:
- ceph_msgr_exit();
-out_debugfs:
- ceph_debugfs_cleanup();
out:
return ret;
}
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type);
destroy_caches();
- ceph_msgr_exit();
- ceph_debugfs_cleanup();
}
module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..1886294e12f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
#ifndef _FS_CEPH_SUPER_H
#define _FS_CEPH_SUPER_H
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
#include <linux/writeback.h>
#include <linux/slab.h>
-#include "types.h"
-#include "messenger.h"
-#include "msgpool.h"
-#include "mon_client.h"
-#include "mds_client.h"
-#include "osd_client.h"
-#include "ceph_fs.h"
+#include <linux/ceph/libceph.h>
/* f_type in struct statfs */
#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
-/*
- * Supported features
- */
-#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
+#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
+#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
-/*
- * mount options
- */
-#define CEPH_OPT_FSID (1<<0)
-#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
-#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
-#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
-#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
-#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
-#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
-#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
+#define ceph_set_mount_opt(fsc, opt) \
+ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
+#define ceph_test_mount_opt(fsc, opt) \
+ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define ceph_set_opt(client, opt) \
- (client)->mount_args->flags |= CEPH_OPT_##opt;
-#define ceph_test_opt(client, opt) \
- (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+#define CEPH_MAX_READDIR_DEFAULT 1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-
-struct ceph_mount_args {
- int sb_flags;
+struct ceph_mount_options {
int flags;
- struct ceph_fsid fsid;
- struct ceph_entity_addr my_addr;
- int num_mon;
- struct ceph_entity_addr *mon_addr;
- int mount_timeout;
- int osd_idle_ttl;
- int osd_timeout;
- int osd_keepalive_timeout;
+ int sb_flags;
+
int wsize;
int rsize; /* max readahead */
int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
- char *snapdir_name; /* default ".snap" */
- char *name;
- char *secret;
-};
-/*
- * defaults
- */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
-#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
-#define CEPH_OSD_KEEPALIVE_DEFAULT 5
-#define CEPH_OSD_IDLE_TTL_DEFAULT 60
-#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
-#define CEPH_MAX_READDIR_DEFAULT 1024
-#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
-
-#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
-#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
-
-#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
-#define CEPH_AUTH_NAME_DEFAULT "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file. Delay a minimum amount of time, even if we send a cap
- * message for some other reason. Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
-
-/* mount state */
-enum {
- CEPH_MOUNT_MOUNTING,
- CEPH_MOUNT_MOUNTED,
- CEPH_MOUNT_UNMOUNTING,
- CEPH_MOUNT_UNMOUNTED,
- CEPH_MOUNT_SHUTDOWN,
-};
-
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
- BUG_ON(time_after(b, a));
- return (long)a - (long)b;
-}
-
-/*
- * per-filesystem client state
- *
- * possibly shared by multiple mount points, if they are
- * mounting the same ceph filesystem/cluster.
- */
-struct ceph_client {
- struct ceph_fsid fsid;
- bool have_fsid;
+ /*
+ * everything above this point can be memcmp'd; everything below
+ * is handled in compare_mount_options()
+ */
- struct mutex mount_mutex; /* serialize mount attempts */
- struct ceph_mount_args *mount_args;
+ char *snapdir_name; /* default ".snap" */
+};
+struct ceph_fs_client {
struct super_block *sb;
- unsigned long mount_state;
- wait_queue_head_t auth_wq;
-
- int auth_err;
+ struct ceph_mount_options *mount_options;
+ struct ceph_client *client;
+ unsigned long mount_state;
int min_caps; /* min caps i added */
- struct ceph_messenger *msgr; /* messenger instance */
- struct ceph_mon_client monc;
- struct ceph_mds_client mdsc;
- struct ceph_osd_client osdc;
+ struct ceph_mds_client *mdsc;
/* writeback */
mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
struct backing_dev_info backing_dev_info;
#ifdef CONFIG_DEBUG_FS
- struct dentry *debugfs_monmap;
- struct dentry *debugfs_mdsmap, *debugfs_osdmap;
- struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+ struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
+ struct dentry *debugfs_mdsc, *debugfs_mdsmap;
#endif
};
+
/*
* File i/o capability. This tracks shared state with the metadata
* server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
int should_free_val;
};
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+ struct ceph_mds_session *lease_session;
+ u32 lease_gen, lease_shared_gen;
+ u32 lease_seq;
+ unsigned long lease_renew_after, lease_renew_from;
+ struct list_head lru;
+ struct dentry *dentry;
+ u64 time;
+ u64 offset;
+};
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
/*
* Ceph inode.
*/
-#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
-#define CEPH_I_NODELAY 4 /* do not delay cap release */
-#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
-#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
-
struct ceph_inode_info {
struct ceph_vino i_vino; /* ceph ino + snap */
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
return container_of(inode, struct ceph_inode_info, vfs_inode);
}
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+ ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+ ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+ if (!ino)
+ ino = 1;
+#endif
+ return ino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+ struct ceph_vino *pvino = (struct ceph_vino *)data;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ return ci->i_vino.ino == pvino->ino &&
+ ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+ struct ceph_vino vino)
+{
+ ino_t t = ceph_vino_to_ino(vino);
+ return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
+#define CEPH_I_NODELAY 4 /* do not delay cap release */
+#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
+
static inline void ceph_i_clear(struct inode *inode, unsigned mask)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
struct ceph_inode_info *ci = ceph_inode(inode);
bool r;
- smp_mb();
+ spin_lock(&inode->i_lock);
r = (ci->i_ceph_flags & mask) == mask;
+ spin_unlock(&inode->i_lock);
return r;
}
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
struct ceph_inode_frag *pfrag,
int *found);
-/*
- * Ceph dentry state
- */
-struct ceph_dentry_info {
- struct ceph_mds_session *lease_session;
- u32 lease_gen, lease_shared_gen;
- u32 lease_seq;
- unsigned long lease_renew_after, lease_renew_from;
- struct list_head lru;
- struct dentry *dentry;
- u64 time;
- u64 offset;
-};
-
static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
{
return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
return ((loff_t)frag << 32) | (loff_t)off;
}
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- * don't include snap in ino hash, at least for now.
- */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
-{
- ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
-#if BITS_PER_LONG == 32
- ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
- if (!ino)
- ino = 1;
-#endif
- return ino;
-}
-
static inline int ceph_set_ino_cb(struct inode *inode, void *data)
{
ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
-static inline struct ceph_vino ceph_vino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino;
-}
-
-/* for printf-style formatting */
-#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
-
-static inline u64 ceph_ino(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.ino;
-}
-static inline u64 ceph_snap(struct inode *inode)
-{
- return ceph_inode(inode)->i_vino.snap;
-}
-
-static inline int ceph_ino_compare(struct inode *inode, void *data)
-{
- struct ceph_vino *pvino = (struct ceph_vino *)data;
- struct ceph_inode_info *ci = ceph_inode(inode);
- return ci->i_vino.ino == pvino->ino &&
- ci->i_vino.snap == pvino->snap;
-}
-
-static inline struct inode *ceph_find_inode(struct super_block *sb,
- struct ceph_vino vino)
-{
- ino_t t = ceph_vino_to_ino(vino);
- return ilookup5(sb, t, ceph_ino_compare, &vino);
-}
-
-
/*
* caps helpers
*/
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx, int need);
extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
-extern void ceph_reservation_status(struct ceph_client *client,
+extern void ceph_reservation_status(struct ceph_fs_client *client,
int *total, int *avail, int *used,
int *reserved, int *min);
-static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
{
- return (struct ceph_client *)inode->i_sb->s_fs_info;
+ return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
}
-static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
{
- return (struct ceph_client *)sb->s_fs_info;
+ return (struct ceph_fs_client *)sb->s_fs_info;
}
@@ -617,51 +541,6 @@ struct ceph_file_info {
/*
- * snapshots
- */
-
-/*
- * A "snap context" is the set of existing snapshots when we
- * write data. It is used by the OSD to guide its COW behavior.
- *
- * The ceph_snap_context is refcounted, and attached to each dirty
- * page, indicating which context the dirty data belonged when it was
- * dirtied.
- */
-struct ceph_snap_context {
- atomic_t nref;
- u64 seq;
- int num_snaps;
- u64 snaps[];
-};
-
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
- /*
- printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
- atomic_read(&sc->nref)+1);
- */
- if (sc)
- atomic_inc(&sc->nref);
- return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
- if (!sc)
- return;
- /*
- printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
- atomic_read(&sc->nref)-1);
- */
- if (atomic_dec_and_test(&sc->nref)) {
- /*printk(" deleting snap_context %p\n", sc);*/
- kfree(sc);
- }
-}
-
-/*
* A "snap realm" describes a subset of the file hierarchy sharing
* the same set of snapshots that apply to it. The realms themselves
* are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
spinlock_t inodes_with_caps_lock;
};
-
-
-/*
- * calculate the number of pages a given length and offset map onto,
- * if we align the data.
- */
-static inline int calc_pages_for(u64 off, u64 len)
+static inline int default_congestion_kb(void)
{
- return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
- (off >> PAGE_CACHE_SHIFT);
+ int congestion_kb;
+
+ /*
+ * Copied from NFS
+ *
+ * congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (congestion_kb > 256*1024)
+ congestion_kb = 256*1024;
+
+ return congestion_kb;
}
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
ci_item)->writing;
}
-
-/* super.c */
-extern struct kmem_cache *ceph_inode_cachep;
-extern struct kmem_cache *ceph_cap_cachep;
-extern struct kmem_cache *ceph_dentry_cachep;
-extern struct kmem_cache *ceph_file_cachep;
-
-extern const char *ceph_msg_type_name(int type);
-extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-
/* inode.c */
extern const struct inode_operations ceph_file_iops;
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
/* file.c */
extern const struct file_operations ceph_file_fops;
extern const struct address_space_operations ceph_aops;
+extern int ceph_copy_to_page_vector(struct page **pages,
+ const char *data,
+ loff_t off, size_t len);
+extern int ceph_copy_from_page_vector(struct page **pages,
+ char *data,
+ loff_t off, size_t len);
+extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
extern int ceph_open(struct inode *inode, struct file *file);
extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
struct nameidata *nd, int mode,
int locked_dir);
extern int ceph_release(struct inode *inode, struct file *filp);
-extern void ceph_release_page_vector(struct page **pages, int num_pages);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* export.c */
extern const struct export_operations ceph_export_ops;
-/* debugfs.c */
-extern int ceph_debugfs_init(void);
-extern void ceph_debugfs_cleanup(void);
-extern int ceph_debugfs_client_init(struct ceph_client *client);
-extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
-
/* locks.c */
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
return NULL;
}
+/* debugfs.c */
+extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
+extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
+
#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
deleted file mode 100644
index 28b35a005ec2..000000000000
--- a/fs/ceph/types.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _FS_CEPH_TYPES_H
-#define _FS_CEPH_TYPES_H
-
-/* needed before including ceph_fs.h */
-#include <linux/in.h>
-#include <linux/types.h>
-#include <linux/fcntl.h>
-#include <linux/string.h>
-
-#include "ceph_fs.h"
-#include "ceph_frag.h"
-#include "ceph_hash.h"
-
-/*
- * Identify inodes by both their ino AND snapshot id (a u64).
- */
-struct ceph_vino {
- u64 ino;
- u64 snap;
-};
-
-
-/* context for the caps reservation mechanism */
-struct ceph_cap_reservation {
- int count;
-};
-
-
-#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..6e12a6ba5f79 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
-#include "ceph_debug.h"
+#include <linux/ceph/ceph_debug.h>
+
#include "super.h"
-#include "decode.h"
+#include "mds_client.h"
+
+#include <linux/ceph/decode.h>
#include <linux/xattr.h>
#include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
const char *value, size_t size, int flags)
{
- struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct inode *parent_inode = dentry->d_parent->d_inode;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_client *mdsc = fsc->mdsc;
int err;
int i, nr_pages;
struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
/* preallocate memory for xattr name, value, index node */
err = -ENOMEM;
- newname = kmalloc(name_len + 1, GFP_NOFS);
+ newname = kmemdup(name, name_len + 1, GFP_NOFS);
if (!newname)
goto out;
- memcpy(newname, name, name_len + 1);
if (val_len) {
newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
static int ceph_send_removexattr(struct dentry *dentry, const char *name)
{
- struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
- struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = dentry->d_inode;
struct inode *parent_inode = dentry->d_parent->d_inode;
struct ceph_mds_request *req;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..7e83b356cc9e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -232,7 +232,7 @@ static int
small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
void **request_buf)
{
- int rc = 0;
+ int rc;
rc = cifs_reconnect_tcon(tcon, smb_command);
if (rc)
@@ -250,7 +250,7 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
if (tcon != NULL)
cifs_stats_inc(&tcon->num_smbs_sent);
- return rc;
+ return 0;
}
int
@@ -281,16 +281,9 @@ small_smb_init_no_tc(const int smb_command, const int wct,
/* If the return code is zero, this function must fill in request_buf pointer */
static int
-smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
- void **request_buf /* returned */ ,
- void **response_buf /* returned */ )
+__smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+ void **request_buf, void **response_buf)
{
- int rc = 0;
-
- rc = cifs_reconnect_tcon(tcon, smb_command);
- if (rc)
- return rc;
-
*request_buf = cifs_buf_get();
if (*request_buf == NULL) {
/* BB should we add a retry in here if not a writepage? */
@@ -309,7 +302,31 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
if (tcon != NULL)
cifs_stats_inc(&tcon->num_smbs_sent);
- return rc;
+ return 0;
+}
+
+/* If the return code is zero, this function must fill in request_buf pointer */
+static int
+smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+ void **request_buf, void **response_buf)
+{
+ int rc;
+
+ rc = cifs_reconnect_tcon(tcon, smb_command);
+ if (rc)
+ return rc;
+
+ return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
+}
+
+static int
+smb_init_no_reconnect(int smb_command, int wct, struct cifsTconInfo *tcon,
+ void **request_buf, void **response_buf)
+{
+ if (tcon->ses->need_reconnect || tcon->need_reconnect)
+ return -EHOSTDOWN;
+
+ return __smb_init(smb_command, wct, tcon, request_buf, response_buf);
}
static int validate_t2(struct smb_t2_rsp *pSMB)
@@ -4534,8 +4551,8 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
cFYI(1, "In QFSUnixInfo");
QFSUnixRetry:
- rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+ (void **) &pSMB, (void **) &pSMBr);
if (rc)
return rc;
@@ -4604,8 +4621,8 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
cFYI(1, "In SETFSUnixInfo");
SETFSUnixRetry:
/* BB switch to small buf init to save memory */
- rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, tcon,
+ (void **) &pSMB, (void **) &pSMBr);
if (rc)
return rc;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 93f77d438d3c..53cce8cc2224 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -801,6 +801,8 @@ retry_iget5_locked:
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
inode->i_ino = hash;
+ if (S_ISREG(inode->i_mode))
+ inode->i_data.backing_dev_info = sb->s_bdi;
#ifdef CONFIG_CIFS_FSCACHE
/* initialize per-inode cache cookie pointer */
CIFS_I(inode)->fscache = NULL;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 03e59aa318eb..d0ad09d57789 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -599,69 +599,6 @@ static int do_smb_getmountuid(unsigned int fd, unsigned int cmd,
#define HIDPGETCONNLIST _IOR('H', 210, int)
#define HIDPGETCONNINFO _IOR('H', 211, int)
-#ifdef CONFIG_BLOCK
-struct raw32_config_request
-{
- compat_int_t raw_minor;
- __u64 block_major;
- __u64 block_minor;
-} __attribute__((packed));
-
-static int get_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
- int ret;
-
- if (!access_ok(VERIFY_READ, user_req, sizeof(struct raw32_config_request)))
- return -EFAULT;
-
- ret = __get_user(req->raw_minor, &user_req->raw_minor);
- ret |= __get_user(req->block_major, &user_req->block_major);
- ret |= __get_user(req->block_minor, &user_req->block_minor);
-
- return ret ? -EFAULT : 0;
-}
-
-static int set_raw32_request(struct raw_config_request *req, struct raw32_config_request __user *user_req)
-{
- int ret;
-
- if (!access_ok(VERIFY_WRITE, user_req, sizeof(struct raw32_config_request)))
- return -EFAULT;
-
- ret = __put_user(req->raw_minor, &user_req->raw_minor);
- ret |= __put_user(req->block_major, &user_req->block_major);
- ret |= __put_user(req->block_minor, &user_req->block_minor);
-
- return ret ? -EFAULT : 0;
-}
-
-static int raw_ioctl(unsigned fd, unsigned cmd,
- struct raw32_config_request __user *user_req)
-{
- int ret;
-
- switch (cmd) {
- case RAW_SETBIND:
- default: { /* RAW_GETBIND */
- struct raw_config_request req;
- mm_segment_t oldfs = get_fs();
-
- if ((ret = get_raw32_request(&req, user_req)))
- return ret;
-
- set_fs(KERNEL_DS);
- ret = sys_ioctl(fd,cmd,(unsigned long)&req);
- set_fs(oldfs);
-
- if ((!ret) && (cmd == RAW_GETBIND)) {
- ret = set_raw32_request(&req, user_req);
- }
- break;
- }
- }
- return ret;
-}
-#endif /* CONFIG_BLOCK */
struct serial_struct32 {
compat_int_t type;
@@ -1262,9 +1199,6 @@ COMPATIBLE_IOCTL(SOUND_MIXER_PRIVATE5)
COMPATIBLE_IOCTL(SOUND_MIXER_GETLEVELS)
COMPATIBLE_IOCTL(SOUND_MIXER_SETLEVELS)
COMPATIBLE_IOCTL(OSS_GETVERSION)
-/* Raw devices */
-COMPATIBLE_IOCTL(RAW_SETBIND)
-COMPATIBLE_IOCTL(RAW_GETBIND)
/* SMB ioctls which do not need any translations */
COMPATIBLE_IOCTL(SMB_IOC_NEWCONN)
/* Watchdog */
@@ -1523,10 +1457,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
case MTIOCGET32:
case MTIOCPOS32:
return mt_ioctl_trans(fd, cmd, argp);
- /* Raw devices */
- case RAW_SETBIND:
- case RAW_GETBIND:
- return raw_ioctl(fd, cmd, argp);
#endif
/* One SMB ioctl needs translations. */
#define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
diff --git a/fs/exec.c b/fs/exec.c
index 828dd2461d6b..6d2b6f936858 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2014,3 +2014,43 @@ fail_creds:
fail:
return;
}
+
+/*
+ * Core dumping helper functions. These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+ return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+ int ret = 1;
+
+ if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+ if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+ return 0;
+ } else {
+ char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+ if (!buf)
+ return 0;
+ while (off > 0) {
+ unsigned long n = off;
+
+ if (n > PAGE_SIZE)
+ n = PAGE_SIZE;
+ if (!dump_write(file, buf, n)) {
+ ret = 0;
+ break;
+ }
+ off -= n;
+ }
+ free_page((unsigned long)buf);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index eb7368ebd8cd..3eadd97324b1 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -54,6 +54,9 @@ struct page_collect {
unsigned nr_pages;
unsigned long length;
loff_t pg_first; /* keep 64bit also in 32-arches */
+ bool read_4_write; /* This means two things: that the read is sync
+ * And the pages should not be unlocked.
+ */
};
static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
@@ -71,6 +74,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
pcol->nr_pages = 0;
pcol->length = 0;
pcol->pg_first = -1;
+ pcol->read_4_write = false;
}
static void _pcol_reset(struct page_collect *pcol)
@@ -347,7 +351,8 @@ static int readpage_strip(void *data, struct page *page)
if (PageError(page))
ClearPageError(page);
- unlock_page(page);
+ if (!pcol->read_4_write)
+ unlock_page(page);
EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
" splitting\n", inode->i_ino, page->index);
@@ -428,6 +433,7 @@ static int _readpage(struct page *page, bool is_sync)
/* readpage_strip might call read_exec(,is_sync==false) at several
* places but not if we have a single page.
*/
+ pcol.read_4_write = is_sync;
ret = readpage_strip(&pcol, page);
if (ret) {
EXOFS_ERR("_readpage => %d\n", ret);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2e20bd771337..377768009106 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1842,8 +1842,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (le32_to_cpu(es->s_blocks_count) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ if (generic_check_addressable(sb->s_blocksize_bits,
+ le32_to_cpu(es->s_blocks_count))) {
ext3_msg(sb, KERN_ERR,
"error: filesystem is too large to mount safely");
if (sizeof(sector_t) < 8)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 24e7699f915d..8ecc1e590303 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2826,15 +2826,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* Test whether we have more sectors than will fit in sector_t,
* and whether the max offset is addressable by the page cache.
*/
- if ((ext4_blocks_count(es) >
- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
- (ext4_blocks_count(es) >
- (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+ ret = generic_check_addressable(sb->s_blocksize_bits,
+ ext4_blocks_count(es));
+ if (ret) {
ext4_msg(sb, KERN_ERR, "filesystem"
" too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
- ret = -EFBIG;
goto failed_mount;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5581122bd2c0..ab38fef1c9a1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -72,22 +72,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
- /*
- * For inodes on standard filesystems, we use superblock's bdi. For
- * inodes on virtual filesystems, we want to use inode mapping's bdi
- * because they can possibly point to something useful (think about
- * block_dev filesystem).
- */
- if (sb->s_bdi && sb->s_bdi != &noop_backing_dev_info) {
- /* Some device inodes could play dirty tricks. Catch them... */
- WARN(bdi != sb->s_bdi && bdi_cap_writeback_dirty(bdi),
- "Dirtiable inode bdi %s != sb bdi %s\n",
- bdi->name, sb->s_bdi->name);
- return sb->s_bdi;
- }
- return bdi;
+ if (strcmp(sb->s_type->name, "bdev") == 0)
+ return inode->i_mapping->backing_dev_info;
+
+ return sb->s_bdi;
}
static void bdi_queue_work(struct backing_dev_info *bdi,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index d367af1514ef..cde755cca564 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1354,7 +1354,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
loff_t file_size;
unsigned int num;
unsigned int offset;
- size_t total_len;
+ size_t total_len = 0;
req = fuse_get_req(fc);
if (IS_ERR(req))
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
config GFS2_FS
tristate "GFS2 file system support"
- depends on EXPERIMENTAL && (64BIT || LBDAF)
+ depends on (64BIT || LBDAF)
select DLM if GFS2_FS_LOCKING_DLM
select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..6b24afb96aae 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
#include "glops.h"
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
- unsigned int from, unsigned int to)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+ unsigned int from, unsigned int to)
{
struct buffer_head *head = page_buffers(page);
unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
int alloc_required;
int error = 0;
- struct gfs2_alloc *al;
+ struct gfs2_alloc *al = NULL;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
rblocks += RES_STATFS + RES_QUOTA;
if (&ip->i_inode == sdp->sd_rindex)
rblocks += 2 * RES_STATFS;
+ if (alloc_required)
+ rblocks += gfs2_rg_blocks(al);
error = gfs2_trans_begin(sdp, rblocks,
PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
page_cache_release(page);
- /*
- * XXX(truncate): the call below should probably be replaced with
- * a call to the gfs2-specific truncate blocks helper to actually
- * release disk blocks..
- */
+ gfs2_trans_end(sdp);
if (pos + len > ip->i_inode.i_size)
- truncate_setsize(&ip->i_inode, ip->i_inode.i_size);
+ gfs2_trim_blocks(&ip->i_inode);
+ goto out_trans_fail;
+
out_endtrans:
gfs2_trans_end(sdp);
out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
page_cache_release(page);
if (copied) {
- if (inode->i_size < to) {
+ if (inode->i_size < to)
i_size_write(inode, to);
- ip->i_disksize = inode->i_size;
- }
gfs2_dinode_out(ip, di);
mark_inode_dirty(inode);
}
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
if (ret > 0) {
- if (inode->i_size > ip->i_disksize)
- ip->i_disksize = inode->i_size;
gfs2_dinode_out(ip, dibh->b_data);
mark_inode_dirty(inode);
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..5476c066d4ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
* @ip: the inode
* @dibh: the dinode buffer
* @block: the block number that was allocated
- * @private: any locked page held by the caller process
+ * @page: The (optional) page. This is looked up if @page is NULL
*
* Returns: errno
*/
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
/**
* gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
* @ip: The GFS2 inode to unstuff
- * @unstuffer: the routine that handles unstuffing a non-zero length file
- * @private: private data for the unstuffer
+ * @page: The (optional) page. This is looked up if the @page is NULL
*
* This routine unstuffs a dinode and returns it to a "normal" state such
* that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
if (error)
goto out;
- if (ip->i_disksize) {
+ if (i_size_read(&ip->i_inode)) {
/* Get a free block, fill it with the stuffed data,
and write it out to disk */
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
di = (struct gfs2_dinode *)dibh->b_data;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
- if (ip->i_disksize) {
+ if (i_size_read(&ip->i_inode)) {
*(__be64 *)(di + 1) = cpu_to_be64(block);
gfs2_add_inode_blocks(&ip->i_inode, 1);
di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
}
/**
- * do_grow - Make a file look bigger than it is
- * @ip: the inode
- * @size: the size to set the file to
- *
- * Called with an exclusive lock on @ip.
- *
- * Returns: errno
- */
-
-static int do_grow(struct gfs2_inode *ip, u64 size)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al;
- struct buffer_head *dibh;
- int error;
-
- al = gfs2_alloc_get(ip);
- if (!al)
- return -ENOMEM;
-
- error = gfs2_quota_lock_check(ip);
- if (error)
- goto out;
-
- al->al_requested = sdp->sd_max_height + RES_DATA;
-
- error = gfs2_inplace_reserve(ip);
- if (error)
- goto out_gunlock_q;
-
- error = gfs2_trans_begin(sdp,
- sdp->sd_max_height + al->al_rgd->rd_length +
- RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
- if (error)
- goto out_ipres;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_end_trans;
-
- if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
- if (gfs2_is_stuffed(ip)) {
- error = gfs2_unstuff_dinode(ip, NULL);
- if (error)
- goto out_brelse;
- }
- }
-
- ip->i_disksize = size;
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
-
-out_brelse:
- brelse(dibh);
-out_end_trans:
- gfs2_trans_end(sdp);
-out_ipres:
- gfs2_inplace_release(ip);
-out_gunlock_q:
- gfs2_quota_unlock(ip);
-out:
- gfs2_alloc_put(ip);
- return error;
-}
-
-
-/**
* gfs2_block_truncate_page - Deal with zeroing out data for truncate
*
* This is partly borrowed from ext3.
*/
-static int gfs2_block_truncate_page(struct address_space *mapping)
+static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
{
struct inode *inode = mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- loff_t from = inode->i_size;
unsigned long index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
return err;
}
-static int trunc_start(struct gfs2_inode *ip, u64 size)
+static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct address_space *mapping = inode->i_mapping;
struct buffer_head *dibh;
int journaled = gfs2_is_jdata(ip);
int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
if (error)
goto out;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
if (gfs2_is_stuffed(ip)) {
- u64 dsize = size + sizeof(struct gfs2_dinode);
- ip->i_disksize = size;
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- if (dsize > dibh->b_size)
- dsize = dibh->b_size;
- gfs2_buffer_clear_tail(dibh, dsize);
- error = 1;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
} else {
- if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
- error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
-
- if (!error) {
- ip->i_disksize = size;
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
- ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
+ if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
+ error = gfs2_block_truncate_page(mapping, newsize);
+ if (error)
+ goto out_brelse;
}
+ ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
}
- brelse(dibh);
+ i_size_write(inode, newsize);
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ gfs2_dinode_out(ip, dibh->b_data);
+ truncate_pagecache(inode, oldsize, newsize);
+out_brelse:
+ brelse(dibh);
out:
gfs2_trans_end(sdp);
return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
if (error)
goto out;
- if (!ip->i_disksize) {
+ if (!i_size_read(&ip->i_inode)) {
ip->i_height = 0;
ip->i_goal = ip->i_no_addr;
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
/**
* do_shrink - make a file smaller
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * @inode: the inode
+ * @oldsize: the current inode size
+ * @newsize: the size to make the file
*
- * Called with an exclusive lock on @ip.
+ * Called with an exclusive lock on @inode. The @size must
+ * be equal to or smaller than the current inode size.
*
* Returns: errno
*/
-static int do_shrink(struct gfs2_inode *ip, u64 size)
+static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
int error;
- error = trunc_start(ip, size);
+ error = trunc_start(inode, oldsize, newsize);
if (error < 0)
return error;
- if (error > 0)
+ if (gfs2_is_stuffed(ip))
return 0;
- error = trunc_dealloc(ip, size);
- if (!error)
+ error = trunc_dealloc(ip, newsize);
+ if (error == 0)
error = trunc_end(ip);
return error;
}
-static int do_touch(struct gfs2_inode *ip, u64 size)
+void gfs2_trim_blocks(struct inode *inode)
{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 size = inode->i_size;
+ int ret;
+
+ ret = do_shrink(inode, size, size);
+ WARN_ON(ret != 0);
+}
+
+/**
+ * do_grow - Touch and update inode size
+ * @inode: The inode
+ * @size: The new size
+ *
+ * This function updates the timestamps on the inode and
+ * may also increase the size of the inode. This function
+ * must not be called with @size any smaller than the current
+ * inode size.
+ *
+ * Although it is not strictly required to unstuff files here,
+ * earlier versions of GFS2 have a bug in the stuffed file reading
+ * code which will result in a buffer overrun if the size is larger
+ * than the max stuffed file size. In order to prevent this from
+ * occuring, such files are unstuffed, but in other cases we can
+ * just update the inode size directly.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
+static int do_grow(struct inode *inode, u64 size)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
struct buffer_head *dibh;
+ struct gfs2_alloc *al = NULL;
int error;
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (gfs2_is_stuffed(ip) &&
+ (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
+ al = gfs2_alloc_get(ip);
+ if (al == NULL)
+ return -ENOMEM;
+
+ error = gfs2_quota_lock_check(ip);
+ if (error)
+ goto do_grow_alloc_put;
+
+ al->al_requested = 1;
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto do_grow_qunlock;
+ }
+
+ error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
if (error)
- return error;
+ goto do_grow_release;
- down_write(&ip->i_rw_mutex);
+ if (al) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (error)
+ goto do_end_trans;
+ }
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
- goto do_touch_out;
+ goto do_end_trans;
+ i_size_write(inode, size);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
-do_touch_out:
- up_write(&ip->i_rw_mutex);
+do_end_trans:
gfs2_trans_end(sdp);
+do_grow_release:
+ if (al) {
+ gfs2_inplace_release(ip);
+do_grow_qunlock:
+ gfs2_quota_unlock(ip);
+do_grow_alloc_put:
+ gfs2_alloc_put(ip);
+ }
return error;
}
/**
- * gfs2_truncatei - make a file a given size
- * @ip: the inode
- * @size: the size to make the file
- * @truncator: function to truncate the last partial block
+ * gfs2_setattr_size - make a file a given size
+ * @inode: the inode
+ * @newsize: the size to make the file
*
- * The file size can grow, shrink, or stay the same size.
+ * The file size can grow, shrink, or stay the same size. This
+ * is called holding i_mutex and an exclusive glock on the inode
+ * in question.
*
* Returns: errno
*/
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+int gfs2_setattr_size(struct inode *inode, u64 newsize)
{
- int error;
+ int ret;
+ u64 oldsize;
- if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
- return -EINVAL;
+ BUG_ON(!S_ISREG(inode->i_mode));
- if (size > ip->i_disksize)
- error = do_grow(ip, size);
- else if (size < ip->i_disksize)
- error = do_shrink(ip, size);
- else
- /* update time stamps */
- error = do_touch(ip, size);
+ ret = inode_newsize_ok(inode, newsize);
+ if (ret)
+ return ret;
- return error;
+ oldsize = inode->i_size;
+ if (newsize >= oldsize)
+ return do_grow(inode, newsize);
+
+ return do_shrink(inode, oldsize, newsize);
}
int gfs2_truncatei_resume(struct gfs2_inode *ip)
{
int error;
- error = trunc_dealloc(ip, ip->i_disksize);
+ error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
if (!error)
error = trunc_end(ip);
return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
shift = sdp->sd_sb.sb_bsize_shift;
BUG_ON(gfs2_is_dir(ip));
- end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+ end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
lblock = offset >> shift;
lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
}
}
-int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
-int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
-int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
-
-int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
-int gfs2_truncatei_resume(struct gfs2_inode *ip);
-int gfs2_file_dealloc(struct gfs2_inode *ip);
-int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
- unsigned int len);
+extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_block_map(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh, int create);
+extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
+ u64 *dblock, unsigned *extlen);
+extern int gfs2_setattr_size(struct inode *inode, u64 size);
+extern void gfs2_trim_blocks(struct inode *inode);
+extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
+extern int gfs2_file_dealloc(struct gfs2_inode *ip);
+extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len);
#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..6798755b3858 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
ip = GFS2_I(inode);
}
- if (sdp->sd_args.ar_localcaching)
+ if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
goto valid;
had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+struct qstr gfs2_qdot __read_mostly;
+struct qstr gfs2_qdotdot __read_mostly;
+
typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
u64 leaf_no, void *data);
typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
- if (ip->i_disksize < offset + size)
- ip->i_disksize = offset + size;
+ if (ip->i_inode.i_size < offset + size)
+ i_size_write(&ip->i_inode, offset + size);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_dinode_out(ip, dibh->b_data);
@@ -225,8 +228,8 @@ out:
if (error)
return error;
- if (ip->i_disksize < offset + copied)
- ip->i_disksize = offset + copied;
+ if (ip->i_inode.i_size < offset + copied)
+ i_size_write(&ip->i_inode, offset + copied);
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
unsigned int o;
int copied = 0;
int error = 0;
+ u64 disksize = i_size_read(&ip->i_inode);
- if (offset >= ip->i_disksize)
+ if (offset >= disksize)
return 0;
- if (offset + size > ip->i_disksize)
- size = ip->i_disksize - offset;
+ if (offset + size > disksize)
+ size = disksize - offset;
if (!size)
return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
unsigned hsize = 1 << ip->i_depth;
unsigned index;
u64 ln;
- if (hsize * sizeof(u64) != ip->i_disksize) {
+ if (hsize * sizeof(u64) != i_size_read(inode)) {
gfs2_consist_inode(ip);
return ERR_PTR(-EIO);
}
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
for (x = sdp->sd_hash_ptrs; x--; lp++)
*lp = cpu_to_be64(bn);
- dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
+ i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
gfs2_add_inode_blocks(&dip->i_inode, 1);
dip->i_diskflags |= GFS2_DIF_EXHASH;
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
u64 *buf;
u64 *from, *to;
u64 block;
+ u64 disksize = i_size_read(&dip->i_inode);
int x;
int error = 0;
hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != dip->i_disksize) {
+ if (hsize * sizeof(u64) != disksize) {
gfs2_consist_inode(dip);
return -EIO;
}
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
if (!buf)
return -ENOMEM;
- for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
+ for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
error = gfs2_dir_read_data(dip, (char *)buf,
block * sdp->sd_hash_bsize,
sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
unsigned depth = 0;
hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != dip->i_disksize) {
+ if (hsize * sizeof(u64) != i_size_read(inode)) {
gfs2_consist_inode(dip);
return -EIO;
}
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
int error = 0;
hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != dip->i_disksize) {
+ if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
gfs2_consist_inode(dip);
return -EIO;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
struct gfs2_inode;
struct gfs2_inum;
-struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename);
-int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
- const struct gfs2_inode *ip);
-int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
- const struct gfs2_inode *ip, unsigned int type);
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
-int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
- filldir_t filldir);
-int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
- const struct gfs2_inode *nip, unsigned int new_type);
+extern struct inode *gfs2_dir_search(struct inode *dir,
+ const struct qstr *filename);
+extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
+ const struct gfs2_inode *ip);
+extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+ const struct gfs2_inode *ip, unsigned int type);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+ filldir_t filldir);
+extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ const struct gfs2_inode *nip, unsigned int new_type);
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
-int gfs2_diradd_alloc_required(struct inode *dir,
- const struct qstr *filename);
-int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
- struct buffer_head **bhp);
+extern int gfs2_diradd_alloc_required(struct inode *dir,
+ const struct qstr *filename);
+extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp);
static inline u32 gfs2_disk_hash(const char *data, int len)
{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
memcpy(dent + 1, name->name, name->len);
}
+extern struct qstr gfs2_qdot;
+extern struct qstr gfs2_qdotdot;
+
#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..06d582732d34 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
static struct dentry *gfs2_get_parent(struct dentry *child)
{
- struct qstr dotdot;
struct dentry *dentry;
- /*
- * XXX(hch): it would be a good idea to keep this around as a
- * static variable.
- */
- gfs2_str2qstr(&dotdot, "..");
-
- dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+ dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
if (!IS_ERR(dentry))
dentry->d_op = &gfs2_dops;
return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 8fcfefb96077..a51079bd4af1 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
rblocks = RES_DINODE + ind_blocks;
if (gfs2_is_jdata(ip))
rblocks += data_blocks ? data_blocks : 1;
- if (ind_blocks || data_blocks)
+ if (ind_blocks || data_blocks) {
rblocks += RES_STATFS + RES_QUOTA;
+ rblocks += gfs2_rg_blocks(al);
+ }
ret = gfs2_trans_begin(sdp, rblocks, 0);
if (ret)
goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
goto fail;
if (!(file->f_flags & O_LARGEFILE) &&
- ip->i_disksize > MAX_NON_LFS) {
+ i_size_read(inode) > MAX_NON_LFS) {
error = -EOVERFLOW;
goto fail_gunlock;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..87778857f099 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
else
gfs2_glock_put_nolock(gl);
}
+ if (held1 && held2 && list_empty(&gl->gl_holders))
+ clear_bit(GLF_QUEUED, &gl->gl_flags);
gl->gl_state = new_state;
gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
insert_pt = &gh2->gh_list;
}
+ set_bit(GLF_QUEUED, &gl->gl_flags);
if (likely(insert_pt == NULL)) {
list_add_tail(&gh->gh_list, &gl->gl_holders);
if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
gfs2_glock_hold(gl);
holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
- if (time_before(now, holdtime))
- delay = holdtime - now;
- if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
- delay = gl->gl_ops->go_min_hold_time;
+ if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
+ if (time_before(now, holdtime))
+ delay = holdtime - now;
+ if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ delay = gl->gl_ops->go_min_hold_time;
+ }
spin_lock(&gl->gl_spin);
handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
spin_unlock(&lru_lock);
spin_lock(&gl->gl_spin);
- if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+ if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0);
spin_unlock(&gl->gl_spin);
gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
*p++ = 'I';
if (test_bit(GLF_FROZEN, gflags))
*p++ = 'F';
+ if (test_bit(GLF_QUEUED, gflags))
+ *p++ = 'q';
*p = 0;
return buf;
}
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
}
#endif
- glock_workqueue = create_workqueue("glock_workqueue");
+ glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+ WQ_HIGHPRI | WQ_FREEZEABLE, 0);
if (IS_ERR(glock_workqueue))
return PTR_ERR(glock_workqueue);
- gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+ gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
+ WQ_FREEZEABLE, 0);
if (IS_ERR(gfs2_delete_workqueue)) {
destroy_workqueue(glock_workqueue);
return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..db1c26d6d220 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
- * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
* @gl: the glock
* @state: the state we're requesting
* @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..0d149dcc04e5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
const struct gfs2_inode *ip = gl->gl_object;
if (ip == NULL)
return 0;
- gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
+ gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
IF2DT(ip->i_inode.i_mode), ip->i_flags,
(unsigned int)ip->i_diskflags,
- (unsigned long long)ip->i_inode.i_size,
- (unsigned long long)ip->i_disksize);
+ (unsigned long long)i_size_read(&ip->i_inode));
return 0;
}
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
[LM_TYPE_META] = &gfs2_meta_glops,
[LM_TYPE_INODE] = &gfs2_inode_glops,
[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
- [LM_TYPE_NONDISK] = &gfs2_trans_glops,
[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
[LM_TYPE_FLOCK] = &gfs2_flock_glops,
[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..764fbb49efc8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
GLF_REPLY_PENDING = 9,
GLF_INITIAL = 10,
GLF_FROZEN = 11,
+ GLF_QUEUED = 12,
};
struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
u64 i_no_formal_ino;
u64 i_generation;
u64 i_eattr;
- loff_t i_disksize;
unsigned long i_flags; /* GIF_... */
struct gfs2_glock *i_gl; /* Move into i_gh? */
struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
unsigned int ar_spectator:1; /* Don't get a journal */
- unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
- unsigned int ar_localcaching:1; /* Local caching */
unsigned int ar_debug:1; /* Oops on errors */
- unsigned int ar_upgrade:1; /* Upgrade ondisk format */
unsigned int ar_posix_acl:1; /* Enable posix acls */
unsigned int ar_quota:2; /* off/account/on */
unsigned int ar_suiddir:1; /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
*/
struct lm_lockstruct {
- unsigned int ls_jid;
+ int ls_jid;
unsigned int ls_first;
unsigned int ls_first_done;
unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
struct list_head sd_rindex_mru_list;
struct gfs2_rgrpd *sd_rindex_forward;
unsigned int sd_rgrps;
+ unsigned int sd_max_rg_data;
/* Journal index stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..06370f8bd8cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
* to do that.
*/
ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
- ip->i_disksize = be64_to_cpu(str->di_size);
- i_size_write(&ip->i_inode, ip->i_disksize);
+ i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
- str->di_size = cpu_to_be64(ip->i_disksize);
+ str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
(unsigned long long)ip->i_no_formal_ino);
printk(KERN_INFO " no_addr = %llu\n",
(unsigned long long)ip->i_no_addr);
- printk(KERN_INFO " i_disksize = %llu\n",
- (unsigned long long)ip->i_disksize);
+ printk(KERN_INFO " i_size = %llu\n",
+ (unsigned long long)i_size_read(&ip->i_inode));
printk(KERN_INFO " blocks = %llu\n",
(unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..6720d7d5fbc6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
extern int gfs2_internal_read(struct gfs2_inode *ip,
struct file_ra_state *ra_state,
char *buf, loff_t *pos, unsigned size);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+ unsigned int from, unsigned int to);
extern void gfs2_set_aops(struct inode *inode);
static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
}
+static inline int gfs2_check_internal_file_size(struct inode *inode,
+ u64 minsize, u64 maxsize)
+{
+ u64 size = i_size_read(inode);
+ if (size < minsize || size > maxsize)
+ goto err;
+ if (size & ((1 << inode->i_blkbits) - 1))
+ goto err;
+ return 0;
+err:
+ gfs2_consist_inode(GFS2_I(inode));
+ return -EIO;
+}
extern void gfs2_set_iop(struct inode *inode);
extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..1c09425b45fd 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
ret |= LM_OUT_CANCELED;
goto out;
case -EAGAIN: /* Try lock fails */
+ case -EDEADLK: /* Deadlock detected */
goto out;
- case -EINVAL: /* Invalid */
- case -ENOMEM: /* Out of memory */
+ case -ETIMEDOUT: /* Canceled due to timeout */
ret |= LM_OUT_ERROR;
goto out;
case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..d7eb1e209aa8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
#include "glock.h"
#include "quota.h"
#include "recovery.h"
+#include "dir.h"
static struct shrinker qd_shrinker = {
.shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
{
int error;
+ gfs2_str2qstr(&gfs2_qdot, ".");
+ gfs2_str2qstr(&gfs2_qdotdot, "..");
+
error = gfs2_sys_init();
if (error)
return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
error = -ENOMEM;
gfs_recovery_wq = alloc_workqueue("gfs_recovery",
- WQ_NON_REENTRANT | WQ_RESCUER, 0);
+ WQ_RESCUER | WQ_FREEZEABLE, 0);
if (!gfs_recovery_wq)
goto fail_wq;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..aeafc233dc89 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
#define DO 0
#define UNDO 1
-static const u32 gfs2_old_fs_formats[] = {
- 0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
- 0
-};
-
/**
* gfs2_tune_init - Fill a gfs2_tune structure with default values
* @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
{
- unsigned int x;
-
if (sb->sb_magic != GFS2_MAGIC ||
sb->sb_type != GFS2_METATYPE_SB) {
if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
sb->sb_multihost_format == GFS2_FORMAT_MULTI)
return 0;
- if (sb->sb_fs_format != GFS2_FORMAT_FS) {
- for (x = 0; gfs2_old_fs_formats[x]; x++)
- if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
- break;
+ fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
- if (!gfs2_old_fs_formats[x]) {
- printk(KERN_WARNING
- "GFS2: code version (%u, %u) is incompatible "
- "with ondisk format (%u, %u)\n",
- GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
- sb->sb_fs_format, sb->sb_multihost_format);
- printk(KERN_WARNING
- "GFS2: I don't know how to upgrade this FS\n");
- return -EINVAL;
- }
- }
-
- if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
- for (x = 0; gfs2_old_multihost_formats[x]; x++)
- if (gfs2_old_multihost_formats[x] ==
- sb->sb_multihost_format)
- break;
-
- if (!gfs2_old_multihost_formats[x]) {
- printk(KERN_WARNING
- "GFS2: code version (%u, %u) is incompatible "
- "with ondisk format (%u, %u)\n",
- GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
- sb->sb_fs_format, sb->sb_multihost_format);
- printk(KERN_WARNING
- "GFS2: I don't know how to upgrade this FS\n");
- return -EINVAL;
- }
- }
-
- if (!sdp->sd_args.ar_upgrade) {
- printk(KERN_WARNING
- "GFS2: code version (%u, %u) is incompatible "
- "with ondisk format (%u, %u)\n",
- GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
- sb->sb_fs_format, sb->sb_multihost_format);
- printk(KERN_INFO
- "GFS2: Use the \"upgrade\" mount option to upgrade "
- "the FS\n");
- printk(KERN_INFO "GFS2: See the manual for more details\n");
- return -EINVAL;
- }
-
- return 0;
+ return -EINVAL;
}
static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
prev_db = 0;
- for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
+ for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
bh.b_state = 0;
bh.b_blocknr = 0;
bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
if (!strcmp("lock_nolock", proto)) {
lm = &nolock_ops;
sdp->sd_args.ar_localflocks = 1;
- sdp->sd_args.ar_localcaching = 1;
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
} else if (!strcmp("lock_dlm", proto)) {
lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
static int wait_on_journal(struct gfs2_sbd *sdp)
{
- if (sdp->sd_args.ar_spectator)
- return 0;
if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
return 0;
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (error)
goto fail_sb;
+ /*
+ * If user space has failed to join the cluster or some similar
+ * failure has occurred, then the journal id will contain a
+ * negative (error) number. This will then be returned to the
+ * caller (of the mount syscall). We do this even for spectator
+ * mounts (which just write a jid of 0 to indicate "ok" even though
+ * the jid is unused in the spectator case)
+ */
+ if (sdp->sd_lockstruct.ls_jid < 0) {
+ error = sdp->sd_lockstruct.ls_jid;
+ sdp->sd_lockstruct.ls_jid = 0;
+ goto fail_sb;
+ }
+
error = init_inodes(sdp, DO);
if (error)
goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..0534510200d5 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/fiemap.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
#include <asm/uaccess.h>
#include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
goto out_gunlock_q;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- al->al_rgd->rd_length +
+ gfs2_rg_blocks(al) +
2 * RES_DINODE + RES_STATFS +
RES_QUOTA, 0);
if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
ip = ghs[1].gh_gl->gl_object;
- ip->i_disksize = size;
i_size_write(inode, size);
error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
ip = ghs[1].gh_gl->gl_object;
ip->i_inode.i_nlink = 2;
- ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+ i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
ip->i_diskflags |= GFS2_DIF_JDATA;
ip->i_entries = 2;
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (!gfs2_assert_withdraw(sdp, !error)) {
struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
- struct qstr str;
- gfs2_str2qstr(&str, ".");
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+ gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
dent->de_inum = di->di_num; /* already GFS2 endian */
dent->de_type = cpu_to_be16(DT_DIR);
di->di_entries = cpu_to_be32(1);
- gfs2_str2qstr(&str, "..");
dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
- gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+ gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
gfs2_inum_out(dip, dent);
dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
struct gfs2_inode *ip)
{
- struct qstr dotname;
int error;
if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
if (error)
return error;
- gfs2_str2qstr(&dotname, ".");
- error = gfs2_dir_del(ip, &dotname);
+ error = gfs2_dir_del(ip, &gfs2_qdot);
if (error)
return error;
- gfs2_str2qstr(&dotname, "..");
- error = gfs2_dir_del(ip, &dotname);
+ error = gfs2_dir_del(ip, &gfs2_qdotdot);
if (error)
return error;
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
struct inode *dir = &to->i_inode;
struct super_block *sb = dir->i_sb;
struct inode *tmp;
- struct qstr dotdot;
int error = 0;
- gfs2_str2qstr(&dotdot, "..");
-
igrab(dir);
for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
break;
}
- tmp = gfs2_lookupi(dir, &dotdot, 1);
+ tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
if (IS_ERR(tmp)) {
error = PTR_ERR(tmp);
break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
struct gfs2_inode *nip = NULL;
struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
+ struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
return 0;
}
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ return error;
if (odip != ndip) {
error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
al->al_requested = sdp->sd_max_dirres;
- error = gfs2_inplace_reserve(ndip);
+ error = gfs2_inplace_reserve_ri(ndip);
if (error)
goto out_gunlock_q;
error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- al->al_rgd->rd_length +
+ gfs2_rg_blocks(al) +
4 * RES_DINODE + 4 * RES_LEAF +
RES_STATFS + RES_QUOTA + 4, 0);
if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
}
if (dir_rename) {
- struct qstr name;
- gfs2_str2qstr(&name, "..");
-
error = gfs2_change_nlink(ndip, +1);
if (error)
goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
goto out_end_trans;
- error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR);
+ error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
if (error)
goto out_end_trans;
} else {
@@ -972,6 +964,7 @@ out_gunlock_r:
if (r_gh.gh_gl)
gfs2_glock_dq_uninit(&r_gh);
out:
+ gfs2_glock_dq_uninit(&ri_gh);
return error;
}
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
struct gfs2_holder i_gh;
struct buffer_head *dibh;
- unsigned int x;
+ unsigned int x, size;
char *buf;
int error;
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
return NULL;
}
- if (!ip->i_disksize) {
+ size = (unsigned int)i_size_read(&ip->i_inode);
+ if (size == 0) {
gfs2_consist_inode(ip);
buf = ERR_PTR(-EIO);
goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
goto out;
}
- x = ip->i_disksize + 1;
+ x = size + 1;
buf = kmalloc(x, GFP_NOFS);
if (!buf)
buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
return error;
}
-/*
- * XXX(truncate): the truncate_setsize calls should be moved to the end.
- */
-static int setattr_size(struct inode *inode, struct iattr *attr)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- int error;
-
- if (attr->ia_size != ip->i_disksize) {
- error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
- if (error)
- return error;
- truncate_setsize(inode, attr->ia_size);
- gfs2_trans_end(sdp);
- }
-
- error = gfs2_truncatei(ip, attr->ia_size);
- if (error && (inode->i_size != ip->i_disksize))
- i_size_write(inode, ip->i_disksize);
-
- return error;
-}
-
static int setattr_chown(struct inode *inode, struct iattr *attr)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto out;
if (attr->ia_valid & ATTR_SIZE)
- error = setattr_size(inode, attr);
+ error = gfs2_setattr_size(inode, attr->ia_size);
else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
error = setattr_chown(inode, attr);
else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
return ret;
}
+static void empty_write_end(struct page *page, unsigned from,
+ unsigned to)
+{
+ struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+
+ page_zero_new_buffers(page, from, to);
+ flush_dcache_page(page);
+ mark_page_accessed(page);
+
+ if (!gfs2_is_writeback(ip))
+ gfs2_page_add_databufs(ip, page, from, to);
+
+ block_commit_write(page, from, to);
+}
+
+
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+ unsigned start, end, next;
+ struct buffer_head *bh, *head;
+ int error;
+
+ if (!page_has_buffers(page)) {
+ error = block_prepare_write(page, from, to, gfs2_block_map);
+ if (unlikely(error))
+ return error;
+
+ empty_write_end(page, from, to);
+ return 0;
+ }
+
+ bh = head = page_buffers(page);
+ next = end = 0;
+ while (next < from) {
+ next += bh->b_size;
+ bh = bh->b_this_page;
+ }
+ start = next;
+ do {
+ next += bh->b_size;
+ if (buffer_mapped(bh)) {
+ if (end) {
+ error = block_prepare_write(page, start, end,
+ gfs2_block_map);
+ if (unlikely(error))
+ return error;
+ empty_write_end(page, start, end);
+ end = 0;
+ }
+ start = next;
+ }
+ else
+ end = next;
+ bh = bh->b_this_page;
+ } while (next < to);
+
+ if (end) {
+ error = block_prepare_write(page, start, end, gfs2_block_map);
+ if (unlikely(error))
+ return error;
+ empty_write_end(page, start, end);
+ }
+
+ return 0;
+}
+
+static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
+ int mode)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *dibh;
+ int error;
+ u64 start = offset >> PAGE_CACHE_SHIFT;
+ unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
+ u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t curr;
+ struct page *page;
+ unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
+ unsigned int from, to;
+
+ if (!end_offset)
+ end_offset = PAGE_CACHE_SIZE;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (unlikely(error))
+ goto out;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (unlikely(error))
+ goto out;
+ }
+
+ curr = start;
+ offset = start << PAGE_CACHE_SHIFT;
+ from = start_offset;
+ to = PAGE_CACHE_SIZE;
+ while (curr <= end) {
+ page = grab_cache_page_write_begin(inode->i_mapping, curr,
+ AOP_FLAG_NOFS);
+ if (unlikely(!page)) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ if (curr == end)
+ to = end_offset;
+ error = write_empty_blocks(page, from, to);
+ if (!error && offset + to > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ i_size_write(inode, offset + to);
+ }
+ unlock_page(page);
+ page_cache_release(page);
+ if (error)
+ goto out;
+ curr++;
+ offset += PAGE_CACHE_SIZE;
+ from = 0;
+ }
+
+ gfs2_dinode_out(ip, dibh->b_data);
+ mark_inode_dirty(inode);
+
+ brelse(dibh);
+
+out:
+ return error;
+}
+
+static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
+ unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+ const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
+ unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
+
+ for (tmp = max_data; tmp > sdp->sd_diptrs;) {
+ tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+ max_data -= tmp;
+ }
+ /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
+ so it might end up with fewer data blocks */
+ if (max_data <= *data_blocks)
+ return;
+ *data_blocks = max_data;
+ *ind_blocks = max_blocks - max_data;
+ *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
+ if (*len > max) {
+ *len = max;
+ gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
+ }
+}
+
+static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
+ loff_t len)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+ loff_t bytes, max_bytes;
+ struct gfs2_alloc *al;
+ int error;
+ loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
+ next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
+
+ offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
+ sdp->sd_sb.sb_bsize_shift;
+
+ len = next - offset;
+ bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
+ if (!bytes)
+ bytes = UINT_MAX;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+ error = gfs2_glock_nq(&ip->i_gh);
+ if (unlikely(error))
+ goto out_uninit;
+
+ if (!gfs2_write_alloc_required(ip, offset, len))
+ goto out_unlock;
+
+ while (len > 0) {
+ if (len < bytes)
+ bytes = len;
+ al = gfs2_alloc_get(ip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_unlock;
+ }
+
+ error = gfs2_quota_lock_check(ip);
+ if (error)
+ goto out_alloc_put;
+
+retry:
+ gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
+
+ al->al_requested = data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip);
+ if (error) {
+ if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
+ bytes >>= 1;
+ goto retry;
+ }
+ goto out_qunlock;
+ }
+ max_bytes = bytes;
+ calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
+ al->al_requested = data_blocks + ind_blocks;
+
+ rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
+ RES_RG_HDR + gfs2_rg_blocks(al);
+ if (gfs2_is_jdata(ip))
+ rblocks += data_blocks ? data_blocks : 1;
+
+ error = gfs2_trans_begin(sdp, rblocks,
+ PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+ if (error)
+ goto out_trans_fail;
+
+ error = fallocate_chunk(inode, offset, max_bytes, mode);
+ gfs2_trans_end(sdp);
+
+ if (error)
+ goto out_trans_fail;
+
+ len -= max_bytes;
+ offset += max_bytes;
+ gfs2_inplace_release(ip);
+ gfs2_quota_unlock(ip);
+ gfs2_alloc_put(ip);
+ }
+ goto out_unlock;
+
+out_trans_fail:
+ gfs2_inplace_release(ip);
+out_qunlock:
+ gfs2_quota_unlock(ip);
+out_alloc_put:
+ gfs2_alloc_put(ip);
+out_unlock:
+ gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+ gfs2_holder_uninit(&ip->i_gh);
+ return error;
+}
+
+
static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
.getxattr = gfs2_getxattr,
.listxattr = gfs2_listxattr,
.removexattr = gfs2_removexattr,
+ .fallocate = gfs2_fallocate,
.fiemap = gfs2_fiemap,
};
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..58a9b9998b42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
goto out;
size = loc + sizeof(struct gfs2_quota);
- if (size > inode->i_size) {
- ip->i_disksize = size;
+ if (size > inode->i_size)
i_size_write(inode, size);
- }
inode->i_mtime = inode->i_atime = CURRENT_TIME;
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
goto out_alloc;
if (nalloc)
- blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
+ blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
int gfs2_quota_init(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
- unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
+ u64 size = i_size_read(sdp->sd_qc_inode);
+ unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
unsigned int x, slot = 0;
unsigned int found = 0;
u64 dblock;
u32 extlen = 0;
int error;
- if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
- ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
- gfs2_consist_inode(ip);
+ if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
return -EIO;
- }
+
sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
error = gfs2_inplace_reserve(ip);
if (error)
goto out_alloc;
+ blocks += gfs2_rg_blocks(al);
}
error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
int ro = 0;
unsigned int pass;
int error;
+ int jlocked = 0;
- if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+ if (sdp->sd_args.ar_spectator ||
+ (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
jd->jd_jid);
-
+ jlocked = 1;
/* Acquire the journal lock so we can do recovery */
error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
jd->jd_jid, t);
}
- if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
- gfs2_glock_dq_uninit(&ji_gh);
-
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
- if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+ if (jlocked) {
+ gfs2_glock_dq_uninit(&ji_gh);
gfs2_glock_dq_uninit(&j_gh);
+ }
fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
fail_gunlock_tr:
gfs2_glock_dq_uninit(&t_gh);
fail_gunlock_ji:
- if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+ if (jlocked) {
gfs2_glock_dq_uninit(&ji_gh);
fail_gunlock_j:
gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..fb67f593f408 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
- if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
+ if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
break;
error = gfs2_internal_read(ip, &ra_state, buf, &pos,
sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
- u64 rgrp_count = ip->i_disksize;
+ u64 rgrp_count = i_size_read(inode);
+ struct gfs2_rgrpd *rgd;
+ unsigned int max_data = 0;
int error;
do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
}
}
+ list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+ if (rgd->rd_data > max_data)
+ max_data = rgd->rd_data;
+ sdp->sd_max_rg_data = max_data;
sdp->sd_rindex_uptodate = 1;
return 0;
}
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
struct file_ra_state ra_state;
+ struct gfs2_rgrpd *rgd;
+ unsigned int max_data = 0;
int error;
file_ra_state_init(&ra_state, inode->i_mapping);
for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
/* Ignore partials */
if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- ip->i_disksize)
+ i_size_read(inode))
break;
error = read_rindex_entry(ip, &ra_state);
if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
return error;
}
}
+ list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
+ if (rgd->rd_data > max_data)
+ max_data = rgd->rd_data;
+ sdp->sd_max_rg_data = max_data;
sdp->sd_rindex_uptodate = 1;
return 0;
@@ -1188,7 +1200,8 @@ out:
* Returns: errno
*/
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+ char *file, unsigned int line)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
return -EINVAL;
try_again:
- /* We need to hold the rindex unless the inode we're using is
- the rindex itself, in which case it's already held. */
- if (ip != GFS2_I(sdp->sd_rindex))
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */
- error = gfs2_ri_update_special(ip);
+ if (hold_rindex) {
+ /* We need to hold the rindex unless the inode we're using is
+ the rindex itself, in which case it's already held. */
+ if (ip != GFS2_I(sdp->sd_rindex))
+ error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+ else if (!sdp->sd_rgrps) /* We may not have the rindex read
+ in, so: */
+ error = gfs2_ri_update_special(ip);
+ }
if (error)
return error;
@@ -1215,7 +1231,7 @@ try_again:
try to free it, and try the allocation again. */
error = get_local_rgrp(ip, &unlinked, &last_unlinked);
if (error) {
- if (ip != GFS2_I(sdp->sd_rindex))
+ if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
gfs2_glock_dq_uninit(&al->al_ri_gh);
if (error != -EAGAIN)
return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
al->al_rgd = NULL;
if (al->al_rgd_gh.gh_gl)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
- if (ip != GFS2_I(sdp->sd_rindex))
+ if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
gfs2_glock_dq_uninit(&al->al_ri_gh);
}
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct buffer_head *dibh;
struct gfs2_alloc *al = ip->i_alloc;
- struct gfs2_rgrpd *rgd = al->al_rgd;
+ struct gfs2_rgrpd *rgd;
u32 goal, blk;
u64 block;
int error;
+ /* Only happens if there is a bug in gfs2, return something distinctive
+ * to ensure that it is noticed.
+ */
+ if (al == NULL)
+ return -ECANCELED;
+
+ rgd = al->al_rgd;
+
if (rgrp_contains_block(rgd, ip->i_goal))
goal = ip->i_goal - rgd->rd_data0;
else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..0e35c0466f9a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
ip->i_alloc = NULL;
}
-extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
- unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
+ char *file, unsigned int line);
#define gfs2_inplace_reserve(ip) \
-gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+ gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
+#define gfs2_inplace_reserve_ri(ip) \
+ gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
extern void gfs2_inplace_release(struct gfs2_inode *ip);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..047d1176096c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
{Opt_locktable, "locktable=%s"},
{Opt_hostdata, "hostdata=%s"},
{Opt_spectator, "spectator"},
+ {Opt_spectator, "norecovery"},
{Opt_ignore_local_fs, "ignore_local_fs"},
{Opt_localflocks, "localflocks"},
{Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
args->ar_spectator = 1;
break;
case Opt_ignore_local_fs:
- args->ar_ignore_local_fs = 1;
+ /* Retained for backwards compat only */
break;
case Opt_localflocks:
args->ar_localflocks = 1;
break;
case Opt_localcaching:
- args->ar_localcaching = 1;
+ /* Retained for backwards compat only */
break;
case Opt_debug:
if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
args->ar_debug = 0;
break;
case Opt_upgrade:
- args->ar_upgrade = 1;
+ /* Retained for backwards compat only */
break;
case Opt_acl:
args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
{
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ u64 size = i_size_read(jd->jd_inode);
- if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
- (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
- gfs2_consist_inode(ip);
+ if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
return -EIO;
- }
- jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
- if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) {
+ jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
+
+ if (gfs2_write_alloc_required(ip, 0, size)) {
gfs2_consist_inode(ip);
return -EIO;
}
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
/* Some flags must not be changed */
if (args_neq(&args, &sdp->sd_args, spectator) ||
- args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
args_neq(&args, &sdp->sd_args, localflocks) ||
- args_neq(&args, &sdp->sd_args, localcaching) ||
args_neq(&args, &sdp->sd_args, meta))
return -EINVAL;
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",hostdata=%s", args->ar_hostdata);
if (args->ar_spectator)
seq_printf(s, ",spectator");
- if (args->ar_ignore_local_fs)
- seq_printf(s, ",ignore_local_fs");
if (args->ar_localflocks)
seq_printf(s, ",localflocks");
- if (args->ar_localcaching)
- seq_printf(s, ",localcaching");
if (args->ar_debug)
seq_printf(s, ",debug");
- if (args->ar_upgrade)
- seq_printf(s, ",upgrade");
if (args->ar_posix_acl)
seq_printf(s, ",acl");
if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
if (gltype > LM_TYPE_JOURNAL)
return -EINVAL;
- glops = gfs2_glops_list[gltype];
+ if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
+ glops = &gfs2_trans_glops;
+ else
+ glops = gfs2_glops_list[gltype];
if (glops == NULL)
return -EINVAL;
if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
{
- return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+ return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
}
static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
- unsigned jid;
+ int jid;
int rv;
- rv = sscanf(buf, "%u", &jid);
+ rv = sscanf(buf, "%d", &jid);
if (rv != 1)
return -EINVAL;
spin_lock(&sdp->sd_jindex_spin);
rv = -EINVAL;
- if (sdp->sd_args.ar_spectator)
- goto out;
if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
goto out;
rv = -EBUSY;
- if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
+ if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
goto out;
+ rv = 0;
+ if (sdp->sd_args.ar_spectator && jid > 0)
+ rv = jid = -EINVAL;
sdp->sd_lockstruct.ls_jid = jid;
+ clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
smp_mb__after_clear_bit();
wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
- rv = 0;
out:
spin_unlock(&sdp->sd_jindex_spin);
return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
- add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
+ add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
if (gfs2_uuid_valid(uuid))
add_uevent_var(env, "UUID=%pUB", uuid);
return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
{(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
{(1UL << GLF_REPLY_PENDING), "r" }, \
{(1UL << GLF_INITIAL), "I" }, \
- {(1UL << GLF_FROZEN), "F" })
+ {(1UL << GLF_FROZEN), "F" }, \
+ {(1UL << GLF_QUEUED), "q" })
#ifndef NUMPTY
#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
#define RES_JDATA 1
#define RES_DATA 1
#define RES_LEAF 1
+#define RES_RG_HDR 1
#define RES_RG_BIT 2
#define RES_EATTR 1
#define RES_STATFS 1
#define RES_QUOTA 2
+/* reserve either the number of blocks to be allocated plus the rg header
+ * block, or all of the blocks in the rg, whichever is smaller */
+static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
+{
+ return (al->al_requested < al->al_rgd->rd_length)?
+ al->al_requested + 1 : al->al_rgd->rd_length;
+}
+
int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
unsigned int revokes);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..30b58f07c8a6 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
goto out_gunlock_q;
error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
- blks + al->al_rgd->rd_length +
+ blks + gfs2_rg_blocks(al) +
RES_DINODE + RES_STATFS + RES_QUOTA, 0);
if (error)
goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->search_key = ptr;
fd->key = ptr + tree->max_key_len + 2;
dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
- down(&tree->tree_lock);
+ mutex_lock(&tree->tree_lock);
return 0;
}
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
hfs_bnode_put(fd->bnode);
kfree(fd->search_key);
dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
- up(&fd->tree->tree_lock);
+ mutex_unlock(&fd->tree->tree_lock);
fd->tree = NULL;
}
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
if (!tree)
return NULL;
- init_MUTEX(&tree->tree_lock);
+ mutex_init(&tree->tree_lock);
spin_lock_init(&tree->hash_lock);
/* Set the correct compare function */
tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
unsigned int depth;
//unsigned int map1_size, map_size;
- struct semaphore tree_lock;
+ struct mutex tree_lock;
unsigned int pages_per_bnode;
spinlock_t hash_lock;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..d182438c7ae4 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
fd->search_key = ptr;
fd->key = ptr + tree->max_key_len + 2;
dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
- down(&tree->tree_lock);
+ mutex_lock(&tree->tree_lock);
return 0;
}
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
hfs_bnode_put(fd->bnode);
kfree(fd->search_key);
dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
- up(&fd->tree->tree_lock);
+ mutex_unlock(&fd->tree->tree_lock);
fd->tree = NULL;
}
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
rec = (e + b) / 2;
len = hfs_brec_lenoff(bnode, rec, &off);
keylen = hfs_brec_keylen(bnode, rec);
+ if (keylen == 0) {
+ res = -EINVAL;
+ goto fail;
+ }
hfs_bnode_read(bnode, fd->key, off, keylen);
cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
if (rec != e && e >= 0) {
len = hfs_brec_lenoff(bnode, e, &off);
keylen = hfs_brec_keylen(bnode, e);
+ if (keylen == 0) {
+ res = -EINVAL;
+ goto fail;
+ }
hfs_bnode_read(bnode, fd->key, off, keylen);
}
done:
@@ -75,6 +83,7 @@ done:
fd->keylength = keylen;
fd->entryoffset = off + keylen;
fd->entrylength = len - keylen;
+fail:
return res;
}
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
len = hfs_brec_lenoff(bnode, fd->record, &off);
keylen = hfs_brec_keylen(bnode, fd->record);
+ if (keylen == 0) {
+ res = -EINVAL;
+ goto out;
+ }
fd->keyoffset = off;
fd->keylength = keylen;
fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..ad57f5991eb1 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
struct page *page;
struct address_space *mapping;
__be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
return size;
dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
- mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
- mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+ mutex_lock(&sbi->alloc_mutex);
+ mapping = sbi->alloc_file->i_mapping;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
if (IS_ERR(page)) {
start = size;
@@ -150,16 +151,17 @@ done:
set_page_dirty(page);
kunmap(page);
*max = offset + (curr - pptr) * 32 + i - start;
- HFSPLUS_SB(sb).free_blocks -= *max;
+ sbi->free_blocks -= *max;
sb->s_dirt = 1;
dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
out:
- mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+ mutex_unlock(&sbi->alloc_mutex);
return start;
}
int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
struct page *page;
struct address_space *mapping;
__be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
/* are all of the bits in range? */
- if ((offset + count) > HFSPLUS_SB(sb).total_blocks)
+ if ((offset + count) > sbi->total_blocks)
return -2;
- mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
- mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
+ mutex_lock(&sbi->alloc_mutex);
+ mapping = sbi->alloc_file->i_mapping;
pnr = offset / PAGE_CACHE_BITS;
page = read_mapping_page(mapping, pnr, NULL);
pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
out:
set_page_dirty(page);
kunmap(page);
- HFSPLUS_SB(sb).free_blocks += len;
+ sbi->free_blocks += len;
sb->s_dirt = 1;
- mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
+ mutex_unlock(&sbi->alloc_mutex);
return 0;
}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2f39d05443e1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
if (!recoff)
return 0;
- if (node->tree->attributes & HFS_TREE_BIGKEYS)
- retval = hfs_bnode_read_u16(node, recoff) + 2;
- else
- retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1;
+
+ retval = hfs_bnode_read_u16(node, recoff) + 2;
+ if (retval > node->tree->max_key_len + 2) {
+ printk(KERN_ERR "hfs: keylen %d too large\n",
+ retval);
+ retval = 0;
+ }
}
return retval;
}
@@ -216,7 +219,7 @@ skip:
static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
{
struct hfs_btree *tree;
- struct hfs_bnode *node, *new_node;
+ struct hfs_bnode *node, *new_node, *next_node;
struct hfs_bnode_desc node_desc;
int num_recs, new_rec_off, new_off, old_rec_off;
int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
new_node->type = node->type;
new_node->height = node->height;
+ if (node->next)
+ next_node = hfs_bnode_find(tree, node->next);
+ else
+ next_node = NULL;
+
+ if (IS_ERR(next_node)) {
+ hfs_bnode_put(node);
+ hfs_bnode_put(new_node);
+ return next_node;
+ }
+
size = tree->node_size / 2 - node->num_recs * 2 - 14;
old_rec_off = tree->node_size - 4;
num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
/* panic? */
hfs_bnode_put(node);
hfs_bnode_put(new_node);
+ if (next_node)
+ hfs_bnode_put(next_node);
return ERR_PTR(-ENOSPC);
}
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
/* update next bnode header */
- if (new_node->next) {
- struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
+ if (next_node) {
next_node->prev = new_node->this;
hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..22e4d4e32999 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
if (!tree)
return NULL;
- init_MUTEX(&tree->tree_lock);
+ mutex_init(&tree->tree_lock);
spin_lock_init(&tree->hash_lock);
tree->sb = sb;
tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
goto free_tree;
tree->inode = inode;
+ if (!HFSPLUS_I(tree->inode)->first_blocks) {
+ printk(KERN_ERR
+ "hfs: invalid btree extent records (0 size).\n");
+ goto free_inode;
+ }
+
mapping = tree->inode->i_mapping;
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
- goto free_tree;
+ goto free_inode;
/* Load the header */
head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
tree->max_key_len = be16_to_cpu(head->max_key_len);
tree->depth = be16_to_cpu(head->depth);
- /* Set the correct compare function */
- if (id == HFSPLUS_EXT_CNID) {
+ /* Verify the tree and set the correct compare function */
+ switch (id) {
+ case HFSPLUS_EXT_CNID:
+ if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
+ printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
+ tree->max_key_len);
+ goto fail_page;
+ }
+ if (tree->attributes & HFS_TREE_VARIDXKEYS) {
+ printk(KERN_ERR "hfs: invalid extent btree flag\n");
+ goto fail_page;
+ }
+
tree->keycmp = hfsplus_ext_cmp_key;
- } else if (id == HFSPLUS_CAT_CNID) {
- if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) &&
+ break;
+ case HFSPLUS_CAT_CNID:
+ if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
+ printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
+ tree->max_key_len);
+ goto fail_page;
+ }
+ if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
+ printk(KERN_ERR "hfs: invalid catalog btree flag\n");
+ goto fail_page;
+ }
+
+ if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
(head->key_type == HFSPLUS_KEY_BINARY))
tree->keycmp = hfsplus_cat_bin_cmp_key;
else {
tree->keycmp = hfsplus_cat_case_cmp_key;
- HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD;
+ set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
}
- } else {
+ break;
+ default:
printk(KERN_ERR "hfs: unknown B*Tree requested\n");
goto fail_page;
}
+ if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
+ printk(KERN_ERR "hfs: invalid btree flag\n");
+ goto fail_page;
+ }
+
size = tree->node_size;
if (!is_power_of_2(size))
goto fail_page;
if (!tree->node_count)
goto fail_page;
+
tree->node_size_shift = ffs(size) - 1;
tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
return tree;
fail_page:
- tree->inode->i_mapping->a_ops = &hfsplus_aops;
page_cache_release(page);
- free_tree:
+ free_inode:
+ tree->inode->i_mapping->a_ops = &hfsplus_aops;
iput(tree->inode);
+ free_tree:
kfree(tree);
return NULL;
}
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
while (!tree->free_nodes) {
struct inode *inode = tree->inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
u32 count;
int res;
res = hfsplus_file_extend(inode);
if (res)
return ERR_PTR(res);
- HFSPLUS_I(inode).phys_size = inode->i_size =
- (loff_t)HFSPLUS_I(inode).alloc_blocks <<
- HFSPLUS_SB(tree->sb).alloc_blksz_shift;
- HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks <<
- HFSPLUS_SB(tree->sb).fs_shift;
+ hip->phys_size = inode->i_size =
+ (loff_t)hip->alloc_blocks <<
+ HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
+ hip->fs_blocks =
+ hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
inode_set_bytes(inode, inode->i_size);
count = inode->i_size >> tree->node_size_shift;
tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..8af45fc5b051 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
key->key_len = cpu_to_be16(6 + ustrlen);
}
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
{
if (inode->i_flags & S_IMMUTABLE)
perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
perms->rootflags |= HFSPLUS_FLG_APPEND;
else
perms->rootflags &= ~HFSPLUS_FLG_APPEND;
- HFSPLUS_I(inode).rootflags = perms->rootflags;
- HFSPLUS_I(inode).userflags = perms->userflags;
+
+ perms->userflags = HFSPLUS_I(inode)->userflags;
perms->mode = cpu_to_be16(inode->i_mode);
perms->owner = cpu_to_be32(inode->i_uid);
perms->group = cpu_to_be32(inode->i_gid);
+
+ if (S_ISREG(inode->i_mode))
+ perms->dev = cpu_to_be32(inode->i_nlink);
+ else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
+ perms->dev = cpu_to_be32(inode->i_rdev);
+ else
+ perms->dev = 0;
}
static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+
if (S_ISDIR(inode->i_mode)) {
struct hfsplus_cat_folder *folder;
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
memset(folder, 0, sizeof(*folder));
folder->type = cpu_to_be16(HFSPLUS_FOLDER);
folder->id = cpu_to_be32(inode->i_ino);
- HFSPLUS_I(inode).create_date =
+ HFSPLUS_I(inode)->create_date =
folder->create_date =
folder->content_mod_date =
folder->attribute_mod_date =
folder->access_date = hfsp_now2mt();
- hfsplus_set_perms(inode, &folder->permissions);
- if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir)
+ hfsplus_cat_set_perms(inode, &folder->permissions);
+ if (inode == sbi->hidden_dir)
/* invisible and namelocked */
folder->user_info.frFlags = cpu_to_be16(0x5000);
return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
file->type = cpu_to_be16(HFSPLUS_FILE);
file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
file->id = cpu_to_be32(cnid);
- HFSPLUS_I(inode).create_date =
+ HFSPLUS_I(inode)->create_date =
file->create_date =
file->content_mod_date =
file->attribute_mod_date =
file->access_date = hfsp_now2mt();
if (cnid == inode->i_ino) {
- hfsplus_set_perms(inode, &file->permissions);
+ hfsplus_cat_set_perms(inode, &file->permissions);
if (S_ISLNK(inode->i_mode)) {
file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
} else {
- file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type);
- file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator);
+ file->user_info.fdType = cpu_to_be32(sbi->type);
+ file->user_info.fdCreator = cpu_to_be32(sbi->creator);
}
if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
file->user_info.fdFlags = cpu_to_be16(0x100);
- file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date;
- file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+ file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
+ file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
}
return sizeof(*file);
}
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
{
+ struct super_block *sb = dir->i_sb;
struct hfs_find_data fd;
- struct super_block *sb;
hfsplus_cat_entry entry;
int entry_size;
int err;
dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
- sb = dir->i_sb;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
{
- struct super_block *sb;
+ struct super_block *sb = dir->i_sb;
struct hfs_find_data fd;
struct hfsplus_fork_raw fork;
struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
u16 type;
dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
- sb = dir->i_sb;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (!str) {
int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
}
- list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) {
+ list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
struct hfsplus_readdir_data *rd =
list_entry(pos, struct hfsplus_readdir_data, list);
if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
struct inode *src_dir, struct qstr *src_name,
struct inode *dst_dir, struct qstr *dst_name)
{
- struct super_block *sb;
+ struct super_block *sb = src_dir->i_sb;
struct hfs_find_data src_fd, dst_fd;
hfsplus_cat_entry entry;
int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
dst_dir->i_ino, dst_name->name);
- sb = src_dir->i_sb;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
+ hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
dst_fd = src_fd;
/* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..d236d85ec9d7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
dentry->d_op = &hfsplus_dentry_operations;
dentry->d_fsdata = NULL;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
again:
err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
cnid = be32_to_cpu(entry.file.id);
if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
- (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date ||
- entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) &&
- HFSPLUS_SB(sb).hidden_dir) {
+ (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
+ entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
+ HFSPLUS_SB(sb)->hidden_dir) {
struct qstr str;
char name[32];
@@ -86,7 +86,8 @@ again:
linkid = be32_to_cpu(entry.file.permissions.dev);
str.len = sprintf(name, "iNode%d", linkid);
str.name = name;
- hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str);
+ hfsplus_cat_build_key(sb, fd.search_key,
+ HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
goto again;
}
} else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
if (IS_ERR(inode))
return ERR_CAST(inode);
if (S_ISREG(inode->i_mode))
- HFSPLUS_I(inode).dev = linkid;
+ HFSPLUS_I(inode)->linkid = linkid;
out:
d_add(dentry, inode);
return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
if (filp->f_pos >= inode->i_size)
return 0;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
err = hfs_brec_find(&fd);
if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
err = -EIO;
goto out;
}
- if (HFSPLUS_SB(sb).hidden_dir &&
- HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id))
+ if (HFSPLUS_SB(sb)->hidden_dir &&
+ HFSPLUS_SB(sb)->hidden_dir->i_ino ==
+ be32_to_cpu(entry.folder.id))
goto next;
if (filldir(dirent, strbuf, len, filp->f_pos,
be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
filp->private_data = rd;
rd->file = filp;
- list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list);
+ list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
}
memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
{
struct hfsplus_readdir_data *rd = file->private_data;
if (rd) {
+ mutex_lock(&inode->i_mutex);
list_del(&rd->list);
+ mutex_unlock(&inode->i_mutex);
kfree(rd);
}
return 0;
}
-static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
-{
- struct inode *inode;
- int res;
-
- inode = hfsplus_new_inode(dir->i_sb, mode);
- if (!inode)
- return -ENOSPC;
-
- res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
- if (res) {
- inode->i_nlink = 0;
- hfsplus_delete_inode(inode);
- iput(inode);
- return res;
- }
- hfsplus_instantiate(dentry, inode, inode->i_ino);
- mark_inode_dirty(inode);
- return 0;
-}
-
static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
struct dentry *dst_dentry)
{
- struct super_block *sb = dst_dir->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
struct inode *inode = src_dentry->d_inode;
struct inode *src_dir = src_dentry->d_parent->d_inode;
struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
if (HFSPLUS_IS_RSRC(inode))
return -EPERM;
+ if (!S_ISREG(inode->i_mode))
+ return -EPERM;
+ mutex_lock(&sbi->vh_mutex);
if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
for (;;) {
get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
str.len = sprintf(name, "iNode%d", id);
res = hfsplus_rename_cat(inode->i_ino,
src_dir, &src_dentry->d_name,
- HFSPLUS_SB(sb).hidden_dir, &str);
+ sbi->hidden_dir, &str);
if (!res)
break;
if (res != -EEXIST)
- return res;
+ goto out;
}
- HFSPLUS_I(inode).dev = id;
- cnid = HFSPLUS_SB(sb).next_cnid++;
+ HFSPLUS_I(inode)->linkid = id;
+ cnid = sbi->next_cnid++;
src_dentry->d_fsdata = (void *)(unsigned long)cnid;
res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
if (res)
/* panic? */
- return res;
- HFSPLUS_SB(sb).file_count++;
+ goto out;
+ sbi->file_count++;
}
- cnid = HFSPLUS_SB(sb).next_cnid++;
+ cnid = sbi->next_cnid++;
res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
if (res)
- return res;
+ goto out;
inc_nlink(inode);
hfsplus_instantiate(dst_dentry, inode, cnid);
atomic_inc(&inode->i_count);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- HFSPLUS_SB(sb).file_count++;
- sb->s_dirt = 1;
-
- return 0;
+ sbi->file_count++;
+ dst_dir->i_sb->s_dirt = 1;
+out:
+ mutex_unlock(&sbi->vh_mutex);
+ return res;
}
static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
{
- struct super_block *sb = dir->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
struct inode *inode = dentry->d_inode;
struct qstr str;
char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
if (HFSPLUS_IS_RSRC(inode))
return -EPERM;
+ mutex_lock(&sbi->vh_mutex);
cnid = (u32)(unsigned long)dentry->d_fsdata;
if (inode->i_ino == cnid &&
- atomic_read(&HFSPLUS_I(inode).opencnt)) {
+ atomic_read(&HFSPLUS_I(inode)->opencnt)) {
str.name = name;
str.len = sprintf(name, "temp%lu", inode->i_ino);
res = hfsplus_rename_cat(inode->i_ino,
dir, &dentry->d_name,
- HFSPLUS_SB(sb).hidden_dir, &str);
+ sbi->hidden_dir, &str);
if (!res)
inode->i_flags |= S_DEAD;
- return res;
+ goto out;
}
res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
if (res)
- return res;
+ goto out;
if (inode->i_nlink > 0)
drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
if (!inode->i_nlink) {
if (inode->i_ino != cnid) {
- HFSPLUS_SB(sb).file_count--;
- if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
+ sbi->file_count--;
+ if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
res = hfsplus_delete_cat(inode->i_ino,
- HFSPLUS_SB(sb).hidden_dir,
+ sbi->hidden_dir,
NULL);
if (!res)
hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
} else
hfsplus_delete_inode(inode);
} else
- HFSPLUS_SB(sb).file_count--;
+ sbi->file_count--;
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
-
+out:
+ mutex_unlock(&sbi->vh_mutex);
return res;
}
-static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
- struct inode *inode;
- int res;
-
- inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
- if (!inode)
- return -ENOSPC;
-
- res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
- if (res) {
- inode->i_nlink = 0;
- hfsplus_delete_inode(inode);
- iput(inode);
- return res;
- }
- hfsplus_instantiate(dentry, inode, inode->i_ino);
- mark_inode_dirty(inode);
- return 0;
-}
-
static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+ struct inode *inode = dentry->d_inode;
int res;
- inode = dentry->d_inode;
if (inode->i_size != 2)
return -ENOTEMPTY;
+
+ mutex_lock(&sbi->vh_mutex);
res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
if (res)
- return res;
+ goto out;
clear_nlink(inode);
inode->i_ctime = CURRENT_TIME_SEC;
hfsplus_delete_inode(inode);
mark_inode_dirty(inode);
- return 0;
+out:
+ mutex_unlock(&sbi->vh_mutex);
+ return res;
}
static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
- struct super_block *sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
struct inode *inode;
- int res;
+ int res = -ENOSPC;
- sb = dir->i_sb;
- inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO);
+ mutex_lock(&sbi->vh_mutex);
+ inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
if (!inode)
- return -ENOSPC;
+ goto out;
res = page_symlink(inode, symname, strlen(symname) + 1);
- if (res) {
- inode->i_nlink = 0;
- hfsplus_delete_inode(inode);
- iput(inode);
- return res;
- }
+ if (res)
+ goto out_err;
- mark_inode_dirty(inode);
res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
+ if (res)
+ goto out_err;
- if (!res) {
- hfsplus_instantiate(dentry, inode, inode->i_ino);
- mark_inode_dirty(inode);
- }
+ hfsplus_instantiate(dentry, inode, inode->i_ino);
+ mark_inode_dirty(inode);
+ goto out;
+out_err:
+ inode->i_nlink = 0;
+ hfsplus_delete_inode(inode);
+ iput(inode);
+out:
+ mutex_unlock(&sbi->vh_mutex);
return res;
}
static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
int mode, dev_t rdev)
{
- struct super_block *sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
struct inode *inode;
- int res;
+ int res = -ENOSPC;
- sb = dir->i_sb;
- inode = hfsplus_new_inode(sb, mode);
+ mutex_lock(&sbi->vh_mutex);
+ inode = hfsplus_new_inode(dir->i_sb, mode);
if (!inode)
- return -ENOSPC;
+ goto out;
+
+ if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
+ init_special_inode(inode, mode, rdev);
res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
inode->i_nlink = 0;
hfsplus_delete_inode(inode);
iput(inode);
- return res;
+ goto out;
}
- init_special_inode(inode, mode, rdev);
+
hfsplus_instantiate(dentry, inode, inode->i_ino);
mark_inode_dirty(inode);
+out:
+ mutex_unlock(&sbi->vh_mutex);
+ return res;
+}
- return 0;
+static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ return hfsplus_mknod(dir, dentry, mode, 0);
+}
+
+static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
}
static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- res = hfsplus_unlink(new_dir, new_dentry);
+ if (S_ISDIR(new_dentry->d_inode->i_mode))
+ res = hfsplus_rmdir(new_dir, new_dentry);
+ else
+ res = hfsplus_unlink(new_dir, new_dentry);
if (res)
return res;
}
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..0c9cb1820a52 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
{
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
int res;
- hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start,
- HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+ WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+ hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
+ HFSPLUS_IS_RSRC(inode) ?
+ HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+
res = hfs_brec_find(fd);
- if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) {
+ if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
if (res != -ENOENT)
return;
- hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec));
- HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+ hfs_brec_insert(fd, hip->cached_extents,
+ sizeof(hfsplus_extent_rec));
+ hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
} else {
if (res)
return;
- hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength);
- HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY;
+ hfs_bnode_write(fd->bnode, hip->cached_extents,
+ fd->entryoffset, fd->entrylength);
+ hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
}
}
-void hfsplus_ext_write_extent(struct inode *inode)
+static void hfsplus_ext_write_extent_locked(struct inode *inode)
{
- if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) {
+ if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
struct hfs_find_data fd;
- hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
__hfsplus_ext_write_extent(inode, &fd);
hfs_find_exit(&fd);
}
}
+void hfsplus_ext_write_extent(struct inode *inode)
+{
+ mutex_lock(&HFSPLUS_I(inode)->extents_lock);
+ hfsplus_ext_write_extent_locked(inode);
+ mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
+}
+
static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
struct hfsplus_extent *extent,
u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
{
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
int res;
- if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY)
+ WARN_ON(!mutex_is_locked(&hip->extents_lock));
+
+ if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
__hfsplus_ext_write_extent(inode, fd);
- res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino,
- block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
+ res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
+ block, HFSPLUS_IS_RSRC(inode) ?
+ HFSPLUS_TYPE_RSRC :
+ HFSPLUS_TYPE_DATA);
if (!res) {
- HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block);
- HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents);
+ hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
+ hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
} else {
- HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
- HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+ hip->cached_start = hip->cached_blocks = 0;
+ hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
}
return res;
}
static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
{
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
struct hfs_find_data fd;
int res;
- if (block >= HFSPLUS_I(inode).cached_start &&
- block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks)
+ if (block >= hip->cached_start &&
+ block < hip->cached_start + hip->cached_blocks)
return 0;
- hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
res = __hfsplus_ext_cache_extent(&fd, inode, block);
hfs_find_exit(&fd);
return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
int hfsplus_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- struct super_block *sb;
+ struct super_block *sb = inode->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
int res = -EIO;
u32 ablock, dblock, mask;
int shift;
- sb = inode->i_sb;
-
/* Convert inode block to disk allocation block */
- shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits;
- ablock = iblock >> HFSPLUS_SB(sb).fs_shift;
+ shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
+ ablock = iblock >> sbi->fs_shift;
- if (iblock >= HFSPLUS_I(inode).fs_blocks) {
- if (iblock > HFSPLUS_I(inode).fs_blocks || !create)
+ if (iblock >= hip->fs_blocks) {
+ if (iblock > hip->fs_blocks || !create)
return -EIO;
- if (ablock >= HFSPLUS_I(inode).alloc_blocks) {
+ if (ablock >= hip->alloc_blocks) {
res = hfsplus_file_extend(inode);
if (res)
return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
} else
create = 0;
- if (ablock < HFSPLUS_I(inode).first_blocks) {
- dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock);
+ if (ablock < hip->first_blocks) {
+ dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
goto done;
}
if (inode->i_ino == HFSPLUS_EXT_CNID)
return -EIO;
- mutex_lock(&HFSPLUS_I(inode).extents_lock);
+ mutex_lock(&hip->extents_lock);
res = hfsplus_ext_read_extent(inode, ablock);
if (!res) {
- dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
- HFSPLUS_I(inode).cached_start);
+ dblock = hfsplus_ext_find_block(hip->cached_extents,
+ ablock - hip->cached_start);
} else {
- mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&hip->extents_lock);
return -EIO;
}
- mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&hip->extents_lock);
done:
dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
- mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1;
- map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask));
+ mask = (1 << sbi->fs_shift) - 1;
+ map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
if (create) {
set_buffer_new(bh_result);
- HFSPLUS_I(inode).phys_size += sb->s_blocksize;
- HFSPLUS_I(inode).fs_blocks++;
+ hip->phys_size += sb->s_blocksize;
+ hip->fs_blocks++;
inode_add_bytes(inode, sb->s_blocksize);
mark_inode_dirty(inode);
}
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
if (total_blocks == blocks)
return 0;
- hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
do {
res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
int hfsplus_file_extend(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
u32 start, len, goal;
int res;
- if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) {
+ if (sbi->alloc_file->i_size * 8 <
+ sbi->total_blocks - sbi->free_blocks + 8) {
// extend alloc file
- printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8,
- HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks);
+ printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
+ sbi->alloc_file->i_size * 8,
+ sbi->total_blocks, sbi->free_blocks);
return -ENOSPC;
}
- mutex_lock(&HFSPLUS_I(inode).extents_lock);
- if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
- goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
+ mutex_lock(&hip->extents_lock);
+ if (hip->alloc_blocks == hip->first_blocks)
+ goal = hfsplus_ext_lastblock(hip->first_extents);
else {
- res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks);
+ res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
if (res)
goto out;
- goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents);
+ goal = hfsplus_ext_lastblock(hip->cached_extents);
}
- len = HFSPLUS_I(inode).clump_blocks;
- start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len);
- if (start >= HFSPLUS_SB(sb).total_blocks) {
+ len = hip->clump_blocks;
+ start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
+ if (start >= sbi->total_blocks) {
start = hfsplus_block_allocate(sb, goal, 0, &len);
if (start >= goal) {
res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
}
dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
- if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) {
- if (!HFSPLUS_I(inode).first_blocks) {
+
+ if (hip->alloc_blocks <= hip->first_blocks) {
+ if (!hip->first_blocks) {
dprint(DBG_EXTENT, "first extents\n");
/* no extents yet */
- HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start);
- HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len);
+ hip->first_extents[0].start_block = cpu_to_be32(start);
+ hip->first_extents[0].block_count = cpu_to_be32(len);
res = 0;
} else {
/* try to append to extents in inode */
- res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents,
- HFSPLUS_I(inode).alloc_blocks,
+ res = hfsplus_add_extent(hip->first_extents,
+ hip->alloc_blocks,
start, len);
if (res == -ENOSPC)
goto insert_extent;
}
if (!res) {
- hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
- HFSPLUS_I(inode).first_blocks += len;
+ hfsplus_dump_extent(hip->first_extents);
+ hip->first_blocks += len;
}
} else {
- res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents,
- HFSPLUS_I(inode).alloc_blocks -
- HFSPLUS_I(inode).cached_start,
+ res = hfsplus_add_extent(hip->cached_extents,
+ hip->alloc_blocks - hip->cached_start,
start, len);
if (!res) {
- hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
- HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
- HFSPLUS_I(inode).cached_blocks += len;
+ hfsplus_dump_extent(hip->cached_extents);
+ hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
+ hip->cached_blocks += len;
} else if (res == -ENOSPC)
goto insert_extent;
}
out:
- mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&hip->extents_lock);
if (!res) {
- HFSPLUS_I(inode).alloc_blocks += len;
+ hip->alloc_blocks += len;
mark_inode_dirty(inode);
}
return res;
insert_extent:
dprint(DBG_EXTENT, "insert new extent\n");
- hfsplus_ext_write_extent(inode);
+ hfsplus_ext_write_extent_locked(inode);
- memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
- HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start);
- HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len);
- hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
- HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
- HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks;
- HFSPLUS_I(inode).cached_blocks = len;
+ memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+ hip->cached_extents[0].start_block = cpu_to_be32(start);
+ hip->cached_extents[0].block_count = cpu_to_be32(len);
+ hfsplus_dump_extent(hip->cached_extents);
+ hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
+ hip->cached_start = hip->alloc_blocks;
+ hip->cached_blocks = len;
res = 0;
goto out;
@@ -437,13 +461,15 @@ insert_extent:
void hfsplus_file_truncate(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
struct hfs_find_data fd;
u32 alloc_cnt, blk_cnt, start;
int res;
- dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino,
- (long long)HFSPLUS_I(inode).phys_size, inode->i_size);
- if (inode->i_size > HFSPLUS_I(inode).phys_size) {
+ dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
+ inode->i_ino, (long long)hip->phys_size, inode->i_size);
+
+ if (inode->i_size > hip->phys_size) {
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
return;
mark_inode_dirty(inode);
return;
- } else if (inode->i_size == HFSPLUS_I(inode).phys_size)
+ } else if (inode->i_size == hip->phys_size)
return;
- blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift;
- alloc_cnt = HFSPLUS_I(inode).alloc_blocks;
+ blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
+ HFSPLUS_SB(sb)->alloc_blksz_shift;
+ alloc_cnt = hip->alloc_blocks;
if (blk_cnt == alloc_cnt)
goto out;
- mutex_lock(&HFSPLUS_I(inode).extents_lock);
- hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
+ mutex_lock(&hip->extents_lock);
+ hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
while (1) {
- if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
- hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents,
+ if (alloc_cnt == hip->first_blocks) {
+ hfsplus_free_extents(sb, hip->first_extents,
alloc_cnt, alloc_cnt - blk_cnt);
- hfsplus_dump_extent(HFSPLUS_I(inode).first_extents);
- HFSPLUS_I(inode).first_blocks = blk_cnt;
+ hfsplus_dump_extent(hip->first_extents);
+ hip->first_blocks = blk_cnt;
break;
}
res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
if (res)
break;
- start = HFSPLUS_I(inode).cached_start;
- hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents,
+ start = hip->cached_start;
+ hfsplus_free_extents(sb, hip->cached_extents,
alloc_cnt - start, alloc_cnt - blk_cnt);
- hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents);
+ hfsplus_dump_extent(hip->cached_extents);
if (blk_cnt > start) {
- HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY;
+ hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
break;
}
alloc_cnt = start;
- HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0;
- HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
+ hip->cached_start = hip->cached_blocks = 0;
+ hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
hfs_brec_remove(&fd);
}
hfs_find_exit(&fd);
- mutex_unlock(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&hip->extents_lock);
- HFSPLUS_I(inode).alloc_blocks = blk_cnt;
+ hip->alloc_blocks = blk_cnt;
out:
- HFSPLUS_I(inode).phys_size = inode->i_size;
- HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
- inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
+ hip->phys_size = inode->i_size;
+ hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+ inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
mark_inode_dirty(inode);
}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..cb3653efb57a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
unsigned int depth;
//unsigned int map1_size, map_size;
- struct semaphore tree_lock;
+ struct mutex tree_lock;
unsigned int pages_per_bnode;
spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
u32 sect_count;
int fs_shift;
- /* Stuff in host order from Vol Header */
+ /* immutable data from the volume header */
u32 alloc_blksz;
int alloc_blksz_shift;
u32 total_blocks;
+ u32 data_clump_blocks, rsrc_clump_blocks;
+
+ /* mutable data from the volume header, protected by alloc_mutex */
u32 free_blocks;
- u32 next_alloc;
+ struct mutex alloc_mutex;
+
+ /* mutable data from the volume header, protected by vh_mutex */
u32 next_cnid;
u32 file_count;
u32 folder_count;
- u32 data_clump_blocks, rsrc_clump_blocks;
+ struct mutex vh_mutex;
/* Config options */
u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
int part, session;
unsigned long flags;
-
- struct hlist_head rsrc_inodes;
};
-#define HFSPLUS_SB_WRITEBACKUP 0x0001
-#define HFSPLUS_SB_NODECOMPOSE 0x0002
-#define HFSPLUS_SB_FORCE 0x0004
-#define HFSPLUS_SB_HFSX 0x0008
-#define HFSPLUS_SB_CASEFOLD 0x0010
+#define HFSPLUS_SB_WRITEBACKUP 0
+#define HFSPLUS_SB_NODECOMPOSE 1
+#define HFSPLUS_SB_FORCE 2
+#define HFSPLUS_SB_HFSX 3
+#define HFSPLUS_SB_CASEFOLD 4
struct hfsplus_inode_info {
- struct mutex extents_lock;
- u32 clump_blocks, alloc_blocks;
- sector_t fs_blocks;
- /* Allocation extents from catalog record or volume header */
- hfsplus_extent_rec first_extents;
- u32 first_blocks;
- hfsplus_extent_rec cached_extents;
- u32 cached_start, cached_blocks;
atomic_t opencnt;
- struct inode *rsrc_inode;
+ /*
+ * Extent allocation information, protected by extents_lock.
+ */
+ u32 first_blocks;
+ u32 clump_blocks;
+ u32 alloc_blocks;
+ u32 cached_start;
+ u32 cached_blocks;
+ hfsplus_extent_rec first_extents;
+ hfsplus_extent_rec cached_extents;
unsigned long flags;
+ struct mutex extents_lock;
+ /*
+ * Immutable data.
+ */
+ struct inode *rsrc_inode;
__be32 create_date;
- /* Device number in hfsplus_permissions in catalog */
- u32 dev;
- /* BSD system and user file flags */
- u8 rootflags;
- u8 userflags;
+ /*
+ * Protected by sbi->vh_mutex.
+ */
+ u32 linkid;
+
+ /*
+ * Protected by i_mutex.
+ */
+ sector_t fs_blocks;
+ u8 userflags; /* BSD user file flags */
struct list_head open_dir_list;
loff_t phys_size;
+
struct inode vfs_inode;
};
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
#define HFSPLUS_FLG_EXT_DIRTY 0x0002
#define HFSPLUS_FLG_EXT_NEW 0x0004
-#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC))
-#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)
+#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
+#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
struct hfs_find_data {
/* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
struct inode *, struct qstr *);
+void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
/* dir.c */
extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
int hfs_part_find(struct super_block *, sector_t *, sector_t *);
/* access macros */
-/*
static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
{
return sb->s_fs_info;
}
+
static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
{
return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
}
-*/
-#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
-#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
-
-#if 1
-#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
-#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
-#else
-#define hfsplus_kmap(p) kmap(p)
-#define hfsplus_kunmap(p) kunmap(p)
-#endif
#define sb_bread512(sb, sec, data) ({ \
struct buffer_head *__bh; \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
-#define kdev_t_to_nr(x) (x)
-
#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..6892899fd6fb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
struct hfsplus_unistr name;
} __packed;
+#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
/* Structs from hfs.h */
struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
__be32 start_block;
} __packed;
-#define HFSPLUS_EXT_KEYLEN 12
+#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
/* HFS+ generic BTree key */
typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..78449280dae0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
*pagep = NULL;
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hfsplus_get_block,
- &HFSPLUS_I(mapping->host).phys_size);
+ &HFSPLUS_I(mapping->host)->phys_size);
if (unlikely(ret)) {
loff_t isize = mapping->host->i_size;
if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
switch (inode->i_ino) {
case HFSPLUS_EXT_CNID:
- tree = HFSPLUS_SB(sb).ext_tree;
+ tree = HFSPLUS_SB(sb)->ext_tree;
break;
case HFSPLUS_CAT_CNID:
- tree = HFSPLUS_SB(sb).cat_tree;
+ tree = HFSPLUS_SB(sb)->cat_tree;
break;
case HFSPLUS_ATTR_CNID:
- tree = HFSPLUS_SB(sb).attr_tree;
+ tree = HFSPLUS_SB(sb)->attr_tree;
break;
default:
BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
struct hfs_find_data fd;
struct super_block *sb = dir->i_sb;
struct inode *inode = NULL;
+ struct hfsplus_inode_info *hip;
int err;
if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
goto out;
- inode = HFSPLUS_I(dir).rsrc_inode;
+ inode = HFSPLUS_I(dir)->rsrc_inode;
if (inode)
goto out;
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
if (!inode)
return ERR_PTR(-ENOMEM);
+ hip = HFSPLUS_I(inode);
inode->i_ino = dir->i_ino;
- INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
- mutex_init(&HFSPLUS_I(inode).extents_lock);
- HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
+ INIT_LIST_HEAD(&hip->open_dir_list);
+ mutex_init(&hip->extents_lock);
+ hip->flags = HFSPLUS_FLG_RSRC;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+ hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
err = hfsplus_find_cat(sb, dir->i_ino, &fd);
if (!err)
err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
iput(inode);
return ERR_PTR(err);
}
- HFSPLUS_I(inode).rsrc_inode = dir;
- HFSPLUS_I(dir).rsrc_inode = inode;
+ hip->rsrc_inode = dir;
+ HFSPLUS_I(dir)->rsrc_inode = inode;
igrab(dir);
- hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes);
+
+ /*
+ * __mark_inode_dirty expects inodes to be hashed. Since we don't
+ * want resource fork inodes in the regular inode space, we make them
+ * appear hashed, but do not put on any lists. hlist_del()
+ * will work fine and require no locking.
+ */
+ inode->i_hash.pprev = &inode->i_hash.next;
+
mark_inode_dirty(inode);
out:
d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
{
- struct super_block *sb = inode->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
u16 mode;
mode = be16_to_cpu(perms->mode);
inode->i_uid = be32_to_cpu(perms->owner);
if (!inode->i_uid && !mode)
- inode->i_uid = HFSPLUS_SB(sb).uid;
+ inode->i_uid = sbi->uid;
inode->i_gid = be32_to_cpu(perms->group);
if (!inode->i_gid && !mode)
- inode->i_gid = HFSPLUS_SB(sb).gid;
+ inode->i_gid = sbi->gid;
if (dir) {
- mode = mode ? (mode & S_IALLUGO) :
- (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
+ mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
mode |= S_IFDIR;
} else if (!mode)
- mode = S_IFREG | ((S_IRUGO|S_IWUGO) &
- ~(HFSPLUS_SB(sb).umask));
+ mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
inode->i_mode = mode;
- HFSPLUS_I(inode).rootflags = perms->rootflags;
- HFSPLUS_I(inode).userflags = perms->userflags;
+ HFSPLUS_I(inode)->userflags = perms->userflags;
if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
inode->i_flags |= S_IMMUTABLE;
else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
inode->i_flags &= ~S_APPEND;
}
-static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
-{
- if (inode->i_flags & S_IMMUTABLE)
- perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
- else
- perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
- if (inode->i_flags & S_APPEND)
- perms->rootflags |= HFSPLUS_FLG_APPEND;
- else
- perms->rootflags &= ~HFSPLUS_FLG_APPEND;
- perms->userflags = HFSPLUS_I(inode).userflags;
- perms->mode = cpu_to_be16(inode->i_mode);
- perms->owner = cpu_to_be32(inode->i_uid);
- perms->group = cpu_to_be32(inode->i_gid);
- perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
-}
-
static int hfsplus_file_open(struct inode *inode, struct file *file)
{
if (HFSPLUS_IS_RSRC(inode))
- inode = HFSPLUS_I(inode).rsrc_inode;
+ inode = HFSPLUS_I(inode)->rsrc_inode;
if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
return -EOVERFLOW;
- atomic_inc(&HFSPLUS_I(inode).opencnt);
+ atomic_inc(&HFSPLUS_I(inode)->opencnt);
return 0;
}
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
struct super_block *sb = inode->i_sb;
if (HFSPLUS_IS_RSRC(inode))
- inode = HFSPLUS_I(inode).rsrc_inode;
- if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
+ inode = HFSPLUS_I(inode)->rsrc_inode;
+ if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
mutex_lock(&inode->i_mutex);
hfsplus_file_truncate(inode);
if (inode->i_flags & S_DEAD) {
- hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
+ hfsplus_delete_cat(inode->i_ino,
+ HFSPLUS_SB(sb)->hidden_dir, NULL);
hfsplus_delete_inode(inode);
}
mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
struct inode *inode = new_inode(sb);
+ struct hfsplus_inode_info *hip;
+
if (!inode)
return NULL;
- inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
+ inode->i_ino = sbi->next_cnid++;
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_nlink = 1;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
- INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
- mutex_init(&HFSPLUS_I(inode).extents_lock);
- atomic_set(&HFSPLUS_I(inode).opencnt, 0);
- HFSPLUS_I(inode).flags = 0;
- memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
- memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
- HFSPLUS_I(inode).alloc_blocks = 0;
- HFSPLUS_I(inode).first_blocks = 0;
- HFSPLUS_I(inode).cached_start = 0;
- HFSPLUS_I(inode).cached_blocks = 0;
- HFSPLUS_I(inode).phys_size = 0;
- HFSPLUS_I(inode).fs_blocks = 0;
- HFSPLUS_I(inode).rsrc_inode = NULL;
+
+ hip = HFSPLUS_I(inode);
+ INIT_LIST_HEAD(&hip->open_dir_list);
+ mutex_init(&hip->extents_lock);
+ atomic_set(&hip->opencnt, 0);
+ hip->flags = 0;
+ memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
+ memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+ hip->alloc_blocks = 0;
+ hip->first_blocks = 0;
+ hip->cached_start = 0;
+ hip->cached_blocks = 0;
+ hip->phys_size = 0;
+ hip->fs_blocks = 0;
+ hip->rsrc_inode = NULL;
if (S_ISDIR(inode->i_mode)) {
inode->i_size = 2;
- HFSPLUS_SB(sb).folder_count++;
+ sbi->folder_count++;
inode->i_op = &hfsplus_dir_inode_operations;
inode->i_fop = &hfsplus_dir_operations;
} else if (S_ISREG(inode->i_mode)) {
- HFSPLUS_SB(sb).file_count++;
+ sbi->file_count++;
inode->i_op = &hfsplus_file_inode_operations;
inode->i_fop = &hfsplus_file_operations;
inode->i_mapping->a_ops = &hfsplus_aops;
- HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks;
+ hip->clump_blocks = sbi->data_clump_blocks;
} else if (S_ISLNK(inode->i_mode)) {
- HFSPLUS_SB(sb).file_count++;
+ sbi->file_count++;
inode->i_op = &page_symlink_inode_operations;
inode->i_mapping->a_ops = &hfsplus_aops;
- HFSPLUS_I(inode).clump_blocks = 1;
+ hip->clump_blocks = 1;
} else
- HFSPLUS_SB(sb).file_count++;
+ sbi->file_count++;
insert_inode_hash(inode);
mark_inode_dirty(inode);
sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
struct super_block *sb = inode->i_sb;
if (S_ISDIR(inode->i_mode)) {
- HFSPLUS_SB(sb).folder_count--;
+ HFSPLUS_SB(sb)->folder_count--;
sb->s_dirt = 1;
return;
}
- HFSPLUS_SB(sb).file_count--;
+ HFSPLUS_SB(sb)->file_count--;
if (S_ISREG(inode->i_mode)) {
if (!inode->i_nlink) {
inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
{
struct super_block *sb = inode->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
u32 count;
int i;
- memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents,
- sizeof(hfsplus_extent_rec));
+ memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
for (count = 0, i = 0; i < 8; i++)
count += be32_to_cpu(fork->extents[i].block_count);
- HFSPLUS_I(inode).first_blocks = count;
- memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec));
- HFSPLUS_I(inode).cached_start = 0;
- HFSPLUS_I(inode).cached_blocks = 0;
-
- HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks);
- inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size);
- HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
- inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits);
- HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift;
- if (!HFSPLUS_I(inode).clump_blocks)
- HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks :
- HFSPLUS_SB(sb).data_clump_blocks;
+ hip->first_blocks = count;
+ memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
+ hip->cached_start = 0;
+ hip->cached_blocks = 0;
+
+ hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
+ hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
+ hip->fs_blocks =
+ (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+ inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
+ hip->clump_blocks =
+ be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
+ if (!hip->clump_blocks) {
+ hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
+ sbi->rsrc_clump_blocks :
+ sbi->data_clump_blocks;
+ }
}
void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
{
- memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents,
+ memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
sizeof(hfsplus_extent_rec));
fork->total_size = cpu_to_be64(inode->i_size);
- fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks);
+ fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
}
int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
- HFSPLUS_I(inode).dev = 0;
+ HFSPLUS_I(inode)->linkid = 0;
if (type == HFSPLUS_FOLDER) {
struct hfsplus_cat_folder *folder = &entry.folder;
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_atime = hfsp_mt2ut(folder->access_date);
inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
- HFSPLUS_I(inode).create_date = folder->create_date;
- HFSPLUS_I(inode).fs_blocks = 0;
+ HFSPLUS_I(inode)->create_date = folder->create_date;
+ HFSPLUS_I(inode)->fs_blocks = 0;
inode->i_op = &hfsplus_dir_inode_operations;
inode->i_fop = &hfsplus_dir_operations;
} else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_atime = hfsp_mt2ut(file->access_date);
inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
- HFSPLUS_I(inode).create_date = file->create_date;
+ HFSPLUS_I(inode)->create_date = file->create_date;
} else {
printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
hfsplus_cat_entry entry;
if (HFSPLUS_IS_RSRC(inode))
- main_inode = HFSPLUS_I(inode).rsrc_inode;
+ main_inode = HFSPLUS_I(inode)->rsrc_inode;
if (!main_inode->i_nlink)
return 0;
- if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd))
+ if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
/* panic? */
return -EIO;
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
- hfsplus_set_perms(inode, &folder->permissions);
+ hfsplus_cat_set_perms(inode, &folder->permissions);
folder->access_date = hfsp_ut2mt(inode->i_atime);
folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
hfsplus_inode_write_fork(inode, &file->data_fork);
- if (S_ISREG(inode->i_mode))
- HFSPLUS_I(inode).dev = inode->i_nlink;
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
- hfsplus_set_perms(inode, &file->permissions);
+ hfsplus_cat_set_perms(inode, &file->permissions);
if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..5b4667e08ef7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/xattr.h>
-#include <linux/smp_lock.h>
#include <asm/uaccess.h>
#include "hfsplus_fs.h"
-long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
{
- struct inode *inode = filp->f_path.dentry->d_inode;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
+ unsigned int flags = 0;
+
+ if (inode->i_flags & S_IMMUTABLE)
+ flags |= FS_IMMUTABLE_FL;
+ if (inode->i_flags |= S_APPEND)
+ flags |= FS_APPEND_FL;
+ if (hip->userflags & HFSPLUS_FLG_NODUMP)
+ flags |= FS_NODUMP_FL;
+
+ return put_user(flags, user_flags);
+}
+
+static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
unsigned int flags;
+ int err = 0;
- lock_kernel();
- switch (cmd) {
- case HFSPLUS_IOC_EXT2_GETFLAGS:
- flags = 0;
- if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
- if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
- flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
- if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
- flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
- return put_user(flags, (int __user *)arg);
- case HFSPLUS_IOC_EXT2_SETFLAGS: {
- int err = 0;
- err = mnt_want_write(filp->f_path.mnt);
- if (err) {
- unlock_kernel();
- return err;
- }
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ goto out;
- if (!is_owner_or_cap(inode)) {
- err = -EACCES;
- goto setflags_out;
- }
- if (get_user(flags, (int __user *)arg)) {
- err = -EFAULT;
- goto setflags_out;
- }
- if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
- HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- err = -EPERM;
- goto setflags_out;
- }
- }
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto out_drop_write;
+ }
- /* don't silently ignore unsupported ext2 flags */
- if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
- err = -EOPNOTSUPP;
- goto setflags_out;
- }
- if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
- inode->i_flags |= S_IMMUTABLE;
- HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
- } else {
- inode->i_flags &= ~S_IMMUTABLE;
- HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
- }
- if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
- inode->i_flags |= S_APPEND;
- HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
- } else {
- inode->i_flags &= ~S_APPEND;
- HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
+ if (get_user(flags, user_flags)) {
+ err = -EFAULT;
+ goto out_drop_write;
+ }
+
+ mutex_lock(&inode->i_mutex);
+
+ if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
+ inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto out_unlock_inode;
}
- if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
- HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
- else
- HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
-
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
-setflags_out:
- mnt_drop_write(filp->f_path.mnt);
- unlock_kernel();
- return err;
}
+
+ /* don't silently ignore unsupported ext2 flags */
+ if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+ err = -EOPNOTSUPP;
+ goto out_unlock_inode;
+ }
+
+ if (flags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+
+ if (flags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+
+ if (flags & FS_NODUMP_FL)
+ hip->userflags |= HFSPLUS_FLG_NODUMP;
+ else
+ hip->userflags &= ~HFSPLUS_FLG_NODUMP;
+
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+
+out_unlock_inode:
+ mutex_lock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(file->f_path.mnt);
+out:
+ return err;
+}
+
+long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case HFSPLUS_IOC_EXT2_GETFLAGS:
+ return hfsplus_ioctl_getflags(file, argp);
+ case HFSPLUS_IOC_EXT2_SETFLAGS:
+ return hfsplus_ioctl_setflags(file, argp);
default:
- unlock_kernel();
return -ENOTTY;
}
}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
return -EOPNOTSUPP;
- res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
if (res)
return res;
res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
return -EOPNOTSUPP;
if (size) {
- res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
+ res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
if (res)
return res;
res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
} else
res = size ? -ERANGE : 4;
} else
- res = -ENODATA;
+ res = -EOPNOTSUPP;
out:
if (size)
hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..f9ab276a4d8d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
kfree(p);
break;
case opt_decompose:
- sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE;
+ clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
break;
case opt_nodecompose:
- sbi->flags |= HFSPLUS_SB_NODECOMPOSE;
+ set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
break;
case opt_force:
- sbi->flags |= HFSPLUS_SB_FORCE;
+ set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
break;
default:
return 0;
@@ -171,7 +171,7 @@ done:
int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
{
- struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
seq_printf(seq, ",session=%u", sbi->session);
if (sbi->nls)
seq_printf(seq, ",nls=%s", sbi->nls->charset);
- if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+ if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
seq_printf(seq, ",nodecompose");
return 0;
}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..208b16c645cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
int hfs_part_find(struct super_block *sb,
sector_t *part_start, sector_t *part_size)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
struct buffer_head *bh;
__be16 *data;
int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
for (i = 0; i < size; p++, i++) {
if (p->pdStart && p->pdSize &&
p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
- (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+ (sbi->part < 0 || sbi->part == i)) {
*part_start += be32_to_cpu(p->pdStart);
*part_size = be32_to_cpu(p->pdSize);
res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
size = be32_to_cpu(pm->pmMapBlkCnt);
for (i = 0; i < size;) {
if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
- (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) {
+ (sbi->part < 0 || sbi->part == i)) {
*part_start += be32_to_cpu(pm->pmPyPartStart);
*part_size = be32_to_cpu(pm->pmPartBlkCnt);
res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..9a88d7536103 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
#include <linux/pagemap.h>
#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/vfs.h>
#include <linux/nls.h>
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
#include "hfsplus_fs.h"
-struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+static int hfsplus_system_read_inode(struct inode *inode)
{
- struct hfs_find_data fd;
- struct hfsplus_vh *vhdr;
- struct inode *inode;
- long err = -EIO;
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
+ struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
- INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
- mutex_init(&HFSPLUS_I(inode).extents_lock);
- HFSPLUS_I(inode).flags = 0;
- HFSPLUS_I(inode).rsrc_inode = NULL;
- atomic_set(&HFSPLUS_I(inode).opencnt, 0);
-
- if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
- read_inode:
- hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
- err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
- if (!err)
- err = hfsplus_cat_read_inode(inode, &fd);
- hfs_find_exit(&fd);
- if (err)
- goto bad_inode;
- goto done;
- }
- vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
- switch(inode->i_ino) {
- case HFSPLUS_ROOT_CNID:
- goto read_inode;
+ switch (inode->i_ino) {
case HFSPLUS_EXT_CNID:
hfsplus_inode_read_fork(inode, &vhdr->ext_file);
inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
inode->i_mapping->a_ops = &hfsplus_btree_aops;
break;
default:
- goto bad_inode;
+ return -EIO;
+ }
+
+ return 0;
+}
+
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
+{
+ struct hfs_find_data fd;
+ struct inode *inode;
+ int err;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
+ mutex_init(&HFSPLUS_I(inode)->extents_lock);
+ HFSPLUS_I(inode)->flags = 0;
+ HFSPLUS_I(inode)->rsrc_inode = NULL;
+ atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
+
+ if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+ inode->i_ino == HFSPLUS_ROOT_CNID) {
+ hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
+ err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
+ if (!err)
+ err = hfsplus_cat_read_inode(inode, &fd);
+ hfs_find_exit(&fd);
+ } else {
+ err = hfsplus_system_read_inode(inode);
+ }
+
+ if (err) {
+ iget_failed(inode);
+ return ERR_PTR(err);
}
-done:
unlock_new_inode(inode);
return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(err);
}
-static int hfsplus_write_inode(struct inode *inode,
- struct writeback_control *wbc)
+static int hfsplus_system_write_inode(struct inode *inode)
{
- struct hfsplus_vh *vhdr;
- int ret = 0;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
+ struct hfsplus_vh *vhdr = sbi->s_vhdr;
+ struct hfsplus_fork_raw *fork;
+ struct hfs_btree *tree = NULL;
- dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
- hfsplus_ext_write_extent(inode);
- if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
- return hfsplus_cat_write_inode(inode);
- }
- vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
switch (inode->i_ino) {
- case HFSPLUS_ROOT_CNID:
- ret = hfsplus_cat_write_inode(inode);
- break;
case HFSPLUS_EXT_CNID:
- if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) {
- HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
- inode->i_sb->s_dirt = 1;
- }
- hfsplus_inode_write_fork(inode, &vhdr->ext_file);
- hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
+ fork = &vhdr->ext_file;
+ tree = sbi->ext_tree;
break;
case HFSPLUS_CAT_CNID:
- if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) {
- HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
- inode->i_sb->s_dirt = 1;
- }
- hfsplus_inode_write_fork(inode, &vhdr->cat_file);
- hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
+ fork = &vhdr->cat_file;
+ tree = sbi->cat_tree;
break;
case HFSPLUS_ALLOC_CNID:
- if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) {
- HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
- inode->i_sb->s_dirt = 1;
- }
- hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
+ fork = &vhdr->alloc_file;
break;
case HFSPLUS_START_CNID:
- if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) {
- HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
- inode->i_sb->s_dirt = 1;
- }
- hfsplus_inode_write_fork(inode, &vhdr->start_file);
+ fork = &vhdr->start_file;
break;
case HFSPLUS_ATTR_CNID:
- if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) {
- HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
- inode->i_sb->s_dirt = 1;
- }
- hfsplus_inode_write_fork(inode, &vhdr->attr_file);
- hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree);
- break;
+ fork = &vhdr->attr_file;
+ tree = sbi->attr_tree;
+ default:
+ return -EIO;
+ }
+
+ if (fork->total_size != cpu_to_be64(inode->i_size)) {
+ set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
+ inode->i_sb->s_dirt = 1;
}
- return ret;
+ hfsplus_inode_write_fork(inode, fork);
+ if (tree)
+ hfs_btree_write(tree);
+ return 0;
+}
+
+static int hfsplus_write_inode(struct inode *inode,
+ struct writeback_control *wbc)
+{
+ dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
+
+ hfsplus_ext_write_extent(inode);
+
+ if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
+ inode->i_ino == HFSPLUS_ROOT_CNID)
+ return hfsplus_cat_write_inode(inode);
+ else
+ return hfsplus_system_write_inode(inode);
}
static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
if (HFSPLUS_IS_RSRC(inode)) {
- HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL;
- iput(HFSPLUS_I(inode).rsrc_inode);
+ HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
+ iput(HFSPLUS_I(inode)->rsrc_inode);
}
}
int hfsplus_sync_fs(struct super_block *sb, int wait)
{
- struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_vh *vhdr = sbi->s_vhdr;
dprint(DBG_SUPER, "hfsplus_write_super\n");
- lock_super(sb);
+ mutex_lock(&sbi->vh_mutex);
+ mutex_lock(&sbi->alloc_mutex);
sb->s_dirt = 0;
- vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
- vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
- vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid);
- vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count);
- vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
+ vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
+ vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
+ vhdr->folder_count = cpu_to_be32(sbi->folder_count);
+ vhdr->file_count = cpu_to_be32(sbi->file_count);
- mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
- if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) {
- if (HFSPLUS_SB(sb).sect_count) {
+ mark_buffer_dirty(sbi->s_vhbh);
+ if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
+ if (sbi->sect_count) {
struct buffer_head *bh;
u32 block, offset;
- block = HFSPLUS_SB(sb).blockoffset;
- block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9);
- offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1);
- printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset,
- HFSPLUS_SB(sb).sect_count, block, offset);
+ block = sbi->blockoffset;
+ block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
+ offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
+ printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
+ sbi->blockoffset, sbi->sect_count,
+ block, offset);
bh = sb_bread(sb, block);
if (bh) {
vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
- memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr));
+ memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
mark_buffer_dirty(bh);
brelse(bh);
} else
printk(KERN_WARNING "hfs: backup not found!\n");
}
}
- HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
}
- unlock_super(sb);
+ mutex_unlock(&sbi->alloc_mutex);
+ mutex_unlock(&sbi->vh_mutex);
return 0;
}
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
static void hfsplus_put_super(struct super_block *sb)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+
dprint(DBG_SUPER, "hfsplus_put_super\n");
+
if (!sb->s_fs_info)
return;
- lock_kernel();
-
if (sb->s_dirt)
hfsplus_write_super(sb);
- if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
- struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+ if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
+ struct hfsplus_vh *vhdr = sbi->s_vhdr;
vhdr->modify_date = hfsp_now2mt();
vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
- mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
- sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+ mark_buffer_dirty(sbi->s_vhbh);
+ sync_dirty_buffer(sbi->s_vhbh);
}
- hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
- hfs_btree_close(HFSPLUS_SB(sb).ext_tree);
- iput(HFSPLUS_SB(sb).alloc_file);
- iput(HFSPLUS_SB(sb).hidden_dir);
- brelse(HFSPLUS_SB(sb).s_vhbh);
- unload_nls(HFSPLUS_SB(sb).nls);
+ hfs_btree_close(sbi->cat_tree);
+ hfs_btree_close(sbi->ext_tree);
+ iput(sbi->alloc_file);
+ iput(sbi->hidden_dir);
+ brelse(sbi->s_vhbh);
+ unload_nls(sbi->nls);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
-
- unlock_kernel();
}
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = HFSPLUS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
- buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift;
+ buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
+ buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
buf->f_bavail = buf->f_bfree;
buf->f_files = 0xFFFFFFFF;
- buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid;
+ buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
if (!(*flags & MS_RDONLY)) {
- struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
+ struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
struct hfsplus_sb_info sbi;
memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
- sbi.nls = HFSPLUS_SB(sb).nls;
+ sbi.nls = HFSPLUS_SB(sb)->nls;
if (!hfsplus_parse_options(data, &sbi))
return -EINVAL;
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
"running fsck.hfsplus is recommended. leaving read-only.\n");
sb->s_flags |= MS_RDONLY;
*flags |= MS_RDONLY;
- } else if (sbi.flags & HFSPLUS_SB_FORCE) {
+ } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
/* nothing */
} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
sb->s_fs_info = sbi;
- INIT_HLIST_HEAD(&sbi->rsrc_inodes);
+ mutex_init(&sbi->alloc_mutex);
+ mutex_init(&sbi->vh_mutex);
hfsplus_fill_defaults(sbi);
if (!hfsplus_parse_options(data, sbi)) {
printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
goto cleanup;
}
- vhdr = HFSPLUS_SB(sb).s_vhdr;
+ vhdr = sbi->s_vhdr;
/* Copy parts of the volume header into the superblock */
sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_ERR "hfs: wrong filesystem version\n");
goto cleanup;
}
- HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks);
- HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks);
- HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc);
- HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid);
- HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count);
- HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count);
- HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
- if (!HFSPLUS_SB(sb).data_clump_blocks)
- HFSPLUS_SB(sb).data_clump_blocks = 1;
- HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift;
- if (!HFSPLUS_SB(sb).rsrc_clump_blocks)
- HFSPLUS_SB(sb).rsrc_clump_blocks = 1;
+ sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
+ sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
+ sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
+ sbi->file_count = be32_to_cpu(vhdr->file_count);
+ sbi->folder_count = be32_to_cpu(vhdr->folder_count);
+ sbi->data_clump_blocks =
+ be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
+ if (!sbi->data_clump_blocks)
+ sbi->data_clump_blocks = 1;
+ sbi->rsrc_clump_blocks =
+ be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
+ if (!sbi->rsrc_clump_blocks)
+ sbi->rsrc_clump_blocks = 1;
/* Set up operations so we can load metadata */
sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
"running fsck.hfsplus is recommended. mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
- } else if (sbi->flags & HFSPLUS_SB_FORCE) {
+ } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
/* nothing */
} else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
"use the force option at your own risk, mounting read-only.\n");
sb->s_flags |= MS_RDONLY;
}
- sbi->flags &= ~HFSPLUS_SB_FORCE;
/* Load metadata objects (B*Trees) */
- HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
- if (!HFSPLUS_SB(sb).ext_tree) {
+ sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
+ if (!sbi->ext_tree) {
printk(KERN_ERR "hfs: failed to load extents file\n");
goto cleanup;
}
- HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
- if (!HFSPLUS_SB(sb).cat_tree) {
+ sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
+ if (!sbi->cat_tree) {
printk(KERN_ERR "hfs: failed to load catalog file\n");
goto cleanup;
}
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = PTR_ERR(inode);
goto cleanup;
}
- HFSPLUS_SB(sb).alloc_file = inode;
+ sbi->alloc_file = inode;
/* Load the root directory */
root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
str.name = HFSP_HIDDENDIR_NAME;
- hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
+ hfs_find_init(sbi->cat_tree, &fd);
hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
err = PTR_ERR(inode);
goto cleanup;
}
- HFSPLUS_SB(sb).hidden_dir = inode;
+ sbi->hidden_dir = inode;
} else
hfs_find_exit(&fd);
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
be32_add_cpu(&vhdr->write_count, 1);
vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
- mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
- sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
+ mark_buffer_dirty(sbi->s_vhbh);
+ sync_dirty_buffer(sbi->s_vhbh);
- if (!HFSPLUS_SB(sb).hidden_dir) {
+ if (!sbi->hidden_dir) {
printk(KERN_DEBUG "hfs: create hidden dir...\n");
- HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
- hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode,
- &str, HFSPLUS_SB(sb).hidden_dir);
- mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir);
+
+ mutex_lock(&sbi->vh_mutex);
+ sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+ hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
+ &str, sbi->hidden_dir);
+ mutex_unlock(&sbi->vh_mutex);
+
+ mark_inode_dirty(sbi->hidden_dir);
}
out:
unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
static void hfsplus_destroy_inode(struct inode *inode)
{
- kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode));
+ kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
}
#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..b66d67de882c 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
{
const hfsplus_unichr *ip;
- struct nls_table *nls = HFSPLUS_SB(sb).nls;
+ struct nls_table *nls = HFSPLUS_SB(sb)->nls;
u8 *op;
u16 cc, c0, c1;
u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
ustrlen = be16_to_cpu(ustr->length);
len = *len_p;
ce1 = NULL;
- compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+ compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
while (ustrlen > 0) {
c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
wchar_t *uc)
{
- int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc);
+ int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
if (size <= 0) {
*uc = '?';
size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
u16 *dstr, outlen = 0;
wchar_t c;
- decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+ decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
size = asc2unichar(sb, astr, len, &c);
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
wchar_t c;
u16 c2;
- casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
- decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+ casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+ decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
hash = init_name_hash();
astr = str->name;
len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
u16 c1, c2;
wchar_t c;
- casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD);
- decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE);
+ casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
+ decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
astr1 = s1->name;
len1 = s1->len;
astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..8972c20b3216 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
*start = 0;
*size = sb->s_bdev->bd_inode->i_size >> 9;
- if (HFSPLUS_SB(sb).session >= 0) {
- te.cdte_track = HFSPLUS_SB(sb).session;
+ if (HFSPLUS_SB(sb)->session >= 0) {
+ te.cdte_track = HFSPLUS_SB(sb)->session;
te.cdte_format = CDROM_LBA;
res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
/* Takes in super block, returns true if good data read */
int hfsplus_read_wrapper(struct super_block *sb)
{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
struct buffer_head *bh;
struct hfsplus_vh *vhdr;
struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
break;
if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
- HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX;
+ set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
break;
}
brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (blocksize < HFSPLUS_SECTOR_SIZE ||
((blocksize - 1) & blocksize))
return -EINVAL;
- HFSPLUS_SB(sb).alloc_blksz = blocksize;
- HFSPLUS_SB(sb).alloc_blksz_shift = 0;
+ sbi->alloc_blksz = blocksize;
+ sbi->alloc_blksz_shift = 0;
while ((blocksize >>= 1) != 0)
- HFSPLUS_SB(sb).alloc_blksz_shift++;
- blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE);
+ sbi->alloc_blksz_shift++;
+ blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
/* align block size to block offset */
while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
return -EINVAL;
}
- HFSPLUS_SB(sb).blockoffset = part_start >>
- (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
- HFSPLUS_SB(sb).sect_count = part_size;
- HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift -
- sb->s_blocksize_bits;
+ sbi->blockoffset =
+ part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
+ sbi->sect_count = part_size;
+ sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
if (!bh)
return -EIO;
/* should still be the same... */
- if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ?
- cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) :
- cpu_to_be16(HFSPLUS_VOLHEAD_SIG)))
- goto error;
- HFSPLUS_SB(sb).s_vhbh = bh;
- HFSPLUS_SB(sb).s_vhdr = vhdr;
+ if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+ if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
+ goto error;
+ } else {
+ if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
+ goto error;
+ }
+
+ sbi->s_vhbh = bh;
+ sbi->s_vhdr = vhdr;
return 0;
error:
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 56bd15c5bf6c..63b6f5632318 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,6 +1,7 @@
config HPFS_FS
tristate "OS/2 HPFS file system support"
depends on BLOCK
+ depends on BKL # nontrivial to fix
help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0e8014ea6b94..262419f83d80 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1371,6 +1371,10 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
if (!compat && !ro && !incompat)
return 1;
+ /* Load journal superblock if it is not loaded yet. */
+ if (journal->j_format_version == 0 &&
+ journal_get_superblock(journal) != 0)
+ return 0;
if (journal->j_format_version == 1)
return 0;
diff --git a/fs/libfs.c b/fs/libfs.c
index 0a9da95317f7..62baa0387d6e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -913,6 +913,35 @@ int generic_file_fsync(struct file *file, int datasync)
}
EXPORT_SYMBOL(generic_file_fsync);
+/**
+ * generic_check_addressable - Check addressability of file system
+ * @blocksize_bits: log of file system block size
+ * @num_blocks: number of blocks in file system
+ *
+ * Determine whether a file system with @num_blocks blocks (and a
+ * block size of 2**@blocksize_bits) is addressable by the sector_t
+ * and page cache of the system. Return 0 if so and -EFBIG otherwise.
+ */
+int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
+{
+ u64 last_fs_block = num_blocks - 1;
+ u64 last_fs_page =
+ last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+
+ if (unlikely(num_blocks == 0))
+ return 0;
+
+ if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+ return -EINVAL;
+
+ if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
+ (last_fs_page > (pgoff_t)(~0ULL))) {
+ return -EFBIG;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(generic_check_addressable);
+
/*
* No-op implementation of ->fsync for in-memory filesystems.
*/
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index f7e13db613cb..b950415d7c43 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,7 @@
config NFS_FS
tristate "NFS client support"
depends on INET && FILE_LOCKING
+ depends on BKL # fix as soon as lockd is done
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 4264377552e2..7cf4ddafb4ab 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -2,6 +2,7 @@ config NFSD
tristate "NFS server support"
depends on INET
depends on FILE_LOCKING
+ depends on BKL # fix as soon as lockd is done
select LOCKD
select SUNRPC
select EXPORTFS
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index cdfb8c6a4206..c16f8d8331b5 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -196,8 +196,6 @@ fh_lock(struct svc_fh *fhp)
static inline void
fh_unlock(struct svc_fh *fhp)
{
- BUG_ON(!fhp->fh_dentry);
-
if (fhp->fh_locked) {
fill_post_wcc(fhp);
mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..b388443c3a09 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -3,4 +3,4 @@ config FSNOTIFY
source "fs/notify/dnotify/Kconfig"
source "fs/notify/inotify/Kconfig"
-source "fs/notify/fanotify/Kconfig"
+#source "fs/notify/fanotify/Kconfig"
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0de69c9a08be..5cfeee118158 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -883,8 +883,8 @@ struct ocfs2_write_ctxt {
* out in so that future reads from that region will get
* zero's.
*/
- struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
unsigned int w_num_pages;
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;
/*
@@ -1642,7 +1642,8 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page)
@@ -1692,7 +1693,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
mlog_errno(ret);
goto out;
} else if (ret == 1) {
- ret = ocfs2_refcount_cow(inode, di_bh,
+ ret = ocfs2_refcount_cow(inode, filp, di_bh,
wc->w_cpos, wc->w_clen, UINT_MAX);
if (ret) {
mlog_errno(ret);
@@ -1854,7 +1855,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep,
fsdata, di_bh, NULL);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index c48e93ffc513..7606f663da6d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
-int ocfs2_write_begin_nolock(struct address_space *mapping,
+int ocfs2_write_begin_nolock(struct file *filp,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
struct buffer_head *di_bh, struct page *mmap_page);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 41d5f1f92d56..52c7557f3e25 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -62,10 +62,51 @@ static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
+/*
+ * In global heartbeat, we maintain a series of region bitmaps.
+ * - o2hb_region_bitmap allows us to limit the region number to max region.
+ * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
+ * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
+ * heartbeat on it.
+ * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
+ */
+static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
+
+#define O2HB_DB_TYPE_LIVENODES 0
+#define O2HB_DB_TYPE_LIVEREGIONS 1
+#define O2HB_DB_TYPE_QUORUMREGIONS 2
+#define O2HB_DB_TYPE_FAILEDREGIONS 3
+#define O2HB_DB_TYPE_REGION_LIVENODES 4
+#define O2HB_DB_TYPE_REGION_NUMBER 5
+#define O2HB_DB_TYPE_REGION_ELAPSED_TIME 6
+struct o2hb_debug_buf {
+ int db_type;
+ int db_size;
+ int db_len;
+ void *db_data;
+};
+
+static struct o2hb_debug_buf *o2hb_db_livenodes;
+static struct o2hb_debug_buf *o2hb_db_liveregions;
+static struct o2hb_debug_buf *o2hb_db_quorumregions;
+static struct o2hb_debug_buf *o2hb_db_failedregions;
+
#define O2HB_DEBUG_DIR "o2hb"
#define O2HB_DEBUG_LIVENODES "livenodes"
+#define O2HB_DEBUG_LIVEREGIONS "live_regions"
+#define O2HB_DEBUG_QUORUMREGIONS "quorum_regions"
+#define O2HB_DEBUG_FAILEDREGIONS "failed_regions"
+#define O2HB_DEBUG_REGION_NUMBER "num"
+#define O2HB_DEBUG_REGION_ELAPSED_TIME "elapsed_time_in_ms"
+
static struct dentry *o2hb_debug_dir;
static struct dentry *o2hb_debug_livenodes;
+static struct dentry *o2hb_debug_liveregions;
+static struct dentry *o2hb_debug_quorumregions;
+static struct dentry *o2hb_debug_failedregions;
static LIST_HEAD(o2hb_all_regions);
@@ -77,7 +118,19 @@ static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
#define O2HB_DEFAULT_BLOCK_BITS 9
+enum o2hb_heartbeat_modes {
+ O2HB_HEARTBEAT_LOCAL = 0,
+ O2HB_HEARTBEAT_GLOBAL,
+ O2HB_HEARTBEAT_NUM_MODES,
+};
+
+char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = {
+ "local", /* O2HB_HEARTBEAT_LOCAL */
+ "global", /* O2HB_HEARTBEAT_GLOBAL */
+};
+
unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
+unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL;
/* Only sets a new threshold if there are no active regions.
*
@@ -94,6 +147,22 @@ static void o2hb_dead_threshold_set(unsigned int threshold)
}
}
+static int o2hb_global_hearbeat_mode_set(unsigned int hb_mode)
+{
+ int ret = -1;
+
+ if (hb_mode < O2HB_HEARTBEAT_NUM_MODES) {
+ spin_lock(&o2hb_live_lock);
+ if (list_empty(&o2hb_all_regions)) {
+ o2hb_heartbeat_mode = hb_mode;
+ ret = 0;
+ }
+ spin_unlock(&o2hb_live_lock);
+ }
+
+ return ret;
+}
+
struct o2hb_node_event {
struct list_head hn_item;
enum o2hb_callback_type hn_event_type;
@@ -135,6 +204,18 @@ struct o2hb_region {
struct block_device *hr_bdev;
struct o2hb_disk_slot *hr_slots;
+ /* live node map of this region */
+ unsigned long hr_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned int hr_region_num;
+
+ struct dentry *hr_debug_dir;
+ struct dentry *hr_debug_livenodes;
+ struct dentry *hr_debug_regnum;
+ struct dentry *hr_debug_elapsed_time;
+ struct o2hb_debug_buf *hr_db_livenodes;
+ struct o2hb_debug_buf *hr_db_regnum;
+ struct o2hb_debug_buf *hr_db_elapsed_time;
+
/* let the person setting up hb wait for it to return until it
* has reached a 'steady' state. This will be fixed when we have
* a more complete api that doesn't lead to this sort of fragility. */
@@ -163,8 +244,19 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
+static int o2hb_pop_count(void *map, int count)
+{
+ int i = -1, pop = 0;
+
+ while ((i = find_next_bit(map, count, i + 1)) < count)
+ pop++;
+ return pop;
+}
+
static void o2hb_write_timeout(struct work_struct *work)
{
+ int failed, quorum;
+ unsigned long flags;
struct o2hb_region *reg =
container_of(work, struct o2hb_region,
hr_write_timeout_work.work);
@@ -172,6 +264,28 @@ static void o2hb_write_timeout(struct work_struct *work)
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
"milliseconds\n", reg->hr_dev_name,
jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
+
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock_irqsave(&o2hb_live_lock, flags);
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+ O2NM_MAX_REGIONS);
+ quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ O2NM_MAX_REGIONS);
+ spin_unlock_irqrestore(&o2hb_live_lock, flags);
+
+ mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
+ quorum, failed);
+
+ /*
+ * Fence if the number of failed regions >= half the number
+ * of quorum regions
+ */
+ if ((failed << 1) < quorum)
+ return;
+ }
+
o2quo_disk_timeout();
}
@@ -180,6 +294,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
O2HB_MAX_WRITE_TIMEOUT_MS);
+ if (o2hb_global_heartbeat_active()) {
+ spin_lock(&o2hb_live_lock);
+ clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ }
cancel_delayed_work(&reg->hr_write_timeout_work);
reg->hr_last_timeout_start = jiffies;
schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -513,6 +632,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
{
assert_spin_locked(&o2hb_live_lock);
+ BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
+
event->hn_event_type = type;
event->hn_node = node;
event->hn_node_num = node_num;
@@ -554,6 +675,35 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
o2nm_node_put(node);
}
+static void o2hb_set_quorum_device(struct o2hb_region *reg,
+ struct o2hb_disk_slot *slot)
+{
+ assert_spin_locked(&o2hb_live_lock);
+
+ if (!o2hb_global_heartbeat_active())
+ return;
+
+ if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+ return;
+
+ /*
+ * A region can be added to the quorum only when it sees all
+ * live nodes heartbeat on it. In other words, the region has been
+ * added to all nodes.
+ */
+ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
+ sizeof(o2hb_live_node_bitmap)))
+ return;
+
+ if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
+ return;
+
+ printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
+ config_item_name(&reg->hr_item));
+
+ set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+}
+
static int o2hb_check_slot(struct o2hb_region *reg,
struct o2hb_disk_slot *slot)
{
@@ -565,14 +715,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
u64 cputime;
unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
unsigned int slot_dead_ms;
+ int tmp;
memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
- /* Is this correct? Do we assume that the node doesn't exist
- * if we're not configured for him? */
+ /*
+ * If a node is no longer configured but is still in the livemap, we
+ * may need to clear that bit from the livemap.
+ */
node = o2nm_get_node_by_num(slot->ds_node_num);
- if (!node)
- return 0;
+ if (!node) {
+ spin_lock(&o2hb_live_lock);
+ tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
+ spin_unlock(&o2hb_live_lock);
+ if (!tmp)
+ return 0;
+ }
if (!o2hb_verify_crc(reg, hb_block)) {
/* all paths from here will drop o2hb_live_lock for
@@ -639,8 +797,12 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
slot->ds_node_num, (long long)slot->ds_last_generation);
+ set_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* first on the list generates a callback */
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Add node %d to live nodes "
+ "bitmap\n", slot->ds_node_num);
set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
@@ -684,13 +846,18 @@ fire_callbacks:
mlog(ML_HEARTBEAT, "Node %d left my region\n",
slot->ds_node_num);
+ clear_bit(slot->ds_node_num, reg->hr_live_node_bitmap);
+
/* last off the live_slot generates a callback */
list_del_init(&slot->ds_live_item);
if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
+ mlog(ML_HEARTBEAT, "o2hb: Remove node %d from live "
+ "nodes bitmap\n", slot->ds_node_num);
clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
- o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
- slot->ds_node_num);
+ /* node can be null */
+ o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
+ node, slot->ds_node_num);
changed = 1;
}
@@ -706,11 +873,14 @@ fire_callbacks:
slot->ds_equal_samples = 0;
}
out:
+ o2hb_set_quorum_device(reg, slot);
+
spin_unlock(&o2hb_live_lock);
o2hb_run_event_list(&event);
- o2nm_node_put(node);
+ if (node)
+ o2nm_node_put(node);
return changed;
}
@@ -737,6 +907,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
int i, ret, highest_node, change = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct o2hb_bio_wait_ctxt write_wc;
ret = o2nm_configured_node_map(configured_nodes,
@@ -746,6 +917,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
return ret;
}
+ /*
+ * If a node is not configured but is in the livemap, we still need
+ * to read the slot so as to be able to remove it from the livemap.
+ */
+ o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
+ i = -1;
+ while ((i = find_next_bit(live_node_bitmap,
+ O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
+ set_bit(i, configured_nodes);
+ }
+
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
@@ -917,21 +1099,59 @@ static int o2hb_thread(void *data)
#ifdef CONFIG_DEBUG_FS
static int o2hb_debug_open(struct inode *inode, struct file *file)
{
+ struct o2hb_debug_buf *db = inode->i_private;
+ struct o2hb_region *reg;
unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
char *buf = NULL;
int i = -1;
int out = 0;
+ /* max_nodes should be the largest bitmap we pass here */
+ BUG_ON(sizeof(map) < db->db_size);
+
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto bail;
- o2hb_fill_node_map(map, sizeof(map));
+ switch (db->db_type) {
+ case O2HB_DB_TYPE_LIVENODES:
+ case O2HB_DB_TYPE_LIVEREGIONS:
+ case O2HB_DB_TYPE_QUORUMREGIONS:
+ case O2HB_DB_TYPE_FAILEDREGIONS:
+ spin_lock(&o2hb_live_lock);
+ memcpy(map, db->db_data, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_LIVENODES:
+ spin_lock(&o2hb_live_lock);
+ reg = (struct o2hb_region *)db->db_data;
+ memcpy(map, reg->hr_live_node_bitmap, db->db_size);
+ spin_unlock(&o2hb_live_lock);
+ break;
+
+ case O2HB_DB_TYPE_REGION_NUMBER:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%d\n",
+ reg->hr_region_num);
+ goto done;
+
+ case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
+ reg = (struct o2hb_region *)db->db_data;
+ out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
+ jiffies_to_msecs(jiffies -
+ reg->hr_last_timeout_start));
+ goto done;
+
+ default:
+ goto done;
+ }
- while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+ while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len)
out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
out += snprintf(buf + out, PAGE_SIZE - out, "\n");
+done:
i_size_write(inode, out);
file->private_data = buf;
@@ -978,10 +1198,104 @@ static const struct file_operations o2hb_debug_fops = {
void o2hb_exit(void)
{
- if (o2hb_debug_livenodes)
- debugfs_remove(o2hb_debug_livenodes);
- if (o2hb_debug_dir)
- debugfs_remove(o2hb_debug_dir);
+ kfree(o2hb_db_livenodes);
+ kfree(o2hb_db_liveregions);
+ kfree(o2hb_db_quorumregions);
+ kfree(o2hb_db_failedregions);
+ debugfs_remove(o2hb_debug_failedregions);
+ debugfs_remove(o2hb_debug_quorumregions);
+ debugfs_remove(o2hb_debug_liveregions);
+ debugfs_remove(o2hb_debug_livenodes);
+ debugfs_remove(o2hb_debug_dir);
+}
+
+static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
+ struct o2hb_debug_buf **db, int db_len,
+ int type, int size, int len, void *data)
+{
+ *db = kmalloc(db_len, GFP_KERNEL);
+ if (!*db)
+ return NULL;
+
+ (*db)->db_type = type;
+ (*db)->db_size = size;
+ (*db)->db_len = len;
+ (*db)->db_data = data;
+
+ return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db,
+ &o2hb_debug_fops);
+}
+
+static int o2hb_debug_init(void)
+{
+ int ret = -ENOMEM;
+
+ o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
+ if (!o2hb_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ o2hb_debug_dir,
+ &o2hb_db_livenodes,
+ sizeof(*o2hb_db_livenodes),
+ O2HB_DB_TYPE_LIVENODES,
+ sizeof(o2hb_live_node_bitmap),
+ O2NM_MAX_NODES,
+ o2hb_live_node_bitmap);
+ if (!o2hb_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_liveregions,
+ sizeof(*o2hb_db_liveregions),
+ O2HB_DB_TYPE_LIVEREGIONS,
+ sizeof(o2hb_live_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_live_region_bitmap);
+ if (!o2hb_debug_liveregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_quorumregions =
+ o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_quorumregions,
+ sizeof(*o2hb_db_quorumregions),
+ O2HB_DB_TYPE_QUORUMREGIONS,
+ sizeof(o2hb_quorum_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_quorum_region_bitmap);
+ if (!o2hb_debug_quorumregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ o2hb_debug_failedregions =
+ o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS,
+ o2hb_debug_dir,
+ &o2hb_db_failedregions,
+ sizeof(*o2hb_db_failedregions),
+ O2HB_DB_TYPE_FAILEDREGIONS,
+ sizeof(o2hb_failed_region_bitmap),
+ O2NM_MAX_REGIONS,
+ o2hb_failed_region_bitmap);
+ if (!o2hb_debug_failedregions) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ if (ret)
+ o2hb_exit();
+
+ return ret;
}
int o2hb_init(void)
@@ -997,24 +1311,12 @@ int o2hb_init(void)
INIT_LIST_HEAD(&o2hb_node_events);
memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
+ memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
+ memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
+ memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
+ memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
- o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
- if (!o2hb_debug_dir) {
- mlog_errno(-ENOMEM);
- return -ENOMEM;
- }
-
- o2hb_debug_livenodes = debugfs_create_file(O2HB_DEBUG_LIVENODES,
- S_IFREG|S_IRUSR,
- o2hb_debug_dir, NULL,
- &o2hb_debug_fops);
- if (!o2hb_debug_livenodes) {
- mlog_errno(-ENOMEM);
- debugfs_remove(o2hb_debug_dir);
- return -ENOMEM;
- }
-
- return 0;
+ return o2hb_debug_init();
}
/* if we're already in a callback then we're already serialized by the sem */
@@ -1078,6 +1380,13 @@ static void o2hb_region_release(struct config_item *item)
if (reg->hr_slots)
kfree(reg->hr_slots);
+ kfree(reg->hr_db_regnum);
+ kfree(reg->hr_db_livenodes);
+ debugfs_remove(reg->hr_debug_livenodes);
+ debugfs_remove(reg->hr_debug_regnum);
+ debugfs_remove(reg->hr_debug_elapsed_time);
+ debugfs_remove(reg->hr_debug_dir);
+
spin_lock(&o2hb_live_lock);
list_del(&reg->hr_all_item);
spin_unlock(&o2hb_live_lock);
@@ -1441,6 +1750,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
+ if (o2hb_global_heartbeat_active())
+ set_bit(reg->hr_region_num, o2hb_live_region_bitmap);
spin_unlock(&o2hb_live_lock);
if (hb_task)
@@ -1448,6 +1759,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
else
ret = -EIO;
+ if (hb_task && o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
+ config_item_name(&reg->hr_item));
+
out:
if (filp)
fput(filp);
@@ -1586,21 +1901,94 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
: NULL;
}
+static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
+{
+ int ret = -ENOMEM;
+
+ reg->hr_debug_dir =
+ debugfs_create_dir(config_item_name(&reg->hr_item), dir);
+ if (!reg->hr_debug_dir) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_livenodes =
+ o2hb_debug_create(O2HB_DEBUG_LIVENODES,
+ reg->hr_debug_dir,
+ &(reg->hr_db_livenodes),
+ sizeof(*(reg->hr_db_livenodes)),
+ O2HB_DB_TYPE_REGION_LIVENODES,
+ sizeof(reg->hr_live_node_bitmap),
+ O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_livenodes) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_regnum =
+ o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER,
+ reg->hr_debug_dir,
+ &(reg->hr_db_regnum),
+ sizeof(*(reg->hr_db_regnum)),
+ O2HB_DB_TYPE_REGION_NUMBER,
+ 0, O2NM_MAX_NODES, reg);
+ if (!reg->hr_debug_regnum) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ reg->hr_debug_elapsed_time =
+ o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME,
+ reg->hr_debug_dir,
+ &(reg->hr_db_elapsed_time),
+ sizeof(*(reg->hr_db_elapsed_time)),
+ O2HB_DB_TYPE_REGION_ELAPSED_TIME,
+ 0, 0, reg);
+ if (!reg->hr_debug_elapsed_time) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = 0;
+bail:
+ return ret;
+}
+
static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
const char *name)
{
struct o2hb_region *reg = NULL;
+ int ret;
reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
if (reg == NULL)
return ERR_PTR(-ENOMEM);
- config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+ if (strlen(name) > O2HB_MAX_REGION_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
spin_lock(&o2hb_live_lock);
+ reg->hr_region_num = 0;
+ if (o2hb_global_heartbeat_active()) {
+ reg->hr_region_num = find_first_zero_bit(o2hb_region_bitmap,
+ O2NM_MAX_REGIONS);
+ if (reg->hr_region_num >= O2NM_MAX_REGIONS) {
+ spin_unlock(&o2hb_live_lock);
+ return ERR_PTR(-EFBIG);
+ }
+ set_bit(reg->hr_region_num, o2hb_region_bitmap);
+ }
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
+ config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
+
+ ret = o2hb_debug_region_init(reg, o2hb_debug_dir);
+ if (ret) {
+ config_item_put(&reg->hr_item);
+ return ERR_PTR(ret);
+ }
+
return &reg->hr_item;
}
@@ -1612,6 +2000,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
/* stop the thread when the user removes the region dir */
spin_lock(&o2hb_live_lock);
+ if (o2hb_global_heartbeat_active()) {
+ clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+ clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+ }
hb_task = reg->hr_task;
reg->hr_task = NULL;
spin_unlock(&o2hb_live_lock);
@@ -1628,6 +2020,9 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
wake_up(&o2hb_steady_queue);
}
+ if (o2hb_global_heartbeat_active())
+ printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
+ config_item_name(&reg->hr_item));
config_item_put(item);
}
@@ -1688,6 +2083,41 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group
return count;
}
+static
+ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group,
+ char *page)
+{
+ return sprintf(page, "%s\n",
+ o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]);
+}
+
+static
+ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
+ const char *page, size_t count)
+{
+ unsigned int i;
+ int ret;
+ size_t len;
+
+ len = (page[count - 1] == '\n') ? count - 1 : count;
+ if (!len)
+ return -EINVAL;
+
+ for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
+ if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len))
+ continue;
+
+ ret = o2hb_global_hearbeat_mode_set(i);
+ if (!ret)
+ printk(KERN_NOTICE "o2hb: Heartbeat mode set to %s\n",
+ o2hb_heartbeat_mode_desc[i]);
+ return count;
+ }
+
+ return -EINVAL;
+
+}
+
static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
.attr = { .ca_owner = THIS_MODULE,
.ca_name = "dead_threshold",
@@ -1696,8 +2126,17 @@ static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold
.store = o2hb_heartbeat_group_threshold_store,
};
+static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "mode",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = o2hb_heartbeat_group_mode_show,
+ .store = o2hb_heartbeat_group_mode_store,
+};
+
static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
&o2hb_heartbeat_group_attr_threshold.attr,
+ &o2hb_heartbeat_group_attr_mode.attr,
NULL,
};
@@ -1963,3 +2402,34 @@ void o2hb_stop_all_regions(void)
spin_unlock(&o2hb_live_lock);
}
EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
+
+int o2hb_get_all_regions(char *region_uuids, u8 max_regions)
+{
+ struct o2hb_region *reg;
+ int numregs = 0;
+ char *p;
+
+ spin_lock(&o2hb_live_lock);
+
+ p = region_uuids;
+ list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) {
+ mlog(0, "Region: %s\n", config_item_name(&reg->hr_item));
+ if (numregs < max_regions) {
+ memcpy(p, config_item_name(&reg->hr_item),
+ O2HB_MAX_REGION_NAME_LEN);
+ p += O2HB_MAX_REGION_NAME_LEN;
+ }
+ numregs++;
+ }
+
+ spin_unlock(&o2hb_live_lock);
+
+ return numregs;
+}
+EXPORT_SYMBOL_GPL(o2hb_get_all_regions);
+
+int o2hb_global_heartbeat_active(void)
+{
+ return (o2hb_heartbeat_mode == O2HB_HEARTBEAT_GLOBAL);
+}
+EXPORT_SYMBOL(o2hb_global_heartbeat_active);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 2f1649253b49..00ad8e8fea51 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -31,6 +31,8 @@
#define O2HB_REGION_TIMEOUT_MS 2000
+#define O2HB_MAX_REGION_NAME_LEN 32
+
/* number of changes to be seen as live */
#define O2HB_LIVE_THRESHOLD 2
/* number of equal samples to be seen as dead */
@@ -81,5 +83,7 @@ int o2hb_check_node_heartbeating(u8 node_num);
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
+int o2hb_get_all_regions(char *region_uuids, u8 numregions);
+int o2hb_global_heartbeat_active(void);
#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index fd96e2a2fa56..ea2ed9f56c94 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -119,7 +119,8 @@
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
-#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */
+#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index ed0c9f367fed..bb240647ca5f 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -711,6 +711,8 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
+ mlog(ML_CLUSTER, "o2nm: Registering node %s\n", name);
+
return &node->nd_item;
}
@@ -744,6 +746,9 @@ static void o2nm_node_group_drop_item(struct config_group *group,
}
write_unlock(&cluster->cl_nodes_lock);
+ mlog(ML_CLUSTER, "o2nm: Unregistered node %s\n",
+ config_item_name(&node->nd_item));
+
config_item_put(item);
}
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
index 5b9854bad571..49b594325bec 100644
--- a/fs/ocfs2/cluster/ocfs2_nodemanager.h
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -36,4 +36,10 @@
/* host name, group name, cluster name all 64 bytes */
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
+/*
+ * Maximum number of global heartbeat regions allowed.
+ * **CAUTION** Changing this number will break dlm compatibility.
+ */
+#define O2NM_MAX_REGIONS 32
+
#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index cbe2f057cc28..9aa426e42123 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
{
o2quo_hb_down(node_num);
+ if (!node)
+ return;
+
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
o2quo_hb_up(node_num);
+ BUG_ON(!node);
+
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b4957c7d9fe2..edaded48e7e9 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -40,6 +40,14 @@
#include "inode.h"
#include "super.h"
+void ocfs2_dentry_attach_gen(struct dentry *dentry)
+{
+ unsigned long gen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ BUG_ON(dentry->d_inode);
+ dentry->d_fsdata = (void *)gen;
+}
+
static int ocfs2_dentry_revalidate(struct dentry *dentry,
struct nameidata *nd)
@@ -51,11 +59,20 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
mlog_entry("(0x%p, '%.*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
- /* Never trust a negative dentry - force a new lookup. */
+ /* For a negative dentry -
+ * check the generation number of the parent and compare with the
+ * one stored in the inode.
+ */
if (inode == NULL) {
- mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
- dentry->d_name.name);
- goto bail;
+ unsigned long gen = (unsigned long) dentry->d_fsdata;
+ unsigned long pgen =
+ OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
+ mlog(0, "negative dentry: %.*s parent gen: %lu "
+ "dentry gen: %lu\n",
+ dentry->d_name.len, dentry->d_name.name, pgen, gen);
+ if (gen != pgen)
+ goto bail;
+ goto valid;
}
BUG_ON(!osb);
@@ -96,6 +113,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
goto bail;
}
+valid:
ret = 1;
bail:
@@ -227,6 +245,12 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
if (!inode)
return 0;
+ if (!dentry->d_inode && dentry->d_fsdata) {
+ /* Converting a negative dentry to positive
+ Clear dentry->d_fsdata */
+ dentry->d_fsdata = dl = NULL;
+ }
+
if (dl) {
mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno,
" \"%.*s\": old parent: %llu, new: %llu\n",
@@ -452,6 +476,7 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode)
out:
iput(inode);
+ ocfs2_dentry_attach_gen(dentry);
}
/*
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index f5dd1789acf1..b79eff709958 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -64,5 +64,6 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target,
struct inode *old_dir, struct inode *new_dir);
extern spinlock_t dentry_attach_lock;
+void ocfs2_dentry_attach_gen(struct dentry *dentry);
#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 765298908f1d..b36d0bf77a5a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -445,7 +445,9 @@ enum {
DLM_LOCK_REQUEST_MSG, /* 515 */
DLM_RECO_DATA_DONE_MSG, /* 516 */
DLM_BEGIN_RECO_MSG, /* 517 */
- DLM_FINALIZE_RECO_MSG /* 518 */
+ DLM_FINALIZE_RECO_MSG, /* 518 */
+ DLM_QUERY_REGION, /* 519 */
+ DLM_QUERY_NODEINFO, /* 520 */
};
struct dlm_reco_node_data
@@ -727,6 +729,31 @@ struct dlm_cancel_join
u8 domain[O2NM_MAX_NAME_LEN];
};
+struct dlm_query_region {
+ u8 qr_node;
+ u8 qr_numregions;
+ u8 qr_namelen;
+ u8 pad1;
+ u8 qr_domain[O2NM_MAX_NAME_LEN];
+ u8 qr_regions[O2HB_MAX_REGION_NAME_LEN * O2NM_MAX_REGIONS];
+};
+
+struct dlm_node_info {
+ u8 ni_nodenum;
+ u8 pad1;
+ u16 ni_ipv4_port;
+ u32 ni_ipv4_address;
+};
+
+struct dlm_query_nodeinfo {
+ u8 qn_nodenum;
+ u8 qn_numnodes;
+ u8 qn_namelen;
+ u8 pad1;
+ u8 qn_domain[O2NM_MAX_NAME_LEN];
+ struct dlm_node_info qn_nodes[O2NM_MAX_NODES];
+};
+
struct dlm_exit_domain
{
u8 node_idx;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 901ca52bf86b..272ec8631a51 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -493,7 +493,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
struct hlist_head *bucket;
struct hlist_node *list;
int i, out = 0;
- unsigned long total = 0, longest = 0, bktcnt;
+ unsigned long total = 0, longest = 0, bucket_count = 0;
out += snprintf(db->buf + out, db->len - out,
"Dumping MLEs for Domain: %s\n", dlm->name);
@@ -505,13 +505,13 @@ static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
mle = hlist_entry(list, struct dlm_master_list_entry,
master_hash_node);
++total;
- ++bktcnt;
+ ++bucket_count;
if (db->len - out < 200)
continue;
out += dump_mle(mle, db->buf + out, db->len - out);
}
- longest = max(longest, bktcnt);
- bktcnt = 0;
+ longest = max(longest, bucket_count);
+ bucket_count = 0;
}
spin_unlock(&dlm->master_lock);
@@ -782,7 +782,9 @@ static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
/* Domain: xxxxxxxxxx Key: 0xdfbac769 */
out += snprintf(db->buf + out, db->len - out,
- "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
+ "Domain: %s Key: 0x%08x Protocol: %d.%d\n",
+ dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
/* Thread Pid: xxx Node: xxx State: xxxxx */
out += snprintf(db->buf + out, db->len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 11a5c87fd7f7..58a93b953735 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -128,10 +128,14 @@ static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
* will have a negotiated version with the same major number and a minor
* number equal or smaller. The dlm_ctxt->dlm_locking_proto field should
* be used to determine what a running domain is actually using.
+ *
+ * New in version 1.1:
+ * - Message DLM_QUERY_REGION added to support global heartbeat
+ * - Message DLM_QUERY_NODEINFO added to allow online node removes
*/
static const struct dlm_protocol_version dlm_protocol = {
.pv_major = 1,
- .pv_minor = 0,
+ .pv_minor = 1,
};
#define DLM_DOMAIN_BACKOFF_MS 200
@@ -142,6 +146,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data);
static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data);
static int dlm_protocol_compare(struct dlm_protocol_version *existing,
@@ -921,6 +927,370 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
return 0;
}
+static int dlm_match_regions(struct dlm_ctxt *dlm,
+ struct dlm_query_region *qr)
+{
+ char *local = NULL, *remote = qr->qr_regions;
+ char *l, *r;
+ int localnr, i, j, foundit;
+ int status = 0;
+
+ if (!o2hb_global_heartbeat_active()) {
+ if (qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Joining node %d has global "
+ "heartbeat enabled but local node %d does not\n",
+ qr->qr_domain, qr->qr_node, dlm->node_num);
+ status = -EINVAL;
+ }
+ goto bail;
+ }
+
+ if (o2hb_global_heartbeat_active() && !qr->qr_numregions) {
+ mlog(ML_ERROR, "Domain %s: Local node %d has global "
+ "heartbeat enabled but joining node %d does not\n",
+ qr->qr_domain, dlm->node_num, qr->qr_node);
+ status = -EINVAL;
+ goto bail;
+ }
+
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, r);
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+ if (!local) {
+ status = -ENOMEM;
+ goto bail;
+ }
+
+ localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
+
+ /* compare local regions with remote */
+ l = local;
+ for (i = 0; i < localnr; ++i) {
+ foundit = 0;
+ r = remote;
+ for (j = 0; j <= qr->qr_numregions; ++j) {
+ if (!memcmp(l, r, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in local node %d but not in joining node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, l,
+ dlm->node_num, qr->qr_node);
+ goto bail;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+ /* compare remote with local regions */
+ r = remote;
+ for (i = 0; i < qr->qr_numregions; ++i) {
+ foundit = 0;
+ l = local;
+ for (j = 0; j < localnr; ++j) {
+ if (!memcmp(r, l, O2HB_MAX_REGION_NAME_LEN)) {
+ foundit = 1;
+ break;
+ }
+ l += O2HB_MAX_REGION_NAME_LEN;
+ }
+ if (!foundit) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Domain %s: Region '%.*s' registered "
+ "in joining node %d but not in local node %d\n",
+ qr->qr_domain, O2HB_MAX_REGION_NAME_LEN, r,
+ qr->qr_node, dlm->node_num);
+ goto bail;
+ }
+ r += O2HB_MAX_REGION_NAME_LEN;
+ }
+
+bail:
+ kfree(local);
+
+ return status;
+}
+
+static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_region *qr = NULL;
+ int status, ret = 0, i;
+ char *p;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL);
+ if (!qr) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ qr->qr_node = dlm->node_num;
+ qr->qr_namelen = strlen(dlm->name);
+ memcpy(qr->qr_domain, dlm->name, qr->qr_namelen);
+ /* if local hb, the numregions will be zero */
+ if (o2hb_global_heartbeat_active())
+ qr->qr_numregions = o2hb_get_all_regions(qr->qr_regions,
+ O2NM_MAX_REGIONS);
+
+ p = qr->qr_regions;
+ for (i = 0; i < qr->qr_numregions; ++i, p += O2HB_MAX_REGION_NAME_LEN)
+ mlog(0, "Region %.*s\n", O2HB_MAX_REGION_NAME_LEN, p);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending regions to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_REGION, DLM_MOD_KEY, qr,
+ sizeof(struct dlm_query_region),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "Region mismatch %d, node %d\n",
+ ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qr);
+ return ret;
+}
+
+static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_region *qr;
+ struct dlm_ctxt *dlm = NULL;
+ int status = 0;
+ int locked = 0;
+
+ qr = (struct dlm_query_region *) msg->buf;
+
+ mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
+ qr->qr_domain);
+
+ status = -EINVAL;
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qr->qr_domain, qr->qr_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "before join domain\n", qr->qr_node, qr->qr_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qr->qr_node) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but joining node is %d\n", qr->qr_node, qr->qr_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for global heartbeat was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
+ "but active dlm protocol is %d.%d\n", qr->qr_node,
+ qr->qr_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_regions(dlm, qr);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
+static int dlm_match_nodes(struct dlm_ctxt *dlm, struct dlm_query_nodeinfo *qn)
+{
+ struct o2nm_node *local;
+ struct dlm_node_info *remote;
+ int i, j;
+ int status = 0;
+
+ for (j = 0; j < qn->qn_numnodes; ++j)
+ mlog(0, "Node %3d, %pI4:%u\n", qn->qn_nodes[j].ni_nodenum,
+ &(qn->qn_nodes[j].ni_ipv4_address),
+ ntohs(qn->qn_nodes[j].ni_ipv4_port));
+
+ for (i = 0; i < O2NM_MAX_NODES && !status; ++i) {
+ local = o2nm_get_node_by_num(i);
+ remote = NULL;
+ for (j = 0; j < qn->qn_numnodes; ++j) {
+ if (qn->qn_nodes[j].ni_nodenum == i) {
+ remote = &(qn->qn_nodes[j]);
+ break;
+ }
+ }
+
+ if (!local && !remote)
+ continue;
+
+ if ((local && !remote) || (!local && remote))
+ status = -EINVAL;
+
+ if (!status &&
+ ((remote->ni_nodenum != local->nd_num) ||
+ (remote->ni_ipv4_port != local->nd_ipv4_port) ||
+ (remote->ni_ipv4_address != local->nd_ipv4_address)))
+ status = -EINVAL;
+
+ if (status) {
+ if (remote && !local)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in joining node %d but not in "
+ "local node %d\n", qn->qn_domain,
+ remote->ni_nodenum,
+ &(remote->ni_ipv4_address),
+ ntohs(remote->ni_ipv4_port),
+ qn->qn_nodenum, dlm->node_num);
+ if (local && !remote)
+ mlog(ML_ERROR, "Domain %s: Node %d (%pI4:%u) "
+ "registered in local node %d but not in "
+ "joining node %d\n", qn->qn_domain,
+ local->nd_num, &(local->nd_ipv4_address),
+ ntohs(local->nd_ipv4_port),
+ dlm->node_num, qn->qn_nodenum);
+ BUG_ON((!local && !remote));
+ }
+
+ if (local)
+ o2nm_node_put(local);
+ }
+
+ return status;
+}
+
+static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map)
+{
+ struct dlm_query_nodeinfo *qn = NULL;
+ struct o2nm_node *node;
+ int ret = 0, status, count, i;
+
+ if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES)
+ goto bail;
+
+ qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL);
+ if (!qn) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ for (i = 0, count = 0; i < O2NM_MAX_NODES; ++i) {
+ node = o2nm_get_node_by_num(i);
+ if (!node)
+ continue;
+ qn->qn_nodes[count].ni_nodenum = node->nd_num;
+ qn->qn_nodes[count].ni_ipv4_port = node->nd_ipv4_port;
+ qn->qn_nodes[count].ni_ipv4_address = node->nd_ipv4_address;
+ mlog(0, "Node %3d, %pI4:%u\n", node->nd_num,
+ &(node->nd_ipv4_address), ntohs(node->nd_ipv4_port));
+ ++count;
+ o2nm_node_put(node);
+ }
+
+ qn->qn_nodenum = dlm->node_num;
+ qn->qn_numnodes = count;
+ qn->qn_namelen = strlen(dlm->name);
+ memcpy(qn->qn_domain, dlm->name, qn->qn_namelen);
+
+ i = -1;
+ while ((i = find_next_bit(node_map, O2NM_MAX_NODES,
+ i + 1)) < O2NM_MAX_NODES) {
+ if (i == dlm->node_num)
+ continue;
+
+ mlog(0, "Sending nodeinfo to node %d\n", i);
+
+ ret = o2net_send_message(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ qn, sizeof(struct dlm_query_nodeinfo),
+ i, &status);
+ if (ret >= 0)
+ ret = status;
+ if (ret) {
+ mlog(ML_ERROR, "node mismatch %d, node %d\n", ret, i);
+ break;
+ }
+ }
+
+bail:
+ kfree(qn);
+ return ret;
+}
+
+static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len,
+ void *data, void **ret_data)
+{
+ struct dlm_query_nodeinfo *qn;
+ struct dlm_ctxt *dlm = NULL;
+ int locked = 0, status = -EINVAL;
+
+ qn = (struct dlm_query_nodeinfo *) msg->buf;
+
+ mlog(0, "Node %u queries nodes on domain %s\n", qn->qn_nodenum,
+ qn->qn_domain);
+
+ spin_lock(&dlm_domain_lock);
+ dlm = __dlm_lookup_domain_full(qn->qn_domain, qn->qn_namelen);
+ if (!dlm) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s before "
+ "join domain\n", qn->qn_nodenum, qn->qn_domain);
+ goto bail;
+ }
+
+ spin_lock(&dlm->spinlock);
+ locked = 1;
+ if (dlm->joining_node != qn->qn_nodenum) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s but "
+ "joining node is %d\n", qn->qn_nodenum, qn->qn_domain,
+ dlm->joining_node);
+ goto bail;
+ }
+
+ /* Support for node query was added in 1.1 */
+ if (dlm->dlm_locking_proto.pv_major == 1 &&
+ dlm->dlm_locking_proto.pv_minor == 0) {
+ mlog(ML_ERROR, "Node %d queried nodes on domain %s "
+ "but active dlm protocol is %d.%d\n", qn->qn_nodenum,
+ qn->qn_domain, dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor);
+ goto bail;
+ }
+
+ status = dlm_match_nodes(dlm, qn);
+
+bail:
+ if (locked)
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+
+ return status;
+}
+
static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
@@ -1241,6 +1611,20 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
set_bit(dlm->node_num, dlm->domain_map);
spin_unlock(&dlm->spinlock);
+ /* Support for global heartbeat and node info was added in 1.1 */
+ if (dlm_protocol.pv_major > 1 || dlm_protocol.pv_minor > 0) {
+ status = dlm_send_nodeinfo(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = dlm_send_regions(dlm, ctxt->yes_resp_map);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
/* Joined state *must* be set before the joining node
@@ -1807,7 +2191,21 @@ static int dlm_register_net_handlers(void)
sizeof(struct dlm_cancel_join),
dlm_cancel_join_handler,
NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_REGION, DLM_MOD_KEY,
+ sizeof(struct dlm_query_region),
+ dlm_query_region_handler,
+ NULL, NULL, &dlm_join_handlers);
+ if (status)
+ goto bail;
+
+ status = o2net_register_handler(DLM_QUERY_NODEINFO, DLM_MOD_KEY,
+ sizeof(struct dlm_query_nodeinfo),
+ dlm_query_nodeinfo_handler,
+ NULL, NULL, &dlm_join_handlers);
bail:
if (status < 0)
dlm_unregister_net_handlers();
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 5e02a893f46e..e8d94d722ecb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3635,10 +3635,18 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
{
struct inode *inode;
struct address_space *mapping;
+ struct ocfs2_inode_info *oi;
inode = ocfs2_lock_res_inode(lockres);
mapping = inode->i_mapping;
+ if (S_ISDIR(inode->i_mode)) {
+ oi = OCFS2_I(inode);
+ oi->ip_dir_lock_gen++;
+ mlog(0, "generation: %u\n", oi->ip_dir_lock_gen);
+ goto out;
+ }
+
if (!S_ISREG(inode->i_mode))
goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9a03c151b5ce..9e8cc4346b76 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -64,12 +64,6 @@
#include "buffer_head_io.h"
-static int ocfs2_sync_inode(struct inode *inode)
-{
- filemap_fdatawrite(inode->i_mapping);
- return sync_mapping_buffers(inode->i_mapping);
-}
-
static int ocfs2_init_file_private(struct inode *inode, struct file *file)
{
struct ocfs2_file_private *fp;
@@ -180,16 +174,12 @@ static int ocfs2_sync_file(struct file *file, int datasync)
{
int err = 0;
journal_t *journal;
- struct dentry *dentry = file->f_path.dentry;
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
- dentry->d_name.len, dentry->d_name.name);
-
- err = ocfs2_sync_inode(dentry->d_inode);
- if (err)
- goto bail;
+ mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+ file->f_path.dentry, file->f_path.dentry->d_name.len,
+ file->f_path.dentry->d_name.name);
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
/*
@@ -370,7 +360,7 @@ static int ocfs2_cow_file_pos(struct inode *inode,
if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
goto out;
- return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
+ return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
out:
return status;
@@ -913,8 +903,8 @@ static int ocfs2_zero_extend_get_range(struct inode *inode,
zero_clusters = last_cpos - zero_cpos;
if (needs_cow) {
- rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
- UINT_MAX);
+ rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
+ zero_clusters, UINT_MAX);
if (rc) {
mlog_errno(rc);
goto out;
@@ -2062,6 +2052,7 @@ out:
}
static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
+ struct file *file,
loff_t pos, size_t count,
int *meta_level)
{
@@ -2079,7 +2070,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
*meta_level = 1;
- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
+ ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
if (ret)
mlog_errno(ret);
out:
@@ -2087,7 +2078,7 @@ out:
return ret;
}
-static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
+static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t *ppos,
size_t count,
int appending,
@@ -2095,6 +2086,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
int *has_refcount)
{
int ret = 0, meta_level = 0;
+ struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
loff_t saved_pos, end;
@@ -2150,6 +2142,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
meta_level = -1;
ret = ocfs2_prepare_inode_for_refcount(inode,
+ file,
saved_pos,
count,
&meta_level);
@@ -2232,6 +2225,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
mlog_entry("(0x%p, %u, '%.*s')\n", file,
(unsigned int)nr_segs,
@@ -2255,16 +2250,39 @@ relock:
have_alloc_sem = 1;
}
- /* concurrent O_DIRECT writes are allowed */
- rw_level = !direct_io;
+ /*
+ * Concurrent O_DIRECT writes are allowed with
+ * mount_option "coherency=buffered".
+ */
+ rw_level = (!direct_io || full_coherency);
+
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
mlog_errno(ret);
goto out_sems;
}
+ /*
+ * O_DIRECT writes with "coherency=full" need to take EX cluster
+ * inode_lock to guarantee coherency.
+ */
+ if (direct_io && full_coherency) {
+ /*
+ * We need to take and drop the inode lock to force
+ * other nodes to drop their caches. Buffered I/O
+ * already does this in write_begin().
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_sems;
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ }
+
can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+ ret = ocfs2_prepare_inode_for_write(file, ppos,
iocb->ki_left, appending,
&can_do_direct, &has_refcount);
if (ret < 0) {
@@ -2312,17 +2330,6 @@ relock:
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
ppos, count, ocount);
if (written < 0) {
- /*
- * direct write may have instantiated a few
- * blocks outside i_size. Trim these off again.
- * Don't need i_size_read because we hold i_mutex.
- *
- * XXX(truncate): this looks buggy because ocfs2 did not
- * actually implement ->truncate. Take a look at
- * the new truncate sequence and update this accordingly
- */
- if (*ppos + count > inode->i_size)
- truncate_setsize(inode, inode->i_size);
ret = written;
goto out_dio;
}
@@ -2394,7 +2401,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
{
int ret;
- ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
+ ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
sd->total_len, 0, NULL, NULL);
if (ret < 0) {
mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index eece3e05d9d0..f935fd6600dd 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -335,6 +335,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
else
inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
+ OCFS2_I(inode)->ip_dir_lock_gen = 1;
break;
case S_IFLNK:
if (ocfs2_inode_is_fast_symlink(inode))
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 6de5a869db30..1c508b149b3a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,30 +46,28 @@ struct ocfs2_inode_info
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
- u32 ip_clusters;
struct list_head ip_io_markers;
+ u32 ip_clusters;
+ u16 ip_dyn_features;
struct mutex ip_io_mutex;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
- u16 ip_dyn_features;
/* protected by recovery_lock. */
struct inode *ip_next_orphan;
- u32 ip_dir_start_lookup;
-
struct ocfs2_caching_info ip_metadata_cache;
-
struct ocfs2_extent_map ip_extent_map;
-
struct inode vfs_inode;
struct jbd2_inode ip_jinode;
+ u32 ip_dir_start_lookup;
+
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
+ u32 ip_dir_lock_gen;
struct ocfs2_alloc_reservation ip_la_data_resv;
};
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7d9d9c132cef..7a4868196152 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -26,6 +26,26 @@
#include <linux/ext2_fs.h>
+#define o2info_from_user(a, b) \
+ copy_from_user(&(a), (b), sizeof(a))
+#define o2info_to_user(a, b) \
+ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
+
+/*
+ * This call is void because we are already reporting an error that may
+ * be -EFAULT. The error will be returned from the ioctl(2) call. It's
+ * just a best-effort to tell userspace that this request caused the error.
+ */
+static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
+ struct ocfs2_info_request __user *req)
+{
+ kreq->ir_flags |= OCFS2_INFO_FL_ERROR;
+ (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags));
+}
+
+#define o2info_set_request_error(a, b) \
+ __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+
static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
{
int status;
@@ -109,6 +129,328 @@ bail:
return status;
}
+int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_blocksize oib;
+
+ if (o2info_from_user(oib, req))
+ goto bail;
+
+ oib.ib_blocksize = inode->i_sb->s_blocksize;
+ oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oib, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oib, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_clustersize oic;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oic, req))
+ goto bail;
+
+ oic.ic_clustersize = osb->s_clustersize;
+ oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oic, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oic, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_maxslots oim;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oim, req))
+ goto bail;
+
+ oim.im_max_slots = osb->max_slots;
+ oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oim, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oim, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_label oil;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oil, req))
+ goto bail;
+
+ memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
+ oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oil, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oil, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_uuid oiu;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oiu, req))
+ goto bail;
+
+ memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
+ oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oiu, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oiu, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_fs_features oif;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oif, req))
+ goto bail;
+
+ oif.if_compat_features = osb->s_feature_compat;
+ oif.if_incompat_features = osb->s_feature_incompat;
+ oif.if_ro_compat_features = osb->s_feature_ro_compat;
+ oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oif, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oif, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_journal_size oij;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (o2info_from_user(oij, req))
+ goto bail;
+
+ oij.ij_journal_size = osb->journal->j_inode->i_size;
+
+ oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oij, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oij, req);
+
+ return status;
+}
+
+int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+
+ if (o2info_to_user(oir, req))
+ goto bail;
+
+ status = 0;
+bail:
+ if (status)
+ o2info_set_request_error(oir, req);
+
+ return status;
+}
+
+/*
+ * Validate and distinguish OCFS2_IOC_INFO requests.
+ *
+ * - validate the magic number.
+ * - distinguish different requests.
+ * - validate size of different requests.
+ */
+int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
+{
+ int status = -EFAULT;
+ struct ocfs2_info_request oir;
+
+ if (o2info_from_user(oir, req))
+ goto bail;
+
+ status = -EINVAL;
+ if (oir.ir_magic != OCFS2_INFO_MAGIC)
+ goto bail;
+
+ switch (oir.ir_code) {
+ case OCFS2_INFO_BLOCKSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_blocksize))
+ status = ocfs2_info_handle_blocksize(inode, req);
+ break;
+ case OCFS2_INFO_CLUSTERSIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_clustersize))
+ status = ocfs2_info_handle_clustersize(inode, req);
+ break;
+ case OCFS2_INFO_MAXSLOTS:
+ if (oir.ir_size == sizeof(struct ocfs2_info_maxslots))
+ status = ocfs2_info_handle_maxslots(inode, req);
+ break;
+ case OCFS2_INFO_LABEL:
+ if (oir.ir_size == sizeof(struct ocfs2_info_label))
+ status = ocfs2_info_handle_label(inode, req);
+ break;
+ case OCFS2_INFO_UUID:
+ if (oir.ir_size == sizeof(struct ocfs2_info_uuid))
+ status = ocfs2_info_handle_uuid(inode, req);
+ break;
+ case OCFS2_INFO_FS_FEATURES:
+ if (oir.ir_size == sizeof(struct ocfs2_info_fs_features))
+ status = ocfs2_info_handle_fs_features(inode, req);
+ break;
+ case OCFS2_INFO_JOURNAL_SIZE:
+ if (oir.ir_size == sizeof(struct ocfs2_info_journal_size))
+ status = ocfs2_info_handle_journal_size(inode, req);
+ break;
+ default:
+ status = ocfs2_info_handle_unknown(inode, req);
+ break;
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
+{
+ int status = -EFAULT;
+ u64 __user *bp = NULL;
+
+ if (compat_flag) {
+#ifdef CONFIG_COMPAT
+ /*
+ * pointer bp stores the base address of a pointers array,
+ * which collects all addresses of separate request.
+ */
+ bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests);
+#else
+ BUG();
+#endif
+ } else
+ bp = (u64 __user *)(unsigned long)(info->oi_requests);
+
+ if (o2info_from_user(*req_addr, bp + idx))
+ goto bail;
+
+ status = 0;
+bail:
+ return status;
+}
+
+/*
+ * OCFS2_IOC_INFO handles an array of requests passed from userspace.
+ *
+ * ocfs2_info_handle() recevies a large info aggregation, grab and
+ * validate the request count from header, then break it into small
+ * pieces, later specific handlers can handle them one by one.
+ *
+ * Idea here is to make each separate request small enough to ensure
+ * a better backward&forward compatibility, since a small piece of
+ * request will be less likely to be broken if disk layout get changed.
+ */
+int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
+{
+ int i, status = 0;
+ u64 req_addr;
+ struct ocfs2_info_request __user *reqp;
+
+ if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) ||
+ (!info->oi_requests)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ for (i = 0; i < info->oi_count; i++) {
+
+ status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag);
+ if (status)
+ break;
+
+ reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+ if (!reqp) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_info_handle_request(inode, reqp);
+ if (status)
+ break;
+ }
+
+bail:
+ return status;
+}
+
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_path.dentry->d_inode;
@@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct reflink_arguments args;
const char *old_path, *new_path;
bool preserve;
+ struct ocfs2_info info;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
preserve = (args.preserve != 0);
return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+ sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 0);
default:
return -ENOTTY;
}
@@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
bool preserve;
struct reflink_arguments args;
struct inode *inode = file->f_path.dentry->d_inode;
+ struct ocfs2_info info;
switch (cmd) {
case OCFS2_IOC32_GETFLAGS:
@@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
compat_ptr(args.new_path), preserve);
+ case OCFS2_IOC_INFO:
+ if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
+ sizeof(struct ocfs2_info)))
+ return -EFAULT;
+
+ return ocfs2_info_handle(inode, &info, 1);
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9b57c0350ff9..faa2303dbf0a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
{
int status = 0;
unsigned int flushed;
- unsigned long old_id;
struct ocfs2_journal *journal = NULL;
mlog_entry_void();
@@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- old_id = ocfs2_inc_trans_id(journal);
+ ocfs2_inc_trans_id(journal);
flushed = atomic_read(&journal->j_num_trans);
atomic_set(&journal->j_num_trans, 0);
@@ -342,9 +341,6 @@ finally:
return status;
}
-/* pass it NULL and it will allocate a new handle object for you. If
- * you pass it a handle however, it may still return error, in which
- * case it has free'd the passed handle for you. */
handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
{
journal_t *journal = osb->journal->j_journal;
@@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
os = &osb->osb_orphan_scan;
+ mlog(0, "Begin orphan scan\n");
+
if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
goto out;
@@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
unlock:
ocfs2_orphan_scan_unlock(osb, seqno);
out:
+ mlog(0, "Orphan scan completed\n");
return;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8e710f..43e56b97f9c0 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head *j_bh; /* Journal disk inode block */
atomic_t j_num_trans; /* Number of transactions
* currently in the system. */
+ spinlock_t j_lock;
unsigned long j_trans_id;
struct rw_semaphore j_trans_barrier;
wait_queue_head_t j_checkpointed;
- spinlock_t j_lock;
+ /* both fields protected by j_lock*/
struct list_head j_la_cleanups;
struct work_struct j_recovery_work;
};
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 4c18f4ad93b4..7e32db9c2c99 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
return ret;
}
-static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
struct page *page)
{
int ret;
+ struct inode *inode = file->f_path.dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
unsigned int len = PAGE_CACHE_SIZE;
@@ -111,7 +112,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+ ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page,
&fsdata, di_bh, page);
if (ret) {
if (ret != -ENOSPC)
@@ -159,7 +160,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
*/
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+ ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index a00dda2e4f16..e7bde21149ae 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -171,7 +171,8 @@ bail_add:
ret = ERR_PTR(status);
goto bail_unlock;
}
- }
+ } else
+ ocfs2_dentry_attach_gen(dentry);
bail_unlock:
/* Don't drop the cluster lock until *after* the d_add --
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b6b5a2..d8408217e3bd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
struct ocfs2_lock_res {
void *l_priv;
struct ocfs2_lock_res_ops *l_ops;
- spinlock_t l_lock;
+
struct list_head l_blocked_list;
struct list_head l_mask_waiters;
- enum ocfs2_lock_type l_type;
unsigned long l_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
- int l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
- struct ocfs2_dlm_lksb l_lksb;
+ unsigned char l_level;
+
+ /* Data packed - type enum ocfs2_lock_type */
+ unsigned char l_type;
/* used from AST/BAST funcs. */
- enum ocfs2_ast_action l_action;
- enum ocfs2_unlock_action l_unlock_action;
- int l_requested;
- int l_blocking;
+ /* Data packed - enum type ocfs2_ast_action */
+ unsigned char l_action;
+ /* Data packed - enum type ocfs2_unlock_action */
+ unsigned char l_unlock_action;
+ unsigned char l_requested;
+ unsigned char l_blocking;
unsigned int l_pending_gen;
+ spinlock_t l_lock;
+
+ struct ocfs2_dlm_lksb l_lksb;
+
wait_queue_head_t l_event;
struct list_head l_debug_list;
@@ -243,7 +250,7 @@ enum ocfs2_local_alloc_state
enum ocfs2_mount_options
{
- OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
+ OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Local heartbeat */
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
@@ -256,6 +263,10 @@ enum ocfs2_mount_options
control lists */
OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
+ OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12, /* Allow concurrent O_DIRECT
+ writes */
+ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
+ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -277,7 +288,8 @@ struct ocfs2_super
struct super_block *sb;
struct inode *root_inode;
struct inode *sys_root_inode;
- struct inode *system_inodes[NUM_SYSTEM_INODES];
+ struct inode *global_system_inodes[NUM_GLOBAL_SYSTEM_INODES];
+ struct inode **local_system_inodes;
struct ocfs2_slot_info *slot_info;
@@ -368,6 +380,8 @@ struct ocfs2_super
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
+ u8 osb_stackflags;
+
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
@@ -601,10 +615,35 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
return ret;
}
-static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+static inline int ocfs2_clusterinfo_valid(struct ocfs2_super *osb)
{
return (osb->s_feature_incompat &
- OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
+ (OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK |
+ OCFS2_FEATURE_INCOMPAT_CLUSTERINFO));
+}
+
+static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_o2cb_stack(struct ocfs2_super *osb)
+{
+ if (ocfs2_clusterinfo_valid(osb) &&
+ !memcmp(osb->osb_cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK,
+ OCFS2_STACK_LABEL_LEN))
+ return 1;
+ return 0;
+}
+
+static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
+{
+ return ocfs2_o2cb_stack(osb) &&
+ (osb->osb_stackflags & OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT);
}
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index fa31d05e41b7..c2e4f8222e2f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -101,7 +101,8 @@
| OCFS2_FEATURE_INCOMPAT_META_ECC \
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
| OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
- | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)
+ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
| OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -170,6 +171,13 @@
#define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000
/*
+ * Incompat bit to indicate useable clusterinfo with stackflags for all
+ * cluster stacks (userspace adnd o2cb). If this bit is set,
+ * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set.
+ */
+#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
+
+/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
*/
@@ -292,10 +300,13 @@
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
-/* The alternate, userspace stack fields */
+/* The cluster stack fields */
#define OCFS2_STACK_LABEL_LEN 4
#define OCFS2_CLUSTER_NAME_LEN 16
+/* Classic (historically speaking) cluster stack */
+#define OCFS2_CLASSIC_CLUSTER_STACK "o2cb"
+
/* Journal limits (in bytes) */
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
@@ -305,6 +316,11 @@
*/
#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+/*
+ * Cluster info flags (ocfs2_cluster_info.ci_stackflags)
+ */
+#define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01)
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -322,6 +338,7 @@ enum {
USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE,
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
+#define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
@@ -330,8 +347,12 @@ enum {
TRUNCATE_LOG_SYSTEM_INODE,
LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE
NUM_SYSTEM_INODES
};
+#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE
+#define NUM_LOCAL_SYSTEM_INODES \
+ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE)
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Global system inodes (single copy) */
@@ -360,6 +381,7 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
/* Parameter passed from mount.ocfs2 to module */
#define OCFS2_HB_NONE "heartbeat=none"
#define OCFS2_HB_LOCAL "heartbeat=local"
+#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
* OCFS2 directory file types. Only the low 3 bits are used. The
@@ -566,9 +588,21 @@ struct ocfs2_slot_map_extended {
*/
};
+/*
+ * ci_stackflags is only valid if the incompat bit
+ * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set.
+ */
struct ocfs2_cluster_info {
/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
- __le32 ci_reserved;
+ union {
+ __le32 ci_reserved;
+ struct {
+ __u8 ci_stackflags;
+ __u8 ci_reserved1;
+ __u8 ci_reserved2;
+ __u8 ci_reserved3;
+ };
+ };
/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
/*18*/
};
@@ -605,9 +639,9 @@ struct ocfs2_super_block {
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
-/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
- stack. Only valid
- with INCOMPAT flag. */
+/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either
+ userspace or clusterinfo
+ INCOMPAT flag set. */
/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
for this fs*/
__le16 s_reserved0;
diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h
index 5d241505690b..b46f39bf7438 100644
--- a/fs/ocfs2/ocfs2_ioctl.h
+++ b/fs/ocfs2/ocfs2_ioctl.h
@@ -76,4 +76,99 @@ struct reflink_arguments {
};
#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
+/* Following definitions dedicated for ocfs2_info_request ioctls. */
+#define OCFS2_INFO_MAX_REQUEST (50)
+#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2)
+
+/* Magic number of all requests */
+#define OCFS2_INFO_MAGIC (0x4F32494E)
+
+/*
+ * Always try to separate info request into small pieces to
+ * guarantee the backward&forward compatibility.
+ */
+struct ocfs2_info {
+ __u64 oi_requests; /* Array of __u64 pointers to requests */
+ __u32 oi_count; /* Number of requests in info_requests */
+ __u32 oi_pad;
+};
+
+struct ocfs2_info_request {
+/*00*/ __u32 ir_magic; /* Magic number */
+ __u32 ir_code; /* Info request code */
+ __u32 ir_size; /* Size of request */
+ __u32 ir_flags; /* Request flags */
+/*10*/ /* Request specific fields */
+};
+
+struct ocfs2_info_clustersize {
+ struct ocfs2_info_request ic_req;
+ __u32 ic_clustersize;
+ __u32 ic_pad;
+};
+
+struct ocfs2_info_blocksize {
+ struct ocfs2_info_request ib_req;
+ __u32 ib_blocksize;
+ __u32 ib_pad;
+};
+
+struct ocfs2_info_maxslots {
+ struct ocfs2_info_request im_req;
+ __u32 im_max_slots;
+ __u32 im_pad;
+};
+
+struct ocfs2_info_label {
+ struct ocfs2_info_request il_req;
+ __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN];
+} __attribute__ ((packed));
+
+struct ocfs2_info_uuid {
+ struct ocfs2_info_request iu_req;
+ __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1];
+} __attribute__ ((packed));
+
+struct ocfs2_info_fs_features {
+ struct ocfs2_info_request if_req;
+ __u32 if_compat_features;
+ __u32 if_incompat_features;
+ __u32 if_ro_compat_features;
+ __u32 if_pad;
+};
+
+struct ocfs2_info_journal_size {
+ struct ocfs2_info_request ij_req;
+ __u64 ij_journal_size;
+};
+
+/* Codes for ocfs2_info_request */
+enum ocfs2_info_type {
+ OCFS2_INFO_CLUSTERSIZE = 1,
+ OCFS2_INFO_BLOCKSIZE,
+ OCFS2_INFO_MAXSLOTS,
+ OCFS2_INFO_LABEL,
+ OCFS2_INFO_UUID,
+ OCFS2_INFO_FS_FEATURES,
+ OCFS2_INFO_JOURNAL_SIZE,
+ OCFS2_INFO_NUM_TYPES
+};
+
+/* Flags for struct ocfs2_info_request */
+/* Filled by the caller */
+#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not
+ required. This is a hint.
+ It is up to ocfs2 whether
+ the request can be fulfilled
+ without locking. */
+/* Filled by ocfs2 */
+#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood
+ this request and
+ filled in the answer */
+
+#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during
+ request handling. */
+
+#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info)
+
#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index efdd75607406..b5f9160e93e9 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -49,6 +49,7 @@
struct ocfs2_cow_context {
struct inode *inode;
+ struct file *file;
u32 cow_start;
u32 cow_len;
struct ocfs2_extent_tree data_et;
@@ -2932,13 +2933,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
- unsigned int from, to;
+ unsigned int from, to, readahead_pages;
loff_t offset, end, map_end;
struct address_space *mapping = context->inode->i_mapping;
mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
new_cluster, new_len, cpos);
+ readahead_pages =
+ (ocfs2_cow_contig_clusters(sb) <<
+ OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
@@ -2969,6 +2973,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
+ if (PageReadahead(page) && context->file) {
+ page_cache_async_readahead(mapping,
+ &context->file->f_ra,
+ context->file,
+ page, page_index,
+ readahead_pages);
+ }
+
if (!PageUptodate(page)) {
ret = block_read_full_page(page, ocfs2_get_block);
if (ret) {
@@ -3409,12 +3421,35 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
return ret;
}
+static void ocfs2_readahead_for_cow(struct inode *inode,
+ struct file *file,
+ u32 start, u32 len)
+{
+ struct address_space *mapping;
+ pgoff_t index;
+ unsigned long num_pages;
+ int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+
+ if (!file)
+ return;
+
+ mapping = file->f_mapping;
+ num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
+ if (!num_pages)
+ num_pages = 1;
+
+ index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
+ page_cache_sync_readahead(mapping, &file->f_ra, file,
+ index, num_pages);
+}
+
/*
* Starting at cpos, try to CoW write_len clusters. Don't CoW
* past max_cpos. This will stop when it runs into a hole or an
* unrefcounted extent.
*/
static int ocfs2_refcount_cow_hunk(struct inode *inode,
+ struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3443,6 +3478,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
BUG_ON(cow_len == 0);
+ ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
+
context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
if (!context) {
ret = -ENOMEM;
@@ -3464,6 +3501,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
context->ref_root_bh = ref_root_bh;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
context->get_clusters = ocfs2_di_get_clusters;
+ context->file = file;
ocfs2_init_dinode_extent_tree(&context->data_et,
INODE_CACHE(inode), di_bh);
@@ -3492,6 +3530,7 @@ out:
* clusters between cpos and cpos+write_len are safe to modify.
*/
int ocfs2_refcount_cow(struct inode *inode,
+ struct file *file,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
@@ -3511,7 +3550,7 @@ int ocfs2_refcount_cow(struct inode *inode,
num_clusters = write_len;
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
- ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
+ ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
num_clusters, max_cpos);
if (ret) {
mlog_errno(ret);
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1570e2..c8ce46f7d8e3 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
struct rb_node rf_node;
u64 rf_blkno;
u32 rf_generation;
+ struct kref rf_getcnt;
struct rw_semaphore rf_sem;
struct ocfs2_lock_res rf_lockres;
- struct kref rf_getcnt;
int rf_removed;
/* the following 4 fields are used by caching_info. */
- struct ocfs2_caching_info rf_ci;
spinlock_t rf_lock;
+ struct ocfs2_caching_info rf_ci;
struct mutex rf_io_mutex;
struct super_block *rf_sb;
};
@@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u32 clusters,
int *credits,
int *ref_blocks);
-int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_refcount_cow(struct inode *inode,
+ struct file *filep, struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos);
typedef int (ocfs2_post_refcount_func)(struct inode *inode,
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bfbd7e9e949f..ab4e0172cc1d 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -357,7 +357,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
{
int status = 0;
u64 blkno;
- unsigned long long blocks, bytes;
+ unsigned long long blocks, bytes = 0;
unsigned int i;
struct buffer_head *bh;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 0d3049f696c5..19965b00c43c 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -283,6 +283,8 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
/* for now we only have one cluster/node, make sure we see it
* in the heartbeat universe */
if (!o2hb_check_local_node_heartbeating()) {
+ if (o2hb_global_heartbeat_active())
+ mlog(ML_ERROR, "Global heartbeat not started\n");
rc = -EINVAL;
goto out;
}
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 849c2f0e0a0e..5fed60de7630 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1380,6 +1380,14 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -2419,6 +2427,14 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
(unsigned long *) undo_bg->bg_bitmap);
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
+ if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
+ ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+ " count %u but claims %u are freed. num_bits %d",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count), num_bits);
+ return -EROFS;
+ }
if (undo_fn)
jbd_unlock_bh_state(group_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b7d724393b5a..56f0cb395820 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -162,6 +162,7 @@ enum {
Opt_nointr,
Opt_hb_none,
Opt_hb_local,
+ Opt_hb_global,
Opt_data_ordered,
Opt_data_writeback,
Opt_atime_quantum,
@@ -177,6 +178,8 @@ enum {
Opt_noacl,
Opt_usrquota,
Opt_grpquota,
+ Opt_coherency_buffered,
+ Opt_coherency_full,
Opt_resv_level,
Opt_dir_resv_level,
Opt_err,
@@ -190,6 +193,7 @@ static const match_table_t tokens = {
{Opt_nointr, "nointr"},
{Opt_hb_none, OCFS2_HB_NONE},
{Opt_hb_local, OCFS2_HB_LOCAL},
+ {Opt_hb_global, OCFS2_HB_GLOBAL},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
@@ -205,6 +209,8 @@ static const match_table_t tokens = {
{Opt_noacl, "noacl"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
+ {Opt_coherency_buffered, "coherency=buffered"},
+ {Opt_coherency_full, "coherency=full"},
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_err, NULL}
@@ -514,11 +520,11 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
mlog_entry_void();
- for (i = 0; i < NUM_SYSTEM_INODES; i++) {
- inode = osb->system_inodes[i];
+ for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
+ inode = osb->global_system_inodes[i];
if (inode) {
iput(inode);
- osb->system_inodes[i] = NULL;
+ osb->global_system_inodes[i] = NULL;
}
}
@@ -534,6 +540,20 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
osb->root_inode = NULL;
}
+ if (!osb->local_system_inodes)
+ goto out;
+
+ for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
+ if (osb->local_system_inodes[i]) {
+ iput(osb->local_system_inodes[i]);
+ osb->local_system_inodes[i] = NULL;
+ }
+ }
+
+ kfree(osb->local_system_inodes);
+ osb->local_system_inodes = NULL;
+
+out:
mlog_exit(0);
}
@@ -608,6 +628,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
int ret = 0;
struct mount_options parsed_options;
struct ocfs2_super *osb = OCFS2_SB(sb);
+ u32 tmp;
if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
!ocfs2_check_set_options(sb, &parsed_options)) {
@@ -615,8 +636,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
- (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE;
+ if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
ret = -EINVAL;
mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
goto out;
@@ -806,23 +828,29 @@ bail:
static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
{
- if (ocfs2_mount_local(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL;
+
+ if (osb->s_mount_opt & hb_enabled) {
+ if (ocfs2_mount_local(osb)) {
mlog(ML_ERROR, "Cannot heartbeat on a locally "
"mounted device.\n");
return -EINVAL;
}
- }
-
- if (ocfs2_userspace_stack(osb)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
+ if (ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Userspace stack expected, but "
"o2cb heartbeat arguments passed to mount\n");
return -EINVAL;
}
+ if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) &&
+ !ocfs2_cluster_o2cb_global_heartbeat(osb)) ||
+ ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) &&
+ ocfs2_cluster_o2cb_global_heartbeat(osb))) {
+ mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n");
+ return -EINVAL;
+ }
}
- if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
+ if (!(osb->s_mount_opt & hb_enabled)) {
if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
!ocfs2_userspace_stack(osb)) {
mlog(ML_ERROR, "Heartbeat has to be started to mount "
@@ -1288,6 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
{
int status;
char *p;
+ u32 tmp;
mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
options ? options : "(none)");
@@ -1319,7 +1348,10 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_HB_LOCAL;
break;
case Opt_hb_none:
- mopt->mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
+ mopt->mount_opt |= OCFS2_MOUNT_HB_NONE;
+ break;
+ case Opt_hb_global:
+ mopt->mount_opt |= OCFS2_MOUNT_HB_GLOBAL;
break;
case Opt_barrier:
if (match_int(&args[0], &option)) {
@@ -1435,6 +1467,12 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_grpquota:
mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
break;
+ case Opt_coherency_buffered:
+ mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
+ case Opt_coherency_full:
+ mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
+ break;
case Opt_acl:
mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
@@ -1474,6 +1512,15 @@ static int ocfs2_parse_options(struct super_block *sb,
}
}
+ /* Ensure only one heartbeat mode */
+ tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+ OCFS2_MOUNT_HB_NONE);
+ if (hweight32(tmp) != 1) {
+ mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+ status = 0;
+ goto bail;
+ }
+
status = 1;
bail:
@@ -1487,10 +1534,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
unsigned long opts = osb->s_mount_opt;
unsigned int local_alloc_megs;
- if (opts & OCFS2_MOUNT_HB_LOCAL)
- seq_printf(s, ",_netdev,heartbeat=local");
- else
- seq_printf(s, ",heartbeat=none");
+ if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) {
+ seq_printf(s, ",_netdev");
+ if (opts & OCFS2_MOUNT_HB_LOCAL)
+ seq_printf(s, ",%s", OCFS2_HB_LOCAL);
+ else
+ seq_printf(s, ",%s", OCFS2_HB_GLOBAL);
+ } else
+ seq_printf(s, ",%s", OCFS2_HB_NONE);
if (opts & OCFS2_MOUNT_NOINTR)
seq_printf(s, ",nointr");
@@ -1533,6 +1584,11 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_GRPQUOTA)
seq_printf(s, ",grpquota");
+ if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
+ seq_printf(s, ",coherency=buffered");
+ else
+ seq_printf(s, ",coherency=full");
+
if (opts & OCFS2_MOUNT_NOUSERXATTR)
seq_printf(s, ",nouser_xattr");
else
@@ -1983,6 +2039,36 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
return 0;
}
+/* Make sure entire volume is addressable by our journal. Requires
+ osb_clusters_at_boot to be valid and for the journal to have been
+ initialized by ocfs2_journal_init(). */
+static int ocfs2_journal_addressable(struct ocfs2_super *osb)
+{
+ int status = 0;
+ u64 max_block =
+ ocfs2_clusters_to_blocks(osb->sb,
+ osb->osb_clusters_at_boot) - 1;
+
+ /* 32-bit block number is always OK. */
+ if (max_block <= (u32)~0ULL)
+ goto out;
+
+ /* Volume is "huge", so see if our journal is new enough to
+ support it. */
+ if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb,
+ OCFS2_FEATURE_COMPAT_JBD2_SB) &&
+ jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0,
+ JBD2_FEATURE_INCOMPAT_64BIT))) {
+ mlog(ML_ERROR, "The journal cannot address the entire volume. "
+ "Enable the 'block64' journal option with tunefs.ocfs2");
+ status = -EFBIG;
+ goto out;
+ }
+
+ out:
+ return status;
+}
+
static int ocfs2_initialize_super(struct super_block *sb,
struct buffer_head *bh,
int sector_size,
@@ -1995,6 +2081,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
struct ocfs2_journal *journal;
__le32 uuid_net_key;
struct ocfs2_super *osb;
+ u64 total_blocks;
mlog_entry_void();
@@ -2053,6 +2140,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+ osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
+ if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
+ mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
+ osb->max_slots);
+ status = -EINVAL;
+ goto bail;
+ }
+ mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+
ocfs2_orphan_scan_init(osb);
status = ocfs2_recovery_init(osb);
@@ -2091,15 +2187,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
- if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
- mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
- osb->max_slots);
- status = -EINVAL;
- goto bail;
- }
- mlog(0, "max_slots for this device: %u\n", osb->max_slots);
-
osb->slot_recovery_generations =
kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
GFP_KERNEL);
@@ -2142,7 +2229,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_userspace_stack(osb)) {
+ if (ocfs2_clusterinfo_valid(osb)) {
+ osb->osb_stackflags =
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
memcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
OCFS2_STACK_LABEL_LEN);
@@ -2207,11 +2296,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
- > (u32)~0UL) {
- mlog(ML_ERROR, "Volume might try to write to blocks beyond "
- "what jbd can address in 32 bits.\n");
- status = -EINVAL;
+ total_blocks = ocfs2_clusters_to_blocks(osb->sb,
+ le32_to_cpu(di->i_clusters));
+
+ status = generic_check_addressable(osb->sb->s_blocksize_bits,
+ total_blocks);
+ if (status) {
+ mlog(ML_ERROR, "Volume too large "
+ "to mount safely on this system");
+ status = -EFBIG;
goto bail;
}
@@ -2373,6 +2466,12 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ /* Now that journal has been initialized, check to make sure
+ entire volume is addressable. */
+ status = ocfs2_journal_addressable(osb);
+ if (status)
+ goto finally;
+
/* If the journal was unmounted cleanly then we don't want to
* recover anything. Otherwise, journal_load will do that
* dirty work for us :) */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 32499d213fc4..9975457c981f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
}
/* Fast symlinks can't be large */
- len = strlen(target);
+ len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
link = kzalloc(len + 1, GFP_NOFS);
if (!link) {
status = -ENOMEM;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index bfe7190cdbf1..902efb23b6a6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -44,11 +44,6 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
int type,
u32 slot);
-static inline int is_global_system_inode(int type);
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot);
-
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key ocfs2_sysfile_cluster_lock_key[NUM_SYSTEM_INODES];
#endif
@@ -59,11 +54,52 @@ static inline int is_global_system_inode(int type)
type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
}
-static inline int is_in_system_inode_array(struct ocfs2_super *osb,
- int type,
- u32 slot)
+static struct inode **get_local_system_inode(struct ocfs2_super *osb,
+ int type,
+ u32 slot)
{
- return slot == osb->slot_num || is_global_system_inode(type);
+ int index;
+ struct inode **local_system_inodes, **free = NULL;
+
+ BUG_ON(slot == OCFS2_INVALID_SLOT);
+ BUG_ON(type < OCFS2_FIRST_LOCAL_SYSTEM_INODE ||
+ type > OCFS2_LAST_LOCAL_SYSTEM_INODE);
+
+ spin_lock(&osb->osb_lock);
+ local_system_inodes = osb->local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+
+ if (unlikely(!local_system_inodes)) {
+ local_system_inodes = kzalloc(sizeof(struct inode *) *
+ NUM_LOCAL_SYSTEM_INODES *
+ osb->max_slots,
+ GFP_NOFS);
+ if (!local_system_inodes) {
+ mlog_errno(-ENOMEM);
+ /*
+ * return NULL here so that ocfs2_get_sytem_file_inodes
+ * will try to create an inode and use it. We will try
+ * to initialize local_system_inodes next time.
+ */
+ return NULL;
+ }
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_system_inodes) {
+ /* Someone has initialized it for us. */
+ free = local_system_inodes;
+ local_system_inodes = osb->local_system_inodes;
+ } else
+ osb->local_system_inodes = local_system_inodes;
+ spin_unlock(&osb->osb_lock);
+ if (unlikely(free))
+ kfree(free);
+ }
+
+ index = (slot * NUM_LOCAL_SYSTEM_INODES) +
+ (type - OCFS2_FIRST_LOCAL_SYSTEM_INODE);
+
+ return &local_system_inodes[index];
}
struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
@@ -74,8 +110,10 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
struct inode **arr = NULL;
/* avoid the lookup if cached in local system file array */
- if (is_in_system_inode_array(osb, type, slot))
- arr = &(osb->system_inodes[type]);
+ if (is_global_system_inode(type)) {
+ arr = &(osb->global_system_inodes[type]);
+ } else
+ arr = get_local_system_inode(osb, type, slot);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 06fa5e77c40e..67cd43914641 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7081,7 +7081,7 @@ static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
goto out;
}
- if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
+ if (!indexed)
ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
else
ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1c43e7c8a7b..8e4addaa5424 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2675,7 +2675,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUSR, proc_pid_limits),
+ INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
@@ -3011,7 +3011,7 @@ static const struct pid_entry tid_base_stuff[] = {
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
- INF("limits", S_IRUSR, proc_pid_limits),
+ INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index f53505de0712..5cbb81e134ac 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
int reiserfs_unpack(struct inode *inode, struct file *filp)
{
int retval = 0;
+ int depth;
int index;
struct page *page;
struct address_space *mapping;
@@ -188,8 +189,8 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
/* we need to make sure nobody is changing the file size beneath
** us
*/
- mutex_lock(&inode->i_mutex);
- reiserfs_write_lock(inode->i_sb);
+ reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb);
+ depth = reiserfs_write_lock_once(inode->i_sb);
write_from = inode->i_size & (blocksize - 1);
/* if we are on a block boundary, we are already unpacked. */
@@ -224,6 +225,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
out:
mutex_unlock(&inode->i_mutex);
- reiserfs_write_unlock(inode->i_sb);
+ reiserfs_write_unlock_once(inode->i_sb, depth);
return retval;
}
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
index e668127c8b2e..2bc24a8c4039 100644
--- a/fs/smbfs/Kconfig
+++ b/fs/smbfs/Kconfig
@@ -1,5 +1,6 @@
config SMB_FS
tristate "SMB file system support (OBSOLETE, please use CIFS)"
+ depends on BKL # probably unfixable
depends on INET
select NLS
help
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 23c1e598792a..442f34ff1af8 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -148,6 +148,65 @@ void sysfs_remove_group(struct kobject * kobj,
sysfs_put(sd);
}
+/**
+ * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * @kobj: The kobject containing the group.
+ * @grp: The files to create and the attribute group they belong to.
+ *
+ * This function returns an error if the group doesn't exist or any of the
+ * files already exist in that group, in which case none of the new files
+ * are created.
+ */
+int sysfs_merge_group(struct kobject *kobj,
+ const struct attribute_group *grp)
+{
+ struct sysfs_dirent *dir_sd;
+ int error = 0;
+ struct attribute *const *attr;
+ int i;
+
+ if (grp)
+ dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+ else
+ dir_sd = sysfs_get(kobj->sd);
+ if (!dir_sd)
+ return -ENOENT;
+
+ for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
+ error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+ if (error) {
+ while (--i >= 0)
+ sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+ }
+ sysfs_put(dir_sd);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_merge_group);
+
+/**
+ * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * @kobj: The kobject containing the group.
+ * @grp: The files to remove and the attribute group they belong to.
+ */
+void sysfs_unmerge_group(struct kobject *kobj,
+ const struct attribute_group *grp)
+{
+ struct sysfs_dirent *dir_sd;
+ struct attribute *const *attr;
+
+ if (grp)
+ dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
+ else
+ dir_sd = sysfs_get(kobj->sd);
+ if (dir_sd) {
+ for (attr = grp->attrs; *attr; ++attr)
+ sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ sysfs_put(dir_sd);
+ }
+}
+EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+
EXPORT_SYMBOL_GPL(sysfs_create_group);
EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..f8def3c8ea4c 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,5 +1,6 @@
config UDF_FS
tristate "UDF file system support"
+ depends on BKL # needs serious work to remove
select CRC_ITU_T
help
This is the new file system used on some CD-ROMs and DVDs. Say Y if
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..30c8f223253d 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,6 +1,7 @@
config UFS_FS
tristate "UFS file system support (read only)"
depends on BLOCK
+ depends on BKL # probably fixable
help
BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index d59c4a65d492..81976ffed7d6 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -668,14 +668,11 @@ xfs_inode_set_reclaim_tag(
xfs_perag_put(pag);
}
-void
-__xfs_inode_clear_reclaim_tag(
- xfs_mount_t *mp,
+STATIC void
+__xfs_inode_clear_reclaim(
xfs_perag_t *pag,
xfs_inode_t *ip)
{
- radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
pag->pag_ici_reclaimable--;
if (!pag->pag_ici_reclaimable) {
/* clear the reclaim tag from the perag radix tree */
@@ -689,6 +686,17 @@ __xfs_inode_clear_reclaim_tag(
}
}
+void
+__xfs_inode_clear_reclaim_tag(
+ xfs_mount_t *mp,
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_inode_clear_reclaim(pag, ip);
+}
+
/*
* Inodes in different states need to be treated differently, and the return
* value of xfs_iflush is not sufficient to get this right. The following table
@@ -838,6 +846,7 @@ reclaim:
if (!radix_tree_delete(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
ASSERT(0);
+ __xfs_inode_clear_reclaim(pag, ip);
write_unlock(&pag->pag_ici_lock);
/*
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ed575fb4b495..7e206fc1fa36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -405,9 +405,15 @@ xlog_cil_push(
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
new_ctx->ticket = xlog_cil_ticket_alloc(log);
- /* lock out transaction commit, but don't block on background push */
+ /*
+ * Lock out transaction commit, but don't block for background pushes
+ * unless we are well over the CIL space limit. See the definition of
+ * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
+ * used here.
+ */
if (!down_write_trylock(&cil->xc_ctx_lock)) {
- if (!push_seq)
+ if (!push_seq &&
+ cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
goto out_free_ticket;
down_write(&cil->xc_ctx_lock);
}
@@ -422,7 +428,7 @@ xlog_cil_push(
goto out_skip;
/* check for a previously pushed seqeunce */
- if (push_seq < cil->xc_ctx->sequence)
+ if (push_seq && push_seq < cil->xc_ctx->sequence)
goto out_skip;
/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index ced52b98b322..edcdfe01617f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -426,13 +426,13 @@ struct xfs_cil {
};
/*
- * The amount of log space we should the CIL to aggregate is difficult to size.
- * Whatever we chose we have to make we can get a reservation for the log space
- * effectively, that it is large enough to capture sufficient relogging to
- * reduce log buffer IO significantly, but it is not too large for the log or
- * induces too much latency when writing out through the iclogs. We track both
- * space consumed and the number of vectors in the checkpoint context, so we
- * need to decide which to use for limiting.
+ * The amount of log space we allow the CIL to aggregate is difficult to size.
+ * Whatever we choose, we have to make sure we can get a reservation for the
+ * log space effectively, that it is large enough to capture sufficient
+ * relogging to reduce log buffer IO significantly, but it is not too large for
+ * the log or induces too much latency when writing out through the iclogs. We
+ * track both space consumed and the number of vectors in the checkpoint
+ * context, so we need to decide which to use for limiting.
*
* Every log buffer we write out during a push needs a header reserved, which
* is at least one sector and more for v2 logs. Hence we need a reservation of
@@ -459,16 +459,21 @@ struct xfs_cil {
* checkpoint transaction ticket is specific to the checkpoint context, rather
* than the CIL itself.
*
- * With dynamic reservations, we can basically make up arbitrary limits for the
- * checkpoint size so long as they don't violate any other size rules. Hence
- * the initial maximum size for the checkpoint transaction will be set to a
- * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
- * right now based on the latency of writing out a large amount of data through
- * the circular iclog buffers.
+ * With dynamic reservations, we can effectively make up arbitrary limits for
+ * the checkpoint size so long as they don't violate any other size rules.
+ * Recovery imposes a rule that no transaction exceed half the log, so we are
+ * limited by that. Furthermore, the log transaction reservation subsystem
+ * tries to keep 25% of the log free, so we need to keep below that limit or we
+ * risk running out of free log space to start any new transactions.
+ *
+ * In order to keep background CIL push efficient, we will set a lower
+ * threshold at which background pushing is attempted without blocking current
+ * transaction commits. A separate, higher bound defines when CIL pushes are
+ * enforced to ensure we stay within our maximum checkpoint size bounds.
+ * threshold, yet give us plenty of space for aggregation on large logs.
*/
-
-#define XLOG_CIL_SPACE_LIMIT(log) \
- (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
/*
* The reservation head lsn is not made up of a cycle number and block number.