diff options
Diffstat (limited to 'fs')
105 files changed, 1578 insertions, 1194 deletions
| diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 88e3787c6ea9..e298fe194093 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)  const struct file_operations v9fs_dir_operations = {  	.read = generic_read_dir, +	.llseek = generic_file_llseek,  	.readdir = v9fs_dir_readdir,  	.open = v9fs_file_open,  	.release = v9fs_dir_release, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c95295c65045..e83aa5ebe861 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,  	return NULL;  error: -	if (fid) -		p9_client_clunk(fid); +	p9_client_clunk(fid);  	return ERR_PTR(result);  } diff --git a/fs/Kconfig b/fs/Kconfig index d3873583360b..abccb5dab9a8 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1930,6 +1930,16 @@ config CIFS_WEAK_PW_HASH  	  If unsure, say N. +config CIFS_UPCALL +	  bool "Kerberos/SPNEGO advanced session setup" +	  depends on CIFS && KEYS +	  help +	    Enables an upcall mechanism for CIFS which accesses +	    userspace helper utilities to provide SPNEGO packaged (RFC 4178) +	    Kerberos tickets which are needed to mount to certain secure servers +	    (for which more secure Kerberos authentication is required). If +	    unsure, say N. +  config CIFS_XATTR          bool "CIFS extended attributes"          depends on CIFS @@ -1982,17 +1992,6 @@ config CIFS_EXPERIMENTAL  	    (which is disabled by default). See the file fs/cifs/README   	    for more details.  If unsure, say N. -config CIFS_UPCALL -	  bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" -	  depends on CIFS_EXPERIMENTAL -	  depends on KEYS -	  help -	    Enables an upcall mechanism for CIFS which accesses -	    userspace helper utilities to provide SPNEGO packaged (RFC 4178) -	    Kerberos tickets which are needed to mount to certain secure servers -	    (for which more secure Kerberos authentication is required). If -	    unsure, say N. -  config CIFS_DFS_UPCALL  	  bool "DFS feature support (EXPERIMENTAL)"  	  depends on CIFS_EXPERIMENTAL diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fc1a8dc64d78..85a30e929800 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,6 +197,7 @@ out:  const struct file_operations adfs_dir_operations = {  	.read		= generic_read_dir, +	.llseek		= generic_file_llseek,  	.readdir	= adfs_readdir,  	.fsync		= file_fsync,  }; diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 6e3f282424b0..7b36904dbeac 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);  const struct file_operations affs_dir_operations = {  	.read		= generic_read_dir, +	.llseek		= generic_file_llseek,  	.readdir	= affs_readdir,  	.fsync		= file_fsync,  }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index bcfb2dc0a61b..2a41c2a7fc52 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = {  	.release	= dcache_dir_close,  	.read		= generic_read_dir,  	.readdir	= dcache_readdir, +	.llseek		= dcache_dir_lseek,  	.ioctl		= autofs4_root_ioctl,  }; @@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = {  	.release	= dcache_dir_close,  	.read		= generic_read_dir,  	.readdir	= dcache_readdir, +	.llseek		= dcache_dir_lseek,  };  const struct inode_operations autofs4_indirect_root_inode_operations = { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 02c6e62b72f8..740f53672a8a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;  static const struct file_operations befs_dir_operations = {  	.read		= generic_read_dir,  	.readdir	= befs_readdir, +	.llseek		= generic_file_llseek,  };  static const struct inode_operations befs_dir_inode_operations = { diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 87ee5ccee348..ed8feb052df9 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,  							inode->i_ino);  	if (err) {  		inode_dec_link_count(inode); -		iput(inode);  		mutex_unlock(&info->bfs_lock); +		iput(inode);  		return err;  	}  	mutex_unlock(&info->bfs_lock); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 56372ecf1690..dfc0197905ca 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -914,7 +914,9 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)  	/* Stash our initial stack pointer into the mm structure */  	current->mm->start_stack = (unsigned long )sp; -	 +#ifdef FLAT_PLAT_INIT +	FLAT_PLAT_INIT(regs); +#endif  	DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",  		(int)regs, (int)start_addr, (int)current->mm->start_stack); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 756205314c24..8d7e88e02e0f 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -120,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)  	if (bprm->misc_bang)  		goto _ret; -	bprm->misc_bang = 1; -  	/* to keep locking time low, we copy the interpreter string */  	read_lock(&entries_lock);  	fmt = check_file(bprm); @@ -199,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)  	if (retval < 0)  		goto _error; +	bprm->misc_bang = 1; +  	retval = search_binary_handler (bprm, regs);  	if (retval < 0)  		goto _error; @@ -469,20 +469,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)  	kfree(bmd);  } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, +					       gfp_t gfp_mask)  { -	struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); +	struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);  	if (!bmd)  		return NULL; -	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); +	bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);  	if (!bmd->iovecs) {  		kfree(bmd);  		return NULL;  	} -	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); +	bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);  	if (bmd->sgvecs)  		return bmd; @@ -491,8 +492,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)  	return NULL;  } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, -			  int uncopy) +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, +			  struct sg_iovec *iov, int iov_count, int uncopy)  {  	int ret = 0, i;  	struct bio_vec *bvec; @@ -502,7 +503,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,  	__bio_for_each_segment(bvec, bio, i, 0) {  		char *bv_addr = page_address(bvec->bv_page); -		unsigned int bv_len = bvec->bv_len; +		unsigned int bv_len = iovecs[i].bv_len;  		while (bv_len && iov_idx < iov_count) {  			unsigned int bytes; @@ -554,7 +555,7 @@ int bio_uncopy_user(struct bio *bio)  	struct bio_map_data *bmd = bio->bi_private;  	int ret; -	ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); +	ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1);  	bio_free_map_data(bmd);  	bio_put(bio); @@ -596,7 +597,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,  		len += iov[i].iov_len;  	} -	bmd = bio_alloc_map_data(nr_pages, iov_count); +	bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL);  	if (!bmd)  		return ERR_PTR(-ENOMEM); @@ -633,7 +634,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,  	 * success  	 */  	if (!write_to_vm) { -		ret = __bio_copy_iov(bio, iov, iov_count, 0); +		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0);  		if (ret)  			goto cleanup;  	} @@ -942,19 +943,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)  {  	struct bio_vec *bvec;  	const int read = bio_data_dir(bio) == READ; -	char *p = bio->bi_private; +	struct bio_map_data *bmd = bio->bi_private;  	int i; +	char *p = bmd->sgvecs[0].iov_base;  	__bio_for_each_segment(bvec, bio, i, 0) {  		char *addr = page_address(bvec->bv_page); +		int len = bmd->iovecs[i].bv_len;  		if (read && !err) -			memcpy(p, addr, bvec->bv_len); +			memcpy(p, addr, len);  		__free_page(bvec->bv_page); -		p += bvec->bv_len; +		p += len;  	} +	bio_free_map_data(bmd);  	bio_put(bio);  } @@ -978,11 +982,21 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,  	const int nr_pages = end - start;  	struct bio *bio;  	struct bio_vec *bvec; +	struct bio_map_data *bmd;  	int i, ret; +	struct sg_iovec iov; + +	iov.iov_base = data; +	iov.iov_len = len; + +	bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask); +	if (!bmd) +		return ERR_PTR(-ENOMEM); +	ret = -ENOMEM;  	bio = bio_alloc(gfp_mask, nr_pages);  	if (!bio) -		return ERR_PTR(-ENOMEM); +		goto out_bmd;  	while (len) {  		struct page *page; @@ -1016,14 +1030,18 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,  		}  	} -	bio->bi_private = data; +	bio->bi_private = bmd;  	bio->bi_end_io = bio_copy_kern_endio; + +	bio_set_map_data(bmd, bio, &iov, 1);  	return bio;  cleanup:  	bio_for_each_segment(bvec, bio, i)  		__free_page(bvec->bv_page);  	bio_put(bio); +out_bmd: +	bio_free_map_data(bmd);  	return ERR_PTR(ret);  } diff --git a/fs/buffer.c b/fs/buffer.c index 38653e36e225..ac78d4c19b3b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh)  	BUG_ON(!buffer_mapped(bh));  	BUG_ON(!bh->b_end_io); -	if (buffer_ordered(bh) && (rw == WRITE)) -		rw = WRITE_BARRIER; +	/* +	 * Mask in barrier bit for a write (could be either a WRITE or a +	 * WRITE_SYNC +	 */ +	if (buffer_ordered(bh) && (rw & WRITE)) +		rw |= WRITE_BARRIER;  	/* -	 * Only clear out a write error when rewriting, should this -	 * include WRITE_SYNC as well? +	 * Only clear out a write error when rewriting  	 */ -	if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) +	if (test_set_buffer_req(bh) && (rw & WRITE))  		clear_buffer_write_io_error(bh);  	/* diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index f5d0083e09fa..06e521a945c3 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -4,7 +4,15 @@ Fix premature write failure on congested networks (we would give up  on EAGAIN from the socket too quickly on large writes).  Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.  Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. +architectures.  Fix problems with preserving timestamps on copying open +files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit +on parent directory when server supports Unix Extensions but not POSIX +create. Update cifs.upcall version to handle new Kerberos sec flags +(this requires update of cifs.upcall program from Samba).  Fix memory leak +on dns_upcall (resolving DFS referralls).  Fix plain text password +authentication (requires setting SecurityFlags to 0x30030 to enable +lanman and plain text though).  Fix writes to be at correct offset when +file is open with O_APPEND and file is on a directio (forcediretio) mount.  Version 1.53  ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 2bd6fe556f88..bd2343d4c6a6 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -542,10 +542,20 @@ SecurityFlags		Flags which control security negotiation and  			hashing mechanisms (as "must use") on the other hand   			does not make much sense. Default flags are   				0x07007  -			(NTLM, NTLMv2 and packet signing allowed).  Maximum  +			(NTLM, NTLMv2 and packet signing allowed).  The maximum   			allowable flags if you want to allow mounts to servers  			using weaker password hashes is 0x37037 (lanman, -			plaintext, ntlm, ntlmv2, signing allowed): +			plaintext, ntlm, ntlmv2, signing allowed).  Some +			SecurityFlags require the corresponding menuconfig +			options to be enabled (lanman and plaintext require +			CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling +			plaintext authentication currently requires also +			enabling lanman authentication in the security flags +			because the cifs module only supports sending +			laintext passwords using the older lanman dialect +			form of the session setup SMB.  (e.g. for authentication +			using plain text passwords, set the SecurityFlags +			to 0x30030):  			may use packet signing 				0x00001  			must use packet signing				0x01001 @@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in  that they represent all for that share, not just those for which the server  returned success. -Also note that "cat /proc/fs/cifs/DebugData" will display information about  +Also note that "cat /proc/fs/cifs/DebugData" will display information about  the active sessions and the shares that are mounted. -Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is -on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and -LANMAN support do not require this helper. + +Enabling Kerberos (extended security) works but requires version 1.2 or later +of the helper program cifs.upcall to be present and to be configured in the +/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba +project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not +require this helper. Note that NTLMv2 security (which does not require the +cifs.upcall helper program), instead of using Kerberos, is sufficient for +some use cases. + +Enabling DFS support (used to access shares transparently in an MS-DFS +global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In +addition, DFS support for target shares which are specified as UNC +names which begin with host names (rather than IP addresses) requires +a user space helper (such as cifs.upcall) to be present in order to +translate host names to ip address, and the user space helper must also +be configured in the file /etc/request-key.conf + +To use cifs Kerberos and DFS support, the Linux keyutils package should be +installed and something like the following lines should be added to the +/etc/request-key.conf file: + +create cifs.spnego * * /usr/local/sbin/cifs.upcall %k +create dns_resolver * * /usr/local/sbin/cifs.upcall %k + + diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 5fabd2caf93c..1b09f1670061 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,  	unsigned int cls, con, tag, oidlen, rc;  	bool use_ntlmssp = false;  	bool use_kerberos = false; +	bool use_mskerberos = false;  	*secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ @@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length,  					 *(oid + 1), *(oid + 2), *(oid + 3)));  				if (compare_oid(oid, oidlen, MSKRB5_OID, -						MSKRB5_OID_LEN)) -					use_kerberos = true; +						MSKRB5_OID_LEN) && +						!use_kerberos) +					use_mskerberos = true;  				else if (compare_oid(oid, oidlen, KRB5_OID, -						     KRB5_OID_LEN)) +						     KRB5_OID_LEN) && +						     !use_mskerberos)  					use_kerberos = true;  				else if (compare_oid(oid, oidlen, NTLMSSP_OID,  						     NTLMSSP_OID_LEN)) @@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length,  	if (use_kerberos)  		*secType = Kerberos; +	else if (use_mskerberos) +		*secType = MSKerberos;  	else if (use_ntlmssp)  		*secType = NTLMSSP; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 2434ab0e8791..117ef4bba68e 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -114,9 +114,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)  	dp = description + strlen(description); -	/* for now, only sec=krb5 is valid */ +	/* for now, only sec=krb5 and sec=mskrb5 are valid */  	if (server->secType == Kerberos)  		sprintf(dp, ";sec=krb5"); +	else if (server->secType == MSKerberos) +		sprintf(dp, ";sec=mskrb5");  	else  		goto out; diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 05a34b17a1ab..e4041ec4d712 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -23,7 +23,7 @@  #ifndef _CIFS_SPNEGO_H  #define _CIFS_SPNEGO_H -#define CIFS_SPNEGO_UPCALL_VERSION 1 +#define CIFS_SPNEGO_UPCALL_VERSION 2  /*   * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 83fd40dc1ef0..bd5f13d38450 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)  	if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)  		if (extended_security & CIFSSEC_MAY_PLNTXT) { +			memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);  			memcpy(lnm_session_key, password_with_pad,  				CIFS_ENCPWD_SIZE);  			return; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e8da4ee761b5..25ecbd5b0404 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -175,6 +175,8 @@ out_no_root:  	if (inode)  		iput(inode); +	cifs_umount(sb, cifs_sb); +  out_mount_failed:  	if (cifs_sb) {  #ifdef CONFIG_CIFS_DFS_UPCALL diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7e1cf262effe..8dfd6f24d488 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,7 +80,8 @@ enum securityEnum {  	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */  	RawNTLMSSP,		/* NTLMSSP without SPNEGO */  	NTLMSSP,		/* NTLMSSP via SPNEGO */ -	Kerberos		/* Kerberos via SPNEGO */ +	Kerberos,		/* Kerberos via SPNEGO */ +	MSKerberos,		/* MS Kerberos via SPNEGO */  };  enum protocolEnum { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0711db65afe8..4c13bcdb92a5 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,  	char ntlm_session_key[CIFS_SESS_KEY_SIZE];  	bool ntlmv2_flag = false;  	int first_time = 0; +	struct TCP_Server_Info *server = pSesInfo->server;  	/* what if server changes its buffer size after dropping the session? */ -	if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { +	if (server->maxBuf == 0) /* no need to send on reconnect */ {  		rc = CIFSSMBNegotiate(xid, pSesInfo); -		if (rc == -EAGAIN) /* retry only once on 1st time connection */ { +		if (rc == -EAGAIN) { +			/* retry only once on 1st time connection */  			rc = CIFSSMBNegotiate(xid, pSesInfo);  			if (rc == -EAGAIN)  				rc = -EHOSTDOWN;  		}  		if (rc == 0) {  			spin_lock(&GlobalMid_Lock); -			if (pSesInfo->server->tcpStatus != CifsExiting) -				pSesInfo->server->tcpStatus = CifsGood; +			if (server->tcpStatus != CifsExiting) +				server->tcpStatus = CifsGood;  			else  				rc = -EHOSTDOWN;  			spin_unlock(&GlobalMid_Lock); @@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,  		goto ss_err_exit;  	pSesInfo->flags = 0; -	pSesInfo->capabilities = pSesInfo->server->capabilities; +	pSesInfo->capabilities = server->capabilities;  	if (linuxExtEnabled == 0)  		pSesInfo->capabilities &= (~CAP_UNIX);  	/*	pSesInfo->sequence_number = 0;*/  	cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", -		 pSesInfo->server->secMode, -		 pSesInfo->server->capabilities, -		 pSesInfo->server->timeAdj)); +		 server->secMode, server->capabilities, server->timeAdj)); +  	if (experimEnabled < 2)  		rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);  	else if (extended_security  			&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) -			&& (pSesInfo->server->secType == NTLMSSP)) { +			&& (server->secType == NTLMSSP)) {  		rc = -EOPNOTSUPP;  	} else if (extended_security  			&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) -			&& (pSesInfo->server->secType == RawNTLMSSP)) { +			&& (server->secType == RawNTLMSSP)) {  		cFYI(1, ("NTLMSSP sesssetup"));  		rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,  						   nls_info); @@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,  			} else {  				SMBNTencrypt(pSesInfo->password, -					     pSesInfo->server->cryptKey, +					     server->cryptKey,  					     ntlm_session_key);  				if (first_time)  					cifs_calculate_mac_key( -					     &pSesInfo->server->mac_signing_key, +					     &server->mac_signing_key,  					     ntlm_session_key,  					     pSesInfo->password);  			} @@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,  						      nls_info);  		}  	} else { /* old style NTLM 0.12 session setup */ -		SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey, +		SMBNTencrypt(pSesInfo->password, server->cryptKey,  			     ntlm_session_key);  		if (first_time) -			cifs_calculate_mac_key( -					&pSesInfo->server->mac_signing_key, -					ntlm_session_key, pSesInfo->password); +			cifs_calculate_mac_key(&server->mac_signing_key, +						ntlm_session_key, +						pSesInfo->password);  		rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);  	} diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index f730ef35499e..a2e0673e1b08 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -47,11 +47,18 @@ static int dns_resolver_instantiate(struct key *key, const void *data,  	return rc;  } +static void +dns_resolver_destroy(struct key *key) +{ +	kfree(key->payload.data); +} +  struct key_type key_type_dns_resolver = {  	.name        = "dns_resolver",  	.def_datalen = sizeof(struct in_addr),  	.describe    = user_describe,  	.instantiate = dns_resolver_instantiate, +	.destroy     = dns_resolver_destroy,  	.match       = user_match,  }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ff14d14903a0..cbefe1f1f9fe 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,  		return -EBADF;  	open_file = (struct cifsFileInfo *) file->private_data; +	rc = generic_write_checks(file, poffset, &write_size, 0); +	if (rc) +		return rc; +  	xid = GetXid();  	if (*poffset > file->f_path.dentry->d_inode->i_size) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 28a22092d450..9c548f110102 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode,  		if ((inode->i_mode & S_IWUGO) == 0 &&  		    (attr & ATTR_READONLY) == 0)  			inode->i_mode |= (S_IWUGO & default_mode); -			inode->i_mode &= ~S_IFMT; + +		inode->i_mode &= ~S_IFMT;  	}  	/* clear write bits if ATTR_READONLY is set */  	if (attr & ATTR_READONLY) @@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)  		inode->i_fop = &simple_dir_operations;  		inode->i_uid = cifs_sb->mnt_uid;  		inode->i_gid = cifs_sb->mnt_gid; +	} else if (rc) {  		_FreeXid(xid);  		iget_failed(inode);  		return ERR_PTR(rc); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ed150efbe27c..252fdc0567f1 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,  #ifdef CONFIG_CIFS_WEAK_PW_HASH  		char lnm_session_key[CIFS_SESS_KEY_SIZE]; +		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; +  		/* no capabilities flags in old lanman negotiation */  		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); @@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,  			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);  		} else  			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); -	} else if (type == Kerberos) { +	} else if (type == Kerberos || type == MSKerberos) {  #ifdef CONFIG_CIFS_UPCALL  		struct cifs_spnego_msg *msg;  		spnego_key = cifs_get_spnego_key(ses); @@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,  		}  		msg = spnego_key->payload.data; +		/* check version field to make sure that cifs.upcall is +		   sending us a response in an expected form */ +		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { +			cERROR(1, ("incorrect version of cifs.upcall (expected" +				   " %d but got %d)", +				   CIFS_SPNEGO_UPCALL_VERSION, msg->version)); +			rc = -EKEYREJECTED; +			goto ssetup_exit; +		}  		/* bail out if key is too long */  		if (msg->sesskey_len >  		    sizeof(ses->server->mac_signing_key.data.krb5)) { diff --git a/fs/compat.c b/fs/compat.c index c9d1472e65c5..075d0509970d 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,  	if (buf->result)  		return -EINVAL;  	d_ino = ino; -	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) +	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { +		buf->result = -EOVERFLOW;  		return -EOVERFLOW; +	}  	buf->result++;  	dirent = buf->dirent;  	if (!access_ok(VERIFY_WRITE, dirent, @@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,  	if (reclen > buf->count)  		return -EINVAL;  	d_ino = ino; -	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) +	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { +		buf->error = -EOVERFLOW;  		return -EOVERFLOW; +	}  	dirent = buf->previous;  	if (dirent) {  		if (__put_user(offset, &dirent->d_off)) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7a8db78a91d2..8e93341f3e82 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)  	 * Ensure that no racing symlink() will make detach_prep() fail while  	 * the new link is temporarily attached  	 */ -	mutex_lock(&configfs_symlink_mutex); -	spin_lock(&configfs_dirent_lock);  	do {  		struct mutex *wait_mutex; +		mutex_lock(&configfs_symlink_mutex); +		spin_lock(&configfs_dirent_lock);  		ret = configfs_detach_prep(dentry, &wait_mutex); -		if (ret) { +		if (ret)  			configfs_detach_rollback(dentry); -			spin_unlock(&configfs_dirent_lock); -			mutex_unlock(&configfs_symlink_mutex); +		spin_unlock(&configfs_dirent_lock); +		mutex_unlock(&configfs_symlink_mutex); + +		if (ret) {  			if (ret != -EAGAIN) {  				config_item_put(parent_item);  				return ret; @@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)  			/* Wait until the racing operation terminates */  			mutex_lock(wait_mutex);  			mutex_unlock(wait_mutex); - -			mutex_lock(&configfs_symlink_mutex); -			spin_lock(&configfs_dirent_lock);  		}  	} while (ret == -EAGAIN); -	spin_unlock(&configfs_dirent_lock); -	mutex_unlock(&configfs_symlink_mutex);  	/* Get a working ref for the duration of this function */  	item = configfs_get_config_item(dentry); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 0c3b618c15b3..f40423eb1a14 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);  static int cramfs_iget5_test(struct inode *inode, void *opaque)  {  	struct cramfs_inode *cramfs_inode = opaque; - -	if (inode->i_ino != CRAMINO(cramfs_inode)) -		return 0; /* does not match */ - -	if (inode->i_ino != 1) -		return 1; - -	/* all empty directories, char, block, pipe, and sock, share inode #1 */ - -	if ((inode->i_mode != cramfs_inode->mode) || -	    (inode->i_gid != cramfs_inode->gid) || -	    (inode->i_uid != cramfs_inode->uid)) -		return 0; /* does not match */ - -	if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) && -	    (inode->i_rdev != old_decode_dev(cramfs_inode->size))) -		return 0; /* does not match */ - -	return 1; /* matches */ +	return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;  }  static int cramfs_iget5_set(struct inode *inode, void *opaque)  { -	static struct timespec zerotime;  	struct cramfs_inode *cramfs_inode = opaque; -	inode->i_mode = cramfs_inode->mode; -	inode->i_uid = cramfs_inode->uid; -	inode->i_size = cramfs_inode->size; -	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; -	inode->i_gid = cramfs_inode->gid; -	/* Struct copy intentional */ -	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;  	inode->i_ino = CRAMINO(cramfs_inode); -	/* inode->i_nlink is left 1 - arguably wrong for directories, -	   but it's the best we can do without reading the directory -           contents.  1 yields the right result in GNU find, even -	   without -noleaf option. */ -	if (S_ISREG(inode->i_mode)) { -		inode->i_fop = &generic_ro_fops; -		inode->i_data.a_ops = &cramfs_aops; -	} else if (S_ISDIR(inode->i_mode)) { -		inode->i_op = &cramfs_dir_inode_operations; -		inode->i_fop = &cramfs_directory_operations; -	} else if (S_ISLNK(inode->i_mode)) { -		inode->i_op = &page_symlink_inode_operations; -		inode->i_data.a_ops = &cramfs_aops; -	} else { -		inode->i_size = 0; -		inode->i_blocks = 0; -		init_special_inode(inode, inode->i_mode, -			old_decode_dev(cramfs_inode->size)); -	}  	return 0;  } @@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,  	struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),  					    cramfs_iget5_test, cramfs_iget5_set,  					    cramfs_inode); +	static struct timespec zerotime; +  	if (inode && (inode->i_state & I_NEW)) { +		inode->i_mode = cramfs_inode->mode; +		inode->i_uid = cramfs_inode->uid; +		inode->i_size = cramfs_inode->size; +		inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; +		inode->i_gid = cramfs_inode->gid; +		/* Struct copy intentional */ +		inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; +		/* inode->i_nlink is left 1 - arguably wrong for directories, +		   but it's the best we can do without reading the directory +		   contents.  1 yields the right result in GNU find, even +		   without -noleaf option. */ +		if (S_ISREG(inode->i_mode)) { +			inode->i_fop = &generic_ro_fops; +			inode->i_data.a_ops = &cramfs_aops; +		} else if (S_ISDIR(inode->i_mode)) { +			inode->i_op = &cramfs_dir_inode_operations; +			inode->i_fop = &cramfs_directory_operations; +		} else if (S_ISLNK(inode->i_mode)) { +			inode->i_op = &page_symlink_inode_operations; +			inode->i_data.a_ops = &cramfs_aops; +		} else { +			inode->i_size = 0; +			inode->i_blocks = 0; +			init_special_inode(inode, inode->i_mode, +				old_decode_dev(cramfs_inode->size)); +		}  		unlock_new_inode(inode);  	}  	return inode;  } +static void cramfs_drop_inode(struct inode *inode) +{ +	if (inode->i_ino == 1) +		generic_delete_inode(inode); +	else +		generic_drop_inode(inode); +} +  /*   * We have our own block cache: don't fill up the buffer cache   * with the rom-image, because the way the filesystem is set @@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {  	.put_super	= cramfs_put_super,  	.remount_fs	= cramfs_remount,  	.statfs		= cramfs_statfs, +	.drop_inode	= cramfs_drop_inode,  };  static int cramfs_get_sb(struct file_system_type *fs_type, diff --git a/fs/dcache.c b/fs/dcache.c index 101663d15e9f..e7a1a99b7464 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)   * If no entry exists with the exact case name, allocate new dentry with   * the exact case, and return the spliced entry.   */ -struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry, +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,  			struct qstr *name)  {  	int error; @@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)  		if (dentry->d_parent != parent)  			goto next; +		/* non-existing due to RCU? */ +		if (d_unhashed(dentry)) +			goto next; +  		/*  		 * It is safe to compare names since d_move() cannot  		 * change the qstr (protected by d_lock). @@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)  				goto next;  		} -		if (!d_unhashed(dentry)) { -			atomic_inc(&dentry->d_count); -			found = dentry; -		} +		atomic_inc(&dentry->d_count); +		found = dentry;  		spin_unlock(&dentry->d_lock);  		break;  next: diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 3a404e7fad53..291abb11e20e 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei  	}  	unlock_kernel(); -	d_add(dentry, inode); -	return NULL; +	return d_splice_alias(inode, dentry);  }  static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, diff --git a/fs/exec.c b/fs/exec.c index 32993beecbe9..cecee501ce78 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)  	tsk->active_mm = mm;  	activate_mm(active_mm, mm);  	task_unlock(tsk); -	mm_update_next_owner(old_mm);  	arch_pick_mmap_layout(mm);  	if (old_mm) {  		up_read(&old_mm->mmap_sem);  		BUG_ON(active_mm != old_mm); +		mm_update_next_owner(old_mm);  		mmput(old_mm);  		return 0;  	} diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ae5004e93fc..e9fa960ba6da 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1626,6 +1626,9 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,  		free_blocks =  			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);  #endif +	if (free_blocks <= root_blocks) +		/* we don't have free space */ +		return 0;  	if (free_blocks - root_blocks < nblocks)  		return free_blocks - root_blocks;  	return nblocks; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d3d23d73c08b..ec8e33b45219 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent,  				get_dtype(sb, fname->file_type));  		if (error) {  			filp->f_pos = curr_pos; -			info->extra_fname = fname->next; +			info->extra_fname = fname;  			return error;  		}  		fname = fname->next; @@ -450,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp,  	 * If there are any leftover names on the hash collision  	 * chain, return them first.  	 */ -	if (info->extra_fname && -	    call_filldir(filp, dirent, filldir, info->extra_fname)) -		goto finished; +	if (info->extra_fname) { +		if (call_filldir(filp, dirent, filldir, info->extra_fname)) +			goto finished; -	if (!info->curr_node) +		info->extra_fname = NULL; +		info->curr_node = rb_next(info->curr_node); +		if (!info->curr_node) { +			if (info->next_hash == ~0) { +				filp->f_pos = EXT4_HTREE_EOF; +				goto finished; +			} +			info->curr_hash = info->next_hash; +			info->curr_minor_hash = 0; +		} +	} else if (!info->curr_node)  		info->curr_node = rb_first(&info->root);  	while (1) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6c7924d9e358..295003241d3d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1072,6 +1072,8 @@ extern void ext4_set_inode_flags(struct inode *);  extern void ext4_get_inode_flags(struct ext4_inode_info *);  extern void ext4_set_aops(struct inode *inode);  extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);  extern int ext4_block_truncate_page(handle_t *handle,  		struct address_space *mapping, loff_t from);  extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); @@ -1227,6 +1229,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;  /* extents.c */  extern int ext4_ext_tree_init(handle_t *handle, struct inode *);  extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, +				       int chunk);  extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,  			ext4_lblk_t iblock,  			unsigned long max_blocks, struct buffer_head *bh_result, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 6c166c0a54b7..d33dc56d6986 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -216,7 +216,9 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);  extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);  extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);  extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, +						   int num, +						   struct ext4_ext_path *path);  extern int ext4_ext_try_to_merge(struct inode *inode,  				 struct ext4_ext_path *path,  				 struct ext4_extent *); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index eb8bc3afe6e9..b455c685a98b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -51,6 +51,14 @@  					 EXT4_XATTR_TRANS_BLOCKS - 2 + \  					 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb)	(EXT4_XATTR_TRANS_BLOCKS + \ +					2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +  /* Delete operations potentially hit one directory's namespace plus an   * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be   * generous.  We can grow the delete transaction later if necessary. */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 612c3d2c3824..b24d3c53f20c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1747,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,  }  /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem.   */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,  						struct ext4_ext_path *path)  { -	int depth, needed; -  	if (path) { +		int depth = ext_depth(inode); +		int ret = 0; +  		/* probably there is space in leaf? */ -		depth = ext_depth(inode);  		if (le16_to_cpu(path[depth].p_hdr->eh_entries) -				< le16_to_cpu(path[depth].p_hdr->eh_max)) -			return 1; -	} +				< le16_to_cpu(path[depth].p_hdr->eh_max)) { -	/* -	 * given 32-bit logical block (4294967296 blocks), max. tree -	 * can be 4 levels in depth -- 4 * 340^4 == 53453440000. -	 * Let's also add one more level for imbalance. -	 */ -	depth = 5; - -	/* allocation of new data block(s) */ -	needed = 2; +			/* +			 *  There are some space in the leaf tree, no +			 *  need to account for leaf block credit +			 * +			 *  bitmaps and block group descriptor blocks +			 *  and other metadat blocks still need to be +			 *  accounted. +			 */ +			/* 1 bitmap, 1 block group descriptor */ +			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); +		} +	} -	/* -	 * tree can be full, so it would need to grow in depth: -	 * we need one credit to modify old root, credits for -	 * new root will be added in split accounting -	 */ -	needed += 1; +	return ext4_chunk_trans_blocks(inode, nrblocks); +} -	/* -	 * Index split can happen, we would need: -	 *    allocate intermediate indexes (bitmap + group) -	 *  + change two blocks at each level, but root (already included) -	 */ -	needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ +	int index; +	int depth = ext_depth(inode); -	/* any allocation modifies superblock */ -	needed += 1; +	if (chunk) +		index = depth * 2; +	else +		index = depth * 3; -	return needed; +	return index;  }  static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1921,9 +1928,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,  			correct_index = 1;  			credits += (ext_depth(inode)) + 1;  		} -#ifdef CONFIG_QUOTA  		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif  		err = ext4_ext_journal_restart(handle, credits);  		if (err) @@ -2805,7 +2810,7 @@ void ext4_ext_truncate(struct inode *inode)  	/*  	 * probably first extent we're gonna free will be last in block  	 */ -	err = ext4_writepage_trans_blocks(inode) + 3; +	err = ext4_writepage_trans_blocks(inode);  	handle = ext4_journal_start(inode, err);  	if (IS_ERR(handle))  		return; @@ -2819,7 +2824,7 @@ void ext4_ext_truncate(struct inode *inode)  	down_write(&EXT4_I(inode)->i_data_sem);  	ext4_ext_invalidate_cache(inode); -	ext4_mb_discard_inode_preallocations(inode); +	ext4_discard_reservation(inode);  	/*  	 * TODO: optimization is possible here. @@ -2858,27 +2863,6 @@ out_stop:  	ext4_journal_stop(handle);  } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ -	int needed; - -	needed = ext4_ext_calc_credits_for_insert(inode, NULL); - -	/* caller wants to allocate num blocks, but note it includes sb */ -	needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA -	needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - -	return needed; -} -  static void ext4_falloc_update_inode(struct inode *inode,  				int mode, loff_t new_size, int update_ctime)  { @@ -2939,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)  	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)  							- block;  	/* -	 * credits to insert 1 extent into extent tree + buffers to be able to -	 * modify 1 super block, 1 block bitmap and 1 group descriptor. +	 * credits to insert 1 extent into extent tree  	 */ -	credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; +	credits = ext4_chunk_trans_blocks(inode, max_blocks);  	mutex_lock(&inode->i_mutex);  retry:  	while (ret >= 0 && ret < max_blocks) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 655e760212b8..f344834bbf58 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -351,7 +351,7 @@ find_close_to_parent:  			goto found_flexbg;  		} -		if (best_flex < 0 || +		if (flex_group[best_flex].free_inodes == 0 ||  		    (flex_group[i].free_blocks >  		     flex_group[best_flex].free_blocks &&  		     flex_group[i].free_inodes)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59fbbe899acc..7e91913e325b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,8 @@  #include "acl.h"  #include "ext4_extents.h" +#define MPAGE_DA_EXTENT_TAIL 0x01 +  static inline int ext4_begin_ordered_truncate(struct inode *inode,  					      loff_t new_size)  { @@ -1005,6 +1007,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)   */  static int ext4_calc_metadata_amount(struct inode *inode, int blocks)  { +	if (!blocks) +		return 0; +  	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)  		return ext4_ext_calc_metadata_amount(inode, blocks); @@ -1041,18 +1046,6 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)  	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);  } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 -/* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. - */ -#define DIO_CREDITS 25 - -  /*   * The ext4_get_blocks_wrap() function try to look up the requested blocks,   * and returns if the blocks are already mapped. @@ -1164,19 +1157,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,  	return retval;  } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 +  static int ext4_get_block(struct inode *inode, sector_t iblock,  			struct buffer_head *bh_result, int create)  {  	handle_t *handle = ext4_journal_current_handle();  	int ret = 0, started = 0;  	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; +	int dio_credits;  	if (create && !handle) {  		/* Direct IO write... */  		if (max_blocks > DIO_MAX_BLOCKS)  			max_blocks = DIO_MAX_BLOCKS; -		handle = ext4_journal_start(inode, DIO_CREDITS + -			      2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); +		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); +		handle = ext4_journal_start(inode, dio_credits);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle);  			goto out; @@ -1559,7 +1556,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free)  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);  	int total, mdb, mdb_free, release; +	if (!to_free) +		return;		/* Nothing to release, exit */ +  	spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + +	if (!EXT4_I(inode)->i_reserved_data_blocks) { +		/* +		 * if there is no reserved blocks, but we try to free some +		 * then the counter is messed up somewhere. +		 * but since this function is called from invalidate +		 * page, it's harmless to return without any action +		 */ +		printk(KERN_INFO "ext4 delalloc try to release %d reserved " +			    "blocks for inode %lu, but there is no reserved " +			    "data blocks\n", to_free, inode->i_ino); +		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +		return; +	} +  	/* recalculate the number of metablocks still need to be reserved */  	total = EXT4_I(inode)->i_reserved_data_blocks - to_free;  	mdb = ext4_calc_metadata_amount(inode, total); @@ -1613,11 +1628,13 @@ struct mpage_da_data {  	unsigned long first_page, next_page;	/* extent of pages */  	get_block_t *get_block;  	struct writeback_control *wbc; +	int io_done; +	long pages_written;  };  /*   * mpage_da_submit_io - walks through extent of pages and try to write - * them with __mpage_writepage() + * them with writepage() call back   *   * @mpd->inode: inode   * @mpd->first_page: first page of the extent @@ -1632,18 +1649,11 @@ struct mpage_da_data {  static int mpage_da_submit_io(struct mpage_da_data *mpd)  {  	struct address_space *mapping = mpd->inode->i_mapping; -	struct mpage_data mpd_pp = { -		.bio = NULL, -		.last_block_in_bio = 0, -		.get_block = mpd->get_block, -		.use_writepage = 1, -	};  	int ret = 0, err, nr_pages, i;  	unsigned long index, end;  	struct pagevec pvec;  	BUG_ON(mpd->next_page <= mpd->first_page); -  	pagevec_init(&pvec, 0);  	index = mpd->first_page;  	end = mpd->next_page - 1; @@ -1661,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)  				break;  			index++; -			err = __mpage_writepage(page, mpd->wbc, &mpd_pp); - +			err = mapping->a_ops->writepage(page, mpd->wbc); +			if (!err) +				mpd->pages_written++;  			/*  			 * In error case, we have to continue because  			 * remaining pages are still locked @@ -1673,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)  		}  		pagevec_release(&pvec);  	} -	if (mpd_pp.bio) -		mpage_bio_submit(WRITE, mpd_pp.bio); -  	return ret;  } @@ -1698,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,  	int blocks = exbh->b_size >> inode->i_blkbits;  	sector_t pblock = exbh->b_blocknr, cur_logical;  	struct buffer_head *head, *bh; -	unsigned long index, end; +	pgoff_t index, end;  	struct pagevec pvec;  	int nr_pages, i; @@ -1741,6 +1749,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,  				if (buffer_delay(bh)) {  					bh->b_blocknr = pblock;  					clear_buffer_delay(bh); +					bh->b_bdev = inode->i_sb->s_bdev; +				} else if (buffer_unwritten(bh)) { +					bh->b_blocknr = pblock; +					clear_buffer_unwritten(bh); +					set_buffer_mapped(bh); +					set_buffer_new(bh); +					bh->b_bdev = inode->i_sb->s_bdev;  				} else if (buffer_mapped(bh))  					BUG_ON(bh->b_blocknr != pblock); @@ -1776,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode,   *   * The function skips space we know is already mapped to disk blocks.   * - * The function ignores errors ->get_block() returns, thus real - * error handling is postponed to __mpage_writepage()   */  static void mpage_da_map_blocks(struct mpage_da_data *mpd)  { +	int err = 0;  	struct buffer_head *lbh = &mpd->lbh; -	int err = 0, remain = lbh->b_size;  	sector_t next = lbh->b_blocknr;  	struct buffer_head new; @@ -1792,38 +1805,36 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)  	if (buffer_mapped(lbh) && !buffer_delay(lbh))  		return; -	while (remain) { -		new.b_state = lbh->b_state; -		new.b_blocknr = 0; -		new.b_size = remain; -		err = mpd->get_block(mpd->inode, next, &new, 1); -		if (err) { -			/* -			 * Rather than implement own error handling -			 * here, we just leave remaining blocks -			 * unallocated and try again with ->writepage() -			 */ -			break; -		} -		BUG_ON(new.b_size == 0); +	new.b_state = lbh->b_state; +	new.b_blocknr = 0; +	new.b_size = lbh->b_size; -		if (buffer_new(&new)) -			__unmap_underlying_blocks(mpd->inode, &new); +	/* +	 * If we didn't accumulate anything +	 * to write simply return +	 */ +	if (!new.b_size) +		return; +	err = mpd->get_block(mpd->inode, next, &new, 1); +	if (err) +		return; +	BUG_ON(new.b_size == 0); -		/* -		 * If blocks are delayed marked, we need to -		 * put actual blocknr and drop delayed bit -		 */ -		if (buffer_delay(lbh)) -			mpage_put_bnr_to_bhs(mpd, next, &new); +	if (buffer_new(&new)) +		__unmap_underlying_blocks(mpd->inode, &new); -		/* go for the remaining blocks */ -		next += new.b_size >> mpd->inode->i_blkbits; -		remain -= new.b_size; -	} +	/* +	 * If blocks are delayed marked, we need to +	 * put actual blocknr and drop delayed bit +	 */ +	if (buffer_delay(lbh) || buffer_unwritten(lbh)) +		mpage_put_bnr_to_bhs(mpd, next, &new); + +	return;  } -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ +		(1 << BH_Delay) | (1 << BH_Unwritten))  /*   * mpage_add_bh_to_extent - try to add one more block to extent of blocks @@ -1837,41 +1848,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)  static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,  				   sector_t logical, struct buffer_head *bh)  { -	struct buffer_head *lbh = &mpd->lbh;  	sector_t next; +	size_t b_size = bh->b_size; +	struct buffer_head *lbh = &mpd->lbh; +	int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; -	next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - +	/* check if thereserved journal credits might overflow */ +	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { +		if (nrblocks >= EXT4_MAX_TRANS_DATA) { +			/* +			 * With non-extent format we are limited by the journal +			 * credit available.  Total credit needed to insert +			 * nrblocks contiguous blocks is dependent on the +			 * nrblocks.  So limit nrblocks. +			 */ +			goto flush_it; +		} else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > +				EXT4_MAX_TRANS_DATA) { +			/* +			 * Adding the new buffer_head would make it cross the +			 * allowed limit for which we have journal credit +			 * reserved. So limit the new bh->b_size +			 */ +			b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << +						mpd->inode->i_blkbits; +			/* we will do mpage_da_submit_io in the next loop */ +		} +	}  	/*  	 * First block in the extent  	 */  	if (lbh->b_size == 0) {  		lbh->b_blocknr = logical; -		lbh->b_size = bh->b_size; +		lbh->b_size = b_size;  		lbh->b_state = bh->b_state & BH_FLAGS;  		return;  	} +	next = lbh->b_blocknr + nrblocks;  	/*  	 * Can we merge the block to our big extent?  	 */  	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { -		lbh->b_size += bh->b_size; +		lbh->b_size += b_size;  		return;  	} +flush_it:  	/*  	 * We couldn't merge the block to our extent, so we  	 * need to flush current  extent and start new one  	 */  	mpage_da_map_blocks(mpd); - -	/* -	 * Now start a new extent -	 */ -	lbh->b_size = bh->b_size; -	lbh->b_state = bh->b_state & BH_FLAGS; -	lbh->b_blocknr = logical; +	mpage_da_submit_io(mpd); +	mpd->io_done = 1; +	return;  }  /* @@ -1891,17 +1922,35 @@ static int __mpage_da_writepage(struct page *page,  	struct buffer_head *bh, *head, fake;  	sector_t logical; +	if (mpd->io_done) { +		/* +		 * Rest of the page in the page_vec +		 * redirty then and skip then. We will +		 * try to to write them again after +		 * starting a new transaction +		 */ +		redirty_page_for_writepage(wbc, page); +		unlock_page(page); +		return MPAGE_DA_EXTENT_TAIL; +	}  	/*  	 * Can we merge this page to current extent?  	 */  	if (mpd->next_page != page->index) {  		/*  		 * Nope, we can't. So, we map non-allocated blocks -		 * and start IO on them using __mpage_writepage() +		 * and start IO on them using writepage()  		 */  		if (mpd->next_page != mpd->first_page) {  			mpage_da_map_blocks(mpd);  			mpage_da_submit_io(mpd); +			/* +			 * skip rest of the page in the page_vec +			 */ +			mpd->io_done = 1; +			redirty_page_for_writepage(wbc, page); +			unlock_page(page); +			return MPAGE_DA_EXTENT_TAIL;  		}  		/* @@ -1932,6 +1981,8 @@ static int __mpage_da_writepage(struct page *page,  		set_buffer_dirty(bh);  		set_buffer_uptodate(bh);  		mpage_add_bh_to_extent(mpd, logical, bh); +		if (mpd->io_done) +			return MPAGE_DA_EXTENT_TAIL;  	} else {  		/*  		 * Page with regular buffer heads, just add all dirty ones @@ -1940,8 +1991,12 @@ static int __mpage_da_writepage(struct page *page,  		bh = head;  		do {  			BUG_ON(buffer_locked(bh)); -			if (buffer_dirty(bh)) +			if (buffer_dirty(bh) && +				(!buffer_mapped(bh) || buffer_delay(bh))) {  				mpage_add_bh_to_extent(mpd, logical, bh); +				if (mpd->io_done) +					return MPAGE_DA_EXTENT_TAIL; +			}  			logical++;  		} while ((bh = bh->b_this_page) != head);  	} @@ -1960,22 +2015,13 @@ static int __mpage_da_writepage(struct page *page,   *   * This is a library function, which implements the writepages()   * address_space_operation. - * - * In order to avoid duplication of logic that deals with partial pages, - * multiple bio per page, etc, we find non-allocated blocks, allocate - * them with minimal calls to ->get_block() and re-use __mpage_writepage() - * - * It's important that we call __mpage_writepage() only once for each - * involved page, otherwise we'd have to implement more complicated logic - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. - * - * See comments to mpage_writepages()   */  static int mpage_da_writepages(struct address_space *mapping,  			       struct writeback_control *wbc,  			       get_block_t get_block)  {  	struct mpage_da_data mpd; +	long to_write;  	int ret;  	if (!get_block) @@ -1989,17 +2035,22 @@ static int mpage_da_writepages(struct address_space *mapping,  	mpd.first_page = 0;  	mpd.next_page = 0;  	mpd.get_block = get_block; +	mpd.io_done = 0; +	mpd.pages_written = 0; + +	to_write = wbc->nr_to_write;  	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);  	/*  	 * Handle last extent of pages  	 */ -	if (mpd.next_page != mpd.first_page) { +	if (!mpd.io_done && mpd.next_page != mpd.first_page) {  		mpage_da_map_blocks(&mpd);  		mpage_da_submit_io(&mpd);  	} +	wbc->nr_to_write = to_write - mpd.pages_written;  	return ret;  } @@ -2204,63 +2255,95 @@ static int ext4_da_writepage(struct page *page,  }  /* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation.   */ -#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ +	int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + +	/* +	 * With non-extent format the journal credit needed to +	 * insert nrblocks contiguous block is dependent on +	 * number of contiguous block. So we will limit +	 * number of contiguous block to a sane value +	 */ +	if (!(inode->i_flags & EXT4_EXTENTS_FL) && +	    (max_blocks > EXT4_MAX_TRANS_DATA)) +		max_blocks = EXT4_MAX_TRANS_DATA; + +	return ext4_chunk_trans_blocks(inode, max_blocks); +}  static int ext4_da_writepages(struct address_space *mapping, -				struct writeback_control *wbc) +			      struct writeback_control *wbc)  { -	struct inode *inode = mapping->host;  	handle_t *handle = NULL; -	int needed_blocks; -	int ret = 0; -	long to_write;  	loff_t range_start = 0; +	struct inode *inode = mapping->host; +	int needed_blocks, ret = 0, nr_to_writebump = 0; +	long to_write, pages_skipped = 0; +	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);  	/*  	 * No pages to write? This is mainly a kludge to avoid starting  	 * a transaction for special inodes like journal inode on last iput()  	 * because that could violate lock ordering on umount  	 */ -	if (!mapping->nrpages) +	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))  		return 0; -  	/* -	 * Estimate the worse case needed credits to write out -	 * EXT4_MAX_BUF_BLOCKS pages +	 * Make sure nr_to_write is >= sbi->s_mb_stream_request +	 * This make sure small files blocks are allocated in +	 * single attempt. This ensure that small files +	 * get less fragmented.  	 */ -	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; +	if (wbc->nr_to_write < sbi->s_mb_stream_request) { +		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; +		wbc->nr_to_write = sbi->s_mb_stream_request; +	} -	to_write = wbc->nr_to_write; -	if (!wbc->range_cyclic) { +	if (!wbc->range_cyclic)  		/*  		 * If range_cyclic is not set force range_cont  		 * and save the old writeback_index  		 */  		wbc->range_cont = 1; -		range_start =  wbc->range_start; -	} -	while (!ret && to_write) { +	range_start =  wbc->range_start; +	pages_skipped = wbc->pages_skipped; + +restart_loop: +	to_write = wbc->nr_to_write; +	while (!ret && to_write > 0) { + +		/* +		 * we  insert one extent at a time. So we need +		 * credit needed for single extent allocation. +		 * journalled mode is currently not supported +		 * by delalloc +		 */ +		BUG_ON(ext4_should_journal_data(inode)); +		needed_blocks = ext4_da_writepages_trans_blocks(inode); +  		/* start a new transaction*/  		handle = ext4_journal_start(inode, needed_blocks);  		if (IS_ERR(handle)) {  			ret = PTR_ERR(handle); +			printk(KERN_EMERG "%s: jbd2_start: " +			       "%ld pages, ino %lu; err %d\n", __func__, +				wbc->nr_to_write, inode->i_ino, ret); +			dump_stack();  			goto out_writepages;  		}  		if (ext4_should_order_data(inode)) {  			/*  			 * With ordered mode we need to add -			 * the inode to the journal handle +			 * the inode to the journal handl  			 * when we do block allocation.  			 */  			ret = ext4_jbd2_file_inode(handle, inode); @@ -2268,20 +2351,20 @@ static int ext4_da_writepages(struct address_space *mapping,  				ext4_journal_stop(handle);  				goto out_writepages;  			} -  		} -		/* -		 * set the max dirty pages could be write at a time -		 * to fit into the reserved transaction credits -		 */ -		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) -			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;  		to_write -= wbc->nr_to_write;  		ret = mpage_da_writepages(mapping, wbc, -						ext4_da_get_block_write); +					  ext4_da_get_block_write);  		ext4_journal_stop(handle); -		if (wbc->nr_to_write) { +		if (ret == MPAGE_DA_EXTENT_TAIL) { +			/* +			 * got one extent now try with +			 * rest of the pages +			 */ +			to_write += wbc->nr_to_write; +			ret = 0; +		} else if (wbc->nr_to_write) {  			/*  			 * There is no more writeout needed  			 * or we requested for a noblocking writeout @@ -2293,10 +2376,18 @@ static int ext4_da_writepages(struct address_space *mapping,  		wbc->nr_to_write = to_write;  	} -out_writepages: -	wbc->nr_to_write = to_write; -	if (range_start) +	if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { +		/* We skipped pages in this loop */  		wbc->range_start = range_start; +		wbc->nr_to_write = to_write + +				wbc->pages_skipped - pages_skipped; +		wbc->pages_skipped = pages_skipped; +		goto restart_loop; +	} + +out_writepages: +	wbc->nr_to_write = to_write - nr_to_writebump; +	wbc->range_start = range_start;  	return ret;  } @@ -3486,6 +3577,9 @@ void ext4_truncate(struct inode *inode)  	 * modify the block allocation tree.  	 */  	down_write(&ei->i_data_sem); + +	ext4_discard_reservation(inode); +  	/*  	 * The orphan list entry will now protect us from any crash which  	 * occurs before the truncate completes, so it is now safe to propagate @@ -3555,8 +3649,6 @@ do_indirects:  		;  	} -	ext4_discard_reservation(inode); -  	up_write(&ei->i_data_sem);  	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);  	ext4_mark_inode_dirty(handle, inode); @@ -4324,57 +4416,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,  	return 0;  } +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, +				      int chunk) +{ +	int indirects; + +	/* if nrblocks are contiguous */ +	if (chunk) { +		/* +		 * With N contiguous data blocks, it need at most +		 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks +		 * 2 dindirect blocks +		 * 1 tindirect block +		 */ +		indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); +		return indirects + 3; +	} +	/* +	 * if nrblocks are not contiguous, worse case, each block touch +	 * a indirect block, and each indirect block touch a double indirect +	 * block, plus a triple indirect block +	 */ +	indirects = nrblocks * 2 + 1; +	return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ +	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) +		return ext4_indirect_trans_blocks(inode, nrblocks, 0); +	return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +}  /* - * How many blocks doth make a writepage()? - * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups   * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary.   * - * With ordered or writeback data it's the same, less the N data blocks. + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ +	int groups, gdpblocks; +	int idxblocks; +	int ret = 0; + +	/* +	 * How many index blocks need to touch to modify nrblocks? +	 * The "Chunk" flag indicating whether the nrblocks is +	 * physically contiguous on disk +	 * +	 * For Direct IO and fallocate, they calls get_block to allocate +	 * one single extent at a time, so they could set the "Chunk" flag +	 */ +	idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + +	ret = idxblocks; + +	/* +	 * Now let's see how many group bitmaps and group descriptors need +	 * to account +	 */ +	groups = idxblocks; +	if (chunk) +		groups += 1; +	else +		groups += nrblocks; + +	gdpblocks = groups; +	if (groups > EXT4_SB(inode->i_sb)->s_groups_count) +		groups = EXT4_SB(inode->i_sb)->s_groups_count; +	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) +		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + +	/* bitmaps and block group descriptor blocks */ +	ret += groups + gdpblocks; + +	/* Blocks for super block, inode, quota and xattr blocks */ +	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + +	return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations.   * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". + * This could be called via ext4_write_begin()   * - * This still overestimates under most circumstances.  If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched.  Pah. + * We need to consider the worse case, when + * one new block per extent.   */ -  int ext4_writepage_trans_blocks(struct inode *inode)  {  	int bpp = ext4_journal_blocks_per_page(inode); -	int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;  	int ret; -	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) -		return ext4_ext_writepage_trans_blocks(inode, bpp); +	ret = ext4_meta_trans_blocks(inode, bpp, 0); +	/* Account for data blocks for journalled mode */  	if (ext4_should_journal_data(inode)) -		ret = 3 * (bpp + indirects) + 2; -	else -		ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA -	/* We know that structure was already allocated during DQUOT_INIT so -	 * we will be updating only the data blocks + inodes */ -	ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - +		ret += bpp;  	return ret;  }  /* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ +	return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/*   * The caller must have previously called ext4_reserve_inode_write().   * Give this, we know that the caller already has write access to iloc->bh.   */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 865e9ddb44d4..e0e3a5eb1ddb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3282,6 +3282,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,  }  /* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, +			struct ext4_prealloc_space *pa, +			struct ext4_prealloc_space *cpa) +{ +	ext4_fsblk_t cur_distance, new_distance; + +	if (cpa == NULL) { +		atomic_inc(&pa->pa_count); +		return pa; +	} +	cur_distance = abs(goal_block - cpa->pa_pstart); +	new_distance = abs(goal_block - pa->pa_pstart); + +	if (cur_distance < new_distance) +		return cpa; + +	/* drop the previous reference */ +	atomic_dec(&cpa->pa_count); +	atomic_inc(&pa->pa_count); +	return pa; +} + +/*   * search goal blocks in preallocated space   */  static noinline_for_stack int @@ -3290,7 +3319,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)  	int order, i;  	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);  	struct ext4_locality_group *lg; -	struct ext4_prealloc_space *pa; +	struct ext4_prealloc_space *pa, *cpa = NULL; +	ext4_fsblk_t goal_block;  	/* only data can be preallocated */  	if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3333,6 +3363,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)  		/* The max size of hash table is PREALLOC_TB_SIZE */  		order = PREALLOC_TB_SIZE - 1; +	goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + +		     ac->ac_g_ex.fe_start + +		     le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); +	/* +	 * search for the prealloc space that is having +	 * minimal distance from the goal block. +	 */  	for (i = order; i < PREALLOC_TB_SIZE; i++) {  		rcu_read_lock();  		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], @@ -3340,17 +3377,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)  			spin_lock(&pa->pa_lock);  			if (pa->pa_deleted == 0 &&  					pa->pa_free >= ac->ac_o_ex.fe_len) { -				atomic_inc(&pa->pa_count); -				ext4_mb_use_group_pa(ac, pa); -				spin_unlock(&pa->pa_lock); -				ac->ac_criteria = 20; -				rcu_read_unlock(); -				return 1; + +				cpa = ext4_mb_check_group_pa(goal_block, +								pa, cpa);  			}  			spin_unlock(&pa->pa_lock);  		}  		rcu_read_unlock();  	} +	if (cpa) { +		ext4_mb_use_group_pa(ac, cpa); +		ac->ac_criteria = 20; +		return 1; +	}  	return 0;  } diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b9e077ba07e9..46fc0b5b12ba 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,  	 * credit. But below we try to not accumalate too much  	 * of them by restarting the journal.  	 */ -	needed = ext4_ext_calc_credits_for_insert(inode, path); +	needed = ext4_ext_calc_credits_for_single_extent(inode, +		    lb->last_block - lb->first_block + 1, path);  	/*  	 * Make sure the credit we accumalated is not really high diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0a9265164265..b3d35604ea18 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)  	if (reserved_gdb || gdb_off == 0) {  		if (!EXT4_HAS_COMPAT_FEATURE(sb, -					     EXT4_FEATURE_COMPAT_RESIZE_INODE)){ +					     EXT4_FEATURE_COMPAT_RESIZE_INODE) +		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {  			ext4_warning(sb, __func__,  				     "No reserved GDT blocks, can't resize");  			return -EPERM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d5d77958b861..566344b926b7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -568,6 +568,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)  #endif  	ei->i_block_alloc_info = NULL;  	ei->vfs_inode.i_version = 1; +	ei->vfs_inode.i_data.writeback_index = 0;  	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));  	INIT_LIST_HEAD(&ei->i_prealloc_list);  	spin_lock_init(&ei->i_prealloc_lock); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6d266d793e2c..80ff3381fa21 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -562,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)  	struct buffer_head *bh;  	struct msdos_dir_entry *raw_entry;  	loff_t i_pos; -	int err = 0; +	int err;  retry:  	i_pos = MSDOS_I(inode)->i_pos;  	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)  		return 0; -	lock_super(sb);  	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);  	if (!bh) {  		printk(KERN_ERR "FAT: unable to read inode block "  		       "for updating (i_pos %lld)\n", i_pos); -		err = -EIO; -		goto out; +		return -EIO;  	}  	spin_lock(&sbi->inode_hash_lock);  	if (i_pos != MSDOS_I(inode)->i_pos) {  		spin_unlock(&sbi->inode_hash_lock);  		brelse(bh); -		unlock_super(sb);  		goto retry;  	} @@ -607,11 +604,10 @@ retry:  	}  	spin_unlock(&sbi->inode_hash_lock);  	mark_buffer_dirty(bh); +	err = 0;  	if (wait)  		err = sync_dirty_buffer(bh);  	brelse(bh); -out: -	unlock_super(sb);  	return err;  } diff --git a/fs/inode.c b/fs/inode.c index b6726f644530..0487ddba1397 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)  		mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);  		mapping->assoc_mapping = NULL;  		mapping->backing_dev_info = &default_backing_dev_info; +		mapping->writeback_index = 0;  		/*  		 * If the block_device provides a backing_dev_info for client diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 60249429a253..d85c7d931cdf 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -323,7 +323,7 @@ out:  }  /* - * remove_kevent - cleans up and ultimately frees the given kevent + * remove_kevent - cleans up the given kevent   *   * Caller must hold dev->ev_mutex.   */ @@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,  	dev->event_count--;  	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; +} +/* + * free_kevent - frees the given kevent. + */ +static void free_kevent(struct inotify_kernel_event *kevent) +{  	kfree(kevent->name);  	kmem_cache_free(event_cachep, kevent);  } @@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)  		struct inotify_kernel_event *kevent;  		kevent = inotify_dev_get_event(dev);  		remove_kevent(dev, kevent); +		free_kevent(kevent);  	}  } @@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,  	dev = file->private_data;  	while (1) { -		int events;  		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);  		mutex_lock(&dev->ev_mutex); -		events = !list_empty(&dev->events); -		mutex_unlock(&dev->ev_mutex); -		if (events) { +		if (!list_empty(&dev->events)) {  			ret = 0;  			break;  		} +		mutex_unlock(&dev->ev_mutex);  		if (file->f_flags & O_NONBLOCK) {  			ret = -EAGAIN; @@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,  	if (ret)  		return ret; -	mutex_lock(&dev->ev_mutex);  	while (1) {  		struct inotify_kernel_event *kevent; @@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,  			}  			break;  		} +		remove_kevent(dev, kevent); + +		/* +		 * Must perform the copy_to_user outside the mutex in order +		 * to avoid a lock order reversal with mmap_sem. +		 */ +		mutex_unlock(&dev->ev_mutex);  		if (copy_to_user(buf, &kevent->event, event_size)) {  			ret = -EFAULT; @@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,  			count -= kevent->event.len;  		} -		remove_kevent(dev, kevent); +		free_kevent(kevent); + +		mutex_lock(&dev->ev_mutex);  	}  	mutex_unlock(&dev->ev_mutex); diff --git a/fs/ioprio.c b/fs/ioprio.c index c4a1c3c65aac..da3cc460d4df 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)  				pgrp = task_pgrp(current);  			else  				pgrp = find_vpid(who); -			do_each_pid_task(pgrp, PIDTYPE_PGID, p) { +			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {  				ret = set_task_ioprio(p, ioprio);  				if (ret)  					break; -			} while_each_pid_task(pgrp, PIDTYPE_PGID, p); +			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);  			break;  		case IOPRIO_WHO_USER:  			if (!who) @@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)  				pgrp = task_pgrp(current);  			else  				pgrp = find_vpid(who); -			do_each_pid_task(pgrp, PIDTYPE_PGID, p) { +			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {  				tmpio = get_task_ioprio(p);  				if (tmpio < 0)  					continue; @@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)  					ret = tmpio;  				else  					ret = ioprio_best(ret, tmpio); -			} while_each_pid_task(pgrp, PIDTYPE_PGID, p); +			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);  			break;  		case IOPRIO_WHO_USER:  			if (!who) diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 31559f45fdde..4c41db91eaa4 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -12,7 +12,6 @@  #ifndef _JFFS2_FS_I  #define _JFFS2_FS_I -#include <linux/version.h>  #include <linux/rbtree.h>  #include <linux/posix_acl.h>  #include <linux/mutex.h> diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9abcd2b329f7..e9b20173fef3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw,  		}  	} +	if (errors > 0) { +		dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n", +				errors, (errors == 1 ? "" : "s")); +		if (!sloppy) +			return 0; +	}  	return 1;  out_nomem: diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index b6ed38380ab8..54b8b4140c8f 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)  	 * enough space for either:  	 */  	alloc = sizeof(struct posix_ace_state_array) -		+ cnt*sizeof(struct posix_ace_state); +		+ cnt*sizeof(struct posix_user_ace_state);  	state->users = kzalloc(alloc, GFP_KERNEL);  	if (!state->users)  		return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2e51adac65de..e5b51ffafc6c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,  	int		slack_bytes;  	__be32		status; -	status = nfserr_resource; -	cstate = cstate_alloc(); -	if (cstate == NULL) -		goto out; -  	resp->xbuf = &rqstp->rq_res;  	resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;  	resp->tagp = resp->p; @@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,  	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)  		goto out; +	status = nfserr_resource; +	cstate = cstate_alloc(); +	if (cstate == NULL) +		goto out; +  	status = nfs_ok;  	while (!status && resp->opcnt < args->opcnt) {  		op = &args->ops[resp->opcnt++]; @@ -957,9 +957,9 @@ encode_op:  		nfsd4_increment_op_stats(op->opnum);  	} +	cstate_free(cstate);  out:  	nfsd4_release_compoundargs(args); -	cstate_free(cstate);  	dprintk("nfsv4 compound returned %d\n", ntohl(status));  	return status;  } diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index e1781c8b1650..9e8a95be7a1e 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,  	// TODO: Consider moving this lot to a separate function! (AIA)  handle_name:     { -	struct dentry *real_dent, *new_dent;  	MFT_RECORD *m;  	ntfs_attr_search_ctx *ctx;  	ntfs_inode *ni = NTFS_I(dent_inode); @@ -255,93 +254,9 @@ handle_name:  	}  	nls_name.hash = full_name_hash(nls_name.name, nls_name.len); -	/* -	 * Note: No need for dent->d_lock lock as i_mutex is held on the -	 * parent inode. -	 */ - -	/* Does a dentry matching the nls_name exist already? */ -	real_dent = d_lookup(dent->d_parent, &nls_name); -	/* If not, create it now. */ -	if (!real_dent) { -		real_dent = d_alloc(dent->d_parent, &nls_name); -		kfree(nls_name.name); -		if (!real_dent) { -			err = -ENOMEM; -			goto err_out; -		} -		new_dent = d_splice_alias(dent_inode, real_dent); -		if (new_dent) -			dput(real_dent); -		else -			new_dent = real_dent; -		ntfs_debug("Done.  (Created new dentry.)"); -		return new_dent; -	} +	dent = d_add_ci(dent, dent_inode, &nls_name);  	kfree(nls_name.name); -	/* Matching dentry exists, check if it is negative. */ -	if (real_dent->d_inode) { -		if (unlikely(real_dent->d_inode != dent_inode)) { -			/* This can happen because bad inodes are unhashed. */ -			BUG_ON(!is_bad_inode(dent_inode)); -			BUG_ON(!is_bad_inode(real_dent->d_inode)); -		} -		/* -		 * Already have the inode and the dentry attached, decrement -		 * the reference count to balance the ntfs_iget() we did -		 * earlier on.  We found the dentry using d_lookup() so it -		 * cannot be disconnected and thus we do not need to worry -		 * about any NFS/disconnectedness issues here. -		 */ -		iput(dent_inode); -		ntfs_debug("Done.  (Already had inode and dentry.)"); -		return real_dent; -	} -	/* -	 * Negative dentry: instantiate it unless the inode is a directory and -	 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), -	 * in which case d_move() that in place of the found dentry. -	 */ -	if (!S_ISDIR(dent_inode->i_mode)) { -		/* Not a directory; everything is easy. */ -		d_instantiate(real_dent, dent_inode); -		ntfs_debug("Done.  (Already had negative file dentry.)"); -		return real_dent; -	} -	spin_lock(&dcache_lock); -	if (list_empty(&dent_inode->i_dentry)) { -		/* -		 * Directory without a 'disconnected' dentry; we need to do -		 * d_instantiate() by hand because it takes dcache_lock which -		 * we already hold. -		 */ -		list_add(&real_dent->d_alias, &dent_inode->i_dentry); -		real_dent->d_inode = dent_inode; -		spin_unlock(&dcache_lock); -		security_d_instantiate(real_dent, dent_inode); -		ntfs_debug("Done.  (Already had negative directory dentry.)"); -		return real_dent; -	} -	/* -	 * Directory with a 'disconnected' dentry; get a reference to the -	 * 'disconnected' dentry. -	 */ -	new_dent = list_entry(dent_inode->i_dentry.next, struct dentry, -			d_alias); -	dget_locked(new_dent); -	spin_unlock(&dcache_lock); -	/* Do security vodoo. */ -	security_d_instantiate(real_dent, dent_inode); -	/* Move new_dent in place of real_dent. */ -	d_move(new_dent, real_dent); -	/* Balance the ntfs_iget() we did above. */ -	iput(dent_inode); -	/* Throw away real_dent. */ -	dput(real_dent); -	/* Use new_dent as the actual dentry. */ -	ntfs_debug("Done.  (Already had negative, disconnected directory " -			"dentry.)"); -	return new_dent; +	return dent;  eio_err_out:  	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 3a8af75351e8..4087fbdac327 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -113,7 +113,7 @@ typedef struct {   * Reason flags (32-bit).  Cumulative flags describing the change(s) to the   * file since it was last opened.  I think the names speak for themselves but   * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * documentation: http://www.linux-ntfs.org/   */  enum {  	USN_REASON_DATA_OVERWRITE	= const_cpu_to_le32(0x00000001), @@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;   * Source info flags (32-bit).  Information about the source of the change(s)   * to the file.  For detailed descriptions of what these mean, see the Linux   * NTFS project NTFS documentation: - *	http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + *	http://www.linux-ntfs.org/   */  enum {  	USN_SOURCE_DATA_MANAGEMENT	  = const_cpu_to_le32(0x00000001), diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 506c24fb5078..a53da1466277 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -594,7 +594,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,  		goto bail;  	} -	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { +	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {  		ocfs2_error(inode->i_sb,  			    "Inode %llu has a hole at block %llu\n",  			    (unsigned long long)OCFS2_I(inode)->ip_blkno, diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index d8bfa0eb41b2..52276c02f710 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)  			   "  message id:   %d\n"  			   "  message type: %u\n"  			   "  message key:  0x%08x\n" -			   "  sock acquiry: %lu.%lu\n" -			   "  send start:   %lu.%lu\n" -			   "  wait start:   %lu.%lu\n", +			   "  sock acquiry: %lu.%ld\n" +			   "  send start:   %lu.%ld\n" +			   "  wait start:   %lu.%ld\n",  			   nst, (unsigned long)nst->st_task->pid,  			   (unsigned long)nst->st_task->tgid,  			   nst->st_task->comm, nst->st_node,  			   nst->st_sc, nst->st_id, nst->st_msg_type,  			   nst->st_msg_key,  			   nst->st_sock_time.tv_sec, -			   (unsigned long)nst->st_sock_time.tv_usec, +			   (long)nst->st_sock_time.tv_usec,  			   nst->st_send_time.tv_sec, -			   (unsigned long)nst->st_send_time.tv_usec, +			   (long)nst->st_send_time.tv_usec,  			   nst->st_status_time.tv_sec, -			   nst->st_status_time.tv_usec); +			   (long)nst->st_status_time.tv_usec);  	}  	spin_unlock(&o2net_debug_lock); @@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)  	return sc; /* unused, just needs to be null when done */  } -#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec +#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec  static int sc_seq_show(struct seq_file *seq, void *v)  { @@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)  			   "  remote node:     %s\n"  			   "  page off:        %zu\n"  			   "  handshake ok:    %u\n" -			   "  timer:           %lu.%lu\n" -			   "  data ready:      %lu.%lu\n" -			   "  advance start:   %lu.%lu\n" -			   "  advance stop:    %lu.%lu\n" -			   "  func start:      %lu.%lu\n" -			   "  func stop:       %lu.%lu\n" +			   "  timer:           %lu.%ld\n" +			   "  data ready:      %lu.%ld\n" +			   "  advance start:   %lu.%ld\n" +			   "  advance stop:    %lu.%ld\n" +			   "  func start:      %lu.%ld\n" +			   "  func stop:       %lu.%ld\n"  			   "  func key:        %u\n"  			   "  func type:       %u\n",  			   sc, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a27d61581bd6..2bcf706d9dd3 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);  static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);  #ifdef CONFIG_DEBUG_FS -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, -		    u32 msgkey, struct task_struct *task, u8 node) +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, +			   u32 msgkey, struct task_struct *task, u8 node)  {  	INIT_LIST_HEAD(&nst->st_net_debug_item);  	nst->st_task = task; @@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,  	nst->st_node = node;  } -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)  {  	do_gettimeofday(&nst->st_sock_time);  } -void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)  {  	do_gettimeofday(&nst->st_send_time);  } -void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)  {  	do_gettimeofday(&nst->st_status_time);  } -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,  					 struct o2net_sock_container *sc)  {  	nst->st_sc = sc;  } -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)  {  	nst->st_id = msg_id;  } + +#else  /* CONFIG_DEBUG_FS */ + +static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, +				  u32 msgkey, struct task_struct *task, u8 node) +{ +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +						struct o2net_sock_container *sc) +{ +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, +					u32 msg_id) +{ +} +  #endif /* CONFIG_DEBUG_FS */  static inline int o2net_reconnect_delay(void) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 18307ff81b77..8d58cfe410b1 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -224,42 +224,10 @@ struct o2net_send_tracking {  	struct timeval			st_send_time;  	struct timeval			st_status_time;  }; - -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, -		    u32 msgkey, struct task_struct *task, u8 node); -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst); -void o2net_set_nst_send_time(struct o2net_send_tracking *nst); -void o2net_set_nst_status_time(struct o2net_send_tracking *nst); -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, -				  struct o2net_sock_container *sc); -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id); -  #else  struct o2net_send_tracking {  	u32	dummy;  }; - -static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, -				  u32 msgkey, struct task_struct *task, u8 node) -{ -} -static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, -						struct o2net_sock_container *sc) -{ -} -static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, -					u32 msg_id) -{ -}  #endif	/* CONFIG_DEBUG_FS */  #endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8a1875848080..9cce563fd627 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1300,7 +1300,6 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,  	di->i_size = cpu_to_le64(sb->s_blocksize);  	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);  	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); -	dir->i_blocks = ocfs2_inode_sector_count(dir);  	/*  	 * This should never fail as our extent list is empty and all @@ -1310,9 +1309,15 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,  				  NULL);  	if (ret) {  		mlog_errno(ret); -		goto out; +		goto out_commit;  	} +	/* +	 * Set i_blocks after the extent insert for the most up to +	 * date ip_clusters value. +	 */ +	dir->i_blocks = ocfs2_inode_sector_count(dir); +  	ret = ocfs2_journal_dirty(handle, di_bh);  	if (ret) {  		mlog_errno(ret); @@ -1336,7 +1341,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,  					  len, 0, NULL);  		if (ret) {  			mlog_errno(ret); -			goto out; +			goto out_commit;  		}  	} diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7a37240f7a31..c47bc2a809c2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1418,13 +1418,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)  {  	unsigned int node_num;  	int status, i; +	u32 gen;  	struct buffer_head *bh = NULL;  	struct ocfs2_dinode *di;  	/* This is called with the super block cluster lock, so we  	 * know that the slot map can't change underneath us. */ -	spin_lock(&osb->osb_lock);  	for (i = 0; i < osb->max_slots; i++) {  		/* Read journal inode to get the recovery generation */  		status = ocfs2_read_journal_inode(osb, i, &bh, NULL); @@ -1433,23 +1433,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)  			goto bail;  		}  		di = (struct ocfs2_dinode *)bh->b_data; -		osb->slot_recovery_generations[i] = -					ocfs2_get_recovery_generation(di); +		gen = ocfs2_get_recovery_generation(di);  		brelse(bh);  		bh = NULL; +		spin_lock(&osb->osb_lock); +		osb->slot_recovery_generations[i] = gen; +  		mlog(0, "Slot %u recovery generation is %u\n", i,  		     osb->slot_recovery_generations[i]); -		if (i == osb->slot_num) +		if (i == osb->slot_num) { +			spin_unlock(&osb->osb_lock);  			continue; +		}  		status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); -		if (status == -ENOENT) +		if (status == -ENOENT) { +			spin_unlock(&osb->osb_lock);  			continue; +		} -		if (__ocfs2_recovery_map_test(osb, node_num)) +		if (__ocfs2_recovery_map_test(osb, node_num)) { +			spin_unlock(&osb->osb_lock);  			continue; +		}  		spin_unlock(&osb->osb_lock);  		/* Ok, we have a slot occupied by another node which @@ -1465,10 +1473,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)  			mlog_errno(status);  			goto bail;  		} - -		spin_lock(&osb->osb_lock);  	} -	spin_unlock(&osb->osb_lock);  	status = 0;  bail: diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 10e149ae5e3a..07f348b8d721 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,  		goto out;  	} -	/* Ok, the stack is pinned */ -	p->sp_count++;  	active_stack = p; -  	rc = 0;  out: +	/* If we found it, pin it */ +	if (!rc) +		active_stack->sp_count++; +  	spin_unlock(&ocfs2_stack_lock);  	return rc;  } diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 697663b01bae..e1c0ec0ae989 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -92,7 +92,7 @@ int omfs_allocate_block(struct super_block *sb, u64 block)  	struct buffer_head *bh;  	struct omfs_sb_info *sbi = OMFS_SB(sb);  	int bits_per_entry = 8 * sb->s_blocksize; -	int map, bit; +	unsigned int map, bit;  	int ret = 0;  	u64 tmp; @@ -176,7 +176,8 @@ int omfs_clear_range(struct super_block *sb, u64 block, int count)  	struct omfs_sb_info *sbi = OMFS_SB(sb);  	int bits_per_entry = 8 * sb->s_blocksize;  	u64 tmp; -	int map, bit, ret; +	unsigned int map, bit; +	int ret;  	tmp = block;  	bit = do_div(tmp, bits_per_entry); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 7e2499053e4d..834b2331f6b3 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -26,6 +26,13 @@ static int omfs_sync_file(struct file *file, struct dentry *dentry,  	return err ? -EIO : 0;  } +static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) +{ +	return (sbi->s_sys_blocksize - offset - +		sizeof(struct omfs_extent)) / +		sizeof(struct omfs_extent_entry) + 1; +} +  void omfs_make_empty_table(struct buffer_head *bh, int offset)  {  	struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset]; @@ -45,6 +52,7 @@ int omfs_shrink_inode(struct inode *inode)  	struct buffer_head *bh;  	u64 next, last;  	u32 extent_count; +	u32 max_extents;  	int ret;  	/* traverse extent table, freeing each entry that is greater @@ -62,15 +70,18 @@ int omfs_shrink_inode(struct inode *inode)  		goto out;  	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); +	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);  	for (;;) { -		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) { -			brelse(bh); -			goto out; -		} +		if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next)) +			goto out_brelse;  		extent_count = be32_to_cpu(oe->e_extent_count); + +		if (extent_count > max_extents) +			goto out_brelse; +  		last = next;  		next = be64_to_cpu(oe->e_next);  		entry = &oe->e_entry; @@ -98,10 +109,14 @@ int omfs_shrink_inode(struct inode *inode)  		if (!bh)  			goto out;  		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); +		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);  	}  	ret = 0;  out:  	return ret; +out_brelse: +	brelse(bh); +	return ret;  }  static void omfs_truncate(struct inode *inode) @@ -154,9 +169,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,  			goto out;  		}  	} -	max_count = (sbi->s_sys_blocksize - OMFS_EXTENT_START - -		sizeof(struct omfs_extent)) / -		sizeof(struct omfs_extent_entry) + 1; +	max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);  	/* TODO: add a continuation block here */  	if (be32_to_cpu(oe->e_extent_count) > max_count-1) @@ -225,6 +238,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,  	sector_t next, offset;  	int ret;  	u64 new_block; +	u32 max_extents;  	int extent_count;  	struct omfs_extent *oe;  	struct omfs_extent_entry *entry; @@ -238,6 +252,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,  		goto out;  	oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]); +	max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);  	next = inode->i_ino;  	for (;;) { @@ -249,6 +264,9 @@ static int omfs_get_block(struct inode *inode, sector_t block,  		next = be64_to_cpu(oe->e_next);  		entry = &oe->e_entry; +		if (extent_count > max_extents) +			goto out_brelse; +  		offset = find_block(inode, entry, block, extent_count, &remain);  		if (offset > 0) {  			ret = 0; @@ -266,6 +284,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,  		if (!bh)  			goto out;  		oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]); +		max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);  	}  	if (create) {  		ret = omfs_grow_extent(inode, oe, &new_block); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index a95fe5984f4b..d29047b1b9b0 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -232,8 +232,7 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)  		inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);  		inode->i_op = &omfs_dir_inops;  		inode->i_fop = &omfs_dir_operations; -		inode->i_size = be32_to_cpu(oi->i_head.h_body_size) + -			sizeof(struct omfs_header); +		inode->i_size = sbi->s_sys_blocksize;  		inc_nlink(inode);  		break;  	case OMFS_FILE: diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7d6b34e201db..ecc3330972e5 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -499,9 +499,9 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)  		if (!size)  			continue;  		if (from + size > get_capacity(disk)) { -			printk(KERN_ERR " %s: p%d exceeds device capacity\n", +			printk(KERN_WARNING +				"%s: p%d exceeds device capacity\n",  				disk->disk_name, p); -			continue;  		}  		res = add_partition(disk, p, from, size, state->parts[p].flags);  		if (res) { diff --git a/fs/proc/array.c b/fs/proc/array.c index 0d6eb33597c6..71c9be59c9c2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -337,65 +337,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,  	return 0;  } -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -static cputime_t task_utime(struct task_struct *p) -{ -	return p->utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ -	return p->stime; -} -#else -static cputime_t task_utime(struct task_struct *p) -{ -	clock_t utime = cputime_to_clock_t(p->utime), -		total = utime + cputime_to_clock_t(p->stime); -	u64 temp; - -	/* -	 * Use CFS's precise accounting: -	 */ -	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - -	if (total) { -		temp *= utime; -		do_div(temp, total); -	} -	utime = (clock_t)temp; - -	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); -	return p->prev_utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ -	clock_t stime; - -	/* -	 * Use CFS's precise accounting. (we subtract utime from -	 * the total, to make sure the total observed by userspace -	 * grows monotonically - apps rely on that): -	 */ -	stime = nsec_to_clock_t(p->se.sum_exec_runtime) - -			cputime_to_clock_t(task_utime(p)); - -	if (stime >= 0) -		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); - -	return p->prev_stime; -} -#endif - -static cputime_t task_gtime(struct task_struct *p) -{ -	return p->gtime; -} -  static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,  			struct pid *pid, struct task_struct *task, int whole)  { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4fb81e9c94e3..7821589a17d5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -330,6 +330,7 @@ retry:  		spin_lock(&proc_inum_lock);  		ida_remove(&proc_inum_ida, i);  		spin_unlock(&proc_inum_lock); +		return 0;  	}  	return PROC_DYNAMIC_FIRST + i;  } @@ -546,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp  	for (tmp = dir->subdir; tmp; tmp = tmp->next)  		if (strcmp(tmp->name, dp->name) == 0) { -			printk(KERN_WARNING "proc_dir_entry '%s' already " -					"registered\n", dp->name); +			printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", +				dir->name, dp->name);  			dump_stack();  			break;  		} diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 79ecd281d2cb..3f87d2632947 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)  	}  	seq_printf(m, -		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", +		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",  		   vma->vm_start,  		   vma->vm_end,  		   flags & VM_READ ? 'r' : '-',  		   flags & VM_WRITE ? 'w' : '-',  		   flags & VM_EXEC ? 'x' : '-',  		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', -		   vma->vm_pgoff << PAGE_SHIFT, +		   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,  		   MAJOR(dev), MINOR(dev), ino, &len);  	if (file) { diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index ded969862960..29e20c6b1f7f 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -24,6 +24,7 @@  #include <linux/tty.h>  #include <linux/string.h>  #include <linux/mman.h> +#include <linux/quicklist.h>  #include <linux/proc_fs.h>  #include <linux/ioport.h>  #include <linux/mm.h> @@ -182,6 +183,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,  		"SReclaimable: %8lu kB\n"  		"SUnreclaim:   %8lu kB\n"  		"PageTables:   %8lu kB\n" +#ifdef CONFIG_QUICKLIST +		"Quicklists:   %8lu kB\n" +#endif  		"NFS_Unstable: %8lu kB\n"  		"Bounce:       %8lu kB\n"  		"WritebackTmp: %8lu kB\n" @@ -214,6 +218,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,  		K(global_page_state(NR_SLAB_RECLAIMABLE)),  		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),  		K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST +		K(quicklist_total_size()), +#endif  		K(global_page_state(NR_UNSTABLE_NFS)),  		K(global_page_state(NR_BOUNCE)),  		K(global_page_state(NR_WRITEBACK_TEMP)), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7546a918f790..73d1891ee625 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -219,14 +219,14 @@ static int show_map(struct seq_file *m, void *v)  		ino = inode->i_ino;  	} -	seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", +	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",  			vma->vm_start,  			vma->vm_end,  			flags & VM_READ ? 'r' : '-',  			flags & VM_WRITE ? 'w' : '-',  			flags & VM_EXEC ? 'x' : '-',  			flags & VM_MAYSHARE ? 's' : 'p', -			vma->vm_pgoff << PAGE_SHIFT, +			((loff_t)vma->vm_pgoff) << PAGE_SHIFT,  			MAJOR(dev), MINOR(dev), ino, &len);  	/* diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 52312ec93ff4..5145cb9125af 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {   * size 0 on the assumption that it's going to be used for an mmap of shared   * memory   */ -static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)  {  	struct pagevec lru_pvec;  	unsigned long npages, xpages, loop, limit; diff --git a/fs/readdir.c b/fs/readdir.c index 4e026e5407fb..93a7559bbfd8 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset  	if (buf->result)  		return -EINVAL;  	d_ino = ino; -	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) +	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { +		buf->result = -EOVERFLOW;  		return -EOVERFLOW; +	}  	buf->result++;  	dirent = buf->dirent;  	if (!access_ok(VERIFY_WRITE, dirent, @@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,  	if (reclen > buf->count)  		return -EINVAL;  	d_ino = ino; -	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) +	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { +		buf->error = -EOVERFLOW;  		return -EOVERFLOW; +	}  	dirent = buf->previous;  	if (dirent) {  		if (__put_user(offset, &dirent->d_off)) diff --git a/fs/seq_file.c b/fs/seq_file.c index 5d54205e486b..bd20f7f5a933 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  			goto Done;  	}  	/* we need at least one record in buffer */ +	pos = m->index; +	p = m->op->start(m, &pos);  	while (1) { -		pos = m->index; -		p = m->op->start(m, &pos);  		err = PTR_ERR(p);  		if (!p || IS_ERR(p))  			break; @@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  			break;  		if (unlikely(err))  			m->count = 0; +		if (unlikely(!m->count)) { +			p = m->op->next(m, p, &pos); +			m->index = pos; +			continue; +		}  		if (m->count < m->size)  			goto Fill;  		m->op->stop(m, p); @@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)  			goto Enomem;  		m->count = 0;  		m->version = 0; +		pos = m->index; +		p = m->op->start(m, &pos);  	}  	m->op->stop(m, p);  	m->count = 0; diff --git a/fs/splice.c b/fs/splice.c index 1bbc6f4bb09c..a1e701c27156 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,  	if (unlikely(!(out->f_mode & FMODE_WRITE)))  		return -EBADF; +	if (unlikely(out->f_flags & O_APPEND)) +		return -EINVAL; +  	ret = rw_verify_area(WRITE, out, ppos, len);  	if (unlikely(ret < 0))  		return ret; diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index d81fb9ed2b8e..73db464cd08b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -263,8 +263,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)  	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; -	/* And make sure we have twice the index size of space reserved */ -	idx_size <<= 1; +	/* And make sure we have thrice the index size of space reserved */ +	idx_size = idx_size + (idx_size << 1);  	/*  	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' @@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)  	int subtract_lebs;  	long long available; -	/* -	 * Force the amount available to the total size reported if the used -	 * space is zero. -	 */ -	if (c->lst.total_used <= UBIFS_INO_NODE_SZ && -	    c->budg_data_growth + c->budg_dd_growth == 0) { -		/* Do the same calculation as for c->block_cnt */ -		available = c->main_lebs - 2; -		available *= c->leb_size - c->dark_wm; -		return available; -	} -  	available = c->main_bytes - c->lst.total_used;  	/* @@ -388,11 +376,11 @@ static int can_use_rp(struct ubifs_info *c)   * This function makes sure UBIFS has enough free eraseblocks for index growth   * and data.   * - * When budgeting index space, UBIFS reserves twice as more LEBs as the index + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index   * would take if it was consolidated and written to the flash. This guarantees   * that the "in-the-gaps" commit method always succeeds and UBIFS will always   * be able to commit dirty index. So this function basically adds amount of - * budgeted index space to the size of the current index, multiplies this by 2, + * budgeted index space to the size of the current index, multiplies this by 3,   * and makes sure this does not exceed the amount of free eraseblocks.   *   * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: @@ -543,8 +531,16 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)  	int err, idx_growth, data_growth, dd_growth;  	struct retries_info ri; +	ubifs_assert(req->new_page <= 1); +	ubifs_assert(req->dirtied_page <= 1); +	ubifs_assert(req->new_dent <= 1); +	ubifs_assert(req->mod_dent <= 1); +	ubifs_assert(req->new_ino <= 1); +	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);  	ubifs_assert(req->dirtied_ino <= 4);  	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); +	ubifs_assert(!(req->new_ino_d & 7)); +	ubifs_assert(!(req->dirtied_ino_d & 7));  	data_growth = calc_data_growth(c, req);  	dd_growth = calc_dd_growth(c, req); @@ -618,8 +614,16 @@ again:   */  void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)  { +	ubifs_assert(req->new_page <= 1); +	ubifs_assert(req->dirtied_page <= 1); +	ubifs_assert(req->new_dent <= 1); +	ubifs_assert(req->mod_dent <= 1); +	ubifs_assert(req->new_ino <= 1); +	ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);  	ubifs_assert(req->dirtied_ino <= 4);  	ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); +	ubifs_assert(!(req->new_ino_d & 7)); +	ubifs_assert(!(req->dirtied_ino_d & 7));  	if (!req->recalculate) {  		ubifs_assert(req->idx_growth >= 0);  		ubifs_assert(req->data_growth >= 0); @@ -647,7 +651,11 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)  	ubifs_assert(c->budg_idx_growth >= 0);  	ubifs_assert(c->budg_data_growth >= 0); +	ubifs_assert(c->budg_dd_growth >= 0);  	ubifs_assert(c->min_idx_lebs < c->main_lebs); +	ubifs_assert(!(c->budg_idx_growth & 7)); +	ubifs_assert(!(c->budg_data_growth & 7)); +	ubifs_assert(!(c->budg_dd_growth & 7));  	spin_unlock(&c->space_lock);  } @@ -686,41 +694,114 @@ void ubifs_convert_page_budget(struct ubifs_info *c)  void ubifs_release_dirty_inode_budget(struct ubifs_info *c,  				      struct ubifs_inode *ui)  { -	struct ubifs_budget_req req = {.dd_growth = c->inode_budget, -				       .dirtied_ino_d = ui->data_len}; +	struct ubifs_budget_req req; +	memset(&req, 0, sizeof(struct ubifs_budget_req)); +	req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);  	ubifs_release_budget(c, &req);  }  /** - * ubifs_budg_get_free_space - return amount of free space. + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexind nodes as well. This introduces additional + * overhead, and UBIFS it has to report sligtly less free space to meet the + * above expectetion. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +{ +	int divisor, factor, f; + +	/* +	 * Reported space size is @free * X, where X is UBIFS block size +	 * divided by UBIFS block size + all overhead one data block +	 * introduces. The overhead is the node header + indexing overhead. +	 * +	 * Indexing overhead calculations are based on the following formula: +	 * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number +	 * of data nodes, f - fanout. Because effective UBIFS fanout is twice +	 * as less than maximum fanout, we assume that each data node +	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. +	 * Note, the multiplier 3 is because UBIFS reseves thrice as more space +	 * for the index. +	 */ +	f = c->fanout > 3 ? c->fanout >> 1 : 2; +	factor = UBIFS_BLOCK_SIZE; +	divisor = UBIFS_MAX_DATA_NODE_SZ; +	divisor += (c->max_idx_node_sz * 3) / (f - 1); +	free *= factor; +	do_div(free, divisor); +	return free; +} + +/** + * ubifs_get_free_space - return amount of free space.   * @c: UBIFS file-system description object   * - * This function returns amount of free space on the file-system. + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * amount of free flash space it has (well, because not all dirty space is + * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectetion about what free space is. Users seem to + * accustomed to assume that if the file-system reports N bytes of free space, + * they would be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account.   */ -long long ubifs_budg_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space(struct ubifs_info *c)  { -	int min_idx_lebs, rsvd_idx_lebs; +	int min_idx_lebs, rsvd_idx_lebs, lebs;  	long long available, outstanding, free; -	/* Do exactly the same calculations as in 'do_budget_space()' */  	spin_lock(&c->space_lock);  	min_idx_lebs = ubifs_calc_min_idx_lebs(c); +	outstanding = c->budg_data_growth + c->budg_dd_growth; -	if (min_idx_lebs > c->lst.idx_lebs) -		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; -	else -		rsvd_idx_lebs = 0; - -	if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -				- c->lst.taken_empty_lebs) { +	/* +	 * Force the amount available to the total size reported if the used +	 * space is zero. +	 */ +	if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {  		spin_unlock(&c->space_lock); -		return 0; +		return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;  	}  	available = ubifs_calc_available(c, min_idx_lebs); -	outstanding = c->budg_data_growth + c->budg_dd_growth; -	c->min_idx_lebs = min_idx_lebs; + +	/* +	 * When reporting free space to user-space, UBIFS guarantees that it is +	 * possible to write a file of free space size. This means that for +	 * empty LEBs we may use more precise calculations than +	 * 'ubifs_calc_available()' is using. Namely, we know that in empty +	 * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. +	 * Thus, amend the available space. +	 * +	 * Note, the calculations below are similar to what we have in +	 * 'do_budget_space()', so refer there for comments. +	 */ +	if (min_idx_lebs > c->lst.idx_lebs) +		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; +	else +		rsvd_idx_lebs = 0; +	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - +	       c->lst.taken_empty_lebs; +	lebs -= rsvd_idx_lebs; +	available += lebs * (c->dark_wm - c->leb_overhead);  	spin_unlock(&c->space_lock);  	if (available > outstanding) diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 3b516316c9b3..0a6aa2cc78f0 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -74,6 +74,7 @@ static int do_commit(struct ubifs_info *c)  			goto out_up;  	} +	c->cmt_no += 1;  	err = ubifs_gc_start_commit(c);  	if (err)  		goto out_up; @@ -115,7 +116,7 @@ static int do_commit(struct ubifs_info *c)  		goto out;  	mutex_lock(&c->mst_mutex); -	c->mst_node->cmt_no      = cpu_to_le64(++c->cmt_no); +	c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);  	c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);  	c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);  	c->mst_node->root_offs   = cpu_to_le32(zroot.offs); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 4e3aaeba4eca..d7f7645779f2 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)  		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);  		for (i = 0; i < n; i++)  			printk(KERN_DEBUG "\t  ino %llu\n", -			       le64_to_cpu(orph->inos[i])); +			       (unsigned long long)le64_to_cpu(orph->inos[i]));  		break;  	}  	default: @@ -568,8 +568,8 @@ void dbg_dump_budget_req(const struct ubifs_budget_req *req)  void dbg_dump_lstats(const struct ubifs_lp_stats *lst)  {  	spin_lock(&dbg_lock); -	printk(KERN_DEBUG "Lprops statistics: empty_lebs %d, idx_lebs  %d\n", -	       lst->empty_lebs, lst->idx_lebs); +	printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " +	       "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);  	printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "  	       "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,  	       lst->total_dirty); @@ -587,8 +587,8 @@ void dbg_dump_budg(struct ubifs_info *c)  	struct ubifs_gced_idx_leb *idx_gc;  	spin_lock(&dbg_lock); -	printk(KERN_DEBUG "Budgeting info: budg_data_growth %lld, " -	       "budg_dd_growth %lld, budg_idx_growth %lld\n", +	printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " +	       "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,  	       c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);  	printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "  	       "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, @@ -634,7 +634,7 @@ void dbg_dump_lprops(struct ubifs_info *c)  	struct ubifs_lprops lp;  	struct ubifs_lp_stats lst; -	printk(KERN_DEBUG "Dumping LEB properties\n"); +	printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);  	ubifs_get_lp_stats(c, &lst);  	dbg_dump_lstats(&lst); @@ -655,7 +655,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)  	if (dbg_failure_mode)  		return; -	printk(KERN_DEBUG "Dumping LEB %d\n", lnum); +	printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);  	sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);  	if (IS_ERR(sleb)) { @@ -720,8 +720,8 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)  {  	int i; -	printk(KERN_DEBUG "Dumping heap cat %d (%d elements)\n", -	       cat, heap->cnt); +	printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n", +	       current->pid, cat, heap->cnt);  	for (i = 0; i < heap->cnt; i++) {  		struct ubifs_lprops *lprops = heap->arr[i]; @@ -736,7 +736,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,  {  	int i; -	printk(KERN_DEBUG "Dumping pnode:\n"); +	printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);  	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",  	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);  	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", @@ -755,7 +755,7 @@ void dbg_dump_tnc(struct ubifs_info *c)  	int level;  	printk(KERN_DEBUG "\n"); -	printk(KERN_DEBUG "Dumping the TNC tree\n"); +	printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);  	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);  	level = znode->level;  	printk(KERN_DEBUG "== Level %d ==\n", level); @@ -2208,16 +2208,17 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,  int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,  		  int offset, int len, int dtype)  { -	int err; +	int err, failing;  	if (in_failure_mode(desc))  		return -EIO; -	if (do_fail(desc, lnum, 1)) +	failing = do_fail(desc, lnum, 1); +	if (failing)  		cut_data(buf, len);  	err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);  	if (err)  		return err; -	if (in_failure_mode(desc)) +	if (failing)  		return -EIO;  	return 0;  } diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index 3c4f1e93c9e0..50315fc57185 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -27,7 +27,7 @@  #define UBIFS_DBG(op) op -#define ubifs_assert(expr)  do {                                               \ +#define ubifs_assert(expr) do {                                                \  	if (unlikely(!(expr))) {                                               \  		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \  		       __func__, __LINE__, current->pid);                      \ @@ -73,50 +73,50 @@ const char *dbg_key_str1(const struct ubifs_info *c,  			 const union ubifs_key *key);  /* - * DBGKEY macros require dbg_lock to be held, which it is in the dbg message + * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message   * macros.   */  #define DBGKEY(key) dbg_key_str0(c, (key))  #define DBGKEY1(key) dbg_key_str1(c, (key))  /* General messages */ -#define dbg_gen(fmt, ...)        dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...)   dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)  /* Additional journal messages */ -#define dbg_jnl(fmt, ...)        dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...)   dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)  /* Additional TNC messages */ -#define dbg_tnc(fmt, ...)        dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...)   dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)  /* Additional lprops messages */ -#define dbg_lp(fmt, ...)         dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...)    dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)  /* Additional LEB find messages */ -#define dbg_find(fmt, ...)       dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...)  dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)  /* Additional mount messages */ -#define dbg_mnt(fmt, ...)        dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...)   dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)  /* Additional I/O messages */ -#define dbg_io(fmt, ...)         dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...)    dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)  /* Additional commit messages */ -#define dbg_cmt(fmt, ...)        dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...)   dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)  /* Additional budgeting messages */ -#define dbg_budg(fmt, ...)       dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...)  dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)  /* Additional log messages */ -#define dbg_log(fmt, ...)        dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...)   dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)  /* Additional gc messages */ -#define dbg_gc(fmt, ...)         dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...)    dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)  /* Additional scan messages */ -#define dbg_scan(fmt, ...)       dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...)  dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)  /* Additional recovery messages */ -#define dbg_rcvry(fmt, ...)      dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)  /*   * Debugging message type flags (must match msg_type_names in debug.c). @@ -239,34 +239,23 @@ typedef int (*dbg_leaf_callback)(struct ubifs_info *c,  				 struct ubifs_zbranch *zbr, void *priv);  typedef int (*dbg_znode_callback)(struct ubifs_info *c,  				  struct ubifs_znode *znode, void *priv); -  int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,  		   dbg_znode_callback znode_cb, void *priv);  /* Checking functions */  int dbg_check_lprops(struct ubifs_info *c); -  int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);  int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); -  int dbg_check_cats(struct ubifs_info *c); -  int dbg_check_ltab(struct ubifs_info *c); -  int dbg_check_synced_i_size(struct inode *inode); -  int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); -  int dbg_check_tnc(struct ubifs_info *c, int extra); -  int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); -  int dbg_check_filesystem(struct ubifs_info *c); -  void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,  		    int add_pos); -  int dbg_check_lprops(struct ubifs_info *c);  int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,  			int row, int col); @@ -329,71 +318,77 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,  #else /* !CONFIG_UBIFS_FS_DEBUG */  #define UBIFS_DBG(op) -#define ubifs_assert(expr)                         ({}) -#define ubifs_assert_cmt_locked(c) + +/* Use "if (0)" to make compiler check arguments even if debugging is off */ +#define ubifs_assert(expr)  do {                                               \ +	if (0 && (expr))                                                       \ +		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ +		       __func__, __LINE__, current->pid);                      \ +} while (0) + +#define dbg_err(fmt, ...)   do {                                               \ +	if (0)                                                                 \ +		ubifs_err(fmt, ##__VA_ARGS__);                                 \ +} while (0) + +#define dbg_msg(fmt, ...) do {                                                 \ +	if (0)                                                                 \ +		printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n",         \ +		       current->pid, __func__, ##__VA_ARGS__);                 \ +} while (0) +  #define dbg_dump_stack() -#define dbg_err(fmt, ...)                          ({}) -#define dbg_msg(fmt, ...)                          ({}) -#define dbg_key(c, key, fmt, ...)                  ({}) - -#define dbg_gen(fmt, ...)                          ({}) -#define dbg_jnl(fmt, ...)                          ({}) -#define dbg_tnc(fmt, ...)                          ({}) -#define dbg_lp(fmt, ...)                           ({}) -#define dbg_find(fmt, ...)                         ({}) -#define dbg_mnt(fmt, ...)                          ({}) -#define dbg_io(fmt, ...)                           ({}) -#define dbg_cmt(fmt, ...)                          ({}) -#define dbg_budg(fmt, ...)                         ({}) -#define dbg_log(fmt, ...)                          ({}) -#define dbg_gc(fmt, ...)                           ({}) -#define dbg_scan(fmt, ...)                         ({}) -#define dbg_rcvry(fmt, ...)                        ({}) - -#define dbg_ntype(type)                            "" -#define dbg_cstate(cmt_state)                      "" -#define dbg_get_key_dump(c, key)                   ({}) -#define dbg_dump_inode(c, inode)                   ({}) -#define dbg_dump_node(c, node)                     ({}) -#define dbg_dump_budget_req(req)                   ({}) -#define dbg_dump_lstats(lst)                       ({}) -#define dbg_dump_budg(c)                           ({}) -#define dbg_dump_lprop(c, lp)                      ({}) -#define dbg_dump_lprops(c)                         ({}) -#define dbg_dump_leb(c, lnum)                      ({}) -#define dbg_dump_znode(c, znode)                   ({}) -#define dbg_dump_heap(c, heap, cat)                ({}) -#define dbg_dump_pnode(c, pnode, parent, iip)      ({}) -#define dbg_dump_tnc(c)                            ({}) -#define dbg_dump_index(c)                          ({}) +#define ubifs_assert_cmt_locked(c) -#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 +#define dbg_gen(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...)   dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...)    dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...)  dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) + +#define DBGKEY(key)  ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + +#define dbg_ntype(type)                       "" +#define dbg_cstate(cmt_state)                 "" +#define dbg_get_key_dump(c, key)              ({}) +#define dbg_dump_inode(c, inode)              ({}) +#define dbg_dump_node(c, node)                ({}) +#define dbg_dump_budget_req(req)              ({}) +#define dbg_dump_lstats(lst)                  ({}) +#define dbg_dump_budg(c)                      ({}) +#define dbg_dump_lprop(c, lp)                 ({}) +#define dbg_dump_lprops(c)                    ({}) +#define dbg_dump_leb(c, lnum)                 ({}) +#define dbg_dump_znode(c, znode)              ({}) +#define dbg_dump_heap(c, heap, cat)           ({}) +#define dbg_dump_pnode(c, pnode, parent, iip) ({}) +#define dbg_dump_tnc(c)                       ({}) +#define dbg_dump_index(c)                     ({}) +#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0  #define dbg_old_index_check_init(c, zroot)         0  #define dbg_check_old_index(c, zroot)              0 -  #define dbg_check_cats(c)                          0 -  #define dbg_check_ltab(c)                          0 -  #define dbg_check_synced_i_size(inode)             0 -  #define dbg_check_dir_size(c, dir)                 0 -  #define dbg_check_tnc(c, x)                        0 -  #define dbg_check_idx_size(c, idx_size)            0 -  #define dbg_check_filesystem(c)                    0 -  #define dbg_check_heap(c, heap, cat, add_pos)      ({}) -  #define dbg_check_lprops(c)                        0  #define dbg_check_lpt_nodes(c, cnode, row, col)    0 -  #define dbg_force_in_the_gaps_enabled              0  #define dbg_force_in_the_gaps()                    0 -  #define dbg_failure_mode                           0  #define dbg_failure_mode_registration(c)           ({})  #define dbg_failure_mode_deregistration(c)         ({}) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e90374be7d3b..526c01ec8003 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -165,7 +165,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,  	}  	inode->i_ino = ++c->highest_inum; -	inode->i_generation = ++c->vfs_gen;  	/*  	 * The creation sequence number remains with this inode for its  	 * lifetime. All nodes for this inode have a greater sequence number, @@ -220,15 +219,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,  	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);  	if (err) { -		/* -		 * Do not hash the direntry if parent 'i_nlink' is zero, because -		 * this has side-effects - '->delete_inode()' call will not be -		 * called for the parent orphan inode, because 'd_count' of its -		 * direntry will stay 1 (it'll be negative direntry I guess) -		 * and prevent 'iput_final()' until the dentry is destroyed due -		 * to unmount or memory pressure. -		 */ -		if (err == -ENOENT && dir->i_nlink != 0) { +		if (err == -ENOENT) {  			dbg_gen("not found");  			goto done;  		} @@ -435,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)  	while (1) {  		dbg_gen("feed '%s', ino %llu, new f_pos %#x", -			dent->name, le64_to_cpu(dent->inum), +			dent->name, (unsigned long long)le64_to_cpu(dent->inum),  			key_hash_flash(c, &dent->key));  		ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); @@ -525,7 +516,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,  	struct ubifs_inode *dir_ui = ubifs_inode(dir);  	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);  	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, -					.dirtied_ino_d = ui->data_len }; +				.dirtied_ino_d = ALIGN(ui->data_len, 8) };  	/*  	 * Budget request settings: new direntry, changing the target inode, @@ -596,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)  	if (err) {  		if (err != -ENOSPC)  			return err; -		err = 0;  		budgeted = 0;  	} @@ -727,8 +717,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)  	struct ubifs_inode *dir_ui = ubifs_inode(dir);  	struct ubifs_info *c = dir->i_sb->s_fs_info;  	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); -	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, -					.dirtied_ino_d = 1 }; +	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };  	/*  	 * Budget request settings: new inode, new direntry and changing parent @@ -789,7 +778,8 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,  	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);  	int err, devlen = 0;  	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, -					.new_ino_d = devlen, .dirtied_ino = 1 }; +					.new_ino_d = ALIGN(devlen, 8), +					.dirtied_ino = 1 };  	/*  	 * Budget request settings: new inode, new direntry and changing parent @@ -863,7 +853,8 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,  	int err, len = strlen(symname);  	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);  	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, -					.new_ino_d = len, .dirtied_ino = 1 }; +					.new_ino_d = ALIGN(len, 8), +					.dirtied_ino = 1 };  	/*  	 * Budget request settings: new inode, new direntry and changing parent @@ -1012,7 +1003,7 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,  	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,  					.dirtied_ino = 3 };  	struct ubifs_budget_req ino_req = { .dirtied_ino = 1, -				.dirtied_ino_d = old_inode_ui->data_len }; +			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };  	struct timespec time;  	/* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8565e586e533..3d698e2022b1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,  	int err;  	struct ubifs_budget_req req;  	loff_t old_size = inode->i_size, new_size = attr->ia_size; -	int offset = new_size & (UBIFS_BLOCK_SIZE - 1); +	int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;  	struct ubifs_inode *ui = ubifs_inode(inode);  	dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); @@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,  	/* A funny way to budget for truncation node */  	req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;  	err = ubifs_budget_space(c, &req); -	if (err) -		return err; +	if (err) { +		/* +		 * Treat truncations to zero as deletion and always allow them, +		 * just like we do for '->unlink()'. +		 */ +		if (new_size || err != -ENOSPC) +			return err; +		budgeted = 0; +	}  	err = vmtruncate(inode, new_size);  	if (err) @@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,  	err = ubifs_jnl_truncate(c, inode, old_size, new_size);  	mutex_unlock(&ui->ui_mutex);  out_budg: -	ubifs_release_budget(c, &req); +	if (budgeted) +		ubifs_release_budget(c, &req); +	else { +		c->nospace = c->nospace_rp = 0; +		smp_wmb(); +	}  	return err;  } @@ -890,7 +902,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,  	loff_t new_size = attr->ia_size;  	struct ubifs_inode *ui = ubifs_inode(inode);  	struct ubifs_budget_req req = { .dirtied_ino = 1, -					.dirtied_ino_d = ui->data_len }; +				.dirtied_ino_d = ALIGN(ui->data_len, 8) };  	err = ubifs_budget_space(c, &req);  	if (err) @@ -941,7 +953,8 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr)  	struct inode *inode = dentry->d_inode;  	struct ubifs_info *c = inode->i_sb->s_fs_info; -	dbg_gen("ino %lu, ia_valid %#x", inode->i_ino, attr->ia_valid); +	dbg_gen("ino %lu, mode %#x, ia_valid %#x", +		inode->i_ino, inode->i_mode, attr->ia_valid);  	err = inode_change_ok(inode, attr);  	if (err)  		return err; @@ -1051,7 +1064,7 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)  	if (mctime_update_needed(inode, &now)) {  		int err, release;  		struct ubifs_budget_req req = { .dirtied_ino = 1, -						.dirtied_ino_d = ui->data_len }; +				.dirtied_ino_d = ALIGN(ui->data_len, 8) };  		err = ubifs_budget_space(c, &req);  		if (err) @@ -1270,6 +1283,7 @@ struct file_operations ubifs_file_operations = {  	.fsync          = ubifs_fsync,  	.unlocked_ioctl = ubifs_ioctl,  	.splice_read	= generic_file_splice_read, +	.splice_write	= generic_file_splice_write,  #ifdef CONFIG_COMPAT  	.compat_ioctl   = ubifs_compat_ioctl,  #endif diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 10394c548367..47814cde2407 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,   * dirty index heap, and it falls-back to LPT scanning if the heaps are empty   * or do not have an LEB which satisfies the @min_space criteria.   * - * Note: - *   o LEBs which have less than dead watermark of dirty space are never picked - *   by this function; - * - * Returns zero and the LEB properties of - * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a - * negative error code in case of other failures. The returned LEB is marked as - * "taken". + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function.   *   * The additional @pick_free argument controls if this function has to return a   * free or freeable LEB if one is present. For example, GC must to set it to %1, @@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,   *   * In addition @pick_free is set to %2 by the recovery process in order to   * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken".   */  int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,  			 int min_space, int pick_free) @@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,  		int lebs, rsvd_idx_lebs = 0;  		spin_lock(&c->space_lock); -		lebs = c->lst.empty_lebs; +		lebs = c->lst.empty_lebs + c->idx_gc_cnt;  		lebs += c->freeable_cnt - c->lst.taken_empty_lebs;  		/* @@ -290,9 +288,14 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,  		idx_lp = idx_heap->arr[0];  		sum = idx_lp->free + idx_lp->dirty;  		/* -		 * Since we reserve twice as more space for the index than it +		 * Since we reserve thrice as much space for the index than it  		 * actually takes, it does not make sense to pick indexing LEBs -		 * with less than half LEB of dirty space. +		 * with less than, say, half LEB of dirty space. May be half is +		 * not the optimal boundary - this should be tested and +		 * checked. This boundary should determine how much we use +		 * in-the-gaps to consolidate the index comparing to how much +		 * we use garbage collector to consolidate it. The "half" +		 * criteria just feels to be fine.  		 */  		if (sum < min_space || sum < c->half_leb_size)  			idx_lp = NULL; @@ -312,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,  		lp = idx_lp;  	if (lp) { -		ubifs_assert(lp->dirty >= c->dead_wm); +		ubifs_assert(lp->free + lp->dirty >= c->dead_wm);  		goto found;  	} @@ -504,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,  		rsvd_idx_lebs = 0;  	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -  	       c->lst.taken_empty_lebs; -	ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs);  	if (rsvd_idx_lebs < lebs)  		/*  		 * OK to allocate an empty LEB, but we still don't want to go diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d0f3dac29081..02aba36fe3d4 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)  		err = move_nodes(c, sleb);  		if (err) -			goto out; +			goto out_inc_seq;  		err = gc_sync_wbufs(c);  		if (err) -			goto out; +			goto out_inc_seq;  		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);  		if (err) -			goto out; +			goto out_inc_seq; + +		/* Allow for races with TNC */ +		c->gced_lnum = lnum; +		smp_wmb(); +		c->gc_seq += 1; +		smp_wmb();  		if (c->gc_lnum == -1) {  			c->gc_lnum = lnum; @@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)  out:  	ubifs_scan_destroy(sleb);  	return err; + +out_inc_seq: +	/* We may have moved at least some nodes so allow for races with TNC */ +	c->gced_lnum = lnum; +	smp_wmb(); +	c->gc_seq += 1; +	smp_wmb(); +	goto out;  }  /** diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3374f91b6709..054363f2b207 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -54,6 +54,20 @@  #include "ubifs.h"  /** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ +	if (!c->ro_media) { +		c->ro_media = 1; +		ubifs_warn("switched to read-only mode, error %d", err); +		dbg_dump_stack(); +	} +} + +/**   * ubifs_check_node - check node.   * @c: UBIFS file-system description object   * @buf: node to check diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 283155abe5f5..22993f867d19 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -447,13 +447,11 @@ static int get_dent_type(int mode)   * @ino: buffer in which to pack inode node   * @inode: inode to pack   * @last: indicates the last node of the group - * @last_reference: non-zero if this is a deletion inode   */  static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, -		       const struct inode *inode, int last, -		       int last_reference) +		       const struct inode *inode, int last)  { -	int data_len = 0; +	int data_len = 0, last_reference = !inode->i_nlink;  	struct ubifs_inode *ui = ubifs_inode(inode);  	ino->ch.node_type = UBIFS_INO_NODE; @@ -596,9 +594,9 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,  	ubifs_prep_grp_node(c, dent, dlen, 0);  	ino = (void *)dent + aligned_dlen; -	pack_inode(c, ino, inode, 0, last_reference); +	pack_inode(c, ino, inode, 0);  	ino = (void *)ino + aligned_ilen; -	pack_inode(c, ino, dir, 1, 0); +	pack_inode(c, ino, dir, 1);  	if (last_reference) {  		err = ubifs_add_orphan(c, inode->i_ino); @@ -606,6 +604,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,  			release_head(c, BASEHD);  			goto out_finish;  		} +		ui->del_cmtno = c->cmt_no;  	}  	err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); @@ -750,30 +749,25 @@ out_free:   * ubifs_jnl_write_inode - flush inode to the journal.   * @c: UBIFS file-system description object   * @inode: inode to flush - * @deletion: inode has been deleted   *   * This function writes inode @inode to the journal. If the inode is   * synchronous, it also synchronizes the write-buffer. Returns zero in case of   * success and a negative error code in case of failure.   */ -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, -			  int deletion) +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)  { -	int err, len, lnum, offs, sync = 0; +	int err, lnum, offs;  	struct ubifs_ino_node *ino;  	struct ubifs_inode *ui = ubifs_inode(inode); +	int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; -	dbg_jnl("ino %lu%s", inode->i_ino, -		deletion ? " (last reference)" : ""); -	if (deletion) -		ubifs_assert(inode->i_nlink == 0); +	dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); -	len = UBIFS_INO_NODE_SZ;  	/*  	 * If the inode is being deleted, do not write the attached data. No  	 * need to synchronize the write-buffer either.  	 */ -	if (!deletion) { +	if (!last_reference) {  		len += ui->data_len;  		sync = IS_SYNC(inode);  	} @@ -786,7 +780,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,  	if (err)  		goto out_free; -	pack_inode(c, ino, inode, 1, deletion); +	pack_inode(c, ino, inode, 1);  	err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);  	if (err)  		goto out_release; @@ -795,7 +789,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode,  					  inode->i_ino);  	release_head(c, BASEHD); -	if (deletion) { +	if (last_reference) {  		err = ubifs_tnc_remove_ino(c, inode->i_ino);  		if (err)  			goto out_ro; @@ -828,6 +822,65 @@ out_free:  }  /** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ +	int err; +	struct ubifs_inode *ui = ubifs_inode(inode); + +	ubifs_assert(inode->i_nlink == 0); + +	if (ui->del_cmtno != c->cmt_no) +		/* A commit happened for sure */ +		return ubifs_jnl_write_inode(c, inode); + +	down_read(&c->commit_sem); +	/* +	 * Check commit number again, because the first test has been done +	 * without @c->commit_sem, so a commit might have happened. +	 */ +	if (ui->del_cmtno != c->cmt_no) { +		up_read(&c->commit_sem); +		return ubifs_jnl_write_inode(c, inode); +	} + +	err = ubifs_tnc_remove_ino(c, inode->i_ino); +	if (err) +		ubifs_ro_mode(c, err); +	else +		ubifs_delete_orphan(c, inode->i_ino); +	up_read(&c->commit_sem); +	return err; +} + +/**   * ubifs_jnl_rename - rename a directory entry.   * @c: UBIFS file-system description object   * @old_dir: parent inode of directory entry to rename @@ -917,16 +970,16 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,  	p = (void *)dent2 + aligned_dlen2;  	if (new_inode) { -		pack_inode(c, p, new_inode, 0, last_reference); +		pack_inode(c, p, new_inode, 0);  		p += ALIGN(ilen, 8);  	}  	if (!move) -		pack_inode(c, p, old_dir, 1, 0); +		pack_inode(c, p, old_dir, 1);  	else { -		pack_inode(c, p, old_dir, 0, 0); +		pack_inode(c, p, old_dir, 0);  		p += ALIGN(plen, 8); -		pack_inode(c, p, new_dir, 1, 0); +		pack_inode(c, p, new_dir, 1);  	}  	if (last_reference) { @@ -935,6 +988,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,  			release_head(c, BASEHD);  			goto out_finish;  		} +		new_ui->del_cmtno = c->cmt_no;  	}  	err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); @@ -1131,7 +1185,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,  	if (err)  		goto out_free; -	pack_inode(c, ino, inode, 0, 0); +	pack_inode(c, ino, inode, 0);  	ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);  	if (dlen)  		ubifs_prep_grp_node(c, dn, dlen, 1); @@ -1251,9 +1305,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,  	ubifs_prep_grp_node(c, xent, xlen, 0);  	ino = (void *)xent + aligned_xlen; -	pack_inode(c, ino, inode, 0, 1); +	pack_inode(c, ino, inode, 0);  	ino = (void *)ino + UBIFS_INO_NODE_SZ; -	pack_inode(c, ino, host, 1, 0); +	pack_inode(c, ino, host, 1);  	err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);  	if (!sync && !err) @@ -1320,7 +1374,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,  			   const struct inode *host)  {  	int err, len1, len2, aligned_len, aligned_len1, lnum, offs; -	struct ubifs_inode *host_ui = ubifs_inode(inode); +	struct ubifs_inode *host_ui = ubifs_inode(host);  	struct ubifs_ino_node *ino;  	union ubifs_key key;  	int sync = IS_DIRSYNC(host); @@ -1344,8 +1398,8 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,  	if (err)  		goto out_free; -	pack_inode(c, ino, host, 0, 0); -	pack_inode(c, (void *)ino + aligned_len1, inode, 1, 0); +	pack_inode(c, ino, host, 0); +	pack_inode(c, (void *)ino + aligned_len1, inode, 1);  	err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);  	if (!sync && !err) { diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 36857b9ed59e..3e0aa7367556 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -317,6 +317,8 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)  	return 0;  out_unlock: +	if (err != -EAGAIN) +		ubifs_ro_mode(c, err);  	mutex_unlock(&c->log_mutex);  	kfree(ref);  	kfree(bud); @@ -410,7 +412,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)  		return -ENOMEM;  	cs->ch.node_type = UBIFS_CS_NODE; -	cs->cmt_no = cpu_to_le64(c->cmt_no + 1); +	cs->cmt_no = cpu_to_le64(c->cmt_no);  	ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);  	/* diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 4beccfc256d2..4c12a9215d7f 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -80,20 +80,6 @@ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)  }  /** - * ubifs_ro_mode - switch UBIFS to read read-only mode. - * @c: UBIFS file-system description object - * @err: error code which is the reason of switching to R/O mode - */ -static inline void ubifs_ro_mode(struct ubifs_info *c, int err) -{ -	if (!c->ro_media) { -		c->ro_media = 1; -		ubifs_warn("switched to read-only mode, error %d", err); -		dbg_dump_stack(); -	} -} - -/**   * ubifs_compr_present - check if compressor was compiled in.   * @compr_type: compressor type to check   * @@ -298,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c,  }  /** - * ubifs_reported_space - calculate reported free space. - * @c: the UBIFS file-system description object - * @free: amount of free space - * - * This function calculates amount of free space which will be reported to - * user-space. User-space application tend to expect that if the file-system - * (e.g., via the 'statfs()' call) reports that it has N bytes available, they - * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. - * - * This function assumes free space is made up of uncompressed data nodes and - * full index nodes (one per data node, doubled because we always allow enough - * space to write the index twice). - * - * Note, the calculation is pessimistic, which means that most of the time - * UBIFS reports less space than it actually has. - */ -static inline long long ubifs_reported_space(const struct ubifs_info *c, -					     uint64_t free) -{ -	int divisor, factor; - -	divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz << 1); -	factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; -	do_div(free, divisor); - -	return free * factor; -} - -/**   * ubifs_current_time - round current time to time granularity.   * @inode: inode   */ @@ -339,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode)  		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;  } +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, +				   const union ubifs_key *key, void *node) +{ +	return ubifs_tnc_locate(c, key, node, NULL, NULL); +} +  #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 3afeb9242c6a..02d3462f4d3e 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -310,10 +310,10 @@ static int write_orph_node(struct ubifs_info *c, int atomic)  	c->cmt_orphans -= cnt;  	spin_unlock(&c->orphan_lock);  	if (c->cmt_orphans) -		orph->cmt_no = cpu_to_le64(c->cmt_no + 1); +		orph->cmt_no = cpu_to_le64(c->cmt_no);  	else  		/* Mark the last node of the commit */ -		orph->cmt_no = cpu_to_le64((c->cmt_no + 1) | (1ULL << 63)); +		orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));  	ubifs_assert(c->ohead_offs + len <= c->leb_size);  	ubifs_assert(c->ohead_lnum >= c->orph_first);  	ubifs_assert(c->ohead_lnum <= c->orph_last); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ca1e2d4e03cc..3f4902060c7a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -30,7 +30,6 @@  #include <linux/slab.h>  #include <linux/module.h>  #include <linux/ctype.h> -#include <linux/random.h>  #include <linux/kthread.h>  #include <linux/parser.h>  #include <linux/seq_file.h> @@ -149,7 +148,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)  	if (err)  		goto out_invalid; -	/* Disable readahead */ +	/* Disable read-ahead */  	inode->i_mapping->backing_dev_info = &c->bdi;  	switch (inode->i_mode & S_IFMT) { @@ -278,7 +277,7 @@ static void ubifs_destroy_inode(struct inode *inode)   */  static int ubifs_write_inode(struct inode *inode, int wait)  { -	int err; +	int err = 0;  	struct ubifs_info *c = inode->i_sb->s_fs_info;  	struct ubifs_inode *ui = ubifs_inode(inode); @@ -299,10 +298,18 @@ static int ubifs_write_inode(struct inode *inode, int wait)  		return 0;  	} -	dbg_gen("inode %lu", inode->i_ino); -	err = ubifs_jnl_write_inode(c, inode, 0); -	if (err) -		ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); +	/* +	 * As an optimization, do not write orphan inodes to the media just +	 * because this is not needed. +	 */ +	dbg_gen("inode %lu, mode %#x, nlink %u", +		inode->i_ino, (int)inode->i_mode, inode->i_nlink); +	if (inode->i_nlink) { +		err = ubifs_jnl_write_inode(c, inode); +		if (err) +			ubifs_err("can't write inode %lu, error %d", +				  inode->i_ino, err); +	}  	ui->dirty = 0;  	mutex_unlock(&ui->ui_mutex); @@ -314,8 +321,9 @@ static void ubifs_delete_inode(struct inode *inode)  {  	int err;  	struct ubifs_info *c = inode->i_sb->s_fs_info; +	struct ubifs_inode *ui = ubifs_inode(inode); -	if (ubifs_inode(inode)->xattr) +	if (ui->xattr)  		/*  		 * Extended attribute inode deletions are fully handled in  		 * 'ubifs_removexattr()'. These inodes are special and have @@ -323,7 +331,7 @@ static void ubifs_delete_inode(struct inode *inode)  		 */  		goto out; -	dbg_gen("inode %lu", inode->i_ino); +	dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);  	ubifs_assert(!atomic_read(&inode->i_count));  	ubifs_assert(inode->i_nlink == 0); @@ -331,15 +339,19 @@ static void ubifs_delete_inode(struct inode *inode)  	if (is_bad_inode(inode))  		goto out; -	ubifs_inode(inode)->ui_size = inode->i_size = 0; -	err = ubifs_jnl_write_inode(c, inode, 1); +	ui->ui_size = inode->i_size = 0; +	err = ubifs_jnl_delete_inode(c, inode);  	if (err)  		/*  		 * Worst case we have a lost orphan inode wasting space, so a -		 * simple error message is ok here. +		 * simple error message is OK here.  		 */ -		ubifs_err("can't write inode %lu, error %d", inode->i_ino, err); +		ubifs_err("can't delete inode %lu, error %d", +			  inode->i_ino, err); +  out: +	if (ui->dirty) +		ubifs_release_dirty_inode_budget(c, ui);  	clear_inode(inode);  } @@ -358,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)  {  	struct ubifs_info *c = dentry->d_sb->s_fs_info;  	unsigned long long free; +	__le32 *uuid = (__le32 *)c->uuid; -	free = ubifs_budg_get_free_space(c); +	free = ubifs_get_free_space(c);  	dbg_gen("free space %lld bytes (%lld blocks)",  		free, free >> UBIFS_BLOCK_SHIFT); @@ -374,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)  	buf->f_files = 0;  	buf->f_ffree = 0;  	buf->f_namelen = UBIFS_MAX_NLEN; - +	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); +	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);  	return 0;  } @@ -518,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c)  	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);  	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); +	/* +	 * Calculate how many bytes would be wasted at the end of LEB if it was +	 * fully filled with data nodes of maximum size. This is used in +	 * calculations when reporting free space. +	 */ +	c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;  	return 0;  } @@ -635,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c)  	 * internally because it does not make much sense for UBIFS, but it is  	 * necessary to report something for the 'statfs()' call.  	 * -	 * Subtract the LEB reserved for GC and the LEB which is reserved for -	 * deletions. -	 * -	 * Review 'ubifs_calc_available()' if changing this calculation. +	 * Subtract the LEB reserved for GC, the LEB which is reserved for +	 * deletions, and assume only one journal head is available.  	 */ -	tmp64 = c->main_lebs - 2; -	tmp64 *= (uint64_t)c->leb_size - c->dark_wm; +	tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; +	tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;  	tmp64 = ubifs_reported_space(c, tmp64);  	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; @@ -1006,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c)  		goto out_dereg;  	} +	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);  	if (!mounted_read_only) {  		err = alloc_wbufs(c);  		if (err)  			goto out_cbuf;  		/* Create background thread */ -		sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, -			c->vi.vol_id);  		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);  		if (!c->bgt)  			c->bgt = ERR_PTR(-EINVAL); @@ -1122,8 +1139,8 @@ static int mount_ubifs(struct ubifs_info *c)  	if (err)  		goto out_infos; -	ubifs_msg("mounted UBI device %d, volume %d", c->vi.ubi_num, -		  c->vi.vol_id); +	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", +		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);  	if (mounted_read_only)  		ubifs_msg("mounted read-only");  	x = (long long)c->main_lebs * c->leb_size; @@ -1469,6 +1486,7 @@ static void ubifs_put_super(struct super_block *sb)  	 */  	ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);  	ubifs_assert(c->budg_idx_growth == 0); +	ubifs_assert(c->budg_dd_growth == 0);  	ubifs_assert(c->budg_data_growth == 0);  	/* @@ -1657,7 +1675,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)  	INIT_LIST_HEAD(&c->orph_new);  	c->highest_inum = UBIFS_FIRST_INO; -	get_random_bytes(&c->vfs_gen, sizeof(int));  	c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;  	ubi_get_volume_info(ubi, &c->vi); @@ -1671,10 +1688,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)  	}  	/* -	 * UBIFS provids 'backing_dev_info' in order to disable readahead. For +	 * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For  	 * UBIFS, I/O is not deferred, it is done immediately in readpage,  	 * which means the user would have to wait not just for their own I/O -	 * but the readahead I/O as well i.e. completely pointless. +	 * but the read-ahead I/O as well i.e. completely pointless.  	 *  	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.  	 */ diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e909f4a96443..7634c5970887 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,  		if (keys_cmp(c, key, &node_key) != 0)  			ret = 0;  	} -	if (ret == 0) +	if (ret == 0 && c->replaying)  		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",  			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));  	return ret; @@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,  }  /** - * ubifs_tnc_lookup - look up a file-system node. + * maybe_leb_gced - determine if a LEB may have been garbage collected.   * @c: UBIFS file-system description object - * @key: node key to lookup - * @node: the node is returned here + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number   * - * This function look up and reads node with key @key. The caller has to make - * sure the @node buffer is large enough to fit the node. Returns zero in case - * of success, %-ENOENT if the node was not found, and a negative error code in - * case of failure. + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned.   */ -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, -		     void *node) +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)  { -	int found, n, err; -	struct ubifs_znode *znode; -	struct ubifs_zbranch zbr, *zt; +	int gc_seq2, gced_lnum; -	mutex_lock(&c->tnc_mutex); -	found = ubifs_lookup_level0(c, key, &znode, &n); -	if (!found) { -		err = -ENOENT; -		goto out; -	} else if (found < 0) { -		err = found; -		goto out; -	} -	zt = &znode->zbranch[n]; -	if (is_hash_key(c, key)) { -		/* -		 * In this case the leaf node cache gets used, so we pass the -		 * address of the zbranch and keep the mutex locked -		 */ -		err = tnc_read_node_nm(c, zt, node); -		goto out; -	} -	zbr = znode->zbranch[n]; -	mutex_unlock(&c->tnc_mutex); - -	err = ubifs_tnc_read_node(c, &zbr, node); -	return err; - -out: -	mutex_unlock(&c->tnc_mutex); -	return err; +	gced_lnum = c->gced_lnum; +	smp_rmb(); +	gc_seq2 = c->gc_seq; +	/* Same seq means no GC */ +	if (gc_seq1 == gc_seq2) +		return 0; +	/* Different by more than 1 means we don't know */ +	if (gc_seq1 + 1 != gc_seq2) +		return 1; +	/* +	 * We have seen the sequence number has increased by 1. Now we need to +	 * be sure we read the right LEB number, so read it again. +	 */ +	smp_rmb(); +	if (gced_lnum != c->gced_lnum) +		return 1; +	/* Finally we can check lnum */ +	if (gced_lnum == lnum) +		return 1; +	return 0;  }  /** @@ -1436,16 +1425,19 @@ out:   * @lnum: LEB number is returned here   * @offs: offset is returned here   * - * This function is the same as 'ubifs_tnc_lookup()' but it returns the node - * location also. See 'ubifs_tnc_lookup()'. + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs.   */  int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,  		     void *node, int *lnum, int *offs)  { -	int found, n, err; +	int found, n, err, safely = 0, gc_seq1;  	struct ubifs_znode *znode;  	struct ubifs_zbranch zbr, *zt; +again:  	mutex_lock(&c->tnc_mutex);  	found = ubifs_lookup_level0(c, key, &znode, &n);  	if (!found) { @@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,  		goto out;  	}  	zt = &znode->zbranch[n]; +	if (lnum) { +		*lnum = zt->lnum; +		*offs = zt->offs; +	}  	if (is_hash_key(c, key)) {  		/*  		 * In this case the leaf node cache gets used, so we pass the  		 * address of the zbranch and keep the mutex locked  		 */ -		*lnum = zt->lnum; -		*offs = zt->offs;  		err = tnc_read_node_nm(c, zt, node);  		goto out;  	} +	if (safely) { +		err = ubifs_tnc_read_node(c, zt, node); +		goto out; +	} +	/* Drop the TNC mutex prematurely and race with garbage collection */  	zbr = znode->zbranch[n]; +	gc_seq1 = c->gc_seq;  	mutex_unlock(&c->tnc_mutex); -	*lnum = zbr.lnum; -	*offs = zbr.offs; +	if (ubifs_get_wbuf(c, zbr.lnum)) { +		/* We do not GC journal heads */ +		err = ubifs_tnc_read_node(c, &zbr, node); +		return err; +	} -	err = ubifs_tnc_read_node(c, &zbr, node); -	return err; +	err = fallible_read_node(c, key, &zbr, node); +	if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { +		/* +		 * The node may have been GC'ed out from under us so try again +		 * while keeping the TNC mutex locked. +		 */ +		safely = 1; +		goto again; +	} +	return 0;  out:  	mutex_unlock(&c->tnc_mutex); @@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,  {  	int found, n, err;  	struct ubifs_znode *znode; -	struct ubifs_zbranch zbr;  	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));  	mutex_lock(&c->tnc_mutex); @@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,  		goto out_unlock;  	} -	zbr = znode->zbranch[n]; -	mutex_unlock(&c->tnc_mutex); - -	err = tnc_read_node_nm(c, &zbr, node); -	return err; +	err = tnc_read_node_nm(c, &znode->zbranch[n], node);  out_unlock:  	mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 8117e65ba2e9..8ac76b1c2d55 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -372,26 +372,25 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)  		written = layout_leb_in_gaps(c, p);  		if (written < 0) {  			err = written; -			if (err == -ENOSPC) { -				if (!dbg_force_in_the_gaps_enabled) { -					/* -					 * Do not print scary warnings if the -					 * debugging option which forces -					 * in-the-gaps is enabled. -					 */ -					ubifs_err("out of space"); -					spin_lock(&c->space_lock); -					dbg_dump_budg(c); -					spin_unlock(&c->space_lock); -					dbg_dump_lprops(c); -				} -				/* Try to commit anyway */ -				err = 0; -				break; +			if (err != -ENOSPC) { +				kfree(c->gap_lebs); +				c->gap_lebs = NULL; +				return err;  			} -			kfree(c->gap_lebs); -			c->gap_lebs = NULL; -			return err; +			if (!dbg_force_in_the_gaps_enabled) { +				/* +				 * Do not print scary warnings if the debugging +				 * option which forces in-the-gaps is enabled. +				 */ +				ubifs_err("out of space"); +				spin_lock(&c->space_lock); +				dbg_dump_budg(c); +				spin_unlock(&c->space_lock); +				dbg_dump_lprops(c); +			} +			/* Try to commit anyway */ +			err = 0; +			break;  		}  		p++;  		cnt -= written; diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 0cc7da9bed47..a9ecbd9af20d 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -87,7 +87,7 @@  #define UBIFS_SK_LEN 8  /* Minimum index tree fanout */ -#define UBIFS_MIN_FANOUT 2 +#define UBIFS_MIN_FANOUT 3  /* Maximum number of levels in UBIFS indexing B-tree */  #define UBIFS_MAX_LEVELS 512 @@ -228,10 +228,10 @@ enum {  /* Minimum number of orphan area logical eraseblocks */  #define UBIFS_MIN_ORPH_LEBS 1  /* - * Minimum number of main area logical eraseblocks (buds, 2 for the index, 1 + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1   * for GC, 1 for deletions, and at least 1 for committed data).   */ -#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 5) +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)  /* Minimum number of logical eraseblocks */  #define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index e4f89f271827..17c620b93eec 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -20,8 +20,6 @@   *          Adrian Hunter   */ -/* Implementation version 0.7 */ -  #ifndef __UBIFS_H__  #define __UBIFS_H__ @@ -322,6 +320,8 @@ struct ubifs_gced_idx_leb {   * struct ubifs_inode - UBIFS in-memory inode description.   * @vfs_inode: VFS inode description object   * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + *             protected by @c->commit_sem;   * @xattr_size: summarized size of all extended attributes in bytes   * @xattr_cnt: count of extended attributes this inode has   * @xattr_names: sum of lengths of all extended attribute names belonging to @@ -373,6 +373,7 @@ struct ubifs_gced_idx_leb {  struct ubifs_inode {  	struct inode vfs_inode;  	unsigned long long creat_sqnum; +	unsigned long long del_cmtno;  	unsigned int xattr_size;  	unsigned int xattr_cnt;  	unsigned int xattr_names; @@ -779,7 +780,7 @@ struct ubifs_compressor {  /**   * struct ubifs_budget_req - budget requirements of an operation.   * - * @fast: non-zero if the budgeting should try to aquire budget quickly and + * @fast: non-zero if the budgeting should try to acquire budget quickly and   *        should not try to call write-back   * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields   *               have to be re-calculated @@ -805,21 +806,31 @@ struct ubifs_compressor {   * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d   * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made   * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned.   */  struct ubifs_budget_req {  	unsigned int fast:1;  	unsigned int recalculate:1; +#ifndef UBIFS_DEBUG  	unsigned int new_page:1;  	unsigned int dirtied_page:1;  	unsigned int new_dent:1;  	unsigned int mod_dent:1;  	unsigned int new_ino:1;  	unsigned int new_ino_d:13; -#ifndef UBIFS_DEBUG  	unsigned int dirtied_ino:4;  	unsigned int dirtied_ino_d:15;  #else  	/* Not bit-fields to check for overflows */ +	unsigned int new_page; +	unsigned int dirtied_page; +	unsigned int new_dent; +	unsigned int mod_dent; +	unsigned int new_ino; +	unsigned int new_ino_d;  	unsigned int dirtied_ino;  	unsigned int dirtied_ino_d;  #endif @@ -860,13 +871,13 @@ struct ubifs_mount_opts {   * struct ubifs_info - UBIFS file-system description data structure   * (per-superblock).   * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable readahead + * @bdi: backing device info object to make VFS happy and disable read-ahead   *   * @highest_inum: highest used inode number - * @vfs_gen: VFS inode generation counter   * @max_sqnum: current global sequence number - * @cmt_no: commit number (last successfully completed commit) - * @cnt_lock: protects @highest_inum, @vfs_gen, and @max_sqnum counters + * @cmt_no: commit number of the last successfully completed commit, protected + *          by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters   * @fmt_version: UBIFS on-flash format version   * @uuid: UUID from super block   * @@ -984,6 +995,9 @@ struct ubifs_mount_opts {   * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary   * @max_inode_sz: maximum possible inode size in bytes   * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + *                data nodes of maximum size - used in free space reporting   * @dead_wm: LEB dead space watermark   * @dark_wm: LEB dark space watermark   * @block_cnt: count of 4KiB blocks on the FS @@ -1017,6 +1031,8 @@ struct ubifs_mount_opts {   * @sbuf: a buffer of LEB size used by GC and replay for scanning   * @idx_gc: list of index LEBs that have been garbage collected   * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected   *   * @infos_list: links all 'ubifs_info' objects   * @umount_mutex: serializes shrinker and un-mount @@ -1103,7 +1119,6 @@ struct ubifs_info {  	struct backing_dev_info bdi;  	ino_t highest_inum; -	unsigned int vfs_gen;  	unsigned long long max_sqnum;  	unsigned long long cmt_no;  	spinlock_t cnt_lock; @@ -1214,6 +1229,8 @@ struct ubifs_info {  	int max_idx_node_sz;  	long long max_inode_sz;  	int max_znode_sz; + +	int leb_overhead;  	int dead_wm;  	int dark_wm;  	int block_cnt; @@ -1247,6 +1264,8 @@ struct ubifs_info {  	void *sbuf;  	struct list_head idx_gc;  	int idx_gc_cnt; +	volatile int gc_seq; +	volatile int gced_lnum;  	struct list_head infos_list;  	struct mutex umount_mutex; @@ -1346,6 +1365,7 @@ extern struct backing_dev_info ubifs_backing_dev_info;  extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];  /* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err);  int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);  int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,  			   int dtype); @@ -1399,8 +1419,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,  		     int deletion, int xent);  int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,  			 const union ubifs_key *key, const void *buf, int len); -int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode, -			  int last_reference); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);  int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,  		     const struct dentry *old_dentry,  		     const struct inode *new_dir, @@ -1423,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,  				struct ubifs_budget_req *req);  void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,  			 struct ubifs_budget_req *req); -long long ubifs_budg_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space(struct ubifs_info *c);  int ubifs_calc_min_idx_lebs(struct ubifs_info *c);  void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);  long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);  /* find.c */ @@ -1440,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);  /* tnc.c */  int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,  			struct ubifs_znode **zn, int *n); -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, -		     void *node);  int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,  			void *node, const struct qstr *nm);  int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 1388a078e1a9..649bec78b645 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -61,7 +61,7 @@  /*   * Limit the number of extended attributes per inode so that the total size - * (xattr_size) is guaranteeded to fit in an 'unsigned int'. + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.   */  #define MAX_XATTRS_PER_INODE 65535 @@ -103,14 +103,14 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,  	struct inode *inode;  	struct ubifs_inode *ui, *host_ui = ubifs_inode(host);  	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, -					.new_ino_d = size, .dirtied_ino = 1, -					.dirtied_ino_d = host_ui->data_len}; +				.new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, +				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };  	if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)  		return -ENOSPC;  	/*  	 * Linux limits the maximum size of the extended attribute names list -	 * to %XATTR_LIST_MAX. This means we should not allow creating more* +	 * to %XATTR_LIST_MAX. This means we should not allow creating more  	 * extended attributes if the name list becomes larger. This limitation  	 * is artificial for UBIFS, though.  	 */ @@ -128,7 +128,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,  		goto out_budg;  	} -	mutex_lock(&host_ui->ui_mutex);  	/* Re-define all operations to be "nothing" */  	inode->i_mapping->a_ops = &none_address_operations;  	inode->i_op = &none_inode_operations; @@ -141,23 +140,19 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,  	ui->data = kmalloc(size, GFP_NOFS);  	if (!ui->data) {  		err = -ENOMEM; -		goto out_unlock; +		goto out_free;  	} -  	memcpy(ui->data, value, size); +	inode->i_size = ui->ui_size = size; +	ui->data_len = size; + +	mutex_lock(&host_ui->ui_mutex);  	host->i_ctime = ubifs_current_time(host);  	host_ui->xattr_cnt += 1;  	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);  	host_ui->xattr_size += CALC_XATTR_BYTES(size);  	host_ui->xattr_names += nm->len; -	/* -	 * We do not use i_size_write() because nobody can race with us as we -	 * are holding host @host->i_mutex - every xattr operation for this -	 * inode is serialized by it. -	 */ -	inode->i_size = ui->ui_size = size; -	ui->data_len = size;  	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);  	if (err)  		goto out_cancel; @@ -172,8 +167,8 @@ out_cancel:  	host_ui->xattr_cnt -= 1;  	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);  	host_ui->xattr_size -= CALC_XATTR_BYTES(size); -out_unlock:  	mutex_unlock(&host_ui->ui_mutex); +out_free:  	make_bad_inode(inode);  	iput(inode);  out_budg: @@ -200,29 +195,28 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,  	struct ubifs_inode *host_ui = ubifs_inode(host);  	struct ubifs_inode *ui = ubifs_inode(inode);  	struct ubifs_budget_req req = { .dirtied_ino = 2, -				.dirtied_ino_d = size + host_ui->data_len }; +		.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };  	ubifs_assert(ui->data_len == inode->i_size);  	err = ubifs_budget_space(c, &req);  	if (err)  		return err; -	mutex_lock(&host_ui->ui_mutex); -	host->i_ctime = ubifs_current_time(host); -	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); -	host_ui->xattr_size += CALC_XATTR_BYTES(size); -  	kfree(ui->data);  	ui->data = kmalloc(size, GFP_NOFS);  	if (!ui->data) {  		err = -ENOMEM; -		goto out_unlock; +		goto out_free;  	} -  	memcpy(ui->data, value, size);  	inode->i_size = ui->ui_size = size;  	ui->data_len = size; +	mutex_lock(&host_ui->ui_mutex); +	host->i_ctime = ubifs_current_time(host); +	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); +	host_ui->xattr_size += CALC_XATTR_BYTES(size); +  	/*  	 * It is important to write the host inode after the xattr inode  	 * because if the host inode gets synchronized (via 'fsync()'), then @@ -240,9 +234,9 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,  out_cancel:  	host_ui->xattr_size -= CALC_XATTR_BYTES(size);  	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); -	make_bad_inode(inode); -out_unlock:  	mutex_unlock(&host_ui->ui_mutex); +	make_bad_inode(inode); +out_free:  	ubifs_release_budget(c, &req);  	return err;  } @@ -312,6 +306,7 @@ int ubifs_setxattr(struct dentry *dentry, const char *name,  	dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,  		host->i_ino, dentry->d_name.len, dentry->d_name.name, size); +	ubifs_assert(mutex_is_locked(&host->i_mutex));  	if (size > UBIFS_MAX_INO_DATA)  		return -ERANGE; @@ -384,7 +379,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,  	if (!xent)  		return -ENOMEM; -	mutex_lock(&host->i_mutex);  	xent_key_init(c, &key, host->i_ino, &nm);  	err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);  	if (err) { @@ -419,7 +413,6 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,  out_iput:  	iput(inode);  out_unlock: -	mutex_unlock(&host->i_mutex);  	kfree(xent);  	return err;  } @@ -449,8 +442,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)  		return -ERANGE;  	lowest_xent_key(c, &key, host->i_ino); - -	mutex_lock(&host->i_mutex);  	while (1) {  		int type; @@ -479,7 +470,6 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)  		pxent = xent;  		key_read(c, &xent->key, &key);  	} -	mutex_unlock(&host->i_mutex);  	kfree(pxent);  	if (err != -ENOENT) { @@ -497,8 +487,8 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,  	int err;  	struct ubifs_inode *host_ui = ubifs_inode(host);  	struct ubifs_inode *ui = ubifs_inode(inode); -	struct ubifs_budget_req req = { .dirtied_ino = 1, .mod_dent = 1, -					.dirtied_ino_d = host_ui->data_len }; +	struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, +				.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };  	ubifs_assert(ui->data_len == inode->i_size); diff --git a/fs/udf/file.c b/fs/udf/file.c index 0ed6e146a0d9..eb91f3b70320 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {  	.release		= udf_release_file,  	.fsync			= udf_fsync_file,  	.splice_read		= generic_file_splice_read, +	.llseek			= generic_file_llseek,  };  const struct inode_operations udf_file_inode_operations = { diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index eb9cfa23dc3d..a4f2b3ce45b0 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)  	*err = -ENOSPC;  	iinfo = UDF_I(inode); -	iinfo->i_unique = 0; -	iinfo->i_lenExtents = 0; -	iinfo->i_next_alloc_block = 0; -	iinfo->i_next_alloc_goal = 0; -	iinfo->i_strat4096 = 0; +	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { +		iinfo->i_efe = 1; +		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) +			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; +		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - +					    sizeof(struct extendedFileEntry), +					    GFP_KERNEL); +	} else { +		iinfo->i_efe = 0; +		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - +					    sizeof(struct fileEntry), +					    GFP_KERNEL); +	} +	if (!iinfo->i_ext.i_data) { +		iput(inode); +		*err = -ENOMEM; +		return NULL; +	}  	block = udf_new_block(dir->i_sb, NULL,  			      dinfo->i_location.partitionReferenceNum, @@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)  		lvhd->uniqueID = cpu_to_le64(uniqueID);  		mark_buffer_dirty(sbi->s_lvid_bh);  	} +	mutex_unlock(&sbi->s_alloc_mutex);  	inode->i_mode = mode;  	inode->i_uid = current->fsuid;  	if (dir->i_mode & S_ISGID) { @@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)  	iinfo->i_lenEAttr = 0;  	iinfo->i_lenAlloc = 0;  	iinfo->i_use = 0; -	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { -		iinfo->i_efe = 1; -		if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) -			sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; -		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - -					    sizeof(struct extendedFileEntry), -					    GFP_KERNEL); -	} else { -		iinfo->i_efe = 0; -		iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - -					    sizeof(struct fileEntry), -					    GFP_KERNEL); -	} -	if (!iinfo->i_ext.i_data) { -		iput(inode); -		*err = -ENOMEM; -		mutex_unlock(&sbi->s_alloc_mutex); -		return NULL; -	}  	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))  		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;  	else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) @@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)  		iinfo->i_crtime = current_fs_time(inode->i_sb);  	insert_inode_hash(inode);  	mark_inode_dirty(inode); -	mutex_unlock(&sbi->s_alloc_mutex);  	if (DQUOT_ALLOC_INODE(inode)) {  		DQUOT_DROP(inode); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index f42f80a3b1fa..a44d68eb50b5 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1338,6 +1338,10 @@ __xfs_get_blocks(  	offset = (xfs_off_t)iblock << inode->i_blkbits;  	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));  	size = bh_result->b_size; + +	if (!create && direct && offset >= i_size_read(inode)) +		return 0; +  	error = xfs_iomap(XFS_I(inode), offset, size,  			     create ? flags : BMAPI_READ, &iomap, &niomap);  	if (error) diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5f60363b9343..5311c1acdd40 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = {  const struct file_operations xfs_dir_file_operations = {  	.read		= generic_read_dir,  	.readdir	= xfs_file_readdir, +	.llseek		= generic_file_llseek,  	.unlocked_ioctl	= xfs_file_ioctl,  #ifdef CONFIG_COMPAT  	.compat_ioctl	= xfs_file_compat_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 91bcd979242c..095d271f3434 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -355,7 +355,7 @@ xfs_vn_ci_lookup(  	/* else case-insensitive match... */  	dname.name = ci_name.name;  	dname.len = ci_name.len; -	dentry = d_add_ci(VFS_I(ip), dentry, &dname); +	dentry = d_add_ci(dentry, VFS_I(ip), &dname);  	kmem_free(ci_name.name);  	return dentry;  } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 73c65f19e549..18d3c8487835 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1302,9 +1302,29 @@ xfs_fs_remount(  			mp->m_flags &= ~XFS_MOUNT_BARRIER;  			break;  		default: +			/* +			 * Logically we would return an error here to prevent +			 * users from believing they might have changed +			 * mount options using remount which can't be changed. +			 * +			 * But unfortunately mount(8) adds all options from +			 * mtab and fstab to the mount arguments in some cases +			 * so we can't blindly reject options, but have to +			 * check for each specified option if it actually +			 * differs from the currently set option and only +			 * reject it if that's the case. +			 * +			 * Until that is implemented we return success for +			 * every remount request, and silently ignore all +			 * options that we can't actually change. +			 */ +#if 0  			printk(KERN_INFO  	"XFS: mount option \"%s\" not supported for remount\n", p);  			return -EINVAL; +#else +			return 0; +#endif  		}  	} diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 608c30c3f76b..002fc2617c8e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -732,6 +732,7 @@ xfs_buf_item_init(  	bip->bli_item.li_ops = &xfs_buf_item_ops;  	bip->bli_item.li_mountp = mp;  	bip->bli_buf = bp; +	xfs_buf_hold(bp);  	bip->bli_format.blf_type = XFS_LI_BUF;  	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);  	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); @@ -867,6 +868,21 @@ xfs_buf_item_dirty(  	return (bip->bli_flags & XFS_BLI_DIRTY);  } +STATIC void +xfs_buf_item_free( +	xfs_buf_log_item_t	*bip) +{ +#ifdef XFS_TRANS_DEBUG +	kmem_free(bip->bli_orig); +	kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE +	ktrace_free(bip->bli_trace); +#endif +	kmem_zone_free(xfs_buf_item_zone, bip); +} +  /*   * This is called when the buf log item is no longer needed.  It should   * free the buf log item associated with the given buffer and clear @@ -887,18 +903,8 @@ xfs_buf_item_relse(  	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {  		XFS_BUF_CLR_IODONE_FUNC(bp);  	} - -#ifdef XFS_TRANS_DEBUG -	kmem_free(bip->bli_orig); -	bip->bli_orig = NULL; -	kmem_free(bip->bli_logged); -	bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE -	ktrace_free(bip->bli_trace); -#endif -	kmem_zone_free(xfs_buf_item_zone, bip); +	xfs_buf_rele(bp); +	xfs_buf_item_free(bip);  } @@ -1120,6 +1126,7 @@ xfs_buf_iodone(  	ASSERT(bip->bli_buf == bp); +	xfs_buf_rele(bp);  	mp = bip->bli_item.li_mountp;  	/* @@ -1136,18 +1143,7 @@ xfs_buf_iodone(  	 * xfs_trans_delete_ail() drops the AIL lock.  	 */  	xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); - -#ifdef XFS_TRANS_DEBUG -	kmem_free(bip->bli_orig); -	bip->bli_orig = NULL; -	kmem_free(bip->bli_logged); -	bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE -	ktrace_free(bip->bli_trace); -#endif -	kmem_zone_free(xfs_buf_item_zone, bip); +	xfs_buf_item_free(bip);  }  #if defined(XFS_BLI_TRACE) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 760f4c5b5160..75b0cd4da0ea 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -149,7 +149,14 @@ xfs_swap_extents(  	sbp = &sxp->sx_stat; -	xfs_lock_two_inodes(ip, tip, lock_flags); +	/* +	 * we have to do two separate lock calls here to keep lockdep +	 * happy. If we try to get all the locks in one call, lock will +	 * report false positives when we drop the ILOCK and regain them +	 * below. +	 */ +	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); +	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);  	locked = 1;  	/* Verify that both files have the same format */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index cdc2d3464a1a..2813cdd72375 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@  #ifndef __XFS_DMAPI_H__  #define __XFS_DMAPI_H__ -#include <linux/version.h>  /*	Values used to define the on-disk version of dm_attrname_t. All   *	on-disk attribute names start with the 8-byte string "SGI_DMI_".   * diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 00e80df9dd9d..dbd9cef852ec 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct(  	ASSERT(nextents <= XFS_LINEAR_EXTS);  	size = nextents * sizeof(xfs_bmbt_rec_t); -	xfs_iext_irec_compact_full(ifp); +	xfs_iext_irec_compact_pages(ifp);  	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);  	ep = ifp->if_u1.if_ext_irec->er_extbuf; @@ -4449,8 +4449,7 @@ xfs_iext_irec_remove(   * compaction policy is as follows:   *   *    Full Compaction: Extents fit into a single page (or inline buffer) - *    Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space   *      No Compaction: Extents occupy at least 50% of allocated space   */  void @@ -4471,8 +4470,6 @@ xfs_iext_irec_compact(  		xfs_iext_direct_to_inline(ifp, nextents);  	} else if (nextents <= XFS_LINEAR_EXTS) {  		xfs_iext_indirect_to_direct(ifp); -	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { -		xfs_iext_irec_compact_full(ifp);  	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {  		xfs_iext_irec_compact_pages(ifp);  	} @@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages(  		erp_next = erp + 1;  		if (erp_next->er_extcount <=  		    (XFS_LINEAR_EXTS - erp->er_extcount)) { -			memmove(&erp->er_extbuf[erp->er_extcount], +			memcpy(&erp->er_extbuf[erp->er_extcount],  				erp_next->er_extbuf, erp_next->er_extcount *  				sizeof(xfs_bmbt_rec_t));  			erp->er_extcount += erp_next->er_extcount; @@ -4516,91 +4513,6 @@ xfs_iext_irec_compact_pages(  }  /* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( -	xfs_ifork_t	*ifp)			/* inode fork pointer */ -{ -	xfs_bmbt_rec_host_t *ep, *ep_next;	/* extent record pointers */ -	xfs_ext_irec_t	*erp, *erp_next;	/* extent irec pointers */ -	int		erp_idx = 0;		/* extent irec index */ -	int		ext_avail;		/* empty entries in ex list */ -	int		ext_diff;		/* number of exts to add */ -	int		nlists;			/* number of irec's (ex lists) */ - -	ASSERT(ifp->if_flags & XFS_IFEXTIREC); - -	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; -	erp = ifp->if_u1.if_ext_irec; -	ep = &erp->er_extbuf[erp->er_extcount]; -	erp_next = erp + 1; -	ep_next = erp_next->er_extbuf; - -	while (erp_idx < nlists - 1) { -		/* -		 * Check how many extent records are available in this irec. -		 * If there is none skip the whole exercise. -		 */ -		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; -		if (ext_avail) { - -			/* -			 * Copy over as many as possible extent records into -			 * the previous page. -			 */ -			ext_diff = MIN(ext_avail, erp_next->er_extcount); -			memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); -			erp->er_extcount += ext_diff; -			erp_next->er_extcount -= ext_diff; - -			/* -			 * If the next irec is empty now we can simply -			 * remove it. -			 */ -			if (erp_next->er_extcount == 0) { -				/* -				 * Free page before removing extent record -				 * so er_extoffs don't get modified in -				 * xfs_iext_irec_remove. -				 */ -				kmem_free(erp_next->er_extbuf); -				erp_next->er_extbuf = NULL; -				xfs_iext_irec_remove(ifp, erp_idx + 1); -				erp = &ifp->if_u1.if_ext_irec[erp_idx]; -				nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - -			/* -			 * If the next irec is not empty move up the content -			 * that has not been copied to the previous page to -			 * the beggining of this one. -			 */ -			} else { -				memmove(erp_next->er_extbuf, &ep_next[ext_diff], -					erp_next->er_extcount * -					sizeof(xfs_bmbt_rec_t)); -				ep_next = erp_next->er_extbuf; -				memset(&ep_next[erp_next->er_extcount], 0, -					(XFS_LINEAR_EXTS - -						erp_next->er_extcount) * -					sizeof(xfs_bmbt_rec_t)); -			} -		} - -		if (erp->er_extcount == XFS_LINEAR_EXTS) { -			erp_idx++; -			if (erp_idx < nlists) -				erp = &ifp->if_u1.if_ext_irec[erp_idx]; -			else -				break; -		} -		ep = &erp->er_extbuf[erp->er_extcount]; -		erp_next = erp + 1; -		ep_next = erp_next->er_extbuf; -	} -} - -/*   * This is called to update the er_extoff field in the indirection   * array when extents have been added or removed from one of the   * extent lists. erp_idx contains the irec index to begin updating diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ccba14eb9dbe..503ea89e8b9a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,  STATIC int	xlog_iclogs_empty(xlog_t *log);  #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE	2048 +#define XLOG_TRACE_ICLOG_SIZE		256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ +	log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ +	ktrace_free(log->l_grant_trace); +} +  void  xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)  {  	unsigned long cnts; -	if (!log->l_grant_trace) { -		log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); -		if (!log->l_grant_trace) -			return; -	}  	/* ticket counts are 1 byte each */  	cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)  }  void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ +	iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ +	ktrace_free(iclog->ic_trace); +} + +void  xlog_trace_iclog(xlog_in_core_t *iclog, uint state)  { -	if (!iclog->ic_trace) -		iclog->ic_trace = ktrace_alloc(256, KM_NOFS);  	ktrace_enter(iclog->ic_trace,  		     (void *)((unsigned long)state),  		     (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)  		     (void *)NULL, (void *)NULL);  }  #else + +#define	xlog_trace_loggrant_alloc(log) +#define	xlog_trace_loggrant_dealloc(log)  #define	xlog_trace_loggrant(log,tic,string) + +#define	xlog_trace_iclog_alloc(iclog) +#define	xlog_trace_iclog_dealloc(iclog)  #define	xlog_trace_iclog(iclog,state) +  #endif /* XFS_LOG_TRACE */ @@ -1009,7 +1037,7 @@ xlog_iodone(xfs_buf_t *bp)  	 * layer, it means the underlyin device no longer supports  	 * barrier I/O. Warn loudly and turn off barriers.  	 */ -	if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { +	if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ISORDERED(bp)) {  		l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;  		xfs_fs_cmn_err(CE_WARN, l->l_mp,  				"xlog_iodone: Barriers are no longer supported" @@ -1231,6 +1259,7 @@ xlog_alloc_log(xfs_mount_t	*mp,  	spin_lock_init(&log->l_grant_lock);  	sv_init(&log->l_flush_wait, 0, "flush_wait"); +	xlog_trace_loggrant_alloc(log);  	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */  	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1285,6 +1314,8 @@ xlog_alloc_log(xfs_mount_t	*mp,  		sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");  		sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); +		xlog_trace_iclog_alloc(iclog); +  		iclogp = &iclog->ic_next;  	}  	*iclogp = log->l_iclog;			/* complete ring */ @@ -1565,11 +1596,7 @@ xlog_dealloc_log(xlog_t *log)  		sv_destroy(&iclog->ic_force_wait);  		sv_destroy(&iclog->ic_write_wait);  		xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE -		if (iclog->ic_trace != NULL) { -			ktrace_free(iclog->ic_trace); -		} -#endif +		xlog_trace_iclog_dealloc(iclog);  		next_iclog = iclog->ic_next;  		kmem_free(iclog);  		iclog = next_iclog; @@ -1578,14 +1605,7 @@ xlog_dealloc_log(xlog_t *log)  	spinlock_destroy(&log->l_grant_lock);  	xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE -	if (log->l_trace != NULL) { -		ktrace_free(log->l_trace); -	} -	if (log->l_grant_trace != NULL) { -		ktrace_free(log->l_grant_trace); -	} -#endif +	xlog_trace_loggrant_dealloc(log);  	log->l_mp->m_log = NULL;  	kmem_free(log);  }	/* xlog_dealloc_log */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c8a5b22ee3e3..e7d8f84443fa 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -448,7 +448,6 @@ typedef struct log {  	int			l_grant_write_bytes;  #ifdef XFS_LOG_TRACE -	struct ktrace		*l_trace;  	struct ktrace		*l_grant_trace;  #endif diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index aa238c8fbd7a..8b6812f66a15 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1838,6 +1838,12 @@ again:  #endif  } +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */  void  xfs_lock_two_inodes(  	xfs_inode_t		*ip0, @@ -1848,6 +1854,8 @@ xfs_lock_two_inodes(  	int			attempts = 0;  	xfs_log_item_t		*lp; +	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) +		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);  	ASSERT(ip0->i_ino != ip1->i_ino);  	if (ip0->i_ino > ip1->i_ino) { @@ -3152,6 +3160,13 @@ error1:	/* Just cancel transaction */  /*   * Zero file bytes between startoff and endoff inclusive.   * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding.   */  STATIC int  xfs_zero_remaining_bytes( @@ -3168,6 +3183,17 @@ xfs_zero_remaining_bytes(  	int			nimap;  	int			error = 0; +	/* +	 * Avoid doing I/O beyond eof - it's not necessary +	 * since nothing can read beyond eof.  The space will +	 * be zeroed when the file is extended anyway. +	 */ +	if (startoff >= ip->i_size) +		return 0; + +	if (endoff > ip->i_size) +		endoff = ip->i_size; +  	bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,  				XFS_IS_REALTIME_INODE(ip) ?  				mp->m_rtdev_targp : mp->m_ddev_targp); | 
