Blame - fs/ext3/inode.c - kernel/hikey-linaro

blob: fcfb10f771202533b26c1496fa65ea272b46c510 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/ext3/inode.c
				3	*
				4	* Copyright (C) 1992, 1993, 1994, 1995
				5	* Remy Card (card@masi.ibp.fr)
				6	* Laboratoire MASI - Institut Blaise Pascal
				7	* Universite Pierre et Marie Curie (Paris VI)
				8	*
				9	* from
				10	*
				11	* linux/fs/minix/inode.c
				12	*
				13	* Copyright (C) 1991, 1992 Linus Torvalds
				14	*
				15	* Goal-directed block allocation by Stephen Tweedie
				16	* (sct@redhat.com), 1993, 1998
				17	* Big-endian to little-endian byte-swapping/bitmaps by
				18	* David S. Miller (davem@caip.rutgers.edu), 1995
				19	* 64-bit file support on 64-bit platforms by Jakub Jelinek
				20	* (jj@sunsite.ms.mff.cuni.cz)
				21	*
				22	* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
				23	*/
				24
				25	#include <linux/module.h>
				26	#include <linux/fs.h>
				27	#include <linux/time.h>
				28	#include <linux/ext3_jbd.h>
				29	#include <linux/jbd.h>
				30	#include <linux/smp_lock.h>
				31	#include <linux/highuid.h>
				32	#include <linux/pagemap.h>
				33	#include <linux/quotaops.h>
				34	#include <linux/string.h>
				35	#include <linux/buffer_head.h>
				36	#include <linux/writeback.h>
				37	#include <linux/mpage.h>
				38	#include <linux/uio.h>
				39	#include "xattr.h"
				40	#include "acl.h"
				41
				42	static int ext3_writepage_trans_blocks(struct inode *inode);
				43
				44	/*
				45	* Test whether an inode is a fast symlink.
				46	*/
				47	static inline int ext3_inode_is_fast_symlink(struct inode *inode)
				48	{
				49	int ea_blocks = EXT3_I(inode)->i_file_acl ?
				50	(inode->i_sb->s_blocksize >> 9) : 0;
				51
				52	return (S_ISLNK(inode->i_mode) &&
				53	inode->i_blocks - ea_blocks == 0);
				54	}
				55
				56	/* The ext3 forget function must perform a revoke if we are freeing data
				57	* which has been journaled. Metadata (eg. indirect blocks) must be
				58	* revoked in all cases.
				59	*
				60	* "bh" may be NULL: a metadata block may have been freed from memory
				61	* but there may still be a record of it in the journal, and that record
				62	* still needs to be revoked.
				63	*/
				64
				65	int ext3_forget(handle_t *handle, int is_metadata,
				66	struct inode inode, struct buffer_head bh,
				67	int blocknr)
				68	{
				69	int err;
				70
				71	might_sleep();
				72
				73	BUFFER_TRACE(bh, "enter");
				74
				75	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
				76	"data mode %lx\n",
				77	bh, is_metadata, inode->i_mode,
				78	test_opt(inode->i_sb, DATA_FLAGS));
				79
				80	/* Never use the revoke function if we are doing full data
				81	* journaling: there is no need to, and a V1 superblock won't
				82	* support it. Otherwise, only skip the revoke on un-journaled
				83	* data blocks. */
				84
				85	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA \|\|
				86	(!is_metadata && !ext3_should_journal_data(inode))) {
				87	if (bh) {
				88	BUFFER_TRACE(bh, "call journal_forget");
				89	return ext3_journal_forget(handle, bh);
				90	}
				91	return 0;
				92	}
				93
				94	/*
				95	* data!=journal && (is_metadata \|\| should_journal_data(inode))
				96	*/
				97	BUFFER_TRACE(bh, "call ext3_journal_revoke");
				98	err = ext3_journal_revoke(handle, blocknr, bh);
				99	if (err)
				100	ext3_abort(inode->i_sb, __FUNCTION__,
				101	"error %d when attempting revoke", err);
				102	BUFFER_TRACE(bh, "exit");
				103	return err;
				104	}
				105
				106	/*
				107	* Work out how many blocks we need to progress with the next chunk of a
				108	* truncate transaction.
				109	*/
				110
				111	static unsigned long blocks_for_truncate(struct inode *inode)
				112	{
				113	unsigned long needed;
				114
				115	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
				116
				117	/* Give ourselves just enough room to cope with inodes in which
				118	* i_blocks is corrupt: we've seen disk corruptions in the past
				119	* which resulted in random data in an inode which looked enough
				120	* like a regular file for ext3 to try to delete it. Things
				121	* will go a bit crazy if that happens, but at least we should
				122	* try not to panic the whole kernel. */
				123	if (needed < 2)
				124	needed = 2;
				125
				126	/* But we need to bound the transaction so we don't overflow the
				127	* journal. */
				128	if (needed > EXT3_MAX_TRANS_DATA)
				129	needed = EXT3_MAX_TRANS_DATA;
				130
Jan Kara	1f54587	2005-06-23 22:01:04 -0700	[diff] [blame]	131	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	132	}
				133
				134	/*
				135	* Truncate transactions can be complex and absolutely huge. So we need to
				136	* be able to restart the transaction at a conventient checkpoint to make
				137	* sure we don't overflow the journal.
				138	*
				139	* start_transaction gets us a new handle for a truncate transaction,
				140	* and extend_transaction tries to extend the existing one a bit. If
				141	* extend fails, we need to propagate the failure up and restart the
				142	* transaction in the top-level truncate loop. --sct
				143	*/
				144
				145	static handle_t start_transaction(struct inode inode)
				146	{
				147	handle_t *result;
				148
				149	result = ext3_journal_start(inode, blocks_for_truncate(inode));
				150	if (!IS_ERR(result))
				151	return result;
				152
				153	ext3_std_error(inode->i_sb, PTR_ERR(result));
				154	return result;
				155	}
				156
				157	/*
				158	* Try to extend this transaction for the purposes of truncation.
				159	*
				160	* Returns 0 if we managed to create more room. If we can't create more
				161	* room, and the transaction must be restarted we return 1.
				162	*/
				163	static int try_to_extend_transaction(handle_t handle, struct inode inode)
				164	{
				165	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
				166	return 0;
				167	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
				168	return 0;
				169	return 1;
				170	}
				171
				172	/*
				173	* Restart the transaction associated with *handle. This does a commit,
				174	* so before we call here everything must be consistently dirtied against
				175	* this transaction.
				176	*/
				177	static int ext3_journal_test_restart(handle_t handle, struct inode inode)
				178	{
				179	jbd_debug(2, "restarting handle %p\n", handle);
				180	return ext3_journal_restart(handle, blocks_for_truncate(inode));
				181	}
				182
				183	/*
				184	* Called at the last iput() if i_nlink is zero.
				185	*/
				186	void ext3_delete_inode (struct inode * inode)
				187	{
				188	handle_t *handle;
				189
Mark Fasheh	fef2665	2005-09-09 13:01:31 -0700	[diff] [blame]	190	truncate_inode_pages(&inode->i_data, 0);
				191
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	192	if (is_bad_inode(inode))
				193	goto no_delete;
				194
				195	handle = start_transaction(inode);
				196	if (IS_ERR(handle)) {
				197	/* If we're going to skip the normal cleanup, we still
				198	* need to make sure that the in-core orphan linked list
				199	* is properly cleaned up. */
				200	ext3_orphan_del(NULL, inode);
				201	goto no_delete;
				202	}
				203
				204	if (IS_SYNC(inode))
				205	handle->h_sync = 1;
				206	inode->i_size = 0;
				207	if (inode->i_blocks)
				208	ext3_truncate(inode);
				209	/*
				210	* Kill off the orphan record which ext3_truncate created.
				211	* AKPM: I think this can be inside the above `if'.
				212	* Note that ext3_orphan_del() has to be able to cope with the
				213	* deletion of a non-existent orphan - this is because we don't
				214	* know if ext3_truncate() actually created an orphan record.
				215	* (Well, we could do this if we need to, but heck - it works)
				216	*/
				217	ext3_orphan_del(handle, inode);
				218	EXT3_I(inode)->i_dtime = get_seconds();
				219
				220	/*
				221	* One subtle ordering requirement: if anything has gone wrong
				222	* (transaction abort, IO errors, whatever), then we can still
				223	* do these next steps (the fs will already have been marked as
				224	* having errors), but we can't free the inode if the mark_dirty
				225	* fails.
				226	*/
				227	if (ext3_mark_inode_dirty(handle, inode))
				228	/* If that failed, just do the required in-core inode clear. */
				229	clear_inode(inode);
				230	else
				231	ext3_free_inode(handle, inode);
				232	ext3_journal_stop(handle);
				233	return;
				234	no_delete:
				235	clear_inode(inode); /* We must guarantee clearing of inode... */
				236	}
				237
				238	static int ext3_alloc_block (handle_t *handle,
				239	struct inode * inode, unsigned long goal, int *err)
				240	{
				241	unsigned long result;
				242
				243	result = ext3_new_block(handle, inode, goal, err);
				244	return result;
				245	}
				246
				247
				248	typedef struct {
				249	__le32 *p;
				250	__le32 key;
				251	struct buffer_head *bh;
				252	} Indirect;
				253
				254	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
				255	{
				256	p->key = *(p->p = v);
				257	p->bh = bh;
				258	}
				259
				260	static inline int verify_chain(Indirect from, Indirect to)
				261	{
				262	while (from <= to && from->key == *from->p)
				263	from++;
				264	return (from > to);
				265	}
				266
				267	/**
				268	* ext3_block_to_path - parse the block number into array of offsets
				269	* @inode: inode in question (we are only interested in its superblock)
				270	* @i_block: block number to be parsed
				271	* @offsets: array to store the offsets in
				272	* @boundary: set this non-zero if the referred-to block is likely to be
				273	* followed (on disk) by an indirect block.
				274	*
				275	* To store the locations of file's data ext3 uses a data structure common
				276	* for UNIX filesystems - tree of pointers anchored in the inode, with
				277	* data blocks at leaves and indirect blocks in intermediate nodes.
				278	* This function translates the block number into path in that tree -
				279	* return value is the path length and @offsets[n] is the offset of
				280	* pointer to (n+1)th node in the nth one. If @block is out of range
				281	* (negative or too large) warning is printed and zero returned.
				282	*
				283	* Note: function doesn't find node addresses, so no IO is needed. All
				284	* we need to know is the capacity of indirect blocks (taken from the
				285	* inode->i_sb).
				286	*/
				287
				288	/*
				289	* Portability note: the last comparison (check that we fit into triple
				290	* indirect block) is spelled differently, because otherwise on an
				291	* architecture with 32-bit longs and 8Kb pages we might get into trouble
				292	* if our filesystem had 8Kb blocks. We might use long long, but that would
				293	* kill us on x86. Oh, well, at least the sign propagation does not matter -
				294	* i_block would have to be negative in the very beginning, so we would not
				295	* get there at all.
				296	*/
				297
				298	static int ext3_block_to_path(struct inode *inode,
				299	long i_block, int offsets[4], int *boundary)
				300	{
				301	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				302	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
				303	const long direct_blocks = EXT3_NDIR_BLOCKS,
				304	indirect_blocks = ptrs,
				305	double_blocks = (1 << (ptrs_bits * 2));
				306	int n = 0;
				307	int final = 0;
				308
				309	if (i_block < 0) {
				310	ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
				311	} else if (i_block < direct_blocks) {
				312	offsets[n++] = i_block;
				313	final = direct_blocks;
				314	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
				315	offsets[n++] = EXT3_IND_BLOCK;
				316	offsets[n++] = i_block;
				317	final = ptrs;
				318	} else if ((i_block -= indirect_blocks) < double_blocks) {
				319	offsets[n++] = EXT3_DIND_BLOCK;
				320	offsets[n++] = i_block >> ptrs_bits;
				321	offsets[n++] = i_block & (ptrs - 1);
				322	final = ptrs;
				323	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
				324	offsets[n++] = EXT3_TIND_BLOCK;
				325	offsets[n++] = i_block >> (ptrs_bits * 2);
				326	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
				327	offsets[n++] = i_block & (ptrs - 1);
				328	final = ptrs;
				329	} else {
				330	ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
				331	}
				332	if (boundary)
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	333	*boundary = final - 1 - (i_block & (ptrs - 1));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	334	return n;
				335	}
				336
				337	/**
				338	* ext3_get_branch - read the chain of indirect blocks leading to data
				339	* @inode: inode in question
				340	* @depth: depth of the chain (1 - direct pointer, etc.)
				341	* @offsets: offsets of pointers in inode/indirect blocks
				342	* @chain: place to store the result
				343	* @err: here we store the error value
				344	*
				345	* Function fills the array of triples <key, p, bh> and returns %NULL
				346	* if everything went OK or the pointer to the last filled triple
				347	* (incomplete one) otherwise. Upon the return chain[i].key contains
				348	* the number of (i+1)-th block in the chain (as it is stored in memory,
				349	* i.e. little-endian 32-bit), chain[i].p contains the address of that
				350	* number (it points into struct inode for i==0 and into the bh->b_data
				351	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
				352	* block for i>0 and NULL for i==0. In other words, it holds the block
				353	* numbers of the chain, addresses they were taken from (and where we can
				354	* verify that chain did not change) and buffer_heads hosting these
				355	* numbers.
				356	*
				357	* Function stops when it stumbles upon zero pointer (absent block)
				358	* (pointer to last triple returned, *@err == 0)
				359	* or when it gets an IO error reading an indirect block
				360	* (ditto, *@err == -EIO)
				361	* or when it notices that chain had been changed while it was reading
				362	* (ditto, *@err == -EAGAIN)
				363	* or when it reads all @depth-1 indirect blocks successfully and finds
				364	* the whole chain, all way to the data (returns %NULL, *err == 0).
				365	*/
				366	static Indirect ext3_get_branch(struct inode inode, int depth, int *offsets,
				367	Indirect chain[4], int *err)
				368	{
				369	struct super_block *sb = inode->i_sb;
				370	Indirect *p = chain;
				371	struct buffer_head *bh;
				372
				373	*err = 0;
				374	/* i_data is not going away, no lock needed */
				375	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
				376	if (!p->key)
				377	goto no_block;
				378	while (--depth) {
				379	bh = sb_bread(sb, le32_to_cpu(p->key));
				380	if (!bh)
				381	goto failure;
				382	/* Reader: pointers */
				383	if (!verify_chain(chain, p))
				384	goto changed;
				385	add_chain(++p, bh, (__le32)bh->b_data + ++offsets);
				386	/* Reader: end */
				387	if (!p->key)
				388	goto no_block;
				389	}
				390	return NULL;
				391
				392	changed:
				393	brelse(bh);
				394	*err = -EAGAIN;
				395	goto no_block;
				396	failure:
				397	*err = -EIO;
				398	no_block:
				399	return p;
				400	}
				401
				402	/**
				403	* ext3_find_near - find a place for allocation with sufficient locality
				404	* @inode: owner
				405	* @ind: descriptor of indirect block.
				406	*
				407	* This function returns the prefered place for block allocation.
				408	* It is used when heuristic for sequential allocation fails.
				409	* Rules are:
				410	* + if there is a block to the left of our position - allocate near it.
				411	* + if pointer will live in indirect block - allocate near that block.
				412	* + if pointer will live in inode - allocate in the same
				413	* cylinder group.
				414	*
				415	* In the latter case we colour the starting block by the callers PID to
				416	* prevent it from clashing with concurrent allocations for a different inode
				417	* in the same block group. The PID is used here so that functionally related
				418	* files will be close-by on-disk.
				419	*
				420	* Caller must make sure that @ind is valid and will stay that way.
				421	*/
				422
				423	static unsigned long ext3_find_near(struct inode inode, Indirect ind)
				424	{
				425	struct ext3_inode_info *ei = EXT3_I(inode);
				426	__le32 start = ind->bh ? (__le32) ind->bh->b_data : ei->i_data;
				427	__le32 *p;
				428	unsigned long bg_start;
				429	unsigned long colour;
				430
				431	/* Try to find previous block */
				432	for (p = ind->p - 1; p >= start; p--)
				433	if (*p)
				434	return le32_to_cpu(*p);
				435
				436	/* No such thing, so let's try location of indirect block */
				437	if (ind->bh)
				438	return ind->bh->b_blocknr;
				439
				440	/*
				441	* It is going to be refered from inode itself? OK, just put it into
				442	* the same cylinder group then.
				443	*/
				444	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
				445	le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
				446	colour = (current->pid % 16) *
				447	(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
				448	return bg_start + colour;
				449	}
				450
				451	/**
				452	* ext3_find_goal - find a prefered place for allocation.
				453	* @inode: owner
				454	* @block: block we want
				455	* @chain: chain of indirect blocks
				456	* @partial: pointer to the last triple within a chain
				457	* @goal: place to store the result.
				458	*
				459	* Normally this function find the prefered place for block allocation,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	460	* stores it in *@goal and returns zero.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	461	*/
				462
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	463	static unsigned long ext3_find_goal(struct inode *inode, long block,
				464	Indirect chain[4], Indirect *partial)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	465	{
				466	struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
				467
				468	/*
				469	* try the heuristic for sequential allocation,
				470	* failing that at least try to get decent locality.
				471	*/
				472	if (block_i && (block == block_i->last_alloc_logical_block + 1)
				473	&& (block_i->last_alloc_physical_block != 0)) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	474	return block_i->last_alloc_physical_block + 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	}
				476
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	477	return ext3_find_near(inode, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	478	}
				479
				480	/**
				481	* ext3_alloc_branch - allocate and set up a chain of blocks.
				482	* @inode: owner
				483	* @num: depth of the chain (number of blocks to allocate)
				484	* @offsets: offsets (in the blocks) to store the pointers to next.
				485	* @branch: place to store the chain in.
				486	*
				487	* This function allocates @num blocks, zeroes out all but the last one,
				488	* links them into chain and (if we are synchronous) writes them to disk.
				489	* In other words, it prepares a branch that can be spliced onto the
				490	* inode. It stores the information about that chain in the branch[], in
				491	* the same format as ext3_get_branch() would do. We are calling it after
				492	* we had read the existing part of chain and partial points to the last
				493	* triple of that (one with zero ->key). Upon the exit we have the same
Glauber de Oliveira Costa	5b11687	2005-10-30 15:02:48 -0800	[diff] [blame]	494	* picture as after the successful ext3_get_block(), except that in one
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	495	* place chain is disconnected - *branch->p is still zero (we did not
				496	* set the last link), but branch->key contains the number that should
				497	* be placed into *branch->p to fill that gap.
				498	*
				499	* If allocation fails we free all blocks we've allocated (and forget
				500	* their buffer_heads) and return the error value the from failed
				501	* ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
				502	* as described above and return 0.
				503	*/
				504
				505	static int ext3_alloc_branch(handle_t handle, struct inode inode,
				506	int num,
				507	unsigned long goal,
				508	int *offsets,
				509	Indirect *branch)
				510	{
				511	int blocksize = inode->i_sb->s_blocksize;
				512	int n = 0, keys = 0;
				513	int err = 0;
				514	int i;
				515	int parent = ext3_alloc_block(handle, inode, goal, &err);
				516
				517	branch[0].key = cpu_to_le32(parent);
				518	if (parent) {
				519	for (n = 1; n < num; n++) {
				520	struct buffer_head *bh;
				521	/* Allocate the next block */
				522	int nr = ext3_alloc_block(handle, inode, parent, &err);
				523	if (!nr)
				524	break;
				525	branch[n].key = cpu_to_le32(nr);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	526
				527	/*
				528	* Get buffer_head for parent block, zero it out
				529	* and set the pointer to new one, then send
				530	* parent to disk.
				531	*/
				532	bh = sb_getblk(inode->i_sb, parent);
Glauber de Oliveira Costa	2973dfd	2005-10-30 15:03:05 -0800	[diff] [blame]	533	if (!bh)
				534	break;
				535	keys = n+1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	536	branch[n].bh = bh;
				537	lock_buffer(bh);
				538	BUFFER_TRACE(bh, "call get_create_access");
				539	err = ext3_journal_get_create_access(handle, bh);
				540	if (err) {
				541	unlock_buffer(bh);
				542	brelse(bh);
				543	break;
				544	}
				545
				546	memset(bh->b_data, 0, blocksize);
				547	branch[n].p = (__le32*) bh->b_data + offsets[n];
				548	*branch[n].p = branch[n].key;
				549	BUFFER_TRACE(bh, "marking uptodate");
				550	set_buffer_uptodate(bh);
				551	unlock_buffer(bh);
				552
				553	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				554	err = ext3_journal_dirty_metadata(handle, bh);
				555	if (err)
				556	break;
				557
				558	parent = nr;
				559	}
				560	}
				561	if (n == num)
				562	return 0;
				563
				564	/* Allocation failed, free what we already allocated */
				565	for (i = 1; i < keys; i++) {
				566	BUFFER_TRACE(branch[i].bh, "call journal_forget");
				567	ext3_journal_forget(handle, branch[i].bh);
				568	}
				569	for (i = 0; i < keys; i++)
				570	ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
				571	return err;
				572	}
				573
				574	/**
				575	* ext3_splice_branch - splice the allocated branch onto inode.
				576	* @inode: owner
				577	* @block: (logical) number of block we are adding
				578	* @chain: chain of indirect blocks (with a missing link - see
				579	* ext3_alloc_branch)
				580	* @where: location of missing link
				581	* @num: number of blocks we are adding
				582	*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	583	* This function fills the missing link and does all housekeeping needed in
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	584	* inode (->i_blocks, etc.). In case of success we end up with the full
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	585	* chain to new block and return 0.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	586	*/
				587
				588	static int ext3_splice_branch(handle_t handle, struct inode inode, long block,
				589	Indirect chain[4], Indirect *where, int num)
				590	{
				591	int i;
				592	int err = 0;
				593	struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
				594
				595	/*
				596	* If we're splicing into a [td]indirect block (as opposed to the
				597	* inode) then we need to get write access to the [td]indirect block
				598	* before the splice.
				599	*/
				600	if (where->bh) {
				601	BUFFER_TRACE(where->bh, "get_write_access");
				602	err = ext3_journal_get_write_access(handle, where->bh);
				603	if (err)
				604	goto err_out;
				605	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	606	/* That's it */
				607
				608	*where->p = where->key;
				609
				610	/*
				611	* update the most recently allocated logical & physical block
				612	* in i_block_alloc_info, to assist find the proper goal block for next
				613	* allocation
				614	*/
				615	if (block_i) {
				616	block_i->last_alloc_logical_block = block;
				617	block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
				618	}
				619
				620	/* We are done with atomic stuff, now do the rest of housekeeping */
				621
				622	inode->i_ctime = CURRENT_TIME_SEC;
				623	ext3_mark_inode_dirty(handle, inode);
				624
				625	/* had we spliced it onto indirect block? */
				626	if (where->bh) {
				627	/*
				628	* akpm: If we spliced it onto an indirect block, we haven't
				629	* altered the inode. Note however that if it is being spliced
				630	* onto an indirect block at the very end of the file (the
				631	* file is growing) then we will alter the inode to reflect
				632	* the new i_size. But that is not done here - it is done in
				633	* generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
				634	*/
				635	jbd_debug(5, "splicing indirect only\n");
				636	BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
				637	err = ext3_journal_dirty_metadata(handle, where->bh);
				638	if (err)
				639	goto err_out;
				640	} else {
				641	/*
				642	* OK, we spliced it into the inode itself on a direct block.
				643	* Inode was dirtied above.
				644	*/
				645	jbd_debug(5, "splicing direct\n");
				646	}
				647	return err;
				648
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	649	err_out:
				650	for (i = 1; i < num; i++) {
				651	BUFFER_TRACE(where[i].bh, "call journal_forget");
				652	ext3_journal_forget(handle, where[i].bh);
				653	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	654	return err;
				655	}
				656
				657	/*
				658	* Allocation strategy is simple: if we have to allocate something, we will
				659	* have to go the whole way to leaf. So let's do it before attaching anything
				660	* to tree, set linkage between the newborn blocks, write them if sync is
				661	* required, recheck the path, free and repeat if check fails, otherwise
				662	* set the last missing link (that will protect us from any truncate-generated
				663	* removals - all blocks on the path are immune now) and possibly force the
				664	* write on the parent block.
				665	* That has a nice additional property: no special recovery from the failed
				666	* allocations is needed - we simply release blocks and do not touch anything
				667	* reachable from inode.
				668	*
				669	* akpm: `handle' can be NULL if create == 0.
				670	*
				671	* The BKL may not be held on entry here. Be sure to take it early.
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	672	* return > 0, # of blocks mapped or allocated.
				673	* return = 0, if plain lookup failed.
				674	* return < 0, error case.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	675	*/
				676
Andrew Morton	d8733c2	2006-03-23 03:00:11 -0800	[diff] [blame]	677	int
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	678	ext3_get_blocks_handle(handle_t handle, struct inode inode, sector_t iblock,
				679	unsigned long maxblocks, struct buffer_head *bh_result,
				680	int create, int extend_disksize)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	681	{
				682	int err = -EIO;
				683	int offsets[4];
				684	Indirect chain[4];
				685	Indirect *partial;
				686	unsigned long goal;
				687	int left;
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	688	int blocks_to_boundary = 0;
				689	int depth;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	690	struct ext3_inode_info *ei = EXT3_I(inode);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	691	int count = 0;
				692	unsigned long first_block = 0;
				693
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	694
				695	J_ASSERT(handle != NULL \|\| create == 0);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	696	depth = ext3_block_to_path(inode, iblock, offsets, &blocks_to_boundary);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	697
				698	if (depth == 0)
				699	goto out;
				700
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	701	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				702
				703	/* Simplest case - block found, no allocation needed */
				704	if (!partial) {
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	705	first_block = chain[depth - 1].key;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	706	clear_buffer_new(bh_result);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	707	count++;
				708	/map more blocks/
				709	while (count < maxblocks && count <= blocks_to_boundary) {
				710	if (!verify_chain(chain, partial)) {
				711	/*
				712	* Indirect block might be removed by
				713	* truncate while we were reading it.
				714	* Handling of that case: forget what we've
				715	* got now. Flag the err as EAGAIN, so it
				716	* will reread.
				717	*/
				718	err = -EAGAIN;
				719	count = 0;
				720	break;
				721	}
				722	if (le32_to_cpu(*(chain[depth-1].p+count) ==
				723	(first_block + count)))
				724	count++;
				725	else
				726	break;
				727	}
				728	if (err != -EAGAIN)
				729	goto got_it;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	730	}
				731
				732	/* Next simple case - plain lookup or failed read of indirect block */
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	733	if (!create \|\| err == -EIO)
				734	goto cleanup;
				735
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	736	mutex_lock(&ei->truncate_mutex);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	737
				738	/*
				739	* If the indirect block is missing while we are reading
				740	* the chain(ext3_get_branch() returns -EAGAIN err), or
				741	* if the chain has been changed after we grab the semaphore,
				742	* (either because another process truncated this branch, or
				743	* another get_block allocated this branch) re-grab the chain to see if
				744	* the request block has been allocated or not.
				745	*
				746	* Since we already block the truncate/other get_block
				747	* at this point, we will have the current copy of the chain when we
				748	* splice the branch into the tree.
				749	*/
				750	if (err == -EAGAIN \|\| !verify_chain(chain, partial)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	751	while (partial > chain) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	752	brelse(partial->bh);
				753	partial--;
				754	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	755	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				756	if (!partial) {
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	757	count++;
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	758	mutex_unlock(&ei->truncate_mutex);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	759	if (err)
				760	goto cleanup;
				761	clear_buffer_new(bh_result);
				762	goto got_it;
				763	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	764	}
				765
				766	/*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	767	* Okay, we need to do block allocation. Lazily initialize the block
				768	* allocation info here if necessary
				769	*/
				770	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	771	ext3_init_block_alloc_info(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	773	goal = ext3_find_goal(inode, iblock, chain, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	774
				775	left = (chain + depth) - partial;
				776
				777	/*
				778	* Block out ext3_truncate while we alter the tree
				779	*/
				780	err = ext3_alloc_branch(handle, inode, left, goal,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	781	offsets + (partial - chain), partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	782
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	783	/*
				784	* The ext3_splice_branch call will free and forget any buffers
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	785	* on the new chain if there is a failure, but that risks using
				786	* up transaction credits, especially for bitmaps where the
				787	* credits cannot be returned. Can we handle this somehow? We
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	788	* may need to return -EAGAIN upwards in the worst case. --sct
				789	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	790	if (!err)
				791	err = ext3_splice_branch(handle, inode, iblock, chain,
				792	partial, left);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	793	/*
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	794	* i_disksize growing is protected by truncate_mutex. Don't forget to
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	795	* protect it if you're about to implement concurrent
				796	* ext3_get_block() -bzzz
				797	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	798	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
				799	ei->i_disksize = inode->i_size;
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	800	mutex_unlock(&ei->truncate_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	801	if (err)
				802	goto cleanup;
				803
				804	set_buffer_new(bh_result);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	805	got_it:
				806	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	807	if (blocks_to_boundary == 0)
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	808	set_buffer_boundary(bh_result);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	809	err = count;
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	810	/* Clean up and exit */
				811	partial = chain + depth - 1; /* the whole chain */
				812	cleanup:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	813	while (partial > chain) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	814	BUFFER_TRACE(partial->bh, "call brelse");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	815	brelse(partial->bh);
				816	partial--;
				817	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	818	BUFFER_TRACE(bh_result, "returned");
				819	out:
				820	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	821	}
				822
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	823	#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
				824
				825	static int
				826	ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
				827	unsigned long max_blocks, struct buffer_head *bh_result,
				828	int create)
				829	{
				830	handle_t *handle = journal_current_handle();
				831	int ret = 0;
				832
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	833	if (!create)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	834	goto get_block; /* A read */
				835
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	836	if (max_blocks == 1)
				837	goto get_block; /* A single block get */
				838
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	839	if (handle->h_transaction->t_state == T_LOCKED) {
				840	/*
				841	* Huge direct-io writes can hold off commits for long
				842	* periods of time. Let this commit run.
				843	*/
				844	ext3_journal_stop(handle);
				845	handle = ext3_journal_start(inode, DIO_CREDITS);
				846	if (IS_ERR(handle))
				847	ret = PTR_ERR(handle);
				848	goto get_block;
				849	}
				850
				851	if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
				852	/*
				853	* Getting low on buffer credits...
				854	*/
				855	ret = ext3_journal_extend(handle, DIO_CREDITS);
				856	if (ret > 0) {
				857	/*
				858	* Couldn't extend the transaction. Start a new one.
				859	*/
				860	ret = ext3_journal_restart(handle, DIO_CREDITS);
				861	}
				862	}
				863
				864	get_block:
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	865	if (ret == 0) {
				866	ret = ext3_get_blocks_handle(handle, inode, iblock,
				867	max_blocks, bh_result, create, 0);
				868	if (ret > 0) {
				869	bh_result->b_size = (ret << inode->i_blkbits);
				870	ret = 0;
				871	}
				872	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	873	return ret;
				874	}
				875
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	876	static int ext3_get_blocks(struct inode *inode, sector_t iblock,
				877	unsigned long maxblocks, struct buffer_head *bh_result,
				878	int create)
				879	{
				880	return ext3_direct_io_get_blocks(inode, iblock, maxblocks,
				881	bh_result, create);
				882	}
				883
				884	static int ext3_get_block(struct inode *inode, sector_t iblock,
				885	struct buffer_head *bh_result, int create)
				886	{
				887	return ext3_get_blocks(inode, iblock, 1, bh_result, create);
				888	}
				889
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	890	/*
				891	* `handle' can be NULL if create is zero
				892	*/
				893	struct buffer_head ext3_getblk(handle_t handle, struct inode * inode,
				894	long block, int create, int * errp)
				895	{
				896	struct buffer_head dummy;
				897	int fatal = 0, err;
				898
				899	J_ASSERT(handle != NULL \|\| create == 0);
				900
				901	dummy.b_state = 0;
				902	dummy.b_blocknr = -1000;
				903	buffer_trace_init(&dummy.b_history);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame^]	904	err = ext3_get_blocks_handle(handle, inode, block, 1,
				905	&dummy, create, 1);
				906	if (err == 1) {
				907	err = 0;
				908	} else if (err >= 0) {
				909	WARN_ON(1);
				910	err = -EIO;
				911	}
				912	*errp = err;
				913	if (!err && buffer_mapped(&dummy)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	914	struct buffer_head *bh;
				915	bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
Glauber de Oliveira Costa	2973dfd	2005-10-30 15:03:05 -0800	[diff] [blame]	916	if (!bh) {
				917	*errp = -EIO;
				918	goto err;
				919	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	920	if (buffer_new(&dummy)) {
				921	J_ASSERT(create != 0);
				922	J_ASSERT(handle != 0);
				923
				924	/* Now that we do not always journal data, we
				925	should keep in mind whether this should
				926	always journal the new buffer as metadata.
				927	For now, regular file writes use
				928	ext3_get_block instead, so it's not a
				929	problem. */
				930	lock_buffer(bh);
				931	BUFFER_TRACE(bh, "call get_create_access");
				932	fatal = ext3_journal_get_create_access(handle, bh);
				933	if (!fatal && !buffer_uptodate(bh)) {
				934	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
				935	set_buffer_uptodate(bh);
				936	}
				937	unlock_buffer(bh);
				938	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				939	err = ext3_journal_dirty_metadata(handle, bh);
				940	if (!fatal)
				941	fatal = err;
				942	} else {
				943	BUFFER_TRACE(bh, "not a new buffer");
				944	}
				945	if (fatal) {
				946	*errp = fatal;
				947	brelse(bh);
				948	bh = NULL;
				949	}
				950	return bh;
				951	}
Glauber de Oliveira Costa	2973dfd	2005-10-30 15:03:05 -0800	[diff] [blame]	952	err:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	953	return NULL;
				954	}
				955
				956	struct buffer_head ext3_bread(handle_t handle, struct inode * inode,
				957	int block, int create, int *err)
				958	{
				959	struct buffer_head * bh;
				960
				961	bh = ext3_getblk(handle, inode, block, create, err);
				962	if (!bh)
				963	return bh;
				964	if (buffer_uptodate(bh))
				965	return bh;
				966	ll_rw_block(READ, 1, &bh);
				967	wait_on_buffer(bh);
				968	if (buffer_uptodate(bh))
				969	return bh;
				970	put_bh(bh);
				971	*err = -EIO;
				972	return NULL;
				973	}
				974
				975	static int walk_page_buffers( handle_t *handle,
				976	struct buffer_head *head,
				977	unsigned from,
				978	unsigned to,
				979	int *partial,
				980	int (fn)( handle_t handle,
				981	struct buffer_head *bh))
				982	{
				983	struct buffer_head *bh;
				984	unsigned block_start, block_end;
				985	unsigned blocksize = head->b_size;
				986	int err, ret = 0;
				987	struct buffer_head *next;
				988
				989	for ( bh = head, block_start = 0;
				990	ret == 0 && (bh != head \|\| !block_start);
				991	block_start = block_end, bh = next)
				992	{
				993	next = bh->b_this_page;
				994	block_end = block_start + blocksize;
				995	if (block_end <= from \|\| block_start >= to) {
				996	if (partial && !buffer_uptodate(bh))
				997	*partial = 1;
				998	continue;
				999	}
				1000	err = (*fn)(handle, bh);
				1001	if (!ret)
				1002	ret = err;
				1003	}
				1004	return ret;
				1005	}
				1006
				1007	/*
				1008	* To preserve ordering, it is essential that the hole instantiation and
				1009	* the data write be encapsulated in a single transaction. We cannot
				1010	* close off a transaction and start a new one between the ext3_get_block()
				1011	* and the commit_write(). So doing the journal_start at the start of
				1012	* prepare_write() is the right place.
				1013	*
				1014	* Also, this function can nest inside ext3_writepage() ->
				1015	* block_write_full_page(). In that case, we know that ext3_writepage()
				1016	* has generated enough buffer credits to do the whole page. So we won't
				1017	* block on the journal in that case, which is good, because the caller may
				1018	* be PF_MEMALLOC.
				1019	*
				1020	* By accident, ext3 can be reentered when a transaction is open via
				1021	* quota file writes. If we were to commit the transaction while thus
				1022	* reentered, there can be a deadlock - we would be holding a quota
				1023	* lock, and the commit would never complete if another thread had a
				1024	* transaction open and was blocking on the quota lock - a ranking
				1025	* violation.
				1026	*
				1027	* So what we do is to rely on the fact that journal_stop/journal_start
				1028	* will _not_ run commit under these circumstances because handle->h_ref
				1029	* is elevated. We'll still have enough credits for the tiny quotafile
				1030	* write.
				1031	*/
				1032
				1033	static int do_journal_get_write_access(handle_t *handle,
				1034	struct buffer_head *bh)
				1035	{
				1036	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1037	return 0;
				1038	return ext3_journal_get_write_access(handle, bh);
				1039	}
				1040
				1041	static int ext3_prepare_write(struct file file, struct page page,
				1042	unsigned from, unsigned to)
				1043	{
				1044	struct inode *inode = page->mapping->host;
				1045	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
				1046	handle_t *handle;
				1047	int retries = 0;
				1048
				1049	retry:
				1050	handle = ext3_journal_start(inode, needed_blocks);
				1051	if (IS_ERR(handle)) {
				1052	ret = PTR_ERR(handle);
				1053	goto out;
				1054	}
				1055	if (test_opt(inode->i_sb, NOBH))
				1056	ret = nobh_prepare_write(page, from, to, ext3_get_block);
				1057	else
				1058	ret = block_prepare_write(page, from, to, ext3_get_block);
				1059	if (ret)
				1060	goto prepare_write_failed;
				1061
				1062	if (ext3_should_journal_data(inode)) {
				1063	ret = walk_page_buffers(handle, page_buffers(page),
				1064	from, to, NULL, do_journal_get_write_access);
				1065	}
				1066	prepare_write_failed:
				1067	if (ret)
				1068	ext3_journal_stop(handle);
				1069	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
				1070	goto retry;
				1071	out:
				1072	return ret;
				1073	}
				1074
				1075	int
				1076	ext3_journal_dirty_data(handle_t handle, struct buffer_head bh)
				1077	{
				1078	int err = journal_dirty_data(handle, bh);
				1079	if (err)
				1080	ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
				1081	bh, handle,err);
				1082	return err;
				1083	}
				1084
				1085	/* For commit_write() in data=journal mode */
				1086	static int commit_write_fn(handle_t handle, struct buffer_head bh)
				1087	{
				1088	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1089	return 0;
				1090	set_buffer_uptodate(bh);
				1091	return ext3_journal_dirty_metadata(handle, bh);
				1092	}
				1093
				1094	/*
				1095	* We need to pick up the new inode size which generic_commit_write gave us
				1096	* `file' can be NULL - eg, when called from page_symlink().
				1097	*
				1098	* ext3 never places buffers on inode->i_mapping->private_list. metadata
				1099	* buffers are managed internally.
				1100	*/
				1101
				1102	static int ext3_ordered_commit_write(struct file file, struct page page,
				1103	unsigned from, unsigned to)
				1104	{
				1105	handle_t *handle = ext3_journal_current_handle();
				1106	struct inode *inode = page->mapping->host;
				1107	int ret = 0, ret2;
				1108
				1109	ret = walk_page_buffers(handle, page_buffers(page),
				1110	from, to, NULL, ext3_journal_dirty_data);
				1111
				1112	if (ret == 0) {
				1113	/*
				1114	* generic_commit_write() will run mark_inode_dirty() if i_size
				1115	* changes. So let's piggyback the i_disksize mark_inode_dirty
				1116	* into that.
				1117	*/
				1118	loff_t new_i_size;
				1119
				1120	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1121	if (new_i_size > EXT3_I(inode)->i_disksize)
				1122	EXT3_I(inode)->i_disksize = new_i_size;
				1123	ret = generic_commit_write(file, page, from, to);
				1124	}
				1125	ret2 = ext3_journal_stop(handle);
				1126	if (!ret)
				1127	ret = ret2;
				1128	return ret;
				1129	}
				1130
				1131	static int ext3_writeback_commit_write(struct file file, struct page page,
				1132	unsigned from, unsigned to)
				1133	{
				1134	handle_t *handle = ext3_journal_current_handle();
				1135	struct inode *inode = page->mapping->host;
				1136	int ret = 0, ret2;
				1137	loff_t new_i_size;
				1138
				1139	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1140	if (new_i_size > EXT3_I(inode)->i_disksize)
				1141	EXT3_I(inode)->i_disksize = new_i_size;
				1142
				1143	if (test_opt(inode->i_sb, NOBH))
				1144	ret = nobh_commit_write(file, page, from, to);
				1145	else
				1146	ret = generic_commit_write(file, page, from, to);
				1147
				1148	ret2 = ext3_journal_stop(handle);
				1149	if (!ret)
				1150	ret = ret2;
				1151	return ret;
				1152	}
				1153
				1154	static int ext3_journalled_commit_write(struct file *file,
				1155	struct page *page, unsigned from, unsigned to)
				1156	{
				1157	handle_t *handle = ext3_journal_current_handle();
				1158	struct inode *inode = page->mapping->host;
				1159	int ret = 0, ret2;
				1160	int partial = 0;
				1161	loff_t pos;
				1162
				1163	/*
				1164	* Here we duplicate the generic_commit_write() functionality
				1165	*/
				1166	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1167
				1168	ret = walk_page_buffers(handle, page_buffers(page), from,
				1169	to, &partial, commit_write_fn);
				1170	if (!partial)
				1171	SetPageUptodate(page);
				1172	if (pos > inode->i_size)
				1173	i_size_write(inode, pos);
				1174	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1175	if (inode->i_size > EXT3_I(inode)->i_disksize) {
				1176	EXT3_I(inode)->i_disksize = inode->i_size;
				1177	ret2 = ext3_mark_inode_dirty(handle, inode);
				1178	if (!ret)
				1179	ret = ret2;
				1180	}
				1181	ret2 = ext3_journal_stop(handle);
				1182	if (!ret)
				1183	ret = ret2;
				1184	return ret;
				1185	}
				1186
				1187	/*
				1188	* bmap() is special. It gets used by applications such as lilo and by
				1189	* the swapper to find the on-disk block of a specific piece of data.
				1190	*
				1191	* Naturally, this is dangerous if the block concerned is still in the
				1192	* journal. If somebody makes a swapfile on an ext3 data-journaling
				1193	* filesystem and enables swap, then they may get a nasty shock when the
				1194	* data getting swapped to that swapfile suddenly gets overwritten by
				1195	* the original zero's written out previously to the journal and
				1196	* awaiting writeback in the kernel's buffer cache.
				1197	*
				1198	* So, if we see any bmap calls here on a modified, data-journaled file,
				1199	* take extra steps to flush any blocks which might be in the cache.
				1200	*/
				1201	static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
				1202	{
				1203	struct inode *inode = mapping->host;
				1204	journal_t *journal;
				1205	int err;
				1206
				1207	if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
				1208	/*
				1209	* This is a REALLY heavyweight approach, but the use of
				1210	* bmap on dirty files is expected to be extremely rare:
				1211	* only if we run lilo or swapon on a freshly made file
				1212	* do we expect this to happen.
				1213	*
				1214	* (bmap requires CAP_SYS_RAWIO so this does not
				1215	* represent an unprivileged user DOS attack --- we'd be
				1216	* in trouble if mortal users could trigger this path at
				1217	* will.)
				1218	*
				1219	* NB. EXT3_STATE_JDATA is not set on files other than
				1220	* regular files. If somebody wants to bmap a directory
				1221	* or symlink and gets confused because the buffer
				1222	* hasn't yet been flushed to disk, they deserve
				1223	* everything they get.
				1224	*/
				1225
				1226	EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
				1227	journal = EXT3_JOURNAL(inode);
				1228	journal_lock_updates(journal);
				1229	err = journal_flush(journal);
				1230	journal_unlock_updates(journal);
				1231
				1232	if (err)
				1233	return 0;
				1234	}
				1235
				1236	return generic_block_bmap(mapping,block,ext3_get_block);
				1237	}
				1238
				1239	static int bget_one(handle_t handle, struct buffer_head bh)
				1240	{
				1241	get_bh(bh);
				1242	return 0;
				1243	}
				1244
				1245	static int bput_one(handle_t handle, struct buffer_head bh)
				1246	{
				1247	put_bh(bh);
				1248	return 0;
				1249	}
				1250
				1251	static int journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
				1252	{
				1253	if (buffer_mapped(bh))
				1254	return ext3_journal_dirty_data(handle, bh);
				1255	return 0;
				1256	}
				1257
				1258	/*
				1259	* Note that we always start a transaction even if we're not journalling
				1260	* data. This is to preserve ordering: any hole instantiation within
				1261	* __block_write_full_page -> ext3_get_block() should be journalled
				1262	* along with the data so we don't crash and then get metadata which
				1263	* refers to old data.
				1264	*
				1265	* In all journalling modes block_write_full_page() will start the I/O.
				1266	*
				1267	* Problem:
				1268	*
				1269	* ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
				1270	* ext3_writepage()
				1271	*
				1272	* Similar for:
				1273	*
				1274	* ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
				1275	*
				1276	* Same applies to ext3_get_block(). We will deadlock on various things like
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	1277	* lock_journal and i_truncate_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1278	*
				1279	* Setting PF_MEMALLOC here doesn't work - too many internal memory
				1280	* allocations fail.
				1281	*
				1282	* 16May01: If we're reentered then journal_current_handle() will be
				1283	* non-zero. We simply return.
				1284	*
				1285	* 1 July 2001: @@@ FIXME:
				1286	* In journalled data mode, a data buffer may be metadata against the
				1287	* current transaction. But the same file is part of a shared mapping
				1288	* and someone does a writepage() on it.
				1289	*
				1290	* We will move the buffer onto the async_data list, but after it has
				1291	* been dirtied. So there's a small window where we have dirty data on
				1292	* BJ_Metadata.
				1293	*
				1294	* Note that this only applies to the last partial page in the file. The
				1295	* bit which block_write_full_page() uses prepare/commit for. (That's
				1296	* broken code anyway: it's wrong for msync()).
				1297	*
				1298	* It's a rare case: affects the final partial page, for journalled data
				1299	* where the file is subject to bith write() and writepage() in the same
				1300	* transction. To fix it we'll need a custom block_write_full_page().
				1301	* We'll probably need that anyway for journalling writepage() output.
				1302	*
				1303	* We don't honour synchronous mounts for writepage(). That would be
				1304	* disastrous. Any write() or metadata operation will sync the fs for
				1305	* us.
				1306	*
				1307	* AKPM2: if all the page's buffers are mapped to disk and !data=journal,
				1308	* we don't need to open a transaction here.
				1309	*/
				1310	static int ext3_ordered_writepage(struct page *page,
				1311	struct writeback_control *wbc)
				1312	{
				1313	struct inode *inode = page->mapping->host;
				1314	struct buffer_head *page_bufs;
				1315	handle_t *handle = NULL;
				1316	int ret = 0;
				1317	int err;
				1318
				1319	J_ASSERT(PageLocked(page));
				1320
				1321	/*
				1322	* We give up here if we're reentered, because it might be for a
				1323	* different filesystem.
				1324	*/
				1325	if (ext3_journal_current_handle())
				1326	goto out_fail;
				1327
				1328	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1329
				1330	if (IS_ERR(handle)) {
				1331	ret = PTR_ERR(handle);
				1332	goto out_fail;
				1333	}
				1334
				1335	if (!page_has_buffers(page)) {
				1336	create_empty_buffers(page, inode->i_sb->s_blocksize,
				1337	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1338	}
				1339	page_bufs = page_buffers(page);
				1340	walk_page_buffers(handle, page_bufs, 0,
				1341	PAGE_CACHE_SIZE, NULL, bget_one);
				1342
				1343	ret = block_write_full_page(page, ext3_get_block, wbc);
				1344
				1345	/*
				1346	* The page can become unlocked at any point now, and
				1347	* truncate can then come in and change things. So we
				1348	* can't touch page from now on. But page_bufs is
				1349	* safe due to elevated refcount.
				1350	*/
				1351
				1352	/*
				1353	* And attach them to the current transaction. But only if
				1354	* block_write_full_page() succeeded. Otherwise they are unmapped,
				1355	* and generally junk.
				1356	*/
				1357	if (ret == 0) {
				1358	err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
				1359	NULL, journal_dirty_data_fn);
				1360	if (!ret)
				1361	ret = err;
				1362	}
				1363	walk_page_buffers(handle, page_bufs, 0,
				1364	PAGE_CACHE_SIZE, NULL, bput_one);
				1365	err = ext3_journal_stop(handle);
				1366	if (!ret)
				1367	ret = err;
				1368	return ret;
				1369
				1370	out_fail:
				1371	redirty_page_for_writepage(wbc, page);
				1372	unlock_page(page);
				1373	return ret;
				1374	}
				1375
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	static int ext3_writeback_writepage(struct page *page,
				1377	struct writeback_control *wbc)
				1378	{
				1379	struct inode *inode = page->mapping->host;
				1380	handle_t *handle = NULL;
				1381	int ret = 0;
				1382	int err;
				1383
				1384	if (ext3_journal_current_handle())
				1385	goto out_fail;
				1386
				1387	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1388	if (IS_ERR(handle)) {
				1389	ret = PTR_ERR(handle);
				1390	goto out_fail;
				1391	}
				1392
				1393	if (test_opt(inode->i_sb, NOBH))
				1394	ret = nobh_writepage(page, ext3_get_block, wbc);
				1395	else
				1396	ret = block_write_full_page(page, ext3_get_block, wbc);
				1397
				1398	err = ext3_journal_stop(handle);
				1399	if (!ret)
				1400	ret = err;
				1401	return ret;
				1402
				1403	out_fail:
				1404	redirty_page_for_writepage(wbc, page);
				1405	unlock_page(page);
				1406	return ret;
				1407	}
				1408
				1409	static int ext3_journalled_writepage(struct page *page,
				1410	struct writeback_control *wbc)
				1411	{
				1412	struct inode *inode = page->mapping->host;
				1413	handle_t *handle = NULL;
				1414	int ret = 0;
				1415	int err;
				1416
				1417	if (ext3_journal_current_handle())
				1418	goto no_write;
				1419
				1420	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1421	if (IS_ERR(handle)) {
				1422	ret = PTR_ERR(handle);
				1423	goto no_write;
				1424	}
				1425
				1426	if (!page_has_buffers(page) \|\| PageChecked(page)) {
				1427	/*
				1428	* It's mmapped pagecache. Add buffers and journal it. There
				1429	* doesn't seem much point in redirtying the page here.
				1430	*/
				1431	ClearPageChecked(page);
				1432	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
				1433	ext3_get_block);
Denis Lunev	ab4eb43	2005-11-13 16:07:17 -0800	[diff] [blame]	1434	if (ret != 0) {
				1435	ext3_journal_stop(handle);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1436	goto out_unlock;
Denis Lunev	ab4eb43	2005-11-13 16:07:17 -0800	[diff] [blame]	1437	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1438	ret = walk_page_buffers(handle, page_buffers(page), 0,
				1439	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
				1440
				1441	err = walk_page_buffers(handle, page_buffers(page), 0,
				1442	PAGE_CACHE_SIZE, NULL, commit_write_fn);
				1443	if (ret == 0)
				1444	ret = err;
				1445	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1446	unlock_page(page);
				1447	} else {
				1448	/*
				1449	* It may be a page full of checkpoint-mode buffers. We don't
				1450	* really know unless we go poke around in the buffer_heads.
				1451	* But block_write_full_page will do the right thing.
				1452	*/
				1453	ret = block_write_full_page(page, ext3_get_block, wbc);
				1454	}
				1455	err = ext3_journal_stop(handle);
				1456	if (!ret)
				1457	ret = err;
				1458	out:
				1459	return ret;
				1460
				1461	no_write:
				1462	redirty_page_for_writepage(wbc, page);
				1463	out_unlock:
				1464	unlock_page(page);
				1465	goto out;
				1466	}
				1467
				1468	static int ext3_readpage(struct file file, struct page page)
				1469	{
				1470	return mpage_readpage(page, ext3_get_block);
				1471	}
				1472
				1473	static int
				1474	ext3_readpages(struct file file, struct address_space mapping,
				1475	struct list_head *pages, unsigned nr_pages)
				1476	{
				1477	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
				1478	}
				1479
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1480	static void ext3_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1481	{
				1482	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1483
				1484	/*
				1485	* If it's a full truncate we just forget about the pending dirtying
				1486	*/
				1487	if (offset == 0)
				1488	ClearPageChecked(page);
				1489
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1490	journal_invalidatepage(journal, page, offset);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1491	}
				1492
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1493	static int ext3_releasepage(struct page *page, gfp_t wait)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1494	{
				1495	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1496
				1497	WARN_ON(PageChecked(page));
				1498	if (!page_has_buffers(page))
				1499	return 0;
				1500	return journal_try_to_free_buffers(journal, page, wait);
				1501	}
				1502
				1503	/*
				1504	* If the O_DIRECT write will extend the file then add this inode to the
				1505	* orphan list. So recovery will truncate it back to the original size
				1506	* if the machine crashes during the write.
				1507	*
				1508	* If the O_DIRECT write is intantiating holes inside i_size and the machine
				1509	* crashes then stale disk data _may_ be exposed inside the file.
				1510	*/
				1511	static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
				1512	const struct iovec *iov, loff_t offset,
				1513	unsigned long nr_segs)
				1514	{
				1515	struct file *file = iocb->ki_filp;
				1516	struct inode *inode = file->f_mapping->host;
				1517	struct ext3_inode_info *ei = EXT3_I(inode);
				1518	handle_t *handle = NULL;
				1519	ssize_t ret;
				1520	int orphan = 0;
				1521	size_t count = iov_length(iov, nr_segs);
				1522
				1523	if (rw == WRITE) {
				1524	loff_t final_size = offset + count;
				1525
				1526	handle = ext3_journal_start(inode, DIO_CREDITS);
				1527	if (IS_ERR(handle)) {
				1528	ret = PTR_ERR(handle);
				1529	goto out;
				1530	}
				1531	if (final_size > inode->i_size) {
				1532	ret = ext3_orphan_add(handle, inode);
				1533	if (ret)
				1534	goto out_stop;
				1535	orphan = 1;
				1536	ei->i_disksize = inode->i_size;
				1537	}
				1538	}
				1539
				1540	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
				1541	offset, nr_segs,
				1542	ext3_direct_io_get_blocks, NULL);
				1543
				1544	/*
				1545	* Reacquire the handle: ext3_direct_io_get_block() can restart the
				1546	* transaction
				1547	*/
				1548	handle = journal_current_handle();
				1549
				1550	out_stop:
				1551	if (handle) {
				1552	int err;
				1553
				1554	if (orphan && inode->i_nlink)
				1555	ext3_orphan_del(handle, inode);
				1556	if (orphan && ret > 0) {
				1557	loff_t end = offset + ret;
				1558	if (end > inode->i_size) {
				1559	ei->i_disksize = end;
				1560	i_size_write(inode, end);
				1561	/*
				1562	* We're going to return a positive `ret'
				1563	* here due to non-zero-length I/O, so there's
				1564	* no way of reporting error returns from
				1565	* ext3_mark_inode_dirty() to userspace. So
				1566	* ignore it.
				1567	*/
				1568	ext3_mark_inode_dirty(handle, inode);
				1569	}
				1570	}
				1571	err = ext3_journal_stop(handle);
				1572	if (ret == 0)
				1573	ret = err;
				1574	}
				1575	out:
				1576	return ret;
				1577	}
				1578
				1579	/*
				1580	* Pages can be marked dirty completely asynchronously from ext3's journalling
				1581	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
				1582	* much here because ->set_page_dirty is called under VFS locks. The page is
				1583	* not necessarily locked.
				1584	*
				1585	* We cannot just dirty the page and leave attached buffers clean, because the
				1586	* buffers' dirty state is "definitive". We cannot just set the buffers dirty
				1587	* or jbddirty because all the journalling code will explode.
				1588	*
				1589	* So what we do is to mark the page "pending dirty" and next time writepage
				1590	* is called, propagate that into the buffers appropriately.
				1591	*/
				1592	static int ext3_journalled_set_page_dirty(struct page *page)
				1593	{
				1594	SetPageChecked(page);
				1595	return __set_page_dirty_nobuffers(page);
				1596	}
				1597
				1598	static struct address_space_operations ext3_ordered_aops = {
				1599	.readpage = ext3_readpage,
				1600	.readpages = ext3_readpages,
				1601	.writepage = ext3_ordered_writepage,
				1602	.sync_page = block_sync_page,
				1603	.prepare_write = ext3_prepare_write,
				1604	.commit_write = ext3_ordered_commit_write,
				1605	.bmap = ext3_bmap,
				1606	.invalidatepage = ext3_invalidatepage,
				1607	.releasepage = ext3_releasepage,
				1608	.direct_IO = ext3_direct_IO,
Christoph Lameter	e965f96	2006-02-01 03:05:41 -0800	[diff] [blame]	1609	.migratepage = buffer_migrate_page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1610	};
				1611
				1612	static struct address_space_operations ext3_writeback_aops = {
				1613	.readpage = ext3_readpage,
				1614	.readpages = ext3_readpages,
				1615	.writepage = ext3_writeback_writepage,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1616	.sync_page = block_sync_page,
				1617	.prepare_write = ext3_prepare_write,
				1618	.commit_write = ext3_writeback_commit_write,
				1619	.bmap = ext3_bmap,
				1620	.invalidatepage = ext3_invalidatepage,
				1621	.releasepage = ext3_releasepage,
				1622	.direct_IO = ext3_direct_IO,
Christoph Lameter	e965f96	2006-02-01 03:05:41 -0800	[diff] [blame]	1623	.migratepage = buffer_migrate_page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1624	};
				1625
				1626	static struct address_space_operations ext3_journalled_aops = {
				1627	.readpage = ext3_readpage,
				1628	.readpages = ext3_readpages,
				1629	.writepage = ext3_journalled_writepage,
				1630	.sync_page = block_sync_page,
				1631	.prepare_write = ext3_prepare_write,
				1632	.commit_write = ext3_journalled_commit_write,
				1633	.set_page_dirty = ext3_journalled_set_page_dirty,
				1634	.bmap = ext3_bmap,
				1635	.invalidatepage = ext3_invalidatepage,
				1636	.releasepage = ext3_releasepage,
				1637	};
				1638
				1639	void ext3_set_aops(struct inode *inode)
				1640	{
				1641	if (ext3_should_order_data(inode))
				1642	inode->i_mapping->a_ops = &ext3_ordered_aops;
				1643	else if (ext3_should_writeback_data(inode))
				1644	inode->i_mapping->a_ops = &ext3_writeback_aops;
				1645	else
				1646	inode->i_mapping->a_ops = &ext3_journalled_aops;
				1647	}
				1648
				1649	/*
				1650	* ext3_block_truncate_page() zeroes out a mapping from file offset `from'
				1651	* up to the end of the block which corresponds to `from'.
				1652	* This required during truncate. We need to physically zero the tail end
				1653	* of that block so it doesn't yield old data if the file is later grown.
				1654	*/
				1655	static int ext3_block_truncate_page(handle_t handle, struct page page,
				1656	struct address_space *mapping, loff_t from)
				1657	{
				1658	unsigned long index = from >> PAGE_CACHE_SHIFT;
				1659	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				1660	unsigned blocksize, iblock, length, pos;
				1661	struct inode *inode = mapping->host;
				1662	struct buffer_head *bh;
				1663	int err = 0;
				1664	void *kaddr;
				1665
				1666	blocksize = inode->i_sb->s_blocksize;
				1667	length = blocksize - (offset & (blocksize - 1));
				1668	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
				1669
				1670	/*
				1671	* For "nobh" option, we can only work if we don't need to
				1672	* read-in the page - otherwise we create buffers to do the IO.
				1673	*/
Badari Pulavarty	cd6ef84	2006-03-11 03:27:14 -0800	[diff] [blame]	1674	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
				1675	ext3_should_writeback_data(inode) && PageUptodate(page)) {
				1676	kaddr = kmap_atomic(page, KM_USER0);
				1677	memset(kaddr + offset, 0, length);
				1678	flush_dcache_page(page);
				1679	kunmap_atomic(kaddr, KM_USER0);
				1680	set_page_dirty(page);
				1681	goto unlock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1682	}
				1683
				1684	if (!page_has_buffers(page))
				1685	create_empty_buffers(page, blocksize, 0);
				1686
				1687	/* Find the buffer that contains "offset" */
				1688	bh = page_buffers(page);
				1689	pos = blocksize;
				1690	while (offset >= pos) {
				1691	bh = bh->b_this_page;
				1692	iblock++;
				1693	pos += blocksize;
				1694	}
				1695
				1696	err = 0;
				1697	if (buffer_freed(bh)) {
				1698	BUFFER_TRACE(bh, "freed: skip");
				1699	goto unlock;
				1700	}
				1701
				1702	if (!buffer_mapped(bh)) {
				1703	BUFFER_TRACE(bh, "unmapped");
				1704	ext3_get_block(inode, iblock, bh, 0);
				1705	/* unmapped? It's a hole - nothing to do */
				1706	if (!buffer_mapped(bh)) {
				1707	BUFFER_TRACE(bh, "still unmapped");
				1708	goto unlock;
				1709	}
				1710	}
				1711
				1712	/* Ok, it's mapped. Make sure it's up-to-date */
				1713	if (PageUptodate(page))
				1714	set_buffer_uptodate(bh);
				1715
				1716	if (!buffer_uptodate(bh)) {
				1717	err = -EIO;
				1718	ll_rw_block(READ, 1, &bh);
				1719	wait_on_buffer(bh);
				1720	/* Uhhuh. Read error. Complain and punt. */
				1721	if (!buffer_uptodate(bh))
				1722	goto unlock;
				1723	}
				1724
				1725	if (ext3_should_journal_data(inode)) {
				1726	BUFFER_TRACE(bh, "get write access");
				1727	err = ext3_journal_get_write_access(handle, bh);
				1728	if (err)
				1729	goto unlock;
				1730	}
				1731
				1732	kaddr = kmap_atomic(page, KM_USER0);
				1733	memset(kaddr + offset, 0, length);
				1734	flush_dcache_page(page);
				1735	kunmap_atomic(kaddr, KM_USER0);
				1736
				1737	BUFFER_TRACE(bh, "zeroed end of block");
				1738
				1739	err = 0;
				1740	if (ext3_should_journal_data(inode)) {
				1741	err = ext3_journal_dirty_metadata(handle, bh);
				1742	} else {
				1743	if (ext3_should_order_data(inode))
				1744	err = ext3_journal_dirty_data(handle, bh);
				1745	mark_buffer_dirty(bh);
				1746	}
				1747
				1748	unlock:
				1749	unlock_page(page);
				1750	page_cache_release(page);
				1751	return err;
				1752	}
				1753
				1754	/*
				1755	* Probably it should be a library function... search for first non-zero word
				1756	* or memcmp with zero_page, whatever is better for particular architecture.
				1757	* Linus?
				1758	*/
				1759	static inline int all_zeroes(__le32 p, __le32 q)
				1760	{
				1761	while (p < q)
				1762	if (*p++)
				1763	return 0;
				1764	return 1;
				1765	}
				1766
				1767	/**
				1768	* ext3_find_shared - find the indirect blocks for partial truncation.
				1769	* @inode: inode in question
				1770	* @depth: depth of the affected branch
				1771	* @offsets: offsets of pointers in that branch (see ext3_block_to_path)
				1772	* @chain: place to store the pointers to partial indirect blocks
				1773	* @top: place to the (detached) top of branch
				1774	*
				1775	* This is a helper function used by ext3_truncate().
				1776	*
				1777	* When we do truncate() we may have to clean the ends of several
				1778	* indirect blocks but leave the blocks themselves alive. Block is
				1779	* partially truncated if some data below the new i_size is refered
				1780	* from it (and it is on the path to the first completely truncated
				1781	* data block, indeed). We have to free the top of that path along
				1782	* with everything to the right of the path. Since no allocation
				1783	* past the truncation point is possible until ext3_truncate()
				1784	* finishes, we may safely do the latter, but top of branch may
				1785	* require special attention - pageout below the truncation point
				1786	* might try to populate it.
				1787	*
				1788	* We atomically detach the top of branch from the tree, store the
				1789	* block number of its root in *@top, pointers to buffer_heads of
				1790	* partially truncated blocks - in @chain[].bh and pointers to
				1791	* their last elements that should not be removed - in
				1792	* @chain[].p. Return value is the pointer to last filled element
				1793	* of @chain.
				1794	*
				1795	* The work left to caller to do the actual freeing of subtrees:
				1796	* a) free the subtree starting from *@top
				1797	* b) free the subtrees whose roots are stored in
				1798	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
				1799	* c) free the subtrees growing from the inode past the @chain[0].
				1800	* (no partially truncated stuff there). */
				1801
				1802	static Indirect ext3_find_shared(struct inode inode,
				1803	int depth,
				1804	int offsets[4],
				1805	Indirect chain[4],
				1806	__le32 *top)
				1807	{
				1808	Indirect partial, p;
				1809	int k, err;
				1810
				1811	*top = 0;
				1812	/* Make k index the deepest non-null offest + 1 */
				1813	for (k = depth; k > 1 && !offsets[k-1]; k--)
				1814	;
				1815	partial = ext3_get_branch(inode, k, offsets, chain, &err);
				1816	/* Writer: pointers */
				1817	if (!partial)
				1818	partial = chain + k-1;
				1819	/*
				1820	* If the branch acquired continuation since we've looked at it -
				1821	* fine, it should all survive and (new) top doesn't belong to us.
				1822	*/
				1823	if (!partial->key && *partial->p)
				1824	/* Writer: end */
				1825	goto no_top;
				1826	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
				1827	;
				1828	/*
				1829	* OK, we've found the last block that must survive. The rest of our
				1830	* branch should be detached before unlocking. However, if that rest
				1831	* of branch is all ours and does not grow immediately from the inode
				1832	* it's easier to cheat and just decrement partial->p.
				1833	*/
				1834	if (p == chain + k - 1 && p > chain) {
				1835	p->p--;
				1836	} else {
				1837	top = p->p;
				1838	/* Nope, don't do this in ext3. Must leave the tree intact */
				1839	#if 0
				1840	*p->p = 0;
				1841	#endif
				1842	}
				1843	/* Writer: end */
				1844
				1845	while(partial > p)
				1846	{
				1847	brelse(partial->bh);
				1848	partial--;
				1849	}
				1850	no_top:
				1851	return partial;
				1852	}
				1853
				1854	/*
				1855	* Zero a number of block pointers in either an inode or an indirect block.
				1856	* If we restart the transaction we must again get write access to the
				1857	* indirect block for further modification.
				1858	*
				1859	* We release `count' blocks on disk, but (last - first) may be greater
				1860	* than `count' because there can be holes in there.
				1861	*/
				1862	static void
				1863	ext3_clear_blocks(handle_t handle, struct inode inode, struct buffer_head *bh,
				1864	unsigned long block_to_free, unsigned long count,
				1865	__le32 first, __le32 last)
				1866	{
				1867	__le32 *p;
				1868	if (try_to_extend_transaction(handle, inode)) {
				1869	if (bh) {
				1870	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				1871	ext3_journal_dirty_metadata(handle, bh);
				1872	}
				1873	ext3_mark_inode_dirty(handle, inode);
				1874	ext3_journal_test_restart(handle, inode);
				1875	if (bh) {
				1876	BUFFER_TRACE(bh, "retaking write access");
				1877	ext3_journal_get_write_access(handle, bh);
				1878	}
				1879	}
				1880
				1881	/*
				1882	* Any buffers which are on the journal will be in memory. We find
				1883	* them on the hash table so journal_revoke() will run journal_forget()
				1884	* on them. We've already detached each block from the file, so
				1885	* bforget() in journal_forget() should be safe.
				1886	*
				1887	* AKPM: turn on bforget in journal_forget()!!!
				1888	*/
				1889	for (p = first; p < last; p++) {
				1890	u32 nr = le32_to_cpu(*p);
				1891	if (nr) {
				1892	struct buffer_head *bh;
				1893
				1894	*p = 0;
				1895	bh = sb_find_get_block(inode->i_sb, nr);
				1896	ext3_forget(handle, 0, inode, bh, nr);
				1897	}
				1898	}
				1899
				1900	ext3_free_blocks(handle, inode, block_to_free, count);
				1901	}
				1902
				1903	/**
				1904	* ext3_free_data - free a list of data blocks
				1905	* @handle: handle for this transaction
				1906	* @inode: inode we are dealing with
				1907	* @this_bh: indirect buffer_head which contains @first and @last
				1908	* @first: array of block numbers
				1909	* @last: points immediately past the end of array
				1910	*
				1911	* We are freeing all blocks refered from that array (numbers are stored as
				1912	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
				1913	*
				1914	* We accumulate contiguous runs of blocks to free. Conveniently, if these
				1915	* blocks are contiguous then releasing them at one time will only affect one
				1916	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
				1917	* actually use a lot of journal space.
				1918	*
				1919	* @this_bh will be %NULL if @first and @last point into the inode's direct
				1920	* block pointers.
				1921	*/
				1922	static void ext3_free_data(handle_t handle, struct inode inode,
				1923	struct buffer_head *this_bh,
				1924	__le32 first, __le32 last)
				1925	{
				1926	unsigned long block_to_free = 0; /* Starting block # of a run */
				1927	unsigned long count = 0; /* Number of blocks in the run */
				1928	__le32 block_to_free_p = NULL; / Pointer into inode/ind
				1929	corresponding to
				1930	block_to_free */
				1931	unsigned long nr; /* Current block # */
				1932	__le32 p; / Pointer into inode/ind
				1933	for current block */
				1934	int err;
				1935
				1936	if (this_bh) { /* For indirect block */
				1937	BUFFER_TRACE(this_bh, "get_write_access");
				1938	err = ext3_journal_get_write_access(handle, this_bh);
				1939	/* Important: if we can't update the indirect pointers
				1940	* to the blocks, we can't free them. */
				1941	if (err)
				1942	return;
				1943	}
				1944
				1945	for (p = first; p < last; p++) {
				1946	nr = le32_to_cpu(*p);
				1947	if (nr) {
				1948	/* accumulate blocks to free if they're contiguous */
				1949	if (count == 0) {
				1950	block_to_free = nr;
				1951	block_to_free_p = p;
				1952	count = 1;
				1953	} else if (nr == block_to_free + count) {
				1954	count++;
				1955	} else {
				1956	ext3_clear_blocks(handle, inode, this_bh,
				1957	block_to_free,
				1958	count, block_to_free_p, p);
				1959	block_to_free = nr;
				1960	block_to_free_p = p;
				1961	count = 1;
				1962	}
				1963	}
				1964	}
				1965
				1966	if (count > 0)
				1967	ext3_clear_blocks(handle, inode, this_bh, block_to_free,
				1968	count, block_to_free_p, p);
				1969
				1970	if (this_bh) {
				1971	BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
				1972	ext3_journal_dirty_metadata(handle, this_bh);
				1973	}
				1974	}
				1975
				1976	/**
				1977	* ext3_free_branches - free an array of branches
				1978	* @handle: JBD handle for this transaction
				1979	* @inode: inode we are dealing with
				1980	* @parent_bh: the buffer_head which contains @first and @last
				1981	* @first: array of block numbers
				1982	* @last: pointer immediately past the end of array
				1983	* @depth: depth of the branches to free
				1984	*
				1985	* We are freeing all blocks refered from these branches (numbers are
				1986	* stored as little-endian 32-bit) and updating @inode->i_blocks
				1987	* appropriately.
				1988	*/
				1989	static void ext3_free_branches(handle_t handle, struct inode inode,
				1990	struct buffer_head *parent_bh,
				1991	__le32 first, __le32 last, int depth)
				1992	{
				1993	unsigned long nr;
				1994	__le32 *p;
				1995
				1996	if (is_handle_aborted(handle))
				1997	return;
				1998
				1999	if (depth--) {
				2000	struct buffer_head *bh;
				2001	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				2002	p = last;
				2003	while (--p >= first) {
				2004	nr = le32_to_cpu(*p);
				2005	if (!nr)
				2006	continue; /* A hole */
				2007
				2008	/* Go read the buffer for the next level down */
				2009	bh = sb_bread(inode->i_sb, nr);
				2010
				2011	/*
				2012	* A read failure? Report error and clear slot
				2013	* (should be rare).
				2014	*/
				2015	if (!bh) {
				2016	ext3_error(inode->i_sb, "ext3_free_branches",
				2017	"Read failure, inode=%ld, block=%ld",
				2018	inode->i_ino, nr);
				2019	continue;
				2020	}
				2021
				2022	/* This zaps the entire block. Bottom up. */
				2023	BUFFER_TRACE(bh, "free child branches");
				2024	ext3_free_branches(handle, inode, bh,
				2025	(__le32*)bh->b_data,
				2026	(__le32*)bh->b_data + addr_per_block,
				2027	depth);
				2028
				2029	/*
				2030	* We've probably journalled the indirect block several
				2031	* times during the truncate. But it's no longer
				2032	* needed and we now drop it from the transaction via
				2033	* journal_revoke().
				2034	*
				2035	* That's easy if it's exclusively part of this
				2036	* transaction. But if it's part of the committing
				2037	* transaction then journal_forget() will simply
				2038	* brelse() it. That means that if the underlying
				2039	* block is reallocated in ext3_get_block(),
				2040	* unmap_underlying_metadata() will find this block
				2041	* and will try to get rid of it. damn, damn.
				2042	*
				2043	* If this block has already been committed to the
				2044	* journal, a revoke record will be written. And
				2045	* revoke records must be emitted before clearing
				2046	* this block's bit in the bitmaps.
				2047	*/
				2048	ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
				2049
				2050	/*
				2051	* Everything below this this pointer has been
				2052	* released. Now let this top-of-subtree go.
				2053	*
				2054	* We want the freeing of this indirect block to be
				2055	* atomic in the journal with the updating of the
				2056	* bitmap block which owns it. So make some room in
				2057	* the journal.
				2058	*
				2059	* We zero the parent pointer after freeing its
				2060	* pointee in the bitmaps, so if extend_transaction()
				2061	* for some reason fails to put the bitmap changes and
				2062	* the release into the same transaction, recovery
				2063	* will merely complain about releasing a free block,
				2064	* rather than leaking blocks.
				2065	*/
				2066	if (is_handle_aborted(handle))
				2067	return;
				2068	if (try_to_extend_transaction(handle, inode)) {
				2069	ext3_mark_inode_dirty(handle, inode);
				2070	ext3_journal_test_restart(handle, inode);
				2071	}
				2072
				2073	ext3_free_blocks(handle, inode, nr, 1);
				2074
				2075	if (parent_bh) {
				2076	/*
				2077	* The block which we have just freed is
				2078	* pointed to by an indirect block: journal it
				2079	*/
				2080	BUFFER_TRACE(parent_bh, "get_write_access");
				2081	if (!ext3_journal_get_write_access(handle,
				2082	parent_bh)){
				2083	*p = 0;
				2084	BUFFER_TRACE(parent_bh,
				2085	"call ext3_journal_dirty_metadata");
				2086	ext3_journal_dirty_metadata(handle,
				2087	parent_bh);
				2088	}
				2089	}
				2090	}
				2091	} else {
				2092	/* We have reached the bottom of the tree. */
				2093	BUFFER_TRACE(parent_bh, "free data blocks");
				2094	ext3_free_data(handle, inode, parent_bh, first, last);
				2095	}
				2096	}
				2097
				2098	/*
				2099	* ext3_truncate()
				2100	*
				2101	* We block out ext3_get_block() block instantiations across the entire
				2102	* transaction, and VFS/VM ensures that ext3_truncate() cannot run
				2103	* simultaneously on behalf of the same inode.
				2104	*
				2105	* As we work through the truncate and commmit bits of it to the journal there
				2106	* is one core, guiding principle: the file's tree must always be consistent on
				2107	* disk. We must be able to restart the truncate after a crash.
				2108	*
				2109	* The file's tree may be transiently inconsistent in memory (although it
				2110	* probably isn't), but whenever we close off and commit a journal transaction,
				2111	* the contents of (the filesystem + the journal) must be consistent and
				2112	* restartable. It's pretty simple, really: bottom up, right to left (although
				2113	* left-to-right works OK too).
				2114	*
				2115	* Note that at recovery time, journal replay occurs before the restart of
				2116	* truncate against the orphan inode list.
				2117	*
				2118	* The committed inode has the new, desired i_size (which is the same as
				2119	* i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
				2120	* that this inode's truncate did not complete and it will again call
				2121	* ext3_truncate() to have another go. So there will be instantiated blocks
				2122	* to the right of the truncation point in a crashed ext3 filesystem. But
				2123	* that's fine - as long as they are linked from the inode, the post-crash
				2124	* ext3_truncate() run will find them and release them.
				2125	*/
				2126
				2127	void ext3_truncate(struct inode * inode)
				2128	{
				2129	handle_t *handle;
				2130	struct ext3_inode_info *ei = EXT3_I(inode);
				2131	__le32 *i_data = ei->i_data;
				2132	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				2133	struct address_space *mapping = inode->i_mapping;
				2134	int offsets[4];
				2135	Indirect chain[4];
				2136	Indirect *partial;
				2137	__le32 nr = 0;
				2138	int n;
				2139	long last_block;
				2140	unsigned blocksize = inode->i_sb->s_blocksize;
				2141	struct page *page;
				2142
				2143	if (!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
				2144	S_ISLNK(inode->i_mode)))
				2145	return;
				2146	if (ext3_inode_is_fast_symlink(inode))
				2147	return;
				2148	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				2149	return;
				2150
				2151	/*
				2152	* We have to lock the EOF page here, because lock_page() nests
				2153	* outside journal_start().
				2154	*/
				2155	if ((inode->i_size & (blocksize - 1)) == 0) {
				2156	/* Block boundary? Nothing to do */
				2157	page = NULL;
				2158	} else {
				2159	page = grab_cache_page(mapping,
				2160	inode->i_size >> PAGE_CACHE_SHIFT);
				2161	if (!page)
				2162	return;
				2163	}
				2164
				2165	handle = start_transaction(inode);
				2166	if (IS_ERR(handle)) {
				2167	if (page) {
				2168	clear_highpage(page);
				2169	flush_dcache_page(page);
				2170	unlock_page(page);
				2171	page_cache_release(page);
				2172	}
				2173	return; /* AKPM: return what? */
				2174	}
				2175
				2176	last_block = (inode->i_size + blocksize-1)
				2177	>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
				2178
				2179	if (page)
				2180	ext3_block_truncate_page(handle, page, mapping, inode->i_size);
				2181
				2182	n = ext3_block_to_path(inode, last_block, offsets, NULL);
				2183	if (n == 0)
				2184	goto out_stop; /* error */
				2185
				2186	/*
				2187	* OK. This truncate is going to happen. We add the inode to the
				2188	* orphan list, so that if this truncate spans multiple transactions,
				2189	* and we crash, we will resume the truncate when the filesystem
				2190	* recovers. It also marks the inode dirty, to catch the new size.
				2191	*
				2192	* Implication: the file must always be in a sane, consistent
				2193	* truncatable state while each transaction commits.
				2194	*/
				2195	if (ext3_orphan_add(handle, inode))
				2196	goto out_stop;
				2197
				2198	/*
				2199	* The orphan list entry will now protect us from any crash which
				2200	* occurs before the truncate completes, so it is now safe to propagate
				2201	* the new, shorter inode size (held for now in i_size) into the
				2202	* on-disk inode. We do this via i_disksize, which is the value which
				2203	* ext3 really writes onto the disk inode.
				2204	*/
				2205	ei->i_disksize = inode->i_size;
				2206
				2207	/*
				2208	* From here we block out all ext3_get_block() callers who want to
				2209	* modify the block allocation tree.
				2210	*/
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	2211	mutex_lock(&ei->truncate_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2212
				2213	if (n == 1) { /* direct blocks */
				2214	ext3_free_data(handle, inode, NULL, i_data+offsets[0],
				2215	i_data + EXT3_NDIR_BLOCKS);
				2216	goto do_indirects;
				2217	}
				2218
				2219	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
				2220	/* Kill the top of shared branch (not detached) */
				2221	if (nr) {
				2222	if (partial == chain) {
				2223	/* Shared branch grows from the inode */
				2224	ext3_free_branches(handle, inode, NULL,
				2225	&nr, &nr+1, (chain+n-1) - partial);
				2226	*partial->p = 0;
				2227	/*
				2228	* We mark the inode dirty prior to restart,
				2229	* and prior to stop. No need for it here.
				2230	*/
				2231	} else {
				2232	/* Shared branch grows from an indirect block */
				2233	BUFFER_TRACE(partial->bh, "get_write_access");
				2234	ext3_free_branches(handle, inode, partial->bh,
				2235	partial->p,
				2236	partial->p+1, (chain+n-1) - partial);
				2237	}
				2238	}
				2239	/* Clear the ends of indirect blocks on the shared branch */
				2240	while (partial > chain) {
				2241	ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
				2242	(__le32*)partial->bh->b_data+addr_per_block,
				2243	(chain+n-1) - partial);
				2244	BUFFER_TRACE(partial->bh, "call brelse");
				2245	brelse (partial->bh);
				2246	partial--;
				2247	}
				2248	do_indirects:
				2249	/* Kill the remaining (whole) subtrees */
				2250	switch (offsets[0]) {
				2251	default:
				2252	nr = i_data[EXT3_IND_BLOCK];
				2253	if (nr) {
				2254	ext3_free_branches(handle, inode, NULL,
				2255	&nr, &nr+1, 1);
				2256	i_data[EXT3_IND_BLOCK] = 0;
				2257	}
				2258	case EXT3_IND_BLOCK:
				2259	nr = i_data[EXT3_DIND_BLOCK];
				2260	if (nr) {
				2261	ext3_free_branches(handle, inode, NULL,
				2262	&nr, &nr+1, 2);
				2263	i_data[EXT3_DIND_BLOCK] = 0;
				2264	}
				2265	case EXT3_DIND_BLOCK:
				2266	nr = i_data[EXT3_TIND_BLOCK];
				2267	if (nr) {
				2268	ext3_free_branches(handle, inode, NULL,
				2269	&nr, &nr+1, 3);
				2270	i_data[EXT3_TIND_BLOCK] = 0;
				2271	}
				2272	case EXT3_TIND_BLOCK:
				2273	;
				2274	}
				2275
				2276	ext3_discard_reservation(inode);
				2277
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	2278	mutex_unlock(&ei->truncate_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2279	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
				2280	ext3_mark_inode_dirty(handle, inode);
				2281
				2282	/* In a multi-transaction truncate, we only make the final
				2283	* transaction synchronous */
				2284	if (IS_SYNC(inode))
				2285	handle->h_sync = 1;
				2286	out_stop:
				2287	/*
				2288	* If this was a simple ftruncate(), and the file will remain alive
				2289	* then we need to clear up the orphan record which we created above.
				2290	* However, if this was a real unlink then we were called by
				2291	* ext3_delete_inode(), and we allow that function to clean up the
				2292	* orphan info for us.
				2293	*/
				2294	if (inode->i_nlink)
				2295	ext3_orphan_del(handle, inode);
				2296
				2297	ext3_journal_stop(handle);
				2298	}
				2299
				2300	static unsigned long ext3_get_inode_block(struct super_block *sb,
				2301	unsigned long ino, struct ext3_iloc *iloc)
				2302	{
				2303	unsigned long desc, group_desc, block_group;
				2304	unsigned long offset, block;
				2305	struct buffer_head *bh;
				2306	struct ext3_group_desc * gdp;
				2307
				2308
				2309	if ((ino != EXT3_ROOT_INO &&
				2310	ino != EXT3_JOURNAL_INO &&
				2311	ino != EXT3_RESIZE_INO &&
				2312	ino < EXT3_FIRST_INO(sb)) \|\|
				2313	ino > le32_to_cpu(
				2314	EXT3_SB(sb)->s_es->s_inodes_count)) {
				2315	ext3_error (sb, "ext3_get_inode_block",
				2316	"bad inode number: %lu", ino);
				2317	return 0;
				2318	}
				2319	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
				2320	if (block_group >= EXT3_SB(sb)->s_groups_count) {
				2321	ext3_error (sb, "ext3_get_inode_block",
				2322	"group >= groups count");
				2323	return 0;
				2324	}
				2325	smp_rmb();
				2326	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
				2327	desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
				2328	bh = EXT3_SB(sb)->s_group_desc[group_desc];
				2329	if (!bh) {
				2330	ext3_error (sb, "ext3_get_inode_block",
				2331	"Descriptor not loaded");
				2332	return 0;
				2333	}
				2334
				2335	gdp = (struct ext3_group_desc *) bh->b_data;
				2336	/*
				2337	* Figure out the offset within the block group inode table
				2338	*/
				2339	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
				2340	EXT3_INODE_SIZE(sb);
				2341	block = le32_to_cpu(gdp[desc].bg_inode_table) +
				2342	(offset >> EXT3_BLOCK_SIZE_BITS(sb));
				2343
				2344	iloc->block_group = block_group;
				2345	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
				2346	return block;
				2347	}
				2348
				2349	/*
				2350	* ext3_get_inode_loc returns with an extra refcount against the inode's
				2351	* underlying buffer_head on success. If 'in_mem' is true, we have all
				2352	* data in memory that is needed to recreate the on-disk version of this
				2353	* inode.
				2354	*/
				2355	static int __ext3_get_inode_loc(struct inode *inode,
				2356	struct ext3_iloc *iloc, int in_mem)
				2357	{
				2358	unsigned long block;
				2359	struct buffer_head *bh;
				2360
				2361	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
				2362	if (!block)
				2363	return -EIO;
				2364
				2365	bh = sb_getblk(inode->i_sb, block);
				2366	if (!bh) {
				2367	ext3_error (inode->i_sb, "ext3_get_inode_loc",
				2368	"unable to read inode block - "
				2369	"inode=%lu, block=%lu", inode->i_ino, block);
				2370	return -EIO;
				2371	}
				2372	if (!buffer_uptodate(bh)) {
				2373	lock_buffer(bh);
				2374	if (buffer_uptodate(bh)) {
				2375	/* someone brought it uptodate while we waited */
				2376	unlock_buffer(bh);
				2377	goto has_buffer;
				2378	}
				2379
				2380	/*
				2381	* If we have all information of the inode in memory and this
				2382	* is the only valid inode in the block, we need not read the
				2383	* block.
				2384	*/
				2385	if (in_mem) {
				2386	struct buffer_head *bitmap_bh;
				2387	struct ext3_group_desc *desc;
				2388	int inodes_per_buffer;
				2389	int inode_offset, i;
				2390	int block_group;
				2391	int start;
				2392
				2393	block_group = (inode->i_ino - 1) /
				2394	EXT3_INODES_PER_GROUP(inode->i_sb);
				2395	inodes_per_buffer = bh->b_size /
				2396	EXT3_INODE_SIZE(inode->i_sb);
				2397	inode_offset = ((inode->i_ino - 1) %
				2398	EXT3_INODES_PER_GROUP(inode->i_sb));
				2399	start = inode_offset & ~(inodes_per_buffer - 1);
				2400
				2401	/* Is the inode bitmap in cache? */
				2402	desc = ext3_get_group_desc(inode->i_sb,
				2403	block_group, NULL);
				2404	if (!desc)
				2405	goto make_io;
				2406
				2407	bitmap_bh = sb_getblk(inode->i_sb,
				2408	le32_to_cpu(desc->bg_inode_bitmap));
				2409	if (!bitmap_bh)
				2410	goto make_io;
				2411
				2412	/*
				2413	* If the inode bitmap isn't in cache then the
				2414	* optimisation may end up performing two reads instead
				2415	* of one, so skip it.
				2416	*/
				2417	if (!buffer_uptodate(bitmap_bh)) {
				2418	brelse(bitmap_bh);
				2419	goto make_io;
				2420	}
				2421	for (i = start; i < start + inodes_per_buffer; i++) {
				2422	if (i == inode_offset)
				2423	continue;
				2424	if (ext3_test_bit(i, bitmap_bh->b_data))
				2425	break;
				2426	}
				2427	brelse(bitmap_bh);
				2428	if (i == start + inodes_per_buffer) {
				2429	/* all other inodes are free, so skip I/O */
				2430	memset(bh->b_data, 0, bh->b_size);
				2431	set_buffer_uptodate(bh);
				2432	unlock_buffer(bh);
				2433	goto has_buffer;
				2434	}
				2435	}
				2436
				2437	make_io:
				2438	/*
				2439	* There are other valid inodes in the buffer, this inode
				2440	* has in-inode xattrs, or we don't have this inode in memory.
				2441	* Read the block from disk.
				2442	*/
				2443	get_bh(bh);
				2444	bh->b_end_io = end_buffer_read_sync;
				2445	submit_bh(READ, bh);
				2446	wait_on_buffer(bh);
				2447	if (!buffer_uptodate(bh)) {
				2448	ext3_error(inode->i_sb, "ext3_get_inode_loc",
				2449	"unable to read inode block - "
				2450	"inode=%lu, block=%lu",
				2451	inode->i_ino, block);
				2452	brelse(bh);
				2453	return -EIO;
				2454	}
				2455	}
				2456	has_buffer:
				2457	iloc->bh = bh;
				2458	return 0;
				2459	}
				2460
				2461	int ext3_get_inode_loc(struct inode inode, struct ext3_iloc iloc)
				2462	{
				2463	/* We have all inode data except xattrs in memory here. */
				2464	return __ext3_get_inode_loc(inode, iloc,
				2465	!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
				2466	}
				2467
				2468	void ext3_set_inode_flags(struct inode *inode)
				2469	{
				2470	unsigned int flags = EXT3_I(inode)->i_flags;
				2471
				2472	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);
				2473	if (flags & EXT3_SYNC_FL)
				2474	inode->i_flags \|= S_SYNC;
				2475	if (flags & EXT3_APPEND_FL)
				2476	inode->i_flags \|= S_APPEND;
				2477	if (flags & EXT3_IMMUTABLE_FL)
				2478	inode->i_flags \|= S_IMMUTABLE;
				2479	if (flags & EXT3_NOATIME_FL)
				2480	inode->i_flags \|= S_NOATIME;
				2481	if (flags & EXT3_DIRSYNC_FL)
				2482	inode->i_flags \|= S_DIRSYNC;
				2483	}
				2484
				2485	void ext3_read_inode(struct inode * inode)
				2486	{
				2487	struct ext3_iloc iloc;
				2488	struct ext3_inode *raw_inode;
				2489	struct ext3_inode_info *ei = EXT3_I(inode);
				2490	struct buffer_head *bh;
				2491	int block;
				2492
				2493	#ifdef CONFIG_EXT3_FS_POSIX_ACL
				2494	ei->i_acl = EXT3_ACL_NOT_CACHED;
				2495	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
				2496	#endif
				2497	ei->i_block_alloc_info = NULL;
				2498
				2499	if (__ext3_get_inode_loc(inode, &iloc, 0))
				2500	goto bad_inode;
				2501	bh = iloc.bh;
				2502	raw_inode = ext3_raw_inode(&iloc);
				2503	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
				2504	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
				2505	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
				2506	if(!(test_opt (inode->i_sb, NO_UID32))) {
				2507	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;
				2508	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;
				2509	}
				2510	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
				2511	inode->i_size = le32_to_cpu(raw_inode->i_size);
				2512	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
				2513	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
				2514	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
				2515	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
				2516
				2517	ei->i_state = 0;
				2518	ei->i_dir_start_lookup = 0;
				2519	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
				2520	/* We now have enough fields to check if the inode was active or not.
				2521	* This is needed because nfsd might try to access dead inodes
				2522	* the test is that same one that e2fsck uses
				2523	* NeilBrown 1999oct15
				2524	*/
				2525	if (inode->i_nlink == 0) {
				2526	if (inode->i_mode == 0 \|\|
				2527	!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
				2528	/* this inode is deleted */
				2529	brelse (bh);
				2530	goto bad_inode;
				2531	}
				2532	/* The only unlinked inodes we let through here have
				2533	* valid i_mode and are being read by the orphan
				2534	* recovery code: that's fine, we're about to complete
				2535	* the process of deleting those. */
				2536	}
				2537	inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
				2538	* (for stat), not the fs block
				2539	* size */
				2540	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
				2541	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
				2542	#ifdef EXT3_FRAGMENTS
				2543	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
				2544	ei->i_frag_no = raw_inode->i_frag;
				2545	ei->i_frag_size = raw_inode->i_fsize;
				2546	#endif
				2547	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
				2548	if (!S_ISREG(inode->i_mode)) {
				2549	ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
				2550	} else {
				2551	inode->i_size \|=
				2552	((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
				2553	}
				2554	ei->i_disksize = inode->i_size;
				2555	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
				2556	ei->i_block_group = iloc.block_group;
				2557	/*
				2558	* NOTE! The in-memory inode i_data array is in little-endian order
				2559	* even on big-endian machines: we do NOT byteswap the block numbers!
				2560	*/
				2561	for (block = 0; block < EXT3_N_BLOCKS; block++)
				2562	ei->i_data[block] = raw_inode->i_block[block];
				2563	INIT_LIST_HEAD(&ei->i_orphan);
				2564
				2565	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
				2566	EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
				2567	/*
				2568	* When mke2fs creates big inodes it does not zero out
				2569	* the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
				2570	* so ignore those first few inodes.
				2571	*/
				2572	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
				2573	if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
				2574	EXT3_INODE_SIZE(inode->i_sb))
				2575	goto bad_inode;
				2576	if (ei->i_extra_isize == 0) {
				2577	/* The extra space is currently unused. Use it. */
				2578	ei->i_extra_isize = sizeof(struct ext3_inode) -
				2579	EXT3_GOOD_OLD_INODE_SIZE;
				2580	} else {
				2581	__le32 magic = (void )raw_inode +
				2582	EXT3_GOOD_OLD_INODE_SIZE +
				2583	ei->i_extra_isize;
				2584	if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				2585	ei->i_state \|= EXT3_STATE_XATTR;
				2586	}
				2587	} else
				2588	ei->i_extra_isize = 0;
				2589
				2590	if (S_ISREG(inode->i_mode)) {
				2591	inode->i_op = &ext3_file_inode_operations;
				2592	inode->i_fop = &ext3_file_operations;
				2593	ext3_set_aops(inode);
				2594	} else if (S_ISDIR(inode->i_mode)) {
				2595	inode->i_op = &ext3_dir_inode_operations;
				2596	inode->i_fop = &ext3_dir_operations;
				2597	} else if (S_ISLNK(inode->i_mode)) {
				2598	if (ext3_inode_is_fast_symlink(inode))
				2599	inode->i_op = &ext3_fast_symlink_inode_operations;
				2600	else {
				2601	inode->i_op = &ext3_symlink_inode_operations;
				2602	ext3_set_aops(inode);
				2603	}
				2604	} else {
				2605	inode->i_op = &ext3_special_inode_operations;
				2606	if (raw_inode->i_block[0])
				2607	init_special_inode(inode, inode->i_mode,
				2608	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
				2609	else
				2610	init_special_inode(inode, inode->i_mode,
				2611	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
				2612	}
				2613	brelse (iloc.bh);
				2614	ext3_set_inode_flags(inode);
				2615	return;
				2616
				2617	bad_inode:
				2618	make_bad_inode(inode);
				2619	return;
				2620	}
				2621
				2622	/*
				2623	* Post the struct inode info into an on-disk inode location in the
				2624	* buffer-cache. This gobbles the caller's reference to the
				2625	* buffer_head in the inode location struct.
				2626	*
				2627	* The caller must have write access to iloc->bh.
				2628	*/
				2629	static int ext3_do_update_inode(handle_t *handle,
				2630	struct inode *inode,
				2631	struct ext3_iloc *iloc)
				2632	{
				2633	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
				2634	struct ext3_inode_info *ei = EXT3_I(inode);
				2635	struct buffer_head *bh = iloc->bh;
				2636	int err = 0, rc, block;
				2637
				2638	/* For fields not not tracking in the in-memory inode,
				2639	* initialise them to zero for new inodes. */
				2640	if (ei->i_state & EXT3_STATE_NEW)
				2641	memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
				2642
				2643	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
				2644	if(!(test_opt(inode->i_sb, NO_UID32))) {
				2645	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
				2646	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
				2647	/*
				2648	* Fix up interoperability with old kernels. Otherwise, old inodes get
				2649	* re-used with the upper 16 bits of the uid/gid intact
				2650	*/
				2651	if(!ei->i_dtime) {
				2652	raw_inode->i_uid_high =
				2653	cpu_to_le16(high_16_bits(inode->i_uid));
				2654	raw_inode->i_gid_high =
				2655	cpu_to_le16(high_16_bits(inode->i_gid));
				2656	} else {
				2657	raw_inode->i_uid_high = 0;
				2658	raw_inode->i_gid_high = 0;
				2659	}
				2660	} else {
				2661	raw_inode->i_uid_low =
				2662	cpu_to_le16(fs_high2lowuid(inode->i_uid));
				2663	raw_inode->i_gid_low =
				2664	cpu_to_le16(fs_high2lowgid(inode->i_gid));
				2665	raw_inode->i_uid_high = 0;
				2666	raw_inode->i_gid_high = 0;
				2667	}
				2668	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
				2669	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
				2670	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
				2671	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
				2672	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
				2673	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
				2674	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
				2675	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
				2676	#ifdef EXT3_FRAGMENTS
				2677	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
				2678	raw_inode->i_frag = ei->i_frag_no;
				2679	raw_inode->i_fsize = ei->i_frag_size;
				2680	#endif
				2681	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
				2682	if (!S_ISREG(inode->i_mode)) {
				2683	raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
				2684	} else {
				2685	raw_inode->i_size_high =
				2686	cpu_to_le32(ei->i_disksize >> 32);
				2687	if (ei->i_disksize > 0x7fffffffULL) {
				2688	struct super_block *sb = inode->i_sb;
				2689	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
				2690	EXT3_FEATURE_RO_COMPAT_LARGE_FILE) \|\|
				2691	EXT3_SB(sb)->s_es->s_rev_level ==
				2692	cpu_to_le32(EXT3_GOOD_OLD_REV)) {
				2693	/* If this is the first large file
				2694	* created, add a flag to the superblock.
				2695	*/
				2696	err = ext3_journal_get_write_access(handle,
				2697	EXT3_SB(sb)->s_sbh);
				2698	if (err)
				2699	goto out_brelse;
				2700	ext3_update_dynamic_rev(sb);
				2701	EXT3_SET_RO_COMPAT_FEATURE(sb,
				2702	EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
				2703	sb->s_dirt = 1;
				2704	handle->h_sync = 1;
				2705	err = ext3_journal_dirty_metadata(handle,
				2706	EXT3_SB(sb)->s_sbh);
				2707	}
				2708	}
				2709	}
				2710	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
				2711	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				2712	if (old_valid_dev(inode->i_rdev)) {
				2713	raw_inode->i_block[0] =
				2714	cpu_to_le32(old_encode_dev(inode->i_rdev));
				2715	raw_inode->i_block[1] = 0;
				2716	} else {
				2717	raw_inode->i_block[0] = 0;
				2718	raw_inode->i_block[1] =
				2719	cpu_to_le32(new_encode_dev(inode->i_rdev));
				2720	raw_inode->i_block[2] = 0;
				2721	}
				2722	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
				2723	raw_inode->i_block[block] = ei->i_data[block];
				2724
Andreas Gruenbacher	ff87b37	2005-07-07 17:57:00 -0700	[diff] [blame]	2725	if (ei->i_extra_isize)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2726	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
				2727
				2728	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				2729	rc = ext3_journal_dirty_metadata(handle, bh);
				2730	if (!err)
				2731	err = rc;
				2732	ei->i_state &= ~EXT3_STATE_NEW;
				2733
				2734	out_brelse:
				2735	brelse (bh);
				2736	ext3_std_error(inode->i_sb, err);
				2737	return err;
				2738	}
				2739
				2740	/*
				2741	* ext3_write_inode()
				2742	*
				2743	* We are called from a few places:
				2744	*
				2745	* - Within generic_file_write() for O_SYNC files.
				2746	* Here, there will be no transaction running. We wait for any running
				2747	* trasnaction to commit.
				2748	*
				2749	* - Within sys_sync(), kupdate and such.
				2750	* We wait on commit, if tol to.
				2751	*
				2752	* - Within prune_icache() (PF_MEMALLOC == true)
				2753	* Here we simply return. We can't afford to block kswapd on the
				2754	* journal commit.
				2755	*
				2756	* In all cases it is actually safe for us to return without doing anything,
				2757	* because the inode has been copied into a raw inode buffer in
				2758	* ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
				2759	* knfsd.
				2760	*
				2761	* Note that we are absolutely dependent upon all inode dirtiers doing the
				2762	* right thing: they must call mark_inode_dirty() after dirtying info in
				2763	* which we are interested.
				2764	*
				2765	* It would be a bug for them to not do this. The code:
				2766	*
				2767	* mark_inode_dirty(inode)
				2768	* stuff();
				2769	* inode->i_size = expr;
				2770	*
				2771	* is in error because a kswapd-driven write_inode() could occur while
				2772	* `stuff()' is running, and the new i_size will be lost. Plus the inode
				2773	* will no longer be on the superblock's dirty inode list.
				2774	*/
				2775	int ext3_write_inode(struct inode *inode, int wait)
				2776	{
				2777	if (current->flags & PF_MEMALLOC)
				2778	return 0;
				2779
				2780	if (ext3_journal_current_handle()) {
				2781	jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
				2782	dump_stack();
				2783	return -EIO;
				2784	}
				2785
				2786	if (!wait)
				2787	return 0;
				2788
				2789	return ext3_force_commit(inode->i_sb);
				2790	}
				2791
				2792	/*
				2793	* ext3_setattr()
				2794	*
				2795	* Called from notify_change.
				2796	*
				2797	* We want to trap VFS attempts to truncate the file as soon as
				2798	* possible. In particular, we want to make sure that when the VFS
				2799	* shrinks i_size, we put the inode on the orphan list and modify
				2800	* i_disksize immediately, so that during the subsequent flushing of
				2801	* dirty pages and freeing of disk blocks, we can guarantee that any
				2802	* commit will leave the blocks being flushed in an unused state on
				2803	* disk. (On recovery, the inode will get truncated and the blocks will
				2804	* be freed, so we have a strong guarantee that no future commit will
				2805	* leave these blocks visible to the user.)
				2806	*
				2807	* Called with inode->sem down.
				2808	*/
				2809	int ext3_setattr(struct dentry dentry, struct iattr attr)
				2810	{
				2811	struct inode *inode = dentry->d_inode;
				2812	int error, rc = 0;
				2813	const unsigned int ia_valid = attr->ia_valid;
				2814
				2815	error = inode_change_ok(inode, attr);
				2816	if (error)
				2817	return error;
				2818
				2819	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
				2820	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
				2821	handle_t *handle;
				2822
				2823	/* (user+group)*(old+new) structure, inode write (sb,
				2824	* inode block, ? - but truncate inode update has it) */
Jan Kara	1f54587	2005-06-23 22:01:04 -0700	[diff] [blame]	2825	handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
				2826	EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2827	if (IS_ERR(handle)) {
				2828	error = PTR_ERR(handle);
				2829	goto err_out;
				2830	}
				2831	error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
				2832	if (error) {
				2833	ext3_journal_stop(handle);
				2834	return error;
				2835	}
				2836	/* Update corresponding info in inode so that everything is in
				2837	* one transaction */
				2838	if (attr->ia_valid & ATTR_UID)
				2839	inode->i_uid = attr->ia_uid;
				2840	if (attr->ia_valid & ATTR_GID)
				2841	inode->i_gid = attr->ia_gid;
				2842	error = ext3_mark_inode_dirty(handle, inode);
				2843	ext3_journal_stop(handle);
				2844	}
				2845
				2846	if (S_ISREG(inode->i_mode) &&
				2847	attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
				2848	handle_t *handle;
				2849
				2850	handle = ext3_journal_start(inode, 3);
				2851	if (IS_ERR(handle)) {
				2852	error = PTR_ERR(handle);
				2853	goto err_out;
				2854	}
				2855
				2856	error = ext3_orphan_add(handle, inode);
				2857	EXT3_I(inode)->i_disksize = attr->ia_size;
				2858	rc = ext3_mark_inode_dirty(handle, inode);
				2859	if (!error)
				2860	error = rc;
				2861	ext3_journal_stop(handle);
				2862	}
				2863
				2864	rc = inode_setattr(inode, attr);
				2865
				2866	/* If inode_setattr's call to ext3_truncate failed to get a
				2867	* transaction handle at all, we need to clean up the in-core
				2868	* orphan list manually. */
				2869	if (inode->i_nlink)
				2870	ext3_orphan_del(NULL, inode);
				2871
				2872	if (!rc && (ia_valid & ATTR_MODE))
				2873	rc = ext3_acl_chmod(inode);
				2874
				2875	err_out:
				2876	ext3_std_error(inode->i_sb, error);
				2877	if (!error)
				2878	error = rc;
				2879	return error;
				2880	}
				2881
				2882
				2883	/*
				2884	* akpm: how many blocks doth make a writepage()?
				2885	*
				2886	* With N blocks per page, it may be:
				2887	* N data blocks
				2888	* 2 indirect block
				2889	* 2 dindirect
				2890	* 1 tindirect
				2891	* N+5 bitmap blocks (from the above)
				2892	* N+5 group descriptor summary blocks
				2893	* 1 inode block
				2894	* 1 superblock.
				2895	* 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
				2896	*
				2897	* 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
				2898	*
				2899	* With ordered or writeback data it's the same, less the N data blocks.
				2900	*
				2901	* If the inode's direct blocks can hold an integral number of pages then a
				2902	* page cannot straddle two indirect blocks, and we can only touch one indirect
				2903	* and dindirect block, and the "5" above becomes "3".
				2904	*
				2905	* This still overestimates under most circumstances. If we were to pass the
				2906	* start and end offsets in here as well we could do block_to_path() on each
				2907	* block and work out the exact number of indirects which are touched. Pah.
				2908	*/
				2909
				2910	static int ext3_writepage_trans_blocks(struct inode *inode)
				2911	{
				2912	int bpp = ext3_journal_blocks_per_page(inode);
				2913	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
				2914	int ret;
				2915
				2916	if (ext3_should_journal_data(inode))
				2917	ret = 3 * (bpp + indirects) + 2;
				2918	else
				2919	ret = 2 * (bpp + indirects) + 2;
				2920
				2921	#ifdef CONFIG_QUOTA
				2922	/* We know that structure was already allocated during DQUOT_INIT so
				2923	* we will be updating only the data blocks + inodes */
Jan Kara	1f54587	2005-06-23 22:01:04 -0700	[diff] [blame]	2924	ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2925	#endif
				2926
				2927	return ret;
				2928	}
				2929
				2930	/*
				2931	* The caller must have previously called ext3_reserve_inode_write().
				2932	* Give this, we know that the caller already has write access to iloc->bh.
				2933	*/
				2934	int ext3_mark_iloc_dirty(handle_t *handle,
				2935	struct inode inode, struct ext3_iloc iloc)
				2936	{
				2937	int err = 0;
				2938
				2939	/* the do_update_inode consumes one bh->b_count */
				2940	get_bh(iloc->bh);
				2941
				2942	/* ext3_do_update_inode() does journal_dirty_metadata */
				2943	err = ext3_do_update_inode(handle, inode, iloc);
				2944	put_bh(iloc->bh);
				2945	return err;
				2946	}
				2947
				2948	/*
				2949	* On success, We end up with an outstanding reference count against
				2950	* iloc->bh. This _must_ be cleaned up later.
				2951	*/
				2952
				2953	int
				2954	ext3_reserve_inode_write(handle_t handle, struct inode inode,
				2955	struct ext3_iloc *iloc)
				2956	{
				2957	int err = 0;
				2958	if (handle) {
				2959	err = ext3_get_inode_loc(inode, iloc);
				2960	if (!err) {
				2961	BUFFER_TRACE(iloc->bh, "get_write_access");
				2962	err = ext3_journal_get_write_access(handle, iloc->bh);
				2963	if (err) {
				2964	brelse(iloc->bh);
				2965	iloc->bh = NULL;
				2966	}
				2967	}
				2968	}
				2969	ext3_std_error(inode->i_sb, err);
				2970	return err;
				2971	}
				2972
				2973	/*
				2974	* akpm: What we do here is to mark the in-core inode as clean
				2975	* with respect to inode dirtiness (it may still be data-dirty).
				2976	* This means that the in-core inode may be reaped by prune_icache
				2977	* without having to perform any I/O. This is a very good thing,
				2978	* because any task may call prune_icache - even ones which
				2979	* have a transaction open against a different journal.
				2980	*
				2981	* Is this cheating? Not really. Sure, we haven't written the
				2982	* inode out, but prune_icache isn't a user-visible syncing function.
				2983	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
				2984	* we start and wait on commits.
				2985	*
				2986	* Is this efficient/effective? Well, we're being nice to the system
				2987	* by cleaning up our inodes proactively so they can be reaped
				2988	* without I/O. But we are potentially leaving up to five seconds'
				2989	* worth of inodes floating about which prune_icache wants us to
				2990	* write out. One way to fix that would be to get prune_icache()
				2991	* to do a write_super() to free up some memory. It has the desired
				2992	* effect.
				2993	*/
				2994	int ext3_mark_inode_dirty(handle_t handle, struct inode inode)
				2995	{
				2996	struct ext3_iloc iloc;
				2997	int err;
				2998
				2999	might_sleep();
				3000	err = ext3_reserve_inode_write(handle, inode, &iloc);
				3001	if (!err)
				3002	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
				3003	return err;
				3004	}
				3005
				3006	/*
				3007	* akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
				3008	*
				3009	* We're really interested in the case where a file is being extended.
				3010	* i_size has been changed by generic_commit_write() and we thus need
				3011	* to include the updated inode in the current transaction.
				3012	*
				3013	* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
				3014	* are allocated to the file.
				3015	*
				3016	* If the inode is marked synchronous, we don't honour that here - doing
				3017	* so would cause a commit on atime updates, which we don't bother doing.
				3018	* We handle synchronous inodes at the highest possible level.
				3019	*/
				3020	void ext3_dirty_inode(struct inode *inode)
				3021	{
				3022	handle_t *current_handle = ext3_journal_current_handle();
				3023	handle_t *handle;
				3024
				3025	handle = ext3_journal_start(inode, 2);
				3026	if (IS_ERR(handle))
				3027	goto out;
				3028	if (current_handle &&
				3029	current_handle->h_transaction != handle->h_transaction) {
				3030	/* This task has a transaction open against a different fs */
				3031	printk(KERN_EMERG "%s: transactions do not match!\n",
				3032	__FUNCTION__);
				3033	} else {
				3034	jbd_debug(5, "marking dirty. outer handle=%p\n",
				3035	current_handle);
				3036	ext3_mark_inode_dirty(handle, inode);
				3037	}
				3038	ext3_journal_stop(handle);
				3039	out:
				3040	return;
				3041	}
				3042
				3043	#ifdef AKPM
				3044	/*
				3045	* Bind an inode's backing buffer_head into this transaction, to prevent
				3046	* it from being flushed to disk early. Unlike
				3047	* ext3_reserve_inode_write, this leaves behind no bh reference and
				3048	* returns no iloc structure, so the caller needs to repeat the iloc
				3049	* lookup to mark the inode dirty later.
				3050	*/
				3051	static inline int
				3052	ext3_pin_inode(handle_t handle, struct inode inode)
				3053	{
				3054	struct ext3_iloc iloc;
				3055
				3056	int err = 0;
				3057	if (handle) {
				3058	err = ext3_get_inode_loc(inode, &iloc);
				3059	if (!err) {
				3060	BUFFER_TRACE(iloc.bh, "get_write_access");
				3061	err = journal_get_write_access(handle, iloc.bh);
				3062	if (!err)
				3063	err = ext3_journal_dirty_metadata(handle,
				3064	iloc.bh);
				3065	brelse(iloc.bh);
				3066	}
				3067	}
				3068	ext3_std_error(inode->i_sb, err);
				3069	return err;
				3070	}
				3071	#endif
				3072
				3073	int ext3_change_inode_journal_flag(struct inode *inode, int val)
				3074	{
				3075	journal_t *journal;
				3076	handle_t *handle;
				3077	int err;
				3078
				3079	/*
				3080	* We have to be very careful here: changing a data block's
				3081	* journaling status dynamically is dangerous. If we write a
				3082	* data block to the journal, change the status and then delete
				3083	* that block, we risk forgetting to revoke the old log record
				3084	* from the journal and so a subsequent replay can corrupt data.
				3085	* So, first we make sure that the journal is empty and that
				3086	* nobody is changing anything.
				3087	*/
				3088
				3089	journal = EXT3_JOURNAL(inode);
				3090	if (is_journal_aborted(journal) \|\| IS_RDONLY(inode))
				3091	return -EROFS;
				3092
				3093	journal_lock_updates(journal);
				3094	journal_flush(journal);
				3095
				3096	/*
				3097	* OK, there are no updates running now, and all cached data is
				3098	* synced to disk. We are now in a completely consistent state
				3099	* which doesn't have anything in the journal, and we know that
				3100	* no filesystem updates are running, so it is safe to modify
				3101	* the inode's in-core data-journaling state flag now.
				3102	*/
				3103
				3104	if (val)
				3105	EXT3_I(inode)->i_flags \|= EXT3_JOURNAL_DATA_FL;
				3106	else
				3107	EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
				3108	ext3_set_aops(inode);
				3109
				3110	journal_unlock_updates(journal);
				3111
				3112	/* Finally we can mark the inode as dirty. */
				3113
				3114	handle = ext3_journal_start(inode, 1);
				3115	if (IS_ERR(handle))
				3116	return PTR_ERR(handle);
				3117
				3118	err = ext3_mark_inode_dirty(handle, inode);
				3119	handle->h_sync = 1;
				3120	ext3_journal_stop(handle);
				3121	ext3_std_error(inode->i_sb, err);
				3122
				3123	return err;
				3124	}