Blame - fs/buffer.c - kernel/hikey-linaro

blob: 4342ab0ad99a90639e30abceeeba06ce5e4dec88 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
				21	#include <linux/config.h>
				22	#include <linux/kernel.h>
				23	#include <linux/syscalls.h>
				24	#include <linux/fs.h>
				25	#include <linux/mm.h>
				26	#include <linux/percpu.h>
				27	#include <linux/slab.h>
				28	#include <linux/smp_lock.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	29	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	30	#include <linux/blkdev.h>
				31	#include <linux/file.h>
				32	#include <linux/quotaops.h>
				33	#include <linux/highmem.h>
				34	#include <linux/module.h>
				35	#include <linux/writeback.h>
				36	#include <linux/hash.h>
				37	#include <linux/suspend.h>
				38	#include <linux/buffer_head.h>
				39	#include <linux/bio.h>
				40	#include <linux/notifier.h>
				41	#include <linux/cpu.h>
				42	#include <linux/bitops.h>
				43	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	44	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	45
				46	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
				47	static void invalidate_bh_lrus(void);
				48
				49	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				50
				51	inline void
				52	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				53	{
				54	bh->b_end_io = handler;
				55	bh->b_private = private;
				56	}
				57
				58	static int sync_buffer(void *word)
				59	{
				60	struct block_device *bd;
				61	struct buffer_head *bh
				62	= container_of(word, struct buffer_head, b_state);
				63
				64	smp_mb();
				65	bd = bh->b_bdev;
				66	if (bd)
				67	blk_run_address_space(bd->bd_inode->i_mapping);
				68	io_schedule();
				69	return 0;
				70	}
				71
				72	void fastcall __lock_buffer(struct buffer_head *bh)
				73	{
				74	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				75	TASK_UNINTERRUPTIBLE);
				76	}
				77	EXPORT_SYMBOL(__lock_buffer);
				78
				79	void fastcall unlock_buffer(struct buffer_head *bh)
				80	{
				81	clear_buffer_locked(bh);
				82	smp_mb__after_clear_bit();
				83	wake_up_bit(&bh->b_state, BH_Lock);
				84	}
				85
				86	/*
				87	* Block until a buffer comes unlocked. This doesn't stop it
				88	* from becoming locked again - you have to lock it yourself
				89	* if you want to preserve its state.
				90	*/
				91	void __wait_on_buffer(struct buffer_head * bh)
				92	{
				93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				94	}
				95
				96	static void
				97	__clear_page_buffers(struct page *page)
				98	{
				99	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	100	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	101	page_cache_release(page);
				102	}
				103
				104	static void buffer_io_error(struct buffer_head *bh)
				105	{
				106	char b[BDEVNAME_SIZE];
				107
				108	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				109	bdevname(bh->b_bdev, b),
				110	(unsigned long long)bh->b_blocknr);
				111	}
				112
				113	/*
				114	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				115	* unlock the buffer. This is what ll_rw_block uses too.
				116	*/
				117	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				118	{
				119	if (uptodate) {
				120	set_buffer_uptodate(bh);
				121	} else {
				122	/* This happens, due to failed READA attempts. */
				123	clear_buffer_uptodate(bh);
				124	}
				125	unlock_buffer(bh);
				126	put_bh(bh);
				127	}
				128
				129	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				130	{
				131	char b[BDEVNAME_SIZE];
				132
				133	if (uptodate) {
				134	set_buffer_uptodate(bh);
				135	} else {
				136	if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
				137	buffer_io_error(bh);
				138	printk(KERN_WARNING "lost page write due to "
				139	"I/O error on %s\n",
				140	bdevname(bh->b_bdev, b));
				141	}
				142	set_buffer_write_io_error(bh);
				143	clear_buffer_uptodate(bh);
				144	}
				145	unlock_buffer(bh);
				146	put_bh(bh);
				147	}
				148
				149	/*
				150	* Write out and wait upon all the dirty data associated with a block
				151	* device via its mapping. Does not take the superblock lock.
				152	*/
				153	int sync_blockdev(struct block_device *bdev)
				154	{
				155	int ret = 0;
				156
OGAWA Hirofumi	28fd129	2006-01-08 01:02:14 -0800	[diff] [blame]	157	if (bdev)
				158	ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	159	return ret;
				160	}
				161	EXPORT_SYMBOL(sync_blockdev);
				162
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	163	static void __fsync_super(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164	{
				165	sync_inodes_sb(sb, 0);
				166	DQUOT_SYNC(sb);
				167	lock_super(sb);
				168	if (sb->s_dirt && sb->s_op->write_super)
				169	sb->s_op->write_super(sb);
				170	unlock_super(sb);
				171	if (sb->s_op->sync_fs)
				172	sb->s_op->sync_fs(sb, 1);
				173	sync_blockdev(sb->s_bdev);
				174	sync_inodes_sb(sb, 1);
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	175	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	176
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	177	/*
				178	* Write out and wait upon all dirty data associated with this
				179	* superblock. Filesystem data as well as the underlying block
				180	* device. Takes the superblock lock.
				181	*/
				182	int fsync_super(struct super_block *sb)
				183	{
				184	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	185	return sync_blockdev(sb->s_bdev);
				186	}
				187
				188	/*
				189	* Write out and wait upon all dirty data associated with this
				190	* device. Filesystem data as well as the underlying block
				191	* device. Takes the superblock lock.
				192	*/
				193	int fsync_bdev(struct block_device *bdev)
				194	{
				195	struct super_block *sb = get_super(bdev);
				196	if (sb) {
				197	int res = fsync_super(sb);
				198	drop_super(sb);
				199	return res;
				200	}
				201	return sync_blockdev(bdev);
				202	}
				203
				204	/**
				205	* freeze_bdev -- lock a filesystem and force it into a consistent state
				206	* @bdev: blockdevice to lock
				207	*
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	208	* This takes the block device bd_mount_mutex to make sure no new mounts
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	209	* happen on bdev until thaw_bdev() is called.
				210	* If a superblock is found on this device, we take the s_umount semaphore
				211	* on it to make sure nobody unmounts until the snapshot creation is done.
				212	*/
				213	struct super_block freeze_bdev(struct block_device bdev)
				214	{
				215	struct super_block *sb;
				216
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	217	mutex_lock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	218	sb = get_super(bdev);
				219	if (sb && !(sb->s_flags & MS_RDONLY)) {
				220	sb->s_frozen = SB_FREEZE_WRITE;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	221	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	222
OGAWA Hirofumi	d25b9a1	2006-03-25 03:07:44 -0800	[diff] [blame]	223	__fsync_super(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	224
				225	sb->s_frozen = SB_FREEZE_TRANS;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	226	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	227
				228	sync_blockdev(sb->s_bdev);
				229
				230	if (sb->s_op->write_super_lockfs)
				231	sb->s_op->write_super_lockfs(sb);
				232	}
				233
				234	sync_blockdev(bdev);
				235	return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
				236	}
				237	EXPORT_SYMBOL(freeze_bdev);
				238
				239	/**
				240	* thaw_bdev -- unlock filesystem
				241	* @bdev: blockdevice to unlock
				242	* @sb: associated superblock
				243	*
				244	* Unlocks the filesystem and marks it writeable again after freeze_bdev().
				245	*/
				246	void thaw_bdev(struct block_device bdev, struct super_block sb)
				247	{
				248	if (sb) {
				249	BUG_ON(sb->s_bdev != bdev);
				250
				251	if (sb->s_op->unlockfs)
				252	sb->s_op->unlockfs(sb);
				253	sb->s_frozen = SB_UNFROZEN;
akpm@osdl.org	d59dd46	2005-05-01 08:58:47 -0700	[diff] [blame]	254	smp_wmb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	255	wake_up(&sb->s_wait_unfrozen);
				256	drop_super(sb);
				257	}
				258
Arjan van de Ven	c039e31	2006-03-23 03:00:28 -0800	[diff] [blame]	259	mutex_unlock(&bdev->bd_mount_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	260	}
				261	EXPORT_SYMBOL(thaw_bdev);
				262
				263	/*
				264	* sync everything. Start out by waking pdflush, because that writes back
				265	* all queues in parallel.
				266	*/
				267	static void do_sync(unsigned long wait)
				268	{
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	269	wakeup_pdflush(0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	270	sync_inodes(0); /* All mappings, inodes and their blockdevs */
				271	DQUOT_SYNC(NULL);
				272	sync_supers(); /* Write the superblocks */
				273	sync_filesystems(0); /* Start syncing the filesystems */
				274	sync_filesystems(wait); /* Waitingly sync the filesystems */
				275	sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
				276	if (!wait)
				277	printk("Emergency Sync complete\n");
				278	if (unlikely(laptop_mode))
				279	laptop_sync_completion();
				280	}
				281
				282	asmlinkage long sys_sync(void)
				283	{
				284	do_sync(1);
				285	return 0;
				286	}
				287
				288	void emergency_sync(void)
				289	{
				290	pdflush_operation(do_sync, 0);
				291	}
				292
				293	/*
				294	* Generic function to fsync a file.
				295	*
				296	* filp may be NULL if called via the msync of a vma.
				297	*/
				298
				299	int file_fsync(struct file filp, struct dentry dentry, int datasync)
				300	{
				301	struct inode * inode = dentry->d_inode;
				302	struct super_block * sb;
				303	int ret, err;
				304
				305	/* sync the inode to buffers */
				306	ret = write_inode_now(inode, 0);
				307
				308	/* sync the superblock to buffers */
				309	sb = inode->i_sb;
				310	lock_super(sb);
				311	if (sb->s_op->write_super)
				312	sb->s_op->write_super(sb);
				313	unlock_super(sb);
				314
				315	/* .. finally sync the buffers to disk */
				316	err = sync_blockdev(sb->s_bdev);
				317	if (!ret)
				318	ret = err;
				319	return ret;
				320	}
				321
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	322	long do_fsync(struct file *file, int datasync)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	323	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	324	int ret;
				325	int err;
				326	struct address_space *mapping = file->f_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	327
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	328	if (!file->f_op \|\| !file->f_op->fsync) {
				329	/* Why? We can still call filemap_fdatawrite */
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	330	ret = -EINVAL;
				331	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	332	}
				333
				334	current->flags \|= PF_SYNCWRITE;
				335	ret = filemap_fdatawrite(mapping);
				336
				337	/*
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	338	* We need to protect against concurrent writers, which could cause
				339	* livelocks in fsync_buffers_list().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	340	*/
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	341	mutex_lock(&mapping->host->i_mutex);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	342	err = file->f_op->fsync(file, file->f_dentry, datasync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	343	if (!ret)
				344	ret = err;
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	345	mutex_unlock(&mapping->host->i_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	346	err = filemap_fdatawait(mapping);
				347	if (!ret)
				348	ret = err;
				349	current->flags &= ~PF_SYNCWRITE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	350	out:
				351	return ret;
				352	}
				353
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	354	static long __do_fsync(unsigned int fd, int datasync)
				355	{
				356	struct file *file;
				357	int ret = -EBADF;
				358
				359	file = fget(fd);
				360	if (file) {
				361	ret = do_fsync(file, datasync);
				362	fput(file);
				363	}
				364	return ret;
				365	}
				366
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	367	asmlinkage long sys_fsync(unsigned int fd)
				368	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	369	return __do_fsync(fd, 0);
Oleg Nesterov	dfb388b	2005-06-23 00:10:02 -0700	[diff] [blame]	370	}
				371
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	372	asmlinkage long sys_fdatasync(unsigned int fd)
				373	{
Andrew Morton	18e79b4	2006-03-24 03:18:14 -0800	[diff] [blame]	374	return __do_fsync(fd, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	375	}
				376
				377	/*
				378	* Various filesystems appear to want __find_get_block to be non-blocking.
				379	* But it's the page lock which protects the buffers. To get around this,
				380	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				381	* private_lock.
				382	*
				383	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				384	* may be quite high. This code could TryLock the page, and if that
				385	* succeeds, there is no need to take private_lock. (But if
				386	* private_lock is contended then so is mapping->tree_lock).
				387	*/
				388	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	389	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	390	{
				391	struct inode *bd_inode = bdev->bd_inode;
				392	struct address_space *bd_mapping = bd_inode->i_mapping;
				393	struct buffer_head *ret = NULL;
				394	pgoff_t index;
				395	struct buffer_head *bh;
				396	struct buffer_head *head;
				397	struct page *page;
				398	int all_mapped = 1;
				399
				400	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				401	page = find_get_page(bd_mapping, index);
				402	if (!page)
				403	goto out;
				404
				405	spin_lock(&bd_mapping->private_lock);
				406	if (!page_has_buffers(page))
				407	goto out_unlock;
				408	head = page_buffers(page);
				409	bh = head;
				410	do {
				411	if (bh->b_blocknr == block) {
				412	ret = bh;
				413	get_bh(bh);
				414	goto out_unlock;
				415	}
				416	if (!buffer_mapped(bh))
				417	all_mapped = 0;
				418	bh = bh->b_this_page;
				419	} while (bh != head);
				420
				421	/* we might be here because some of the buffers on this page are
				422	* not mapped. This is due to various races between
				423	* file io on the block device and getblk. It gets dealt with
				424	* elsewhere, don't buffer_error if we had some unmapped buffers
				425	*/
				426	if (all_mapped) {
				427	printk("__find_get_block_slow() failed. "
				428	"block=%llu, b_blocknr=%llu\n",
				429	(unsigned long long)block, (unsigned long long)bh->b_blocknr);
				430	printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
				431	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				432	}
				433	out_unlock:
				434	spin_unlock(&bd_mapping->private_lock);
				435	page_cache_release(page);
				436	out:
				437	return ret;
				438	}
				439
				440	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				441	of fs corruption is going on. Trashing dirty data always imply losing
				442	information that was supposed to be just stored on the physical layer
				443	by the user.
				444
				445	Thus invalidate_buffers in general usage is not allwowed to trash
				446	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				447	be preserved. These buffers are simply skipped.
				448
				449	We also skip buffers which are still in use. For example this can
				450	happen if a userspace program is reading the block device.
				451
				452	NOTE: In the case where the user removed a removable-media-disk even if
				453	there's still dirty data not synced on disk (due a bug in the device driver
				454	or due an error of the user), by not destroying the dirty buffers we could
				455	generate corruption also on the next media inserted, thus a parameter is
				456	necessary to handle this case in the most safe way possible (trying
				457	to not corrupt also the new disk inserted with the data belonging to
				458	the old now corrupted disk). Also for the ramdisk the natural thing
				459	to do in order to release the ramdisk memory is to destroy dirty buffers.
				460
				461	These are two special cases. Normal usage imply the device driver
				462	to issue a sync on the device (without waiting I/O completion) and
				463	then an invalidate_buffers call that doesn't trash dirty buffers.
				464
				465	For handling cache coherency with the blkdev pagecache the 'update' case
				466	is been introduced. It is needed to re-read from disk any pinned
				467	buffer. NOTE: re-reading from disk is destructive so we can do it only
				468	when we assume nobody is changing the buffercache under our I/O and when
				469	we think the disk contains more recent information than the buffercache.
				470	The update == 1 pass marks the buffers we need to update, the update == 2
				471	pass does the actual I/O. */
				472	void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
				473	{
				474	invalidate_bh_lrus();
				475	/*
				476	* FIXME: what about destroy_dirty_buffers?
				477	* We really want to use invalidate_inode_pages2() for
				478	* that, but not until that's cleaned up.
				479	*/
				480	invalidate_inode_pages(bdev->bd_inode->i_mapping);
				481	}
				482
				483	/*
				484	* Kick pdflush then try to free up some ZONE_NORMAL memory.
				485	*/
				486	static void free_more_memory(void)
				487	{
				488	struct zone **zones;
				489	pg_data_t *pgdat;
				490
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	491	wakeup_pdflush(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	492	yield();
				493
				494	for_each_pgdat(pgdat) {
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	495	zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	496	if (*zones)
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	497	try_to_free_pages(zones, GFP_NOFS);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	498	}
				499	}
				500
				501	/*
				502	* I/O completion handler for block_read_full_page() - pages
				503	* which come unlocked at the end of I/O.
				504	*/
				505	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				506	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	507	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	508	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	509	struct buffer_head *tmp;
				510	struct page *page;
				511	int page_uptodate = 1;
				512
				513	BUG_ON(!buffer_async_read(bh));
				514
				515	page = bh->b_page;
				516	if (uptodate) {
				517	set_buffer_uptodate(bh);
				518	} else {
				519	clear_buffer_uptodate(bh);
				520	if (printk_ratelimit())
				521	buffer_io_error(bh);
				522	SetPageError(page);
				523	}
				524
				525	/*
				526	* Be _very_ careful from here on. Bad things can happen if
				527	* two buffer heads end IO at almost the same time and both
				528	* decide that the page is now completely done.
				529	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	530	first = page_buffers(page);
				531	local_irq_save(flags);
				532	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	533	clear_buffer_async_read(bh);
				534	unlock_buffer(bh);
				535	tmp = bh;
				536	do {
				537	if (!buffer_uptodate(tmp))
				538	page_uptodate = 0;
				539	if (buffer_async_read(tmp)) {
				540	BUG_ON(!buffer_locked(tmp));
				541	goto still_busy;
				542	}
				543	tmp = tmp->b_this_page;
				544	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	545	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				546	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	547
				548	/*
				549	* If none of the buffers had errors and they are all
				550	* uptodate then we can set the page uptodate.
				551	*/
				552	if (page_uptodate && !PageError(page))
				553	SetPageUptodate(page);
				554	unlock_page(page);
				555	return;
				556
				557	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	558	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				559	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	560	return;
				561	}
				562
				563	/*
				564	* Completion handler for block_write_full_page() - pages which are unlocked
				565	* during I/O, and which have PageWriteback cleared upon I/O completion.
				566	*/
				567	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
				568	{
				569	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	570	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	571	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	572	struct buffer_head *tmp;
				573	struct page *page;
				574
				575	BUG_ON(!buffer_async_write(bh));
				576
				577	page = bh->b_page;
				578	if (uptodate) {
				579	set_buffer_uptodate(bh);
				580	} else {
				581	if (printk_ratelimit()) {
				582	buffer_io_error(bh);
				583	printk(KERN_WARNING "lost page write due to "
				584	"I/O error on %s\n",
				585	bdevname(bh->b_bdev, b));
				586	}
				587	set_bit(AS_EIO, &page->mapping->flags);
				588	clear_buffer_uptodate(bh);
				589	SetPageError(page);
				590	}
				591
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	592	first = page_buffers(page);
				593	local_irq_save(flags);
				594	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				595
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	596	clear_buffer_async_write(bh);
				597	unlock_buffer(bh);
				598	tmp = bh->b_this_page;
				599	while (tmp != bh) {
				600	if (buffer_async_write(tmp)) {
				601	BUG_ON(!buffer_locked(tmp));
				602	goto still_busy;
				603	}
				604	tmp = tmp->b_this_page;
				605	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	606	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				607	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	608	end_page_writeback(page);
				609	return;
				610
				611	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	612	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				613	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	614	return;
				615	}
				616
				617	/*
				618	* If a page's buffers are under async readin (end_buffer_async_read
				619	* completion) then there is a possibility that another thread of
				620	* control could lock one of the buffers after it has completed
				621	* but while some of the other buffers have not completed. This
				622	* locked buffer would confuse end_buffer_async_read() into not unlocking
				623	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				624	* that this buffer is not under async I/O.
				625	*
				626	* The page comes unlocked when it has no locked buffer_async buffers
				627	* left.
				628	*
				629	* PageLocked prevents anyone starting new async I/O reads any of
				630	* the buffers.
				631	*
				632	* PageWriteback is used to prevent simultaneous writeout of the same
				633	* page.
				634	*
				635	* PageLocked prevents anyone from starting writeback of a page which is
				636	* under read I/O (PageWriteback is only ever set against a locked page).
				637	*/
				638	static void mark_buffer_async_read(struct buffer_head *bh)
				639	{
				640	bh->b_end_io = end_buffer_async_read;
				641	set_buffer_async_read(bh);
				642	}
				643
				644	void mark_buffer_async_write(struct buffer_head *bh)
				645	{
				646	bh->b_end_io = end_buffer_async_write;
				647	set_buffer_async_write(bh);
				648	}
				649	EXPORT_SYMBOL(mark_buffer_async_write);
				650
				651
				652	/*
				653	* fs/buffer.c contains helper functions for buffer-backed address space's
				654	* fsync functions. A common requirement for buffer-based filesystems is
				655	* that certain data from the backing blockdev needs to be written out for
				656	* a successful fsync(). For example, ext2 indirect blocks need to be
				657	* written back and waited upon before fsync() returns.
				658	*
				659	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				660	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				661	* management of a list of dependent buffers at ->i_mapping->private_list.
				662	*
				663	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				664	* from their controlling inode's queue when they are being freed. But
				665	* try_to_free_buffers() will be operating against the blockdev mapping
				666	* at the time, not against the S_ISREG file which depends on those buffers.
				667	* So the locking for private_list is via the private_lock in the address_space
				668	* which backs the buffers. Which is different from the address_space
				669	* against which the buffers are listed. So for a particular address_space,
				670	* mapping->private_lock does not protect mapping->private_list! In fact,
				671	* mapping->private_list will always be protected by the backing blockdev's
				672	* ->private_lock.
				673	*
				674	* Which introduces a requirement: all buffers on an address_space's
				675	* ->private_list must be from the same address_space: the blockdev's.
				676	*
				677	* address_spaces which do not place buffers at ->private_list via these
				678	* utility functions are free to use private_lock and private_list for
				679	* whatever they want. The only requirement is that list_empty(private_list)
				680	* be true at clear_inode() time.
				681	*
				682	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				683	* filesystems should do that. invalidate_inode_buffers() should just go
				684	* BUG_ON(!list_empty).
				685	*
				686	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				687	* take an address_space, not an inode. And it should be called
				688	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				689	* queued up.
				690	*
				691	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				692	* list if it is already on a list. Because if the buffer is on a list,
				693	* it must already be on the right one. If not, the filesystem is being
				694	* silly. This will save a ton of locking. But first we have to ensure
				695	* that buffers are taken off the old inode's list when they are freed
				696	* (presumably in truncate). That requires careful auditing of all
				697	* filesystems (do it inside bforget()). It could also be done by bringing
				698	* b_inode back.
				699	*/
				700
				701	/*
				702	* The buffer's backing address_space's private_lock must be held
				703	*/
				704	static inline void __remove_assoc_queue(struct buffer_head *bh)
				705	{
				706	list_del_init(&bh->b_assoc_buffers);
				707	}
				708
				709	int inode_has_buffers(struct inode *inode)
				710	{
				711	return !list_empty(&inode->i_data.private_list);
				712	}
				713
				714	/*
				715	* osync is designed to support O_SYNC io. It waits synchronously for
				716	* all already-submitted IO to complete, but does not queue any new
				717	* writes to the disk.
				718	*
				719	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				720	* you dirty the buffers, and then use osync_inode_buffers to wait for
				721	* completion. Any other dirty buffers which are not yet queued for
				722	* write will not be flushed to disk by the osync.
				723	*/
				724	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				725	{
				726	struct buffer_head *bh;
				727	struct list_head *p;
				728	int err = 0;
				729
				730	spin_lock(lock);
				731	repeat:
				732	list_for_each_prev(p, list) {
				733	bh = BH_ENTRY(p);
				734	if (buffer_locked(bh)) {
				735	get_bh(bh);
				736	spin_unlock(lock);
				737	wait_on_buffer(bh);
				738	if (!buffer_uptodate(bh))
				739	err = -EIO;
				740	brelse(bh);
				741	spin_lock(lock);
				742	goto repeat;
				743	}
				744	}
				745	spin_unlock(lock);
				746	return err;
				747	}
				748
				749	/**
				750	* sync_mapping_buffers - write out and wait upon a mapping's "associated"
				751	* buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	752	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	753	*
				754	* Starts I/O against the buffers at mapping->private_list, and waits upon
				755	* that I/O.
				756	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	757	* Basically, this is a convenience function for fsync().
				758	* @mapping is a file or directory which needs those buffers to be written for
				759	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	760	*/
				761	int sync_mapping_buffers(struct address_space *mapping)
				762	{
				763	struct address_space *buffer_mapping = mapping->assoc_mapping;
				764
				765	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				766	return 0;
				767
				768	return fsync_buffers_list(&buffer_mapping->private_lock,
				769	&mapping->private_list);
				770	}
				771	EXPORT_SYMBOL(sync_mapping_buffers);
				772
				773	/*
				774	* Called when we've recently written block `bblock', and it is known that
				775	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				776	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				777	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				778	*/
				779	void write_boundary_block(struct block_device *bdev,
				780	sector_t bblock, unsigned blocksize)
				781	{
				782	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				783	if (bh) {
				784	if (buffer_dirty(bh))
				785	ll_rw_block(WRITE, 1, &bh);
				786	put_bh(bh);
				787	}
				788	}
				789
				790	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				791	{
				792	struct address_space *mapping = inode->i_mapping;
				793	struct address_space *buffer_mapping = bh->b_page->mapping;
				794
				795	mark_buffer_dirty(bh);
				796	if (!mapping->assoc_mapping) {
				797	mapping->assoc_mapping = buffer_mapping;
				798	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame^]	799	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	800	}
				801	if (list_empty(&bh->b_assoc_buffers)) {
				802	spin_lock(&buffer_mapping->private_lock);
				803	list_move_tail(&bh->b_assoc_buffers,
				804	&mapping->private_list);
				805	spin_unlock(&buffer_mapping->private_lock);
				806	}
				807	}
				808	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				809
				810	/*
				811	* Add a page to the dirty page list.
				812	*
				813	* It is a sad fact of life that this function is called from several places
				814	* deeply under spinlocking. It may not sleep.
				815	*
				816	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				817	* dirty-state coherency between the page and the buffers. It the page does
				818	* not have buffers then when they are later attached they will all be set
				819	* dirty.
				820	*
				821	* The buffers are dirtied before the page is dirtied. There's a small race
				822	* window in which a writepage caller may see the page cleanness but not the
				823	* buffer dirtiness. That's fine. If this code were to set the page dirty
				824	* before the buffers, a concurrent writepage caller could clear the page dirty
				825	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				826	* page on the dirty page list.
				827	*
				828	* We use private_lock to lock against try_to_free_buffers while using the
				829	* page's buffer list. Also use this to protect against clean buffers being
				830	* added to the page after it was set dirty.
				831	*
				832	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				833	* address_space though.
				834	*/
				835	int __set_page_dirty_buffers(struct page *page)
				836	{
				837	struct address_space * const mapping = page->mapping;
				838
				839	spin_lock(&mapping->private_lock);
				840	if (page_has_buffers(page)) {
				841	struct buffer_head *head = page_buffers(page);
				842	struct buffer_head *bh = head;
				843
				844	do {
				845	set_buffer_dirty(bh);
				846	bh = bh->b_this_page;
				847	} while (bh != head);
				848	}
				849	spin_unlock(&mapping->private_lock);
				850
				851	if (!TestSetPageDirty(page)) {
				852	write_lock_irq(&mapping->tree_lock);
				853	if (page->mapping) { /* Race with truncate? */
				854	if (mapping_cap_account_dirty(mapping))
				855	inc_page_state(nr_dirty);
				856	radix_tree_tag_set(&mapping->page_tree,
				857	page_index(page),
				858	PAGECACHE_TAG_DIRTY);
				859	}
				860	write_unlock_irq(&mapping->tree_lock);
				861	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Andrew Morton	4741c9f	2006-03-24 03:18:11 -0800	[diff] [blame]	862	return 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	863	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	864	return 0;
				865	}
				866	EXPORT_SYMBOL(__set_page_dirty_buffers);
				867
				868	/*
				869	* Write out and wait upon a list of buffers.
				870	*
				871	* We have conflicting pressures: we want to make sure that all
				872	* initially dirty buffers get waited on, but that any subsequently
				873	* dirtied buffers don't. After all, we don't want fsync to last
				874	* forever if somebody is actively writing to the file.
				875	*
				876	* Do this in two main stages: first we copy dirty buffers to a
				877	* temporary inode list, queueing the writes as we go. Then we clean
				878	* up, waiting for those writes to complete.
				879	*
				880	* During this second stage, any subsequent updates to the file may end
				881	* up refiling the buffer on the original inode's dirty list again, so
				882	* there is a chance we will end up with a buffer queued for write but
				883	* not yet completed on that list. So, as a final cleanup we go through
				884	* the osync code to catch these locked, dirty buffers without requeuing
				885	* any newly dirty buffers for write.
				886	*/
				887	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				888	{
				889	struct buffer_head *bh;
				890	struct list_head tmp;
				891	int err = 0, err2;
				892
				893	INIT_LIST_HEAD(&tmp);
				894
				895	spin_lock(lock);
				896	while (!list_empty(list)) {
				897	bh = BH_ENTRY(list->next);
				898	list_del_init(&bh->b_assoc_buffers);
				899	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				900	list_add(&bh->b_assoc_buffers, &tmp);
				901	if (buffer_dirty(bh)) {
				902	get_bh(bh);
				903	spin_unlock(lock);
				904	/*
				905	* Ensure any pending I/O completes so that
				906	* ll_rw_block() actually writes the current
				907	* contents - it is a noop if I/O is still in
				908	* flight on potentially older contents.
				909	*/
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	910	ll_rw_block(SWRITE, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	911	brelse(bh);
				912	spin_lock(lock);
				913	}
				914	}
				915	}
				916
				917	while (!list_empty(&tmp)) {
				918	bh = BH_ENTRY(tmp.prev);
				919	__remove_assoc_queue(bh);
				920	get_bh(bh);
				921	spin_unlock(lock);
				922	wait_on_buffer(bh);
				923	if (!buffer_uptodate(bh))
				924	err = -EIO;
				925	brelse(bh);
				926	spin_lock(lock);
				927	}
				928
				929	spin_unlock(lock);
				930	err2 = osync_buffers_list(lock, list);
				931	if (err)
				932	return err;
				933	else
				934	return err2;
				935	}
				936
				937	/*
				938	* Invalidate any and all dirty buffers on a given inode. We are
				939	* probably unmounting the fs, but that doesn't mean we have already
				940	* done a sync(). Just drop the buffers from the inode list.
				941	*
				942	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				943	* assumes that all the buffers are against the blockdev. Not true
				944	* for reiserfs.
				945	*/
				946	void invalidate_inode_buffers(struct inode *inode)
				947	{
				948	if (inode_has_buffers(inode)) {
				949	struct address_space *mapping = &inode->i_data;
				950	struct list_head *list = &mapping->private_list;
				951	struct address_space *buffer_mapping = mapping->assoc_mapping;
				952
				953	spin_lock(&buffer_mapping->private_lock);
				954	while (!list_empty(list))
				955	__remove_assoc_queue(BH_ENTRY(list->next));
				956	spin_unlock(&buffer_mapping->private_lock);
				957	}
				958	}
				959
				960	/*
				961	* Remove any clean buffers from the inode's buffer list. This is called
				962	* when we're trying to free the inode itself. Those buffers can pin it.
				963	*
				964	* Returns true if all buffers were removed.
				965	*/
				966	int remove_inode_buffers(struct inode *inode)
				967	{
				968	int ret = 1;
				969
				970	if (inode_has_buffers(inode)) {
				971	struct address_space *mapping = &inode->i_data;
				972	struct list_head *list = &mapping->private_list;
				973	struct address_space *buffer_mapping = mapping->assoc_mapping;
				974
				975	spin_lock(&buffer_mapping->private_lock);
				976	while (!list_empty(list)) {
				977	struct buffer_head *bh = BH_ENTRY(list->next);
				978	if (buffer_dirty(bh)) {
				979	ret = 0;
				980	break;
				981	}
				982	__remove_assoc_queue(bh);
				983	}
				984	spin_unlock(&buffer_mapping->private_lock);
				985	}
				986	return ret;
				987	}
				988
				989	/*
				990	* Create the appropriate buffers when given a page for data area and
				991	* the size of each buffer.. Use the bh->b_this_page linked list to
				992	* follow the buffers created. Return NULL if unable to create more
				993	* buffers.
				994	*
				995	* The retry flag is used to differentiate async IO (paging, swapping)
				996	* which may not fail from ordinary buffer allocations.
				997	*/
				998	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				999	int retry)
				1000	{
				1001	struct buffer_head bh, head;
				1002	long offset;
				1003
				1004	try_again:
				1005	head = NULL;
				1006	offset = PAGE_SIZE;
				1007	while ((offset -= size) >= 0) {
				1008	bh = alloc_buffer_head(GFP_NOFS);
				1009	if (!bh)
				1010	goto no_grow;
				1011
				1012	bh->b_bdev = NULL;
				1013	bh->b_this_page = head;
				1014	bh->b_blocknr = -1;
				1015	head = bh;
				1016
				1017	bh->b_state = 0;
				1018	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	1019	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1020	bh->b_size = size;
				1021
				1022	/* Link the buffer to its page */
				1023	set_bh_page(bh, page, offset);
				1024
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	1025	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1026	}
				1027	return head;
				1028	/*
				1029	* In case anything failed, we just free everything we got.
				1030	*/
				1031	no_grow:
				1032	if (head) {
				1033	do {
				1034	bh = head;
				1035	head = head->b_this_page;
				1036	free_buffer_head(bh);
				1037	} while (head);
				1038	}
				1039
				1040	/*
				1041	* Return failure for non-async IO requests. Async IO requests
				1042	* are not allowed to fail, so we have to wait until buffer heads
				1043	* become available. But we don't want tasks sleeping with
				1044	* partially complete buffers, so all were released above.
				1045	*/
				1046	if (!retry)
				1047	return NULL;
				1048
				1049	/* We're _really_ low on memory. Now we just
				1050	* wait for old buffer heads to become free due to
				1051	* finishing IO. Since this is an async request and
				1052	* the reserve list is empty, we're sure there are
				1053	* async buffer heads in use.
				1054	*/
				1055	free_more_memory();
				1056	goto try_again;
				1057	}
				1058	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				1059
				1060	static inline void
				1061	link_dev_buffers(struct page page, struct buffer_head head)
				1062	{
				1063	struct buffer_head bh, tail;
				1064
				1065	bh = head;
				1066	do {
				1067	tail = bh;
				1068	bh = bh->b_this_page;
				1069	} while (bh);
				1070	tail->b_this_page = head;
				1071	attach_page_buffers(page, head);
				1072	}
				1073
				1074	/*
				1075	* Initialise the state of a blockdev page's buffers.
				1076	*/
				1077	static void
				1078	init_page_buffers(struct page page, struct block_device bdev,
				1079	sector_t block, int size)
				1080	{
				1081	struct buffer_head *head = page_buffers(page);
				1082	struct buffer_head *bh = head;
				1083	int uptodate = PageUptodate(page);
				1084
				1085	do {
				1086	if (!buffer_mapped(bh)) {
				1087	init_buffer(bh, NULL, NULL);
				1088	bh->b_bdev = bdev;
				1089	bh->b_blocknr = block;
				1090	if (uptodate)
				1091	set_buffer_uptodate(bh);
				1092	set_buffer_mapped(bh);
				1093	}
				1094	block++;
				1095	bh = bh->b_this_page;
				1096	} while (bh != head);
				1097	}
				1098
				1099	/*
				1100	* Create the page-cache page that contains the requested block.
				1101	*
				1102	* This is user purely for blockdev mappings.
				1103	*/
				1104	static struct page *
				1105	grow_dev_page(struct block_device *bdev, sector_t block,
				1106	pgoff_t index, int size)
				1107	{
				1108	struct inode *inode = bdev->bd_inode;
				1109	struct page *page;
				1110	struct buffer_head *bh;
				1111
				1112	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
				1113	if (!page)
				1114	return NULL;
				1115
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame^]	1116	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1117
				1118	if (page_has_buffers(page)) {
				1119	bh = page_buffers(page);
				1120	if (bh->b_size == size) {
				1121	init_page_buffers(page, bdev, block, size);
				1122	return page;
				1123	}
				1124	if (!try_to_free_buffers(page))
				1125	goto failed;
				1126	}
				1127
				1128	/*
				1129	* Allocate some buffers for this page
				1130	*/
				1131	bh = alloc_page_buffers(page, size, 0);
				1132	if (!bh)
				1133	goto failed;
				1134
				1135	/*
				1136	* Link the page to the buffers and initialise them. Take the
				1137	* lock to be atomic wrt __find_get_block(), which does not
				1138	* run under the page lock.
				1139	*/
				1140	spin_lock(&inode->i_mapping->private_lock);
				1141	link_dev_buffers(page, bh);
				1142	init_page_buffers(page, bdev, block, size);
				1143	spin_unlock(&inode->i_mapping->private_lock);
				1144	return page;
				1145
				1146	failed:
				1147	BUG();
				1148	unlock_page(page);
				1149	page_cache_release(page);
				1150	return NULL;
				1151	}
				1152
				1153	/*
				1154	* Create buffers for the specified block device block's page. If
				1155	* that page was dirty, the buffers are set dirty also.
				1156	*
				1157	* Except that's a bug. Attaching dirty buffers to a dirty
				1158	* blockdev's page can result in filesystem corruption, because
				1159	* some of those buffers may be aliases of filesystem data.
				1160	* grow_dev_page() will go BUG() if this happens.
				1161	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1162	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1163	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1164	{
				1165	struct page *page;
				1166	pgoff_t index;
				1167	int sizebits;
				1168
				1169	sizebits = -1;
				1170	do {
				1171	sizebits++;
				1172	} while ((size << sizebits) < PAGE_SIZE);
				1173
				1174	index = block >> sizebits;
				1175	block = index << sizebits;
				1176
				1177	/* Create a page with the proper size buffers.. */
				1178	page = grow_dev_page(bdev, block, index, size);
				1179	if (!page)
				1180	return 0;
				1181	unlock_page(page);
				1182	page_cache_release(page);
				1183	return 1;
				1184	}
				1185
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1186	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1187	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1188	{
				1189	/* Size must be multiple of hard sectorsize */
				1190	if (unlikely(size & (bdev_hardsect_size(bdev)-1) \|\|
				1191	(size < 512 \|\| size > PAGE_SIZE))) {
				1192	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1193	size);
				1194	printk(KERN_ERR "hardsect size: %d\n",
				1195	bdev_hardsect_size(bdev));
				1196
				1197	dump_stack();
				1198	return NULL;
				1199	}
				1200
				1201	for (;;) {
				1202	struct buffer_head * bh;
				1203
				1204	bh = __find_get_block(bdev, block, size);
				1205	if (bh)
				1206	return bh;
				1207
				1208	if (!grow_buffers(bdev, block, size))
				1209	free_more_memory();
				1210	}
				1211	}
				1212
				1213	/*
				1214	* The relationship between dirty buffers and dirty pages:
				1215	*
				1216	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1217	* the page is tagged dirty in its radix tree.
				1218	*
				1219	* At all times, the dirtiness of the buffers represents the dirtiness of
				1220	* subsections of the page. If the page has buffers, the page dirty bit is
				1221	* merely a hint about the true dirty state.
				1222	*
				1223	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1224	* (if the page has buffers).
				1225	*
				1226	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1227	* buffers are not.
				1228	*
				1229	* Also. When blockdev buffers are explicitly read with bread(), they
				1230	* individually become uptodate. But their backing page remains not
				1231	* uptodate - even if all of its buffers are uptodate. A subsequent
				1232	* block_read_full_page() against that page will discover all the uptodate
				1233	* buffers, will set the page uptodate and will perform no I/O.
				1234	*/
				1235
				1236	/**
				1237	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1238	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1239	*
				1240	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1241	* backing page dirty, then tag the page as dirty in its address_space's radix
				1242	* tree and then attach the address_space's inode to its superblock's dirty
				1243	* inode list.
				1244	*
				1245	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1246	* mapping->tree_lock and the global inode_lock.
				1247	*/
				1248	void fastcall mark_buffer_dirty(struct buffer_head *bh)
				1249	{
				1250	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
				1251	__set_page_dirty_nobuffers(bh->b_page);
				1252	}
				1253
				1254	/*
				1255	* Decrement a buffer_head's reference count. If all buffers against a page
				1256	* have zero reference count, are clean and unlocked, and if the page is clean
				1257	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1258	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1259	* a page but it ends up not being freed, and buffers may later be reattached).
				1260	*/
				1261	void __brelse(struct buffer_head * buf)
				1262	{
				1263	if (atomic_read(&buf->b_count)) {
				1264	put_bh(buf);
				1265	return;
				1266	}
				1267	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
				1268	WARN_ON(1);
				1269	}
				1270
				1271	/*
				1272	* bforget() is like brelse(), except it discards any
				1273	* potentially dirty data.
				1274	*/
				1275	void __bforget(struct buffer_head *bh)
				1276	{
				1277	clear_buffer_dirty(bh);
				1278	if (!list_empty(&bh->b_assoc_buffers)) {
				1279	struct address_space *buffer_mapping = bh->b_page->mapping;
				1280
				1281	spin_lock(&buffer_mapping->private_lock);
				1282	list_del_init(&bh->b_assoc_buffers);
				1283	spin_unlock(&buffer_mapping->private_lock);
				1284	}
				1285	__brelse(bh);
				1286	}
				1287
				1288	static struct buffer_head __bread_slow(struct buffer_head bh)
				1289	{
				1290	lock_buffer(bh);
				1291	if (buffer_uptodate(bh)) {
				1292	unlock_buffer(bh);
				1293	return bh;
				1294	} else {
				1295	get_bh(bh);
				1296	bh->b_end_io = end_buffer_read_sync;
				1297	submit_bh(READ, bh);
				1298	wait_on_buffer(bh);
				1299	if (buffer_uptodate(bh))
				1300	return bh;
				1301	}
				1302	brelse(bh);
				1303	return NULL;
				1304	}
				1305
				1306	/*
				1307	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1308	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1309	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1310	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1311	* CPU's LRUs at the same time.
				1312	*
				1313	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1314	* sb_find_get_block().
				1315	*
				1316	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1317	* a local interrupt disable for that.
				1318	*/
				1319
				1320	#define BH_LRU_SIZE 8
				1321
				1322	struct bh_lru {
				1323	struct buffer_head *bhs[BH_LRU_SIZE];
				1324	};
				1325
				1326	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1327
				1328	#ifdef CONFIG_SMP
				1329	#define bh_lru_lock() local_irq_disable()
				1330	#define bh_lru_unlock() local_irq_enable()
				1331	#else
				1332	#define bh_lru_lock() preempt_disable()
				1333	#define bh_lru_unlock() preempt_enable()
				1334	#endif
				1335
				1336	static inline void check_irqs_on(void)
				1337	{
				1338	#ifdef irqs_disabled
				1339	BUG_ON(irqs_disabled());
				1340	#endif
				1341	}
				1342
				1343	/*
				1344	* The LRU management algorithm is dopey-but-simple. Sorry.
				1345	*/
				1346	static void bh_lru_install(struct buffer_head *bh)
				1347	{
				1348	struct buffer_head *evictee = NULL;
				1349	struct bh_lru *lru;
				1350
				1351	check_irqs_on();
				1352	bh_lru_lock();
				1353	lru = &__get_cpu_var(bh_lrus);
				1354	if (lru->bhs[0] != bh) {
				1355	struct buffer_head *bhs[BH_LRU_SIZE];
				1356	int in;
				1357	int out = 0;
				1358
				1359	get_bh(bh);
				1360	bhs[out++] = bh;
				1361	for (in = 0; in < BH_LRU_SIZE; in++) {
				1362	struct buffer_head *bh2 = lru->bhs[in];
				1363
				1364	if (bh2 == bh) {
				1365	__brelse(bh2);
				1366	} else {
				1367	if (out >= BH_LRU_SIZE) {
				1368	BUG_ON(evictee != NULL);
				1369	evictee = bh2;
				1370	} else {
				1371	bhs[out++] = bh2;
				1372	}
				1373	}
				1374	}
				1375	while (out < BH_LRU_SIZE)
				1376	bhs[out++] = NULL;
				1377	memcpy(lru->bhs, bhs, sizeof(bhs));
				1378	}
				1379	bh_lru_unlock();
				1380
				1381	if (evictee)
				1382	__brelse(evictee);
				1383	}
				1384
				1385	/*
				1386	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1387	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1388	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1389	lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
				1390	{
				1391	struct buffer_head *ret = NULL;
				1392	struct bh_lru *lru;
				1393	int i;
				1394
				1395	check_irqs_on();
				1396	bh_lru_lock();
				1397	lru = &__get_cpu_var(bh_lrus);
				1398	for (i = 0; i < BH_LRU_SIZE; i++) {
				1399	struct buffer_head *bh = lru->bhs[i];
				1400
				1401	if (bh && bh->b_bdev == bdev &&
				1402	bh->b_blocknr == block && bh->b_size == size) {
				1403	if (i) {
				1404	while (i) {
				1405	lru->bhs[i] = lru->bhs[i - 1];
				1406	i--;
				1407	}
				1408	lru->bhs[0] = bh;
				1409	}
				1410	get_bh(bh);
				1411	ret = bh;
				1412	break;
				1413	}
				1414	}
				1415	bh_lru_unlock();
				1416	return ret;
				1417	}
				1418
				1419	/*
				1420	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1421	* it in the LRU and mark it as accessed. If it is not present then return
				1422	* NULL
				1423	*/
				1424	struct buffer_head *
				1425	__find_get_block(struct block_device *bdev, sector_t block, int size)
				1426	{
				1427	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1428
				1429	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1430	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1431	if (bh)
				1432	bh_lru_install(bh);
				1433	}
				1434	if (bh)
				1435	touch_buffer(bh);
				1436	return bh;
				1437	}
				1438	EXPORT_SYMBOL(__find_get_block);
				1439
				1440	/*
				1441	* __getblk will locate (and, if necessary, create) the buffer_head
				1442	* which corresponds to the passed block_device, block and size. The
				1443	* returned buffer has its reference count incremented.
				1444	*
				1445	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1446	* illegal block number, __getblk() will happily return a buffer_head
				1447	* which represents the non-existent block. Very weird.
				1448	*
				1449	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1450	* attempt is failing. FIXME, perhaps?
				1451	*/
				1452	struct buffer_head *
				1453	__getblk(struct block_device *bdev, sector_t block, int size)
				1454	{
				1455	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1456
				1457	might_sleep();
				1458	if (bh == NULL)
				1459	bh = __getblk_slow(bdev, block, size);
				1460	return bh;
				1461	}
				1462	EXPORT_SYMBOL(__getblk);
				1463
				1464	/*
				1465	* Do async read-ahead on a buffer..
				1466	*/
				1467	void __breadahead(struct block_device *bdev, sector_t block, int size)
				1468	{
				1469	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1470	if (likely(bh)) {
				1471	ll_rw_block(READA, 1, &bh);
				1472	brelse(bh);
				1473	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1474	}
				1475	EXPORT_SYMBOL(__breadahead);
				1476
				1477	/**
				1478	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1479	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1480	* @block: number of block
				1481	* @size: size (in bytes) to read
				1482	*
				1483	* Reads a specified block, and returns buffer head that contains it.
				1484	* It returns NULL if the block was unreadable.
				1485	*/
				1486	struct buffer_head *
				1487	__bread(struct block_device *bdev, sector_t block, int size)
				1488	{
				1489	struct buffer_head *bh = __getblk(bdev, block, size);
				1490
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1491	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1492	bh = __bread_slow(bh);
				1493	return bh;
				1494	}
				1495	EXPORT_SYMBOL(__bread);
				1496
				1497	/*
				1498	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1499	* This doesn't race because it runs in each cpu either in irq
				1500	* or with preempt disabled.
				1501	*/
				1502	static void invalidate_bh_lru(void *arg)
				1503	{
				1504	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1505	int i;
				1506
				1507	for (i = 0; i < BH_LRU_SIZE; i++) {
				1508	brelse(b->bhs[i]);
				1509	b->bhs[i] = NULL;
				1510	}
				1511	put_cpu_var(bh_lrus);
				1512	}
				1513
				1514	static void invalidate_bh_lrus(void)
				1515	{
				1516	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
				1517	}
				1518
				1519	void set_bh_page(struct buffer_head *bh,
				1520	struct page *page, unsigned long offset)
				1521	{
				1522	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame^]	1523	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1524	if (PageHighMem(page))
				1525	/*
				1526	* This catches illegal uses and preserves the offset:
				1527	*/
				1528	bh->b_data = (char *)(0 + offset);
				1529	else
				1530	bh->b_data = page_address(page) + offset;
				1531	}
				1532	EXPORT_SYMBOL(set_bh_page);
				1533
				1534	/*
				1535	* Called when truncating a buffer on a page completely.
				1536	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1537	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1538	{
				1539	lock_buffer(bh);
				1540	clear_buffer_dirty(bh);
				1541	bh->b_bdev = NULL;
				1542	clear_buffer_mapped(bh);
				1543	clear_buffer_req(bh);
				1544	clear_buffer_new(bh);
				1545	clear_buffer_delay(bh);
				1546	unlock_buffer(bh);
				1547	}
				1548
				1549	/**
				1550	* try_to_release_page() - release old fs-specific metadata on a page
				1551	*
				1552	* @page: the page which the kernel is trying to free
				1553	* @gfp_mask: memory allocation flags (and I/O mode)
				1554	*
				1555	* The address_space is to try to release any data against the page
				1556	* (presumably at page->private). If the release was successful, return `1'.
				1557	* Otherwise return zero.
				1558	*
				1559	* The @gfp_mask argument specifies whether I/O may be performed to release
				1560	* this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
				1561	*
				1562	* NOTE: @gfp_mask may go away, and this function may become non-blocking.
				1563	*/
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1564	int try_to_release_page(struct page *page, gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1565	{
				1566	struct address_space * const mapping = page->mapping;
				1567
				1568	BUG_ON(!PageLocked(page));
				1569	if (PageWriteback(page))
				1570	return 0;
				1571
				1572	if (mapping && mapping->a_ops->releasepage)
				1573	return mapping->a_ops->releasepage(page, gfp_mask);
				1574	return try_to_free_buffers(page);
				1575	}
				1576	EXPORT_SYMBOL(try_to_release_page);
				1577
				1578	/**
				1579	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1580	*
				1581	* @page: the page which is affected
				1582	* @offset: the index of the truncation point
				1583	*
				1584	* block_invalidatepage() is called when all or part of the page has become
				1585	* invalidatedby a truncate operation.
				1586	*
				1587	* block_invalidatepage() does not have to release all buffers, but it must
				1588	* ensure that no dirty buffer is left outside @offset and that no I/O
				1589	* is underway against any of the blocks which are outside the truncation
				1590	* point. Because the caller is about to free (and possibly reuse) those
				1591	* blocks on-disk.
				1592	*/
				1593	int block_invalidatepage(struct page *page, unsigned long offset)
				1594	{
				1595	struct buffer_head head, bh, *next;
				1596	unsigned int curr_off = 0;
				1597	int ret = 1;
				1598
				1599	BUG_ON(!PageLocked(page));
				1600	if (!page_has_buffers(page))
				1601	goto out;
				1602
				1603	head = page_buffers(page);
				1604	bh = head;
				1605	do {
				1606	unsigned int next_off = curr_off + bh->b_size;
				1607	next = bh->b_this_page;
				1608
				1609	/*
				1610	* is this block fully invalidated?
				1611	*/
				1612	if (offset <= curr_off)
				1613	discard_buffer(bh);
				1614	curr_off = next_off;
				1615	bh = next;
				1616	} while (bh != head);
				1617
				1618	/*
				1619	* We release buffers only if the entire page is being invalidated.
				1620	* The get_block cached value has been unconditionally invalidated,
				1621	* so real IO is not possible anymore.
				1622	*/
				1623	if (offset == 0)
				1624	ret = try_to_release_page(page, 0);
				1625	out:
				1626	return ret;
				1627	}
				1628	EXPORT_SYMBOL(block_invalidatepage);
				1629
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	1630	int do_invalidatepage(struct page *page, unsigned long offset)
				1631	{
				1632	int (invalidatepage)(struct page , unsigned long);
				1633	invalidatepage = page->mapping->a_ops->invalidatepage;
				1634	if (invalidatepage == NULL)
				1635	invalidatepage = block_invalidatepage;
				1636	return (*invalidatepage)(page, offset);
				1637	}
				1638
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1639	/*
				1640	* We attach and possibly dirty the buffers atomically wrt
				1641	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1642	* is already excluded via the page lock.
				1643	*/
				1644	void create_empty_buffers(struct page *page,
				1645	unsigned long blocksize, unsigned long b_state)
				1646	{
				1647	struct buffer_head bh, head, *tail;
				1648
				1649	head = alloc_page_buffers(page, blocksize, 1);
				1650	bh = head;
				1651	do {
				1652	bh->b_state \|= b_state;
				1653	tail = bh;
				1654	bh = bh->b_this_page;
				1655	} while (bh);
				1656	tail->b_this_page = head;
				1657
				1658	spin_lock(&page->mapping->private_lock);
				1659	if (PageUptodate(page) \|\| PageDirty(page)) {
				1660	bh = head;
				1661	do {
				1662	if (PageDirty(page))
				1663	set_buffer_dirty(bh);
				1664	if (PageUptodate(page))
				1665	set_buffer_uptodate(bh);
				1666	bh = bh->b_this_page;
				1667	} while (bh != head);
				1668	}
				1669	attach_page_buffers(page, head);
				1670	spin_unlock(&page->mapping->private_lock);
				1671	}
				1672	EXPORT_SYMBOL(create_empty_buffers);
				1673
				1674	/*
				1675	* We are taking a block for data and we don't want any output from any
				1676	* buffer-cache aliases starting from return from that function and
				1677	* until the moment when something will explicitly mark the buffer
				1678	* dirty (hopefully that will not happen until we will free that block ;-)
				1679	* We don't even need to mark it not-uptodate - nobody can expect
				1680	* anything from a newly allocated buffer anyway. We used to used
				1681	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1682	* don't want to mark the alias unmapped, for example - it would confuse
				1683	* anyone who might pick it with bread() afterwards...
				1684	*
				1685	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1686	* be writeout I/O going on against recently-freed buffers. We don't
				1687	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1688	* only if we really need to. That happens here.
				1689	*/
				1690	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1691	{
				1692	struct buffer_head *old_bh;
				1693
				1694	might_sleep();
				1695
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1696	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1697	if (old_bh) {
				1698	clear_buffer_dirty(old_bh);
				1699	wait_on_buffer(old_bh);
				1700	clear_buffer_req(old_bh);
				1701	__brelse(old_bh);
				1702	}
				1703	}
				1704	EXPORT_SYMBOL(unmap_underlying_metadata);
				1705
				1706	/*
				1707	* NOTE! All mapped/uptodate combinations are valid:
				1708	*
				1709	* Mapped Uptodate Meaning
				1710	*
				1711	* No No "unknown" - must do get_block()
				1712	* No Yes "hole" - zero-filled
				1713	* Yes No "allocated" - allocated on disk, not read in
				1714	* Yes Yes "valid" - allocated and up-to-date in memory.
				1715	*
				1716	* "Dirty" is valid only with the last case (mapped+uptodate).
				1717	*/
				1718
				1719	/*
				1720	* While block_write_full_page is writing back the dirty buffers under
				1721	* the page lock, whoever dirtied the buffers may decide to clean them
				1722	* again at any time. We handle that by only looking at the buffer
				1723	* state inside lock_buffer().
				1724	*
				1725	* If block_write_full_page() is called for regular writeback
				1726	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1727	* locked buffer. This only can happen if someone has written the buffer
				1728	* directly, with submit_bh(). At the address_space level PageWriteback
				1729	* prevents this contention from occurring.
				1730	*/
				1731	static int __block_write_full_page(struct inode inode, struct page page,
				1732	get_block_t get_block, struct writeback_control wbc)
				1733	{
				1734	int err;
				1735	sector_t block;
				1736	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1737	struct buffer_head bh, head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1738	int nr_underway = 0;
				1739
				1740	BUG_ON(!PageLocked(page));
				1741
				1742	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1743
				1744	if (!page_has_buffers(page)) {
				1745	create_empty_buffers(page, 1 << inode->i_blkbits,
				1746	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1747	}
				1748
				1749	/*
				1750	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1751	* here, and the (potentially unmapped) buffers may become dirty at
				1752	* any time. If a buffer becomes dirty here after we've inspected it
				1753	* then we just miss that fact, and the page stays dirty.
				1754	*
				1755	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1756	* handle that here by just cleaning them.
				1757	*/
				1758
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1759	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1760	head = page_buffers(page);
				1761	bh = head;
				1762
				1763	/*
				1764	* Get all the dirty buffers mapped to disk addresses and
				1765	* handle any aliases from the underlying blockdev's mapping.
				1766	*/
				1767	do {
				1768	if (block > last_block) {
				1769	/*
				1770	* mapped buffers outside i_size will occur, because
				1771	* this page can be outside i_size when there is a
				1772	* truncate in progress.
				1773	*/
				1774	/*
				1775	* The buffer was zeroed by block_write_full_page()
				1776	*/
				1777	clear_buffer_dirty(bh);
				1778	set_buffer_uptodate(bh);
				1779	} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
				1780	err = get_block(inode, block, bh, 1);
				1781	if (err)
				1782	goto recover;
				1783	if (buffer_new(bh)) {
				1784	/* blockdev mappings never come here */
				1785	clear_buffer_new(bh);
				1786	unmap_underlying_metadata(bh->b_bdev,
				1787	bh->b_blocknr);
				1788	}
				1789	}
				1790	bh = bh->b_this_page;
				1791	block++;
				1792	} while (bh != head);
				1793
				1794	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	if (!buffer_mapped(bh))
				1796	continue;
				1797	/*
				1798	* If it's a fully non-blocking write attempt and we cannot
				1799	* lock the buffer then redirty the page. Note that this can
				1800	* potentially cause a busy-wait loop from pdflush and kswapd
				1801	* activity, but those code paths have their own higher-level
				1802	* throttling.
				1803	*/
				1804	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1805	lock_buffer(bh);
				1806	} else if (test_set_buffer_locked(bh)) {
				1807	redirty_page_for_writepage(wbc, page);
				1808	continue;
				1809	}
				1810	if (test_clear_buffer_dirty(bh)) {
				1811	mark_buffer_async_write(bh);
				1812	} else {
				1813	unlock_buffer(bh);
				1814	}
				1815	} while ((bh = bh->b_this_page) != head);
				1816
				1817	/*
				1818	* The page and its buffers are protected by PageWriteback(), so we can
				1819	* drop the bh refcounts early.
				1820	*/
				1821	BUG_ON(PageWriteback(page));
				1822	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1823
				1824	do {
				1825	struct buffer_head *next = bh->b_this_page;
				1826	if (buffer_async_write(bh)) {
				1827	submit_bh(WRITE, bh);
				1828	nr_underway++;
				1829	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1830	bh = next;
				1831	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1832	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1833
				1834	err = 0;
				1835	done:
				1836	if (nr_underway == 0) {
				1837	/*
				1838	* The page was marked dirty, but the buffers were
				1839	* clean. Someone wrote them back by hand with
				1840	* ll_rw_block/submit_bh. A rare case.
				1841	*/
				1842	int uptodate = 1;
				1843	do {
				1844	if (!buffer_uptodate(bh)) {
				1845	uptodate = 0;
				1846	break;
				1847	}
				1848	bh = bh->b_this_page;
				1849	} while (bh != head);
				1850	if (uptodate)
				1851	SetPageUptodate(page);
				1852	end_page_writeback(page);
				1853	/*
				1854	* The page and buffer_heads can be released at any time from
				1855	* here on.
				1856	*/
				1857	wbc->pages_skipped++; /* We didn't write this page */
				1858	}
				1859	return err;
				1860
				1861	recover:
				1862	/*
				1863	* ENOSPC, or some other error. We may already have added some
				1864	* blocks to the file, so we need to write these out to avoid
				1865	* exposing stale data.
				1866	* The page is currently locked and not marked for writeback
				1867	*/
				1868	bh = head;
				1869	/* Recovery: lock and submit the mapped buffers */
				1870	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1871	if (buffer_mapped(bh) && buffer_dirty(bh)) {
				1872	lock_buffer(bh);
				1873	mark_buffer_async_write(bh);
				1874	} else {
				1875	/*
				1876	* The buffer may have been set dirty during
				1877	* attachment to a dirty page.
				1878	*/
				1879	clear_buffer_dirty(bh);
				1880	}
				1881	} while ((bh = bh->b_this_page) != head);
				1882	SetPageError(page);
				1883	BUG_ON(PageWriteback(page));
				1884	set_page_writeback(page);
				1885	unlock_page(page);
				1886	do {
				1887	struct buffer_head *next = bh->b_this_page;
				1888	if (buffer_async_write(bh)) {
				1889	clear_buffer_dirty(bh);
				1890	submit_bh(WRITE, bh);
				1891	nr_underway++;
				1892	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1893	bh = next;
				1894	} while (bh != head);
				1895	goto done;
				1896	}
				1897
				1898	static int __block_prepare_write(struct inode inode, struct page page,
				1899	unsigned from, unsigned to, get_block_t *get_block)
				1900	{
				1901	unsigned block_start, block_end;
				1902	sector_t block;
				1903	int err = 0;
				1904	unsigned blocksize, bbits;
				1905	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1906
				1907	BUG_ON(!PageLocked(page));
				1908	BUG_ON(from > PAGE_CACHE_SIZE);
				1909	BUG_ON(to > PAGE_CACHE_SIZE);
				1910	BUG_ON(from > to);
				1911
				1912	blocksize = 1 << inode->i_blkbits;
				1913	if (!page_has_buffers(page))
				1914	create_empty_buffers(page, blocksize, 0);
				1915	head = page_buffers(page);
				1916
				1917	bbits = inode->i_blkbits;
				1918	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1919
				1920	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1921	block++, block_start=block_end, bh = bh->b_this_page) {
				1922	block_end = block_start + blocksize;
				1923	if (block_end <= from \|\| block_start >= to) {
				1924	if (PageUptodate(page)) {
				1925	if (!buffer_uptodate(bh))
				1926	set_buffer_uptodate(bh);
				1927	}
				1928	continue;
				1929	}
				1930	if (buffer_new(bh))
				1931	clear_buffer_new(bh);
				1932	if (!buffer_mapped(bh)) {
				1933	err = get_block(inode, block, bh, 1);
				1934	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1935	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1936	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1937	unmap_underlying_metadata(bh->b_bdev,
				1938	bh->b_blocknr);
				1939	if (PageUptodate(page)) {
				1940	set_buffer_uptodate(bh);
				1941	continue;
				1942	}
				1943	if (block_end > to \|\| block_start < from) {
				1944	void *kaddr;
				1945
				1946	kaddr = kmap_atomic(page, KM_USER0);
				1947	if (block_end > to)
				1948	memset(kaddr+to, 0,
				1949	block_end-to);
				1950	if (block_start < from)
				1951	memset(kaddr+block_start,
				1952	0, from-block_start);
				1953	flush_dcache_page(page);
				1954	kunmap_atomic(kaddr, KM_USER0);
				1955	}
				1956	continue;
				1957	}
				1958	}
				1959	if (PageUptodate(page)) {
				1960	if (!buffer_uptodate(bh))
				1961	set_buffer_uptodate(bh);
				1962	continue;
				1963	}
				1964	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				1965	(block_start < from \|\| block_end > to)) {
				1966	ll_rw_block(READ, 1, &bh);
				1967	*wait_bh++=bh;
				1968	}
				1969	}
				1970	/*
				1971	* If we issued read requests - let them complete.
				1972	*/
				1973	while(wait_bh > wait) {
				1974	wait_on_buffer(*--wait_bh);
				1975	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1976	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1977	}
Anton Altaparmakov	152becd	2005-06-23 00:10:21 -0700	[diff] [blame]	1978	if (!err) {
				1979	bh = head;
				1980	do {
				1981	if (buffer_new(bh))
				1982	clear_buffer_new(bh);
				1983	} while ((bh = bh->b_this_page) != head);
				1984	return 0;
				1985	}
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1986	/* Error case: */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1987	/*
				1988	* Zero out any newly allocated blocks to avoid exposing stale
				1989	* data. If BH_New is set, we know that the block was newly
				1990	* allocated in the above loop.
				1991	*/
				1992	bh = head;
				1993	block_start = 0;
				1994	do {
				1995	block_end = block_start+blocksize;
				1996	if (block_end <= from)
				1997	goto next_bh;
				1998	if (block_start >= to)
				1999	break;
				2000	if (buffer_new(bh)) {
				2001	void *kaddr;
				2002
				2003	clear_buffer_new(bh);
				2004	kaddr = kmap_atomic(page, KM_USER0);
				2005	memset(kaddr+block_start, 0, bh->b_size);
				2006	kunmap_atomic(kaddr, KM_USER0);
				2007	set_buffer_uptodate(bh);
				2008	mark_buffer_dirty(bh);
				2009	}
				2010	next_bh:
				2011	block_start = block_end;
				2012	bh = bh->b_this_page;
				2013	} while (bh != head);
				2014	return err;
				2015	}
				2016
				2017	static int __block_commit_write(struct inode inode, struct page page,
				2018	unsigned from, unsigned to)
				2019	{
				2020	unsigned block_start, block_end;
				2021	int partial = 0;
				2022	unsigned blocksize;
				2023	struct buffer_head bh, head;
				2024
				2025	blocksize = 1 << inode->i_blkbits;
				2026
				2027	for(bh = head = page_buffers(page), block_start = 0;
				2028	bh != head \|\| !block_start;
				2029	block_start=block_end, bh = bh->b_this_page) {
				2030	block_end = block_start + blocksize;
				2031	if (block_end <= from \|\| block_start >= to) {
				2032	if (!buffer_uptodate(bh))
				2033	partial = 1;
				2034	} else {
				2035	set_buffer_uptodate(bh);
				2036	mark_buffer_dirty(bh);
				2037	}
				2038	}
				2039
				2040	/*
				2041	* If this is a partial write which happened to make all buffers
				2042	* uptodate then we can optimize away a bogus readpage() for
				2043	* the next read(). Here we 'discover' whether the page went
				2044	* uptodate as a result of this (potentially partial) write.
				2045	*/
				2046	if (!partial)
				2047	SetPageUptodate(page);
				2048	return 0;
				2049	}
				2050
				2051	/*
				2052	* Generic "read page" function for block devices that have the normal
				2053	* get_block functionality. This is most of the block device filesystems.
				2054	* Reads the page asynchronously --- the unlock_buffer() and
				2055	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2056	* page struct once IO has completed.
				2057	*/
				2058	int block_read_full_page(struct page page, get_block_t get_block)
				2059	{
				2060	struct inode *inode = page->mapping->host;
				2061	sector_t iblock, lblock;
				2062	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2063	unsigned int blocksize;
				2064	int nr, i;
				2065	int fully_mapped = 1;
				2066
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2067	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2068	blocksize = 1 << inode->i_blkbits;
				2069	if (!page_has_buffers(page))
				2070	create_empty_buffers(page, blocksize, 0);
				2071	head = page_buffers(page);
				2072
				2073	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2074	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2075	bh = head;
				2076	nr = 0;
				2077	i = 0;
				2078
				2079	do {
				2080	if (buffer_uptodate(bh))
				2081	continue;
				2082
				2083	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2084	int err = 0;
				2085
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2086	fully_mapped = 0;
				2087	if (iblock < lblock) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2088	err = get_block(inode, iblock, bh, 0);
				2089	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2090	SetPageError(page);
				2091	}
				2092	if (!buffer_mapped(bh)) {
				2093	void *kaddr = kmap_atomic(page, KM_USER0);
				2094	memset(kaddr + i * blocksize, 0, blocksize);
				2095	flush_dcache_page(page);
				2096	kunmap_atomic(kaddr, KM_USER0);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2097	if (!err)
				2098	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2099	continue;
				2100	}
				2101	/*
				2102	* get_block() might have updated the buffer
				2103	* synchronously
				2104	*/
				2105	if (buffer_uptodate(bh))
				2106	continue;
				2107	}
				2108	arr[nr++] = bh;
				2109	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2110
				2111	if (fully_mapped)
				2112	SetPageMappedToDisk(page);
				2113
				2114	if (!nr) {
				2115	/*
				2116	* All buffers are uptodate - we can set the page uptodate
				2117	* as well. But not if get_block() returned an error.
				2118	*/
				2119	if (!PageError(page))
				2120	SetPageUptodate(page);
				2121	unlock_page(page);
				2122	return 0;
				2123	}
				2124
				2125	/* Stage two: lock the buffers */
				2126	for (i = 0; i < nr; i++) {
				2127	bh = arr[i];
				2128	lock_buffer(bh);
				2129	mark_buffer_async_read(bh);
				2130	}
				2131
				2132	/*
				2133	* Stage 3: start the IO. Check for uptodateness
				2134	* inside the buffer lock in case another process reading
				2135	* the underlying blockdev brought it uptodate (the sct fix).
				2136	*/
				2137	for (i = 0; i < nr; i++) {
				2138	bh = arr[i];
				2139	if (buffer_uptodate(bh))
				2140	end_buffer_async_read(bh, 1);
				2141	else
				2142	submit_bh(READ, bh);
				2143	}
				2144	return 0;
				2145	}
				2146
				2147	/* utility function for filesystems that need to do work on expanding
				2148	* truncates. Uses prepare/commit_write to allow the filesystem to
				2149	* deal with the hole.
				2150	*/
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2151	static int __generic_cont_expand(struct inode *inode, loff_t size,
				2152	pgoff_t index, unsigned int offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2153	{
				2154	struct address_space *mapping = inode->i_mapping;
				2155	struct page *page;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2156	unsigned long limit;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2157	int err;
				2158
				2159	err = -EFBIG;
				2160	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
				2161	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
				2162	send_sig(SIGXFSZ, current, 0);
				2163	goto out;
				2164	}
				2165	if (size > inode->i_sb->s_maxbytes)
				2166	goto out;
				2167
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2168	err = -ENOMEM;
				2169	page = grab_cache_page(mapping, index);
				2170	if (!page)
				2171	goto out;
				2172	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2173	if (err) {
				2174	/*
				2175	* ->prepare_write() may have instantiated a few blocks
				2176	* outside i_size. Trim these off again.
				2177	*/
				2178	unlock_page(page);
				2179	page_cache_release(page);
				2180	vmtruncate(inode, inode->i_size);
				2181	goto out;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2182	}
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2183
				2184	err = mapping->a_ops->commit_write(NULL, page, offset, offset);
				2185
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2186	unlock_page(page);
				2187	page_cache_release(page);
				2188	if (err > 0)
				2189	err = 0;
				2190	out:
				2191	return err;
				2192	}
				2193
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2194	int generic_cont_expand(struct inode *inode, loff_t size)
				2195	{
				2196	pgoff_t index;
				2197	unsigned int offset;
				2198
				2199	offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
				2200
				2201	/* ugh. in prepare/commit_write, if from==to==start of block, we
				2202	** skip the prepare. make sure we never send an offset for the start
				2203	** of a block
				2204	*/
				2205	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				2206	/* caller must handle this extra byte. */
				2207	offset++;
				2208	}
				2209	index = size >> PAGE_CACHE_SHIFT;
				2210
				2211	return __generic_cont_expand(inode, size, index, offset);
				2212	}
				2213
				2214	int generic_cont_expand_simple(struct inode *inode, loff_t size)
				2215	{
				2216	loff_t pos = size - 1;
				2217	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
				2218	unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
				2219
				2220	/* prepare/commit_write can handle even if from==to==start of block. */
				2221	return __generic_cont_expand(inode, size, index, offset);
				2222	}
				2223
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2224	/*
				2225	* For moronic filesystems that do not allow holes in file.
				2226	* We may have to extend the file.
				2227	*/
				2228
				2229	int cont_prepare_write(struct page *page, unsigned offset,
				2230	unsigned to, get_block_t get_block, loff_t bytes)
				2231	{
				2232	struct address_space *mapping = page->mapping;
				2233	struct inode *inode = mapping->host;
				2234	struct page *new_page;
				2235	pgoff_t pgpos;
				2236	long status;
				2237	unsigned zerofrom;
				2238	unsigned blocksize = 1 << inode->i_blkbits;
				2239	void *kaddr;
				2240
				2241	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
				2242	status = -ENOMEM;
				2243	new_page = grab_cache_page(mapping, pgpos);
				2244	if (!new_page)
				2245	goto out;
				2246	/* we might sleep */
				2247	if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
				2248	unlock_page(new_page);
				2249	page_cache_release(new_page);
				2250	continue;
				2251	}
				2252	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2253	if (zerofrom & (blocksize-1)) {
				2254	*bytes \|= (blocksize-1);
				2255	(*bytes)++;
				2256	}
				2257	status = __block_prepare_write(inode, new_page, zerofrom,
				2258	PAGE_CACHE_SIZE, get_block);
				2259	if (status)
				2260	goto out_unmap;
				2261	kaddr = kmap_atomic(new_page, KM_USER0);
				2262	memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
				2263	flush_dcache_page(new_page);
				2264	kunmap_atomic(kaddr, KM_USER0);
				2265	generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
				2266	unlock_page(new_page);
				2267	page_cache_release(new_page);
				2268	}
				2269
				2270	if (page->index < pgpos) {
				2271	/* completely inside the area */
				2272	zerofrom = offset;
				2273	} else {
				2274	/* page covers the boundary, find the boundary offset */
				2275	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2276
				2277	/* if we will expand the thing last block will be filled */
				2278	if (to > zerofrom && (zerofrom & (blocksize-1))) {
				2279	*bytes \|= (blocksize-1);
				2280	(*bytes)++;
				2281	}
				2282
				2283	/* starting below the boundary? Nothing to zero out */
				2284	if (offset <= zerofrom)
				2285	zerofrom = offset;
				2286	}
				2287	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
				2288	if (status)
				2289	goto out1;
				2290	if (zerofrom < offset) {
				2291	kaddr = kmap_atomic(page, KM_USER0);
				2292	memset(kaddr+zerofrom, 0, offset-zerofrom);
				2293	flush_dcache_page(page);
				2294	kunmap_atomic(kaddr, KM_USER0);
				2295	__block_commit_write(inode, page, zerofrom, offset);
				2296	}
				2297	return 0;
				2298	out1:
				2299	ClearPageUptodate(page);
				2300	return status;
				2301
				2302	out_unmap:
				2303	ClearPageUptodate(new_page);
				2304	unlock_page(new_page);
				2305	page_cache_release(new_page);
				2306	out:
				2307	return status;
				2308	}
				2309
				2310	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2311	get_block_t *get_block)
				2312	{
				2313	struct inode *inode = page->mapping->host;
				2314	int err = __block_prepare_write(inode, page, from, to, get_block);
				2315	if (err)
				2316	ClearPageUptodate(page);
				2317	return err;
				2318	}
				2319
				2320	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2321	{
				2322	struct inode *inode = page->mapping->host;
				2323	__block_commit_write(inode,page,from,to);
				2324	return 0;
				2325	}
				2326
				2327	int generic_commit_write(struct file file, struct page page,
				2328	unsigned from, unsigned to)
				2329	{
				2330	struct inode *inode = page->mapping->host;
				2331	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2332	__block_commit_write(inode,page,from,to);
				2333	/*
				2334	* No need to use i_size_read() here, the i_size
Jes Sorensen	1b1dcc1	2006-01-09 15:59:24 -0800	[diff] [blame]	2335	* cannot change under us because we hold i_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2336	*/
				2337	if (pos > inode->i_size) {
				2338	i_size_write(inode, pos);
				2339	mark_inode_dirty(inode);
				2340	}
				2341	return 0;
				2342	}
				2343
				2344
				2345	/*
				2346	* nobh_prepare_write()'s prereads are special: the buffer_heads are freed
				2347	* immediately, while under the page lock. So it needs a special end_io
				2348	* handler which does not touch the bh after unlocking it.
				2349	*
				2350	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				2351	* a race there is benign: unlock_buffer() only use the bh's address for
				2352	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				2353	* itself.
				2354	*/
				2355	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2356	{
				2357	if (uptodate) {
				2358	set_buffer_uptodate(bh);
				2359	} else {
				2360	/* This happens, due to failed READA attempts. */
				2361	clear_buffer_uptodate(bh);
				2362	}
				2363	unlock_buffer(bh);
				2364	}
				2365
				2366	/*
				2367	* On entry, the page is fully not uptodate.
				2368	* On exit the page is fully uptodate in the areas outside (from,to)
				2369	*/
				2370	int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
				2371	get_block_t *get_block)
				2372	{
				2373	struct inode *inode = page->mapping->host;
				2374	const unsigned blkbits = inode->i_blkbits;
				2375	const unsigned blocksize = 1 << blkbits;
				2376	struct buffer_head map_bh;
				2377	struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
				2378	unsigned block_in_page;
				2379	unsigned block_start;
				2380	sector_t block_in_file;
				2381	char *kaddr;
				2382	int nr_reads = 0;
				2383	int i;
				2384	int ret = 0;
				2385	int is_mapped_to_disk = 1;
				2386	int dirtied_it = 0;
				2387
				2388	if (PageMappedToDisk(page))
				2389	return 0;
				2390
				2391	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
				2392	map_bh.b_page = page;
				2393
				2394	/*
				2395	* We loop across all blocks in the page, whether or not they are
				2396	* part of the affected region. This is so we can discover if the
				2397	* page is fully mapped-to-disk.
				2398	*/
				2399	for (block_start = 0, block_in_page = 0;
				2400	block_start < PAGE_CACHE_SIZE;
				2401	block_in_page++, block_start += blocksize) {
				2402	unsigned block_end = block_start + blocksize;
				2403	int create;
				2404
				2405	map_bh.b_state = 0;
				2406	create = 1;
				2407	if (block_start >= to)
				2408	create = 0;
				2409	ret = get_block(inode, block_in_file + block_in_page,
				2410	&map_bh, create);
				2411	if (ret)
				2412	goto failed;
				2413	if (!buffer_mapped(&map_bh))
				2414	is_mapped_to_disk = 0;
				2415	if (buffer_new(&map_bh))
				2416	unmap_underlying_metadata(map_bh.b_bdev,
				2417	map_bh.b_blocknr);
				2418	if (PageUptodate(page))
				2419	continue;
				2420	if (buffer_new(&map_bh) \|\| !buffer_mapped(&map_bh)) {
				2421	kaddr = kmap_atomic(page, KM_USER0);
				2422	if (block_start < from) {
				2423	memset(kaddr+block_start, 0, from-block_start);
				2424	dirtied_it = 1;
				2425	}
				2426	if (block_end > to) {
				2427	memset(kaddr + to, 0, block_end - to);
				2428	dirtied_it = 1;
				2429	}
				2430	flush_dcache_page(page);
				2431	kunmap_atomic(kaddr, KM_USER0);
				2432	continue;
				2433	}
				2434	if (buffer_uptodate(&map_bh))
				2435	continue; /* reiserfs does this */
				2436	if (block_start < from \|\| block_end > to) {
				2437	struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
				2438
				2439	if (!bh) {
				2440	ret = -ENOMEM;
				2441	goto failed;
				2442	}
				2443	bh->b_state = map_bh.b_state;
				2444	atomic_set(&bh->b_count, 0);
				2445	bh->b_this_page = NULL;
				2446	bh->b_page = page;
				2447	bh->b_blocknr = map_bh.b_blocknr;
				2448	bh->b_size = blocksize;
				2449	bh->b_data = (char *)(long)block_start;
				2450	bh->b_bdev = map_bh.b_bdev;
				2451	bh->b_private = NULL;
				2452	read_bh[nr_reads++] = bh;
				2453	}
				2454	}
				2455
				2456	if (nr_reads) {
				2457	struct buffer_head *bh;
				2458
				2459	/*
				2460	* The page is locked, so these buffers are protected from
				2461	* any VM or truncate activity. Hence we don't need to care
				2462	* for the buffer_head refcounts.
				2463	*/
				2464	for (i = 0; i < nr_reads; i++) {
				2465	bh = read_bh[i];
				2466	lock_buffer(bh);
				2467	bh->b_end_io = end_buffer_read_nobh;
				2468	submit_bh(READ, bh);
				2469	}
				2470	for (i = 0; i < nr_reads; i++) {
				2471	bh = read_bh[i];
				2472	wait_on_buffer(bh);
				2473	if (!buffer_uptodate(bh))
				2474	ret = -EIO;
				2475	free_buffer_head(bh);
				2476	read_bh[i] = NULL;
				2477	}
				2478	if (ret)
				2479	goto failed;
				2480	}
				2481
				2482	if (is_mapped_to_disk)
				2483	SetPageMappedToDisk(page);
				2484	SetPageUptodate(page);
				2485
				2486	/*
				2487	* Setting the page dirty here isn't necessary for the prepare_write
				2488	* function - commit_write will do that. But if/when this function is
				2489	* used within the pagefault handler to ensure that all mmapped pages
				2490	* have backing space in the filesystem, we will need to dirty the page
				2491	* if its contents were altered.
				2492	*/
				2493	if (dirtied_it)
				2494	set_page_dirty(page);
				2495
				2496	return 0;
				2497
				2498	failed:
				2499	for (i = 0; i < nr_reads; i++) {
				2500	if (read_bh[i])
				2501	free_buffer_head(read_bh[i]);
				2502	}
				2503
				2504	/*
				2505	* Error recovery is pretty slack. Clear the page and mark it dirty
				2506	* so we'll later zero out any blocks which _were_ allocated.
				2507	*/
				2508	kaddr = kmap_atomic(page, KM_USER0);
				2509	memset(kaddr, 0, PAGE_CACHE_SIZE);
				2510	kunmap_atomic(kaddr, KM_USER0);
				2511	SetPageUptodate(page);
				2512	set_page_dirty(page);
				2513	return ret;
				2514	}
				2515	EXPORT_SYMBOL(nobh_prepare_write);
				2516
				2517	int nobh_commit_write(struct file file, struct page page,
				2518	unsigned from, unsigned to)
				2519	{
				2520	struct inode *inode = page->mapping->host;
				2521	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				2522
				2523	set_page_dirty(page);
				2524	if (pos > inode->i_size) {
				2525	i_size_write(inode, pos);
				2526	mark_inode_dirty(inode);
				2527	}
				2528	return 0;
				2529	}
				2530	EXPORT_SYMBOL(nobh_commit_write);
				2531
				2532	/*
				2533	* nobh_writepage() - based on block_full_write_page() except
				2534	* that it tries to operate without attaching bufferheads to
				2535	* the page.
				2536	*/
				2537	int nobh_writepage(struct page page, get_block_t get_block,
				2538	struct writeback_control *wbc)
				2539	{
				2540	struct inode * const inode = page->mapping->host;
				2541	loff_t i_size = i_size_read(inode);
				2542	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2543	unsigned offset;
				2544	void *kaddr;
				2545	int ret;
				2546
				2547	/* Is the page fully inside i_size? */
				2548	if (page->index < end_index)
				2549	goto out;
				2550
				2551	/* Is the page fully outside i_size? (truncate in progress) */
				2552	offset = i_size & (PAGE_CACHE_SIZE-1);
				2553	if (page->index >= end_index+1 \|\| !offset) {
				2554	/*
				2555	* The page may have dirty, unmapped buffers. For example,
				2556	* they may have been added in ext3_writepage(). Make them
				2557	* freeable here, so the page does not leak.
				2558	*/
				2559	#if 0
				2560	/* Not really sure about this - do we need this ? */
				2561	if (page->mapping->a_ops->invalidatepage)
				2562	page->mapping->a_ops->invalidatepage(page, offset);
				2563	#endif
				2564	unlock_page(page);
				2565	return 0; /* don't care */
				2566	}
				2567
				2568	/*
				2569	* The page straddles i_size. It must be zeroed out on each and every
				2570	* writepage invocation because it may be mmapped. "A file is mapped
				2571	* in multiples of the page size. For a file that is not a multiple of
				2572	* the page size, the remaining memory is zeroed when mapped, and
				2573	* writes to that region are not written out to the file."
				2574	*/
				2575	kaddr = kmap_atomic(page, KM_USER0);
				2576	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2577	flush_dcache_page(page);
				2578	kunmap_atomic(kaddr, KM_USER0);
				2579	out:
				2580	ret = mpage_writepage(page, get_block, wbc);
				2581	if (ret == -EAGAIN)
				2582	ret = __block_write_full_page(inode, page, get_block, wbc);
				2583	return ret;
				2584	}
				2585	EXPORT_SYMBOL(nobh_writepage);
				2586
				2587	/*
				2588	* This function assumes that ->prepare_write() uses nobh_prepare_write().
				2589	*/
				2590	int nobh_truncate_page(struct address_space *mapping, loff_t from)
				2591	{
				2592	struct inode *inode = mapping->host;
				2593	unsigned blocksize = 1 << inode->i_blkbits;
				2594	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2595	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2596	unsigned to;
				2597	struct page *page;
				2598	struct address_space_operations *a_ops = mapping->a_ops;
				2599	char *kaddr;
				2600	int ret = 0;
				2601
				2602	if ((offset & (blocksize - 1)) == 0)
				2603	goto out;
				2604
				2605	ret = -ENOMEM;
				2606	page = grab_cache_page(mapping, index);
				2607	if (!page)
				2608	goto out;
				2609
				2610	to = (offset + blocksize) & ~(blocksize - 1);
				2611	ret = a_ops->prepare_write(NULL, page, offset, to);
				2612	if (ret == 0) {
				2613	kaddr = kmap_atomic(page, KM_USER0);
				2614	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2615	flush_dcache_page(page);
				2616	kunmap_atomic(kaddr, KM_USER0);
				2617	set_page_dirty(page);
				2618	}
				2619	unlock_page(page);
				2620	page_cache_release(page);
				2621	out:
				2622	return ret;
				2623	}
				2624	EXPORT_SYMBOL(nobh_truncate_page);
				2625
				2626	int block_truncate_page(struct address_space *mapping,
				2627	loff_t from, get_block_t *get_block)
				2628	{
				2629	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2630	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2631	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2632	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2633	unsigned length, pos;
				2634	struct inode *inode = mapping->host;
				2635	struct page *page;
				2636	struct buffer_head *bh;
				2637	void *kaddr;
				2638	int err;
				2639
				2640	blocksize = 1 << inode->i_blkbits;
				2641	length = offset & (blocksize - 1);
				2642
				2643	/* Block boundary? Nothing to do */
				2644	if (!length)
				2645	return 0;
				2646
				2647	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2648	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2649
				2650	page = grab_cache_page(mapping, index);
				2651	err = -ENOMEM;
				2652	if (!page)
				2653	goto out;
				2654
				2655	if (!page_has_buffers(page))
				2656	create_empty_buffers(page, blocksize, 0);
				2657
				2658	/* Find the buffer that contains "offset" */
				2659	bh = page_buffers(page);
				2660	pos = blocksize;
				2661	while (offset >= pos) {
				2662	bh = bh->b_this_page;
				2663	iblock++;
				2664	pos += blocksize;
				2665	}
				2666
				2667	err = 0;
				2668	if (!buffer_mapped(bh)) {
				2669	err = get_block(inode, iblock, bh, 0);
				2670	if (err)
				2671	goto unlock;
				2672	/* unmapped? It's a hole - nothing to do */
				2673	if (!buffer_mapped(bh))
				2674	goto unlock;
				2675	}
				2676
				2677	/* Ok, it's mapped. Make sure it's up-to-date */
				2678	if (PageUptodate(page))
				2679	set_buffer_uptodate(bh);
				2680
				2681	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
				2682	err = -EIO;
				2683	ll_rw_block(READ, 1, &bh);
				2684	wait_on_buffer(bh);
				2685	/* Uhhuh. Read error. Complain and punt. */
				2686	if (!buffer_uptodate(bh))
				2687	goto unlock;
				2688	}
				2689
				2690	kaddr = kmap_atomic(page, KM_USER0);
				2691	memset(kaddr + offset, 0, length);
				2692	flush_dcache_page(page);
				2693	kunmap_atomic(kaddr, KM_USER0);
				2694
				2695	mark_buffer_dirty(bh);
				2696	err = 0;
				2697
				2698	unlock:
				2699	unlock_page(page);
				2700	page_cache_release(page);
				2701	out:
				2702	return err;
				2703	}
				2704
				2705	/*
				2706	* The generic ->writepage function for buffer-backed address_spaces
				2707	*/
				2708	int block_write_full_page(struct page page, get_block_t get_block,
				2709	struct writeback_control *wbc)
				2710	{
				2711	struct inode * const inode = page->mapping->host;
				2712	loff_t i_size = i_size_read(inode);
				2713	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2714	unsigned offset;
				2715	void *kaddr;
				2716
				2717	/* Is the page fully inside i_size? */
				2718	if (page->index < end_index)
				2719	return __block_write_full_page(inode, page, get_block, wbc);
				2720
				2721	/* Is the page fully outside i_size? (truncate in progress) */
				2722	offset = i_size & (PAGE_CACHE_SIZE-1);
				2723	if (page->index >= end_index+1 \|\| !offset) {
				2724	/*
				2725	* The page may have dirty, unmapped buffers. For example,
				2726	* they may have been added in ext3_writepage(). Make them
				2727	* freeable here, so the page does not leak.
				2728	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2729	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2730	unlock_page(page);
				2731	return 0; /* don't care */
				2732	}
				2733
				2734	/*
				2735	* The page straddles i_size. It must be zeroed out on each and every
				2736	* writepage invokation because it may be mmapped. "A file is mapped
				2737	* in multiples of the page size. For a file that is not a multiple of
				2738	* the page size, the remaining memory is zeroed when mapped, and
				2739	* writes to that region are not written out to the file."
				2740	*/
				2741	kaddr = kmap_atomic(page, KM_USER0);
				2742	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
				2743	flush_dcache_page(page);
				2744	kunmap_atomic(kaddr, KM_USER0);
				2745	return __block_write_full_page(inode, page, get_block, wbc);
				2746	}
				2747
				2748	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2749	get_block_t *get_block)
				2750	{
				2751	struct buffer_head tmp;
				2752	struct inode *inode = mapping->host;
				2753	tmp.b_state = 0;
				2754	tmp.b_blocknr = 0;
				2755	get_block(inode, block, &tmp, 0);
				2756	return tmp.b_blocknr;
				2757	}
				2758
				2759	static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
				2760	{
				2761	struct buffer_head *bh = bio->bi_private;
				2762
				2763	if (bio->bi_size)
				2764	return 1;
				2765
				2766	if (err == -EOPNOTSUPP) {
				2767	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2768	set_bit(BH_Eopnotsupp, &bh->b_state);
				2769	}
				2770
				2771	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2772	bio_put(bio);
				2773	return 0;
				2774	}
				2775
				2776	int submit_bh(int rw, struct buffer_head * bh)
				2777	{
				2778	struct bio *bio;
				2779	int ret = 0;
				2780
				2781	BUG_ON(!buffer_locked(bh));
				2782	BUG_ON(!buffer_mapped(bh));
				2783	BUG_ON(!bh->b_end_io);
				2784
				2785	if (buffer_ordered(bh) && (rw == WRITE))
				2786	rw = WRITE_BARRIER;
				2787
				2788	/*
				2789	* Only clear out a write error when rewriting, should this
				2790	* include WRITE_SYNC as well?
				2791	*/
				2792	if (test_set_buffer_req(bh) && (rw == WRITE \|\| rw == WRITE_BARRIER))
				2793	clear_buffer_write_io_error(bh);
				2794
				2795	/*
				2796	* from here on down, it's all bio -- do the initial mapping,
				2797	* submit_bio -> generic_make_request may further map this bio around
				2798	*/
				2799	bio = bio_alloc(GFP_NOIO, 1);
				2800
				2801	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2802	bio->bi_bdev = bh->b_bdev;
				2803	bio->bi_io_vec[0].bv_page = bh->b_page;
				2804	bio->bi_io_vec[0].bv_len = bh->b_size;
				2805	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2806
				2807	bio->bi_vcnt = 1;
				2808	bio->bi_idx = 0;
				2809	bio->bi_size = bh->b_size;
				2810
				2811	bio->bi_end_io = end_bio_bh_io_sync;
				2812	bio->bi_private = bh;
				2813
				2814	bio_get(bio);
				2815	submit_bio(rw, bio);
				2816
				2817	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2818	ret = -EOPNOTSUPP;
				2819
				2820	bio_put(bio);
				2821	return ret;
				2822	}
				2823
				2824	/**
				2825	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2826	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2827	* @nr: number of &struct buffer_heads in the array
				2828	* @bhs: array of pointers to &struct buffer_head
				2829	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2830	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				2831	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				2832	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				2833	* are sent to disk. The fourth %READA option is described in the documentation
				2834	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2835	*
				2836	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2837	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				2838	* clean when doing a write request, and any buffer that appears to be
				2839	* up-to-date when doing read request. Further it marks as clean buffers that
				2840	* are processed for writing (the buffer cache won't assume that they are
				2841	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2842	*
				2843	* ll_rw_block sets b_end_io to simple completion handler that marks
				2844	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				2845	* any waiters.
				2846	*
				2847	* All of the buffers must be for the same device, and must also be a
				2848	* multiple of the current approved size for the device.
				2849	*/
				2850	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				2851	{
				2852	int i;
				2853
				2854	for (i = 0; i < nr; i++) {
				2855	struct buffer_head *bh = bhs[i];
				2856
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2857	if (rw == SWRITE)
				2858	lock_buffer(bh);
				2859	else if (test_set_buffer_locked(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2860	continue;
				2861
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	2862	if (rw == WRITE \|\| rw == SWRITE) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2863	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2864	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2865	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2866	submit_bh(WRITE, bh);
				2867	continue;
				2868	}
				2869	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2870	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	2871	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	2872	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2873	submit_bh(rw, bh);
				2874	continue;
				2875	}
				2876	}
				2877	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2878	}
				2879	}
				2880
				2881	/*
				2882	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				2883	* and then start new I/O and then wait upon it. The caller must have a ref on
				2884	* the buffer_head.
				2885	*/
				2886	int sync_dirty_buffer(struct buffer_head *bh)
				2887	{
				2888	int ret = 0;
				2889
				2890	WARN_ON(atomic_read(&bh->b_count) < 1);
				2891	lock_buffer(bh);
				2892	if (test_clear_buffer_dirty(bh)) {
				2893	get_bh(bh);
				2894	bh->b_end_io = end_buffer_write_sync;
				2895	ret = submit_bh(WRITE, bh);
				2896	wait_on_buffer(bh);
				2897	if (buffer_eopnotsupp(bh)) {
				2898	clear_buffer_eopnotsupp(bh);
				2899	ret = -EOPNOTSUPP;
				2900	}
				2901	if (!ret && !buffer_uptodate(bh))
				2902	ret = -EIO;
				2903	} else {
				2904	unlock_buffer(bh);
				2905	}
				2906	return ret;
				2907	}
				2908
				2909	/*
				2910	* try_to_free_buffers() checks if all the buffers on this particular page
				2911	* are unused, and releases them if so.
				2912	*
				2913	* Exclusion against try_to_free_buffers may be obtained by either
				2914	* locking the page or by holding its mapping's private_lock.
				2915	*
				2916	* If the page is dirty but all the buffers are clean then we need to
				2917	* be sure to mark the page clean as well. This is because the page
				2918	* may be against a block device, and a later reattachment of buffers
				2919	* to a dirty page will set all buffers dirty. Which would corrupt
				2920	* filesystem data on the same device.
				2921	*
				2922	* The same applies to regular filesystem pages: if all the buffers are
				2923	* clean then we set the page clean and proceed. To do that, we require
				2924	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				2925	* private_lock.
				2926	*
				2927	* try_to_free_buffers() is non-blocking.
				2928	*/
				2929	static inline int buffer_busy(struct buffer_head *bh)
				2930	{
				2931	return atomic_read(&bh->b_count) \|
				2932	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				2933	}
				2934
				2935	static int
				2936	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				2937	{
				2938	struct buffer_head *head = page_buffers(page);
				2939	struct buffer_head *bh;
				2940
				2941	bh = head;
				2942	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	2943	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2944	set_bit(AS_EIO, &page->mapping->flags);
				2945	if (buffer_busy(bh))
				2946	goto failed;
				2947	bh = bh->b_this_page;
				2948	} while (bh != head);
				2949
				2950	do {
				2951	struct buffer_head *next = bh->b_this_page;
				2952
				2953	if (!list_empty(&bh->b_assoc_buffers))
				2954	__remove_assoc_queue(bh);
				2955	bh = next;
				2956	} while (bh != head);
				2957	*buffers_to_free = head;
				2958	__clear_page_buffers(page);
				2959	return 1;
				2960	failed:
				2961	return 0;
				2962	}
				2963
				2964	int try_to_free_buffers(struct page *page)
				2965	{
				2966	struct address_space * const mapping = page->mapping;
				2967	struct buffer_head *buffers_to_free = NULL;
				2968	int ret = 0;
				2969
				2970	BUG_ON(!PageLocked(page));
				2971	if (PageWriteback(page))
				2972	return 0;
				2973
				2974	if (mapping == NULL) { /* can this still happen? */
				2975	ret = drop_buffers(page, &buffers_to_free);
				2976	goto out;
				2977	}
				2978
				2979	spin_lock(&mapping->private_lock);
				2980	ret = drop_buffers(page, &buffers_to_free);
				2981	if (ret) {
				2982	/*
				2983	* If the filesystem writes its buffers by hand (eg ext3)
				2984	* then we can have clean buffers against a dirty page. We
				2985	* clean the page here; otherwise later reattachment of buffers
				2986	* could encounter a non-uptodate page, which is unresolvable.
				2987	* This only applies in the rare case where try_to_free_buffers
				2988	* succeeds but the page is not freed.
				2989	*/
				2990	clear_page_dirty(page);
				2991	}
				2992	spin_unlock(&mapping->private_lock);
				2993	out:
				2994	if (buffers_to_free) {
				2995	struct buffer_head *bh = buffers_to_free;
				2996
				2997	do {
				2998	struct buffer_head *next = bh->b_this_page;
				2999	free_buffer_head(bh);
				3000	bh = next;
				3001	} while (bh != buffers_to_free);
				3002	}
				3003	return ret;
				3004	}
				3005	EXPORT_SYMBOL(try_to_free_buffers);
				3006
				3007	int block_sync_page(struct page *page)
				3008	{
				3009	struct address_space *mapping;
				3010
				3011	smp_mb();
				3012	mapping = page_mapping(page);
				3013	if (mapping)
				3014	blk_run_backing_dev(mapping->backing_dev_info, page);
				3015	return 0;
				3016	}
				3017
				3018	/*
				3019	* There are no bdflush tunables left. But distributions are
				3020	* still running obsolete flush daemons, so we terminate them here.
				3021	*
				3022	* Use of bdflush() is deprecated and will be removed in a future kernel.
				3023	* The `pdflush' kernel threads fully replace bdflush daemons and this call.
				3024	*/
				3025	asmlinkage long sys_bdflush(int func, long data)
				3026	{
				3027	static int msg_count;
				3028
				3029	if (!capable(CAP_SYS_ADMIN))
				3030	return -EPERM;
				3031
				3032	if (msg_count < 5) {
				3033	msg_count++;
				3034	printk(KERN_INFO
				3035	"warning: process `%s' used the obsolete bdflush"
				3036	" system call\n", current->comm);
				3037	printk(KERN_INFO "Fix your initscripts?\n");
				3038	}
				3039
				3040	if (func == 1)
				3041	do_exit(0);
				3042	return 0;
				3043	}
				3044
				3045	/*
				3046	* Buffer-head allocation
				3047	*/
				3048	static kmem_cache_t *bh_cachep;
				3049
				3050	/*
				3051	* Once the number of bh's in the machine exceeds this level, we start
				3052	* stripping them in writeback.
				3053	*/
				3054	static int max_buffer_heads;
				3055
				3056	int buffer_heads_over_limit;
				3057
				3058	struct bh_accounting {
				3059	int nr; /* Number of live bh's */
				3060	int ratelimit; /* Limit cacheline bouncing */
				3061	};
				3062
				3063	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3064
				3065	static void recalc_bh_state(void)
				3066	{
				3067	int i;
				3068	int tot = 0;
				3069
				3070	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3071	return;
				3072	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3073	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3074	tot += per_cpu(bh_accounting, i).nr;
				3075	buffer_heads_over_limit = (tot > max_buffer_heads);
				3076	}
				3077
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3078	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3079	{
				3080	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
				3081	if (ret) {
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3082	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3083	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3084	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3085	}
				3086	return ret;
				3087	}
				3088	EXPORT_SYMBOL(alloc_buffer_head);
				3089
				3090	void free_buffer_head(struct buffer_head *bh)
				3091	{
				3092	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3093	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3094	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3095	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3096	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3097	}
				3098	EXPORT_SYMBOL(free_buffer_head);
				3099
				3100	static void
				3101	init_buffer_head(void data, kmem_cache_t cachep, unsigned long flags)
				3102	{
				3103	if ((flags & (SLAB_CTOR_VERIFY\|SLAB_CTOR_CONSTRUCTOR)) ==
				3104	SLAB_CTOR_CONSTRUCTOR) {
				3105	struct buffer_head * bh = (struct buffer_head *)data;
				3106
				3107	memset(bh, 0, sizeof(*bh));
				3108	INIT_LIST_HEAD(&bh->b_assoc_buffers);
				3109	}
				3110	}
				3111
				3112	#ifdef CONFIG_HOTPLUG_CPU
				3113	static void buffer_exit_cpu(int cpu)
				3114	{
				3115	int i;
				3116	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3117
				3118	for (i = 0; i < BH_LRU_SIZE; i++) {
				3119	brelse(b->bhs[i]);
				3120	b->bhs[i] = NULL;
				3121	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3122	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3123	per_cpu(bh_accounting, cpu).nr = 0;
				3124	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3125	}
				3126
				3127	static int buffer_cpu_notify(struct notifier_block *self,
				3128	unsigned long action, void *hcpu)
				3129	{
				3130	if (action == CPU_DEAD)
				3131	buffer_exit_cpu((unsigned long)hcpu);
				3132	return NOTIFY_OK;
				3133	}
				3134	#endif /* CONFIG_HOTPLUG_CPU */
				3135
				3136	void __init buffer_init(void)
				3137	{
				3138	int nrpages;
				3139
				3140	bh_cachep = kmem_cache_create("buffer_head",
Paul Jackson	b019600	2006-03-24 03:16:09 -0800	[diff] [blame]	3141	sizeof(struct buffer_head), 0,
				3142	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3143	SLAB_MEM_SPREAD),
				3144	init_buffer_head,
				3145	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3146
				3147	/*
				3148	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3149	*/
				3150	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3151	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3152	hotcpu_notifier(buffer_cpu_notify, 0);
				3153	}
				3154
				3155	EXPORT_SYMBOL(__bforget);
				3156	EXPORT_SYMBOL(__brelse);
				3157	EXPORT_SYMBOL(__wait_on_buffer);
				3158	EXPORT_SYMBOL(block_commit_write);
				3159	EXPORT_SYMBOL(block_prepare_write);
				3160	EXPORT_SYMBOL(block_read_full_page);
				3161	EXPORT_SYMBOL(block_sync_page);
				3162	EXPORT_SYMBOL(block_truncate_page);
				3163	EXPORT_SYMBOL(block_write_full_page);
				3164	EXPORT_SYMBOL(cont_prepare_write);
				3165	EXPORT_SYMBOL(end_buffer_async_write);
				3166	EXPORT_SYMBOL(end_buffer_read_sync);
				3167	EXPORT_SYMBOL(end_buffer_write_sync);
				3168	EXPORT_SYMBOL(file_fsync);
				3169	EXPORT_SYMBOL(fsync_bdev);
				3170	EXPORT_SYMBOL(generic_block_bmap);
				3171	EXPORT_SYMBOL(generic_commit_write);
				3172	EXPORT_SYMBOL(generic_cont_expand);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	3173	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3174	EXPORT_SYMBOL(init_buffer);
				3175	EXPORT_SYMBOL(invalidate_bdev);
				3176	EXPORT_SYMBOL(ll_rw_block);
				3177	EXPORT_SYMBOL(mark_buffer_dirty);
				3178	EXPORT_SYMBOL(submit_bh);
				3179	EXPORT_SYMBOL(sync_dirty_buffer);
				3180	EXPORT_SYMBOL(unlock_buffer);