MD: raid5 avoid unnecessary zero page for trim We want to avoid zero discarded dev page, because it's useless for discard. But if we don't zero it, another read/write hit such page in the cache and will get inconsistent data. To avoid zero the page, we don't set R5_UPTODATE flag after construction is done. In this way, discard write request is still issued and finished, but read will not hit the page. If the stripe gets accessed soon, we need reread the stripe, but since the chance is low, the reread isn't a big deal. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>

commit: 9e44476851e91c86c98eb92b9bc27fb801f89072 [log] [tgz]
author: Shaohua Li <shli@kernel.org> Thu Oct 11 13:49:49 2012 +1100
committer: NeilBrown <neilb@suse.de> Thu Oct 11 13:49:49 2012 +1100
tree: c681fe9b8261f390d60fbbcce060fe6ed958c836
parent: 620125f2bf8ff0c4969b79653b54d7bcc9d40637 [diff] [blame]
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 74dcf19..758b772 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c

@@ -547,7 +547,7 @@
 				rw = WRITE_FUA;
 			else
 				rw = WRITE;
-			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
+			if (test_bit(R5_Discard, &sh->dev[i].flags))
 				rw |= REQ_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 			rw = READ;
@@ -1172,11 +1172,9 @@
 					set_bit(R5_WantFUA, &dev->flags);
 				if (wbi->bi_rw & REQ_SYNC)
 					set_bit(R5_SyncIO, &dev->flags);
-				if (wbi->bi_rw & REQ_DISCARD) {
-					memset(page_address(dev->page), 0,
-						STRIPE_SECTORS << 9);
+				if (wbi->bi_rw & REQ_DISCARD)
 					set_bit(R5_Discard, &dev->flags);
-				} else
+				else
 					tx = async_copy_data(1, wbi, dev->page,
 						dev->sector, tx);
 				wbi = r5_next_bio(wbi, dev->sector);
@@ -1194,7 +1192,7 @@
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	int i;
-	bool fua = false, sync = false;
+	bool fua = false, sync = false, discard = false;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -1202,13 +1200,15 @@
 	for (i = disks; i--; ) {
 		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
 		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
 	}
 
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->written || i == pd_idx || i == qd_idx) {
-			set_bit(R5_UPTODATE, &dev->flags);
+			if (!discard)
+				set_bit(R5_UPTODATE, &dev->flags);
 			if (fua)
 				set_bit(R5_WantFUA, &dev->flags);
 			if (sync)
@@ -1252,8 +1252,6 @@
 	}
 	if (i >= sh->disks) {
 		atomic_inc(&sh->count);
-		memset(page_address(sh->dev[pd_idx].page), 0,
-			STRIPE_SECTORS << 9);
 		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
 		ops_complete_reconstruct(sh);
 		return;
@@ -1314,10 +1312,6 @@
 	}
 	if (i >= sh->disks) {
 		atomic_inc(&sh->count);
-		memset(page_address(sh->dev[sh->pd_idx].page), 0,
-			STRIPE_SECTORS << 9);
-		memset(page_address(sh->dev[sh->qd_idx].page), 0,
-			STRIPE_SECTORS << 9);
 		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
 		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
 		ops_complete_reconstruct(sh);
@@ -2775,7 +2769,8 @@
 		if (sh->dev[i].written) {
 			dev = &sh->dev[i];
 			if (!test_bit(R5_LOCKED, &dev->flags) &&
-				test_bit(R5_UPTODATE, &dev->flags)) {
+			    (test_bit(R5_UPTODATE, &dev->flags) ||
+			     test_and_clear_bit(R5_Discard, &dev->flags))) {
 				/* We can return any write requests */
 				struct bio *wbi, *wbi2;
 				pr_debug("Return write for disc %d\n", i);
@@ -3493,10 +3488,12 @@
 	if (s.written &&
 	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
 			     && !test_bit(R5_LOCKED, &pdev->flags)
-			     && test_bit(R5_UPTODATE, &pdev->flags)))) &&
+			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
+				 test_bit(R5_Discard, &pdev->flags))))) &&
 	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
-			     && test_bit(R5_UPTODATE, &qdev->flags)))))
+			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
+				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -3523,9 +3520,11 @@
 		/* All the 'written' buffers and the parity block are ready to
 		 * be written back to disk
 		 */
-		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
+		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
+		       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
 		BUG_ON(sh->qd_idx >= 0 &&
-		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
+		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
+		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
commit	9e44476851e91c86c98eb92b9bc27fb801f89072	[log] [tgz]
author	Shaohua Li <shli@kernel.org>	Thu Oct 11 13:49:49 2012 +1100
committer	NeilBrown <neilb@suse.de>	Thu Oct 11 13:49:49 2012 +1100
tree	c681fe9b8261f390d60fbbcce060fe6ed958c836
parent	620125f2bf8ff0c4969b79653b54d7bcc9d40637 [diff] [blame]