md/raid5: avoid reading from known bad blocks. There are two times that we might read in raid5: 1/ when a read request fits within a chunk on a single working device. In this case, if there is any bad block in the range of the read, we simply fail the cache-bypass read and perform the read though the stripe cache. 2/ when reading into the stripe cache. In this case we mark as failed any device which has a bad block in that strip (1 page wide). Note that we will both avoid reading and avoid writing. This is correct (as we will never read from the block, there is no point writing), but not optimal (as writing could 'fix' the error) - that will be addressed later. If we have not seen any write errors on the device yet, we treat a bad block like a recent read error. This will encourage an attempt to fix the read error which will either generate a write error, or will ensure good data is stored there. We don't yet forget the bad block in that case. That comes later. Now that we honour bad blocks when reading we can allow devices with bad blocks into the array. Signed-off-by: NeilBrown <neilb@suse.de>

commit: 31c176ecdf3563140e6395249eda51a18130d9f6 [log] [tgz]
author: NeilBrown <neilb@suse.de> Thu Jul 28 11:39:22 2011 +1000
committer: NeilBrown <neilb@suse.de> Thu Jul 28 11:39:22 2011 +1000
tree: 5ad7dba363214e9d36fa921c221316d2597078ef
parent: 62096bce231b3760882ed91205fc84682d6b0529 [diff] [blame]
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 304389b..a2d6838 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c

@@ -2923,6 +2923,9 @@
 	spin_lock_irq(&conf->device_lock);
 	for (i=disks; i--; ) {
 		mdk_rdev_t *rdev;
+		sector_t first_bad;
+		int bad_sectors;
+		int is_bad = 0;
 
 		dev = &sh->dev[i];
 
@@ -2959,15 +2962,32 @@
 		if (dev->written)
 			s->written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
-		if (s->blocked_rdev == NULL &&
-		    rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			s->blocked_rdev = rdev;
-			atomic_inc(&rdev->nr_pending);
+		if (rdev) {
+			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
+					     &first_bad, &bad_sectors);
+			if (s->blocked_rdev == NULL
+			    && (test_bit(Blocked, &rdev->flags)
+				|| is_bad < 0)) {
+				if (is_bad < 0)
+					set_bit(BlockedBadBlocks,
+						&rdev->flags);
+				s->blocked_rdev = rdev;
+				atomic_inc(&rdev->nr_pending);
+			}
 		}
 		clear_bit(R5_Insync, &dev->flags);
 		if (!rdev)
 			/* Not in-sync */;
-		else if (test_bit(In_sync, &rdev->flags))
+		else if (is_bad) {
+			/* also not in-sync */
+			if (!test_bit(WriteErrorSeen, &rdev->flags)) {
+				/* treat as in-sync, but with a read error
+				 * which we can now try to correct
+				 */
+				set_bit(R5_Insync, &dev->flags);
+				set_bit(R5_ReadError, &dev->flags);
+			}
+		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
 		else {
 			/* in sync if before recovery_offset */
@@ -3471,6 +3491,9 @@
 	rcu_read_lock();
 	rdev = rcu_dereference(conf->disks[dd_idx].rdev);
 	if (rdev && test_bit(In_sync, &rdev->flags)) {
+		sector_t first_bad;
+		int bad_sectors;
+
 		atomic_inc(&rdev->nr_pending);
 		rcu_read_unlock();
 		raid_bio->bi_next = (void*)rdev;
@@ -3478,8 +3501,10 @@
 		align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
 		align_bi->bi_sector += rdev->data_offset;
 
-		if (!bio_fits_rdev(align_bi)) {
-			/* too big in some way */
+		if (!bio_fits_rdev(align_bi) ||
+		    is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+				&first_bad, &bad_sectors)) {
+			/* too big in some way, or has a known bad block */
 			bio_put(align_bi);
 			rdev_dec_pending(rdev, mddev);
 			return 0;
@@ -4671,10 +4696,6 @@
 	 * 0 for a fully functional array, 1 or 2 for a degraded array.
 	 */
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
-		if (rdev->badblocks.count) {
-			printk(KERN_ERR "md/raid5: cannot handle bad blocks yet\n");
-			goto abort;
-		}
 		if (rdev->raid_disk < 0)
 			continue;
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -4983,9 +5004,6 @@
 	int first = 0;
 	int last = conf->raid_disks - 1;
 
-	if (rdev->badblocks.count)
-		return -EINVAL;
-
 	if (has_failed(conf))
 		/* no point adding a device */
 		return -EINVAL;
commit	31c176ecdf3563140e6395249eda51a18130d9f6	[log] [tgz]
author	NeilBrown <neilb@suse.de>	Thu Jul 28 11:39:22 2011 +1000
committer	NeilBrown <neilb@suse.de>	Thu Jul 28 11:39:22 2011 +1000
tree	5ad7dba363214e9d36fa921c221316d2597078ef
parent	62096bce231b3760882ed91205fc84682d6b0529 [diff] [blame]