blob: 9fa98fc74b058dc2685f0449f2e2b8cde2a9d305 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
NeilBrown32a76272005-06-21 17:17:14 -070022 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
Linus Torvalds1da177e2005-04-16 15:20:36 -070025 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
29
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33*/
34
NeilBrowna6fb0932005-09-09 16:23:56 -070035#include <linux/kthread.h>
NeilBrownbff61972009-03-31 14:33:13 +110036#include <linux/blkdev.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <linux/sysctl.h>
NeilBrownbff61972009-03-31 14:33:13 +110038#include <linux/seq_file.h>
Al Viroff01bb42011-09-16 02:31:11 -040039#include <linux/fs.h>
NeilBrownd7603b72006-01-06 00:20:30 -080040#include <linux/poll.h>
NeilBrown16f17b32006-06-26 00:27:37 -070041#include <linux/ctype.h>
André Goddard Rosae7d28602009-12-14 18:01:06 -080042#include <linux/string.h>
NeilBrownfb4d8c72008-10-13 11:55:12 +110043#include <linux/hdreg.h>
44#include <linux/proc_fs.h>
45#include <linux/random.h>
Paul Gortmaker056075c2011-07-03 13:58:33 -040046#include <linux/module.h>
NeilBrownfb4d8c72008-10-13 11:55:12 +110047#include <linux/reboot.h>
NeilBrown32a76272005-06-21 17:17:14 -070048#include <linux/file.h>
Arnd Bergmannaa98aa32009-12-14 12:50:05 +110049#include <linux/compat.h>
Stephen Rothwell25570722008-10-15 09:09:21 +110050#include <linux/delay.h>
NeilBrownbff61972009-03-31 14:33:13 +110051#include <linux/raid/md_p.h>
52#include <linux/raid/md_u.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090053#include <linux/slab.h>
NeilBrown43b2e5d2009-03-31 14:33:13 +110054#include "md.h"
Christoph Hellwigef740c32009-03-31 14:27:03 +110055#include "bitmap.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070056
Linus Torvalds1da177e2005-04-16 15:20:36 -070057#ifndef MODULE
NeilBrownd710e132008-10-13 11:55:12 +110058static void autostart_arrays(int part);
Linus Torvalds1da177e2005-04-16 15:20:36 -070059#endif
60
NeilBrown01f96c02011-09-21 15:30:20 +100061/* pers_list is a list of registered personalities protected
62 * by pers_lock.
63 * pers_lock does extra service to protect accesses to
64 * mddev->thread when the mutex cannot be held.
65 */
NeilBrown2604b702006-01-06 00:20:36 -080066static LIST_HEAD(pers_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_SPINLOCK(pers_lock);
68
Adrian Bunk5e563412006-06-26 00:27:42 -070069static void md_print_devices(void);
70
Bernd Schubert90b08712008-05-23 13:04:38 -070071static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
Tejun Heoe804ac72010-10-15 15:36:08 +020072static struct workqueue_struct *md_wq;
73static struct workqueue_struct *md_misc_wq;
Bernd Schubert90b08712008-05-23 13:04:38 -070074
Adrian Bunk5e563412006-06-26 00:27:42 -070075#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
76
Linus Torvalds1da177e2005-04-16 15:20:36 -070077/*
Robert Becker1e509152009-12-14 12:49:58 +110078 * Default number of read corrections we'll attempt on an rdev
79 * before ejecting it from the array. We divide the read error
80 * count by 2 for every hour elapsed between read errors.
81 */
82#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
83/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070084 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
85 * is 1000 KB/sec, so the extra system load does not show up that much.
86 * Increase it if you want to have more _guaranteed_ speed. Note that
Adrian Bunk338cec32005-09-10 00:26:54 -070087 * the RAID driver will use the maximum available bandwidth if the IO
Linus Torvalds1da177e2005-04-16 15:20:36 -070088 * subsystem is idle. There is also an 'absolute maximum' reconstruction
89 * speed limit - in case reconstruction slows down your system despite
90 * idle IO detection.
91 *
92 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
NeilBrown88202a02006-01-06 00:21:36 -080093 * or /sys/block/mdX/md/sync_speed_{min,max}
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 */
95
96static int sysctl_speed_limit_min = 1000;
97static int sysctl_speed_limit_max = 200000;
NeilBrownfd01b882011-10-11 16:47:53 +110098static inline int speed_min(struct mddev *mddev)
NeilBrown88202a02006-01-06 00:21:36 -080099{
100 return mddev->sync_speed_min ?
101 mddev->sync_speed_min : sysctl_speed_limit_min;
102}
103
NeilBrownfd01b882011-10-11 16:47:53 +1100104static inline int speed_max(struct mddev *mddev)
NeilBrown88202a02006-01-06 00:21:36 -0800105{
106 return mddev->sync_speed_max ?
107 mddev->sync_speed_max : sysctl_speed_limit_max;
108}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109
110static struct ctl_table_header *raid_table_header;
111
112static ctl_table raid_table[] = {
113 {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114 .procname = "speed_limit_min",
115 .data = &sysctl_speed_limit_min,
116 .maxlen = sizeof(int),
NeilBrown80ca3a42006-07-10 04:44:18 -0700117 .mode = S_IRUGO|S_IWUSR,
Eric W. Biederman6d456112009-11-16 03:11:48 -0800118 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 },
120 {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 .procname = "speed_limit_max",
122 .data = &sysctl_speed_limit_max,
123 .maxlen = sizeof(int),
NeilBrown80ca3a42006-07-10 04:44:18 -0700124 .mode = S_IRUGO|S_IWUSR,
Eric W. Biederman6d456112009-11-16 03:11:48 -0800125 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 },
Eric W. Biederman894d2492009-11-05 14:34:02 -0800127 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128};
129
130static ctl_table raid_dir_table[] = {
131 {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132 .procname = "raid",
133 .maxlen = 0,
NeilBrown80ca3a42006-07-10 04:44:18 -0700134 .mode = S_IRUGO|S_IXUGO,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 .child = raid_table,
136 },
Eric W. Biederman894d2492009-11-05 14:34:02 -0800137 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138};
139
140static ctl_table raid_root_table[] = {
141 {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142 .procname = "dev",
143 .maxlen = 0,
144 .mode = 0555,
145 .child = raid_dir_table,
146 },
Eric W. Biederman894d2492009-11-05 14:34:02 -0800147 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148};
149
Alexey Dobriyan83d5cde2009-09-21 17:01:13 -0700150static const struct block_device_operations md_fops;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151
NeilBrownf91de922005-11-08 21:39:36 -0800152static int start_readonly;
153
NeilBrowna167f662010-10-26 18:31:13 +1100154/* bio_clone_mddev
155 * like bio_clone, but with a local bio set
156 */
157
158static void mddev_bio_destructor(struct bio *bio)
159{
NeilBrownfd01b882011-10-11 16:47:53 +1100160 struct mddev *mddev, **mddevp;
NeilBrowna167f662010-10-26 18:31:13 +1100161
162 mddevp = (void*)bio;
163 mddev = mddevp[-1];
164
165 bio_free(bio, mddev->bio_set);
166}
167
168struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
NeilBrownfd01b882011-10-11 16:47:53 +1100169 struct mddev *mddev)
NeilBrowna167f662010-10-26 18:31:13 +1100170{
171 struct bio *b;
NeilBrownfd01b882011-10-11 16:47:53 +1100172 struct mddev **mddevp;
NeilBrowna167f662010-10-26 18:31:13 +1100173
174 if (!mddev || !mddev->bio_set)
175 return bio_alloc(gfp_mask, nr_iovecs);
176
177 b = bio_alloc_bioset(gfp_mask, nr_iovecs,
178 mddev->bio_set);
179 if (!b)
180 return NULL;
181 mddevp = (void*)b;
182 mddevp[-1] = mddev;
183 b->bi_destructor = mddev_bio_destructor;
184 return b;
185}
186EXPORT_SYMBOL_GPL(bio_alloc_mddev);
187
188struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
NeilBrownfd01b882011-10-11 16:47:53 +1100189 struct mddev *mddev)
NeilBrowna167f662010-10-26 18:31:13 +1100190{
191 struct bio *b;
NeilBrownfd01b882011-10-11 16:47:53 +1100192 struct mddev **mddevp;
NeilBrowna167f662010-10-26 18:31:13 +1100193
194 if (!mddev || !mddev->bio_set)
195 return bio_clone(bio, gfp_mask);
196
197 b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
198 mddev->bio_set);
199 if (!b)
200 return NULL;
201 mddevp = (void*)b;
202 mddevp[-1] = mddev;
203 b->bi_destructor = mddev_bio_destructor;
204 __bio_clone(b, bio);
205 if (bio_integrity(bio)) {
206 int ret;
207
208 ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
209
210 if (ret < 0) {
211 bio_put(b);
212 return NULL;
213 }
214 }
215
216 return b;
217}
218EXPORT_SYMBOL_GPL(bio_clone_mddev);
219
NeilBrownd2eb35a2011-07-28 11:31:48 +1000220void md_trim_bio(struct bio *bio, int offset, int size)
221{
222 /* 'bio' is a cloned bio which we need to trim to match
223 * the given offset and size.
224 * This requires adjusting bi_sector, bi_size, and bi_io_vec
225 */
226 int i;
227 struct bio_vec *bvec;
228 int sofar = 0;
229
230 size <<= 9;
231 if (offset == 0 && size == bio->bi_size)
232 return;
233
234 bio->bi_sector += offset;
235 bio->bi_size = size;
236 offset <<= 9;
237 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
238
239 while (bio->bi_idx < bio->bi_vcnt &&
240 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
241 /* remove this whole bio_vec */
242 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
243 bio->bi_idx++;
244 }
245 if (bio->bi_idx < bio->bi_vcnt) {
246 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
247 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
248 }
249 /* avoid any complications with bi_idx being non-zero*/
250 if (bio->bi_idx) {
251 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
252 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
253 bio->bi_vcnt -= bio->bi_idx;
254 bio->bi_idx = 0;
255 }
256 /* Make sure vcnt and last bv are not too big */
257 bio_for_each_segment(bvec, bio, i) {
258 if (sofar + bvec->bv_len > size)
259 bvec->bv_len = size - sofar;
260 if (bvec->bv_len == 0) {
261 bio->bi_vcnt = i;
262 break;
263 }
264 sofar += bvec->bv_len;
265 }
266}
267EXPORT_SYMBOL_GPL(md_trim_bio);
268
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269/*
NeilBrownd7603b72006-01-06 00:20:30 -0800270 * We have a system wide 'event count' that is incremented
271 * on any 'interesting' event, and readers of /proc/mdstat
272 * can use 'poll' or 'select' to find out when the event
273 * count increases.
274 *
275 * Events are:
276 * start array, stop array, error, add device, remove device,
277 * start build, activate spare
278 */
NeilBrown2989ddb2006-01-06 00:20:43 -0800279static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
NeilBrownd7603b72006-01-06 00:20:30 -0800280static atomic_t md_event_count;
NeilBrownfd01b882011-10-11 16:47:53 +1100281void md_new_event(struct mddev *mddev)
NeilBrownd7603b72006-01-06 00:20:30 -0800282{
283 atomic_inc(&md_event_count);
284 wake_up(&md_event_waiters);
285}
NeilBrown29269552006-03-27 01:18:10 -0800286EXPORT_SYMBOL_GPL(md_new_event);
NeilBrownd7603b72006-01-06 00:20:30 -0800287
NeilBrownc331eb02006-05-30 21:27:13 -0700288/* Alternate version that can be called from interrupts
289 * when calling sysfs_notify isn't needed.
290 */
NeilBrownfd01b882011-10-11 16:47:53 +1100291static void md_new_event_inintr(struct mddev *mddev)
NeilBrownc331eb02006-05-30 21:27:13 -0700292{
293 atomic_inc(&md_event_count);
294 wake_up(&md_event_waiters);
295}
296
NeilBrownd7603b72006-01-06 00:20:30 -0800297/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 * Enables to iterate over all existing md arrays
299 * all_mddevs_lock protects this list.
300 */
301static LIST_HEAD(all_mddevs);
302static DEFINE_SPINLOCK(all_mddevs_lock);
303
304
305/*
306 * iterates through all used mddevs in the system.
307 * We take care to grab the all_mddevs_lock whenever navigating
308 * the list, and to always hold a refcount when unlocked.
309 * Any code which breaks out of this loop while own
310 * a reference to the current mddev and must mddev_put it.
311 */
NeilBrownfd01b882011-10-11 16:47:53 +1100312#define for_each_mddev(_mddev,_tmp) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 \
314 for (({ spin_lock(&all_mddevs_lock); \
NeilBrownfd01b882011-10-11 16:47:53 +1100315 _tmp = all_mddevs.next; \
316 _mddev = NULL;}); \
317 ({ if (_tmp != &all_mddevs) \
318 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 spin_unlock(&all_mddevs_lock); \
NeilBrownfd01b882011-10-11 16:47:53 +1100320 if (_mddev) mddev_put(_mddev); \
321 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
322 _tmp != &all_mddevs;}); \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 ({ spin_lock(&all_mddevs_lock); \
NeilBrownfd01b882011-10-11 16:47:53 +1100324 _tmp = _tmp->next;}) \
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 )
326
327
NeilBrown409c57f2009-03-31 14:39:39 +1100328/* Rather than calling directly into the personality make_request function,
329 * IO requests come here first so that we can check if the device is
330 * being suspended pending a reconfiguration.
331 * We hold a refcount over the call to ->make_request. By the time that
332 * call has finished, the bio has been linked into some internal structure
333 * and so is visible to ->quiesce(), so we don't need the refcount any more.
334 */
Christoph Hellwig5a7bbad2011-09-12 12:12:01 +0200335static void md_make_request(struct request_queue *q, struct bio *bio)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336{
NeilBrown49077322010-03-25 16:20:56 +1100337 const int rw = bio_data_dir(bio);
NeilBrownfd01b882011-10-11 16:47:53 +1100338 struct mddev *mddev = q->queuedata;
NeilBrown49077322010-03-25 16:20:56 +1100339 int cpu;
Chris Masone91ece52011-02-07 19:21:48 -0500340 unsigned int sectors;
NeilBrown49077322010-03-25 16:20:56 +1100341
NeilBrown0ca69882011-01-14 09:14:33 +1100342 if (mddev == NULL || mddev->pers == NULL
343 || !mddev->ready) {
NeilBrown409c57f2009-03-31 14:39:39 +1100344 bio_io_error(bio);
Christoph Hellwig5a7bbad2011-09-12 12:12:01 +0200345 return;
NeilBrown409c57f2009-03-31 14:39:39 +1100346 }
NeilBrown0ca69882011-01-14 09:14:33 +1100347 smp_rmb(); /* Ensure implications of 'active' are visible */
NeilBrown409c57f2009-03-31 14:39:39 +1100348 rcu_read_lock();
Tejun Heoe9c74692010-09-03 11:56:18 +0200349 if (mddev->suspended) {
NeilBrown409c57f2009-03-31 14:39:39 +1100350 DEFINE_WAIT(__wait);
351 for (;;) {
352 prepare_to_wait(&mddev->sb_wait, &__wait,
353 TASK_UNINTERRUPTIBLE);
Tejun Heoe9c74692010-09-03 11:56:18 +0200354 if (!mddev->suspended)
NeilBrown409c57f2009-03-31 14:39:39 +1100355 break;
356 rcu_read_unlock();
357 schedule();
358 rcu_read_lock();
359 }
360 finish_wait(&mddev->sb_wait, &__wait);
361 }
362 atomic_inc(&mddev->active_io);
363 rcu_read_unlock();
NeilBrown49077322010-03-25 16:20:56 +1100364
Chris Masone91ece52011-02-07 19:21:48 -0500365 /*
366 * save the sectors now since our bio can
367 * go away inside make_request
368 */
369 sectors = bio_sectors(bio);
Christoph Hellwig5a7bbad2011-09-12 12:12:01 +0200370 mddev->pers->make_request(mddev, bio);
NeilBrown49077322010-03-25 16:20:56 +1100371
372 cpu = part_stat_lock();
373 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
Chris Masone91ece52011-02-07 19:21:48 -0500374 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
NeilBrown49077322010-03-25 16:20:56 +1100375 part_stat_unlock();
376
NeilBrown409c57f2009-03-31 14:39:39 +1100377 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
378 wake_up(&mddev->sb_wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379}
380
NeilBrown9e35b992010-04-06 14:23:02 +1000381/* mddev_suspend makes sure no new requests are submitted
382 * to the device, and that any requests that have been submitted
383 * are completely handled.
384 * Once ->stop is called and completes, the module will be completely
385 * unused.
386 */
NeilBrownfd01b882011-10-11 16:47:53 +1100387void mddev_suspend(struct mddev *mddev)
NeilBrown409c57f2009-03-31 14:39:39 +1100388{
389 BUG_ON(mddev->suspended);
390 mddev->suspended = 1;
391 synchronize_rcu();
392 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
393 mddev->pers->quiesce(mddev, 1);
Jonathan Brassow0d9f4f12012-05-16 04:06:14 -0500394
395 del_timer_sync(&mddev->safemode_timer);
NeilBrown409c57f2009-03-31 14:39:39 +1100396}
NeilBrown390ee602010-06-01 19:37:27 +1000397EXPORT_SYMBOL_GPL(mddev_suspend);
NeilBrown409c57f2009-03-31 14:39:39 +1100398
NeilBrownfd01b882011-10-11 16:47:53 +1100399void mddev_resume(struct mddev *mddev)
NeilBrown409c57f2009-03-31 14:39:39 +1100400{
401 mddev->suspended = 0;
402 wake_up(&mddev->sb_wait);
403 mddev->pers->quiesce(mddev, 0);
Jonathan Brassow0fd018a2011-06-07 17:49:36 -0500404
405 md_wakeup_thread(mddev->thread);
406 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
NeilBrown409c57f2009-03-31 14:39:39 +1100407}
NeilBrown390ee602010-06-01 19:37:27 +1000408EXPORT_SYMBOL_GPL(mddev_resume);
NeilBrown409c57f2009-03-31 14:39:39 +1100409
NeilBrownfd01b882011-10-11 16:47:53 +1100410int mddev_congested(struct mddev *mddev, int bits)
NeilBrown3fa841d2009-09-23 18:10:29 +1000411{
412 return mddev->suspended;
413}
414EXPORT_SYMBOL(mddev_congested);
415
NeilBrowna2826aa2009-12-14 12:49:49 +1100416/*
Tejun Heoe9c74692010-09-03 11:56:18 +0200417 * Generic flush handling for md
NeilBrowna2826aa2009-12-14 12:49:49 +1100418 */
419
Tejun Heoe9c74692010-09-03 11:56:18 +0200420static void md_end_flush(struct bio *bio, int err)
NeilBrowna2826aa2009-12-14 12:49:49 +1100421{
NeilBrown3cb03002011-10-11 16:45:26 +1100422 struct md_rdev *rdev = bio->bi_private;
NeilBrownfd01b882011-10-11 16:47:53 +1100423 struct mddev *mddev = rdev->mddev;
NeilBrowna2826aa2009-12-14 12:49:49 +1100424
425 rdev_dec_pending(rdev, mddev);
426
427 if (atomic_dec_and_test(&mddev->flush_pending)) {
Tejun Heoe9c74692010-09-03 11:56:18 +0200428 /* The pre-request flush has finished */
Tejun Heoe804ac72010-10-15 15:36:08 +0200429 queue_work(md_wq, &mddev->flush_work);
NeilBrowna2826aa2009-12-14 12:49:49 +1100430 }
431 bio_put(bio);
432}
433
NeilBrowna7a07e62010-12-09 16:04:25 +1100434static void md_submit_flush_data(struct work_struct *ws);
435
NeilBrowna035fc32010-12-09 16:17:51 +1100436static void submit_flushes(struct work_struct *ws)
NeilBrowna2826aa2009-12-14 12:49:49 +1100437{
NeilBrownfd01b882011-10-11 16:47:53 +1100438 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
NeilBrown3cb03002011-10-11 16:45:26 +1100439 struct md_rdev *rdev;
NeilBrowna2826aa2009-12-14 12:49:49 +1100440
NeilBrowna7a07e62010-12-09 16:04:25 +1100441 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
442 atomic_set(&mddev->flush_pending, 1);
NeilBrowna2826aa2009-12-14 12:49:49 +1100443 rcu_read_lock();
NeilBrowndafb20f2012-03-19 12:46:39 +1100444 rdev_for_each_rcu(rdev, mddev)
NeilBrowna2826aa2009-12-14 12:49:49 +1100445 if (rdev->raid_disk >= 0 &&
446 !test_bit(Faulty, &rdev->flags)) {
447 /* Take two references, one is dropped
448 * when request finishes, one after
449 * we reclaim rcu_read_lock
450 */
451 struct bio *bi;
452 atomic_inc(&rdev->nr_pending);
453 atomic_inc(&rdev->nr_pending);
454 rcu_read_unlock();
Shaohua Lib5e1b8c2012-05-21 09:26:59 +1000455 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
Tejun Heoe9c74692010-09-03 11:56:18 +0200456 bi->bi_end_io = md_end_flush;
NeilBrowna2826aa2009-12-14 12:49:49 +1100457 bi->bi_private = rdev;
458 bi->bi_bdev = rdev->bdev;
459 atomic_inc(&mddev->flush_pending);
Tejun Heoe9c74692010-09-03 11:56:18 +0200460 submit_bio(WRITE_FLUSH, bi);
NeilBrowna2826aa2009-12-14 12:49:49 +1100461 rcu_read_lock();
462 rdev_dec_pending(rdev, mddev);
463 }
464 rcu_read_unlock();
NeilBrowna7a07e62010-12-09 16:04:25 +1100465 if (atomic_dec_and_test(&mddev->flush_pending))
466 queue_work(md_wq, &mddev->flush_work);
NeilBrowna2826aa2009-12-14 12:49:49 +1100467}
468
Tejun Heoe9c74692010-09-03 11:56:18 +0200469static void md_submit_flush_data(struct work_struct *ws)
NeilBrowna2826aa2009-12-14 12:49:49 +1100470{
NeilBrownfd01b882011-10-11 16:47:53 +1100471 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
Tejun Heoe9c74692010-09-03 11:56:18 +0200472 struct bio *bio = mddev->flush_bio;
NeilBrowna2826aa2009-12-14 12:49:49 +1100473
Tejun Heoe9c74692010-09-03 11:56:18 +0200474 if (bio->bi_size == 0)
NeilBrowna2826aa2009-12-14 12:49:49 +1100475 /* an empty barrier - all done */
476 bio_endio(bio, 0);
477 else {
Tejun Heoe9c74692010-09-03 11:56:18 +0200478 bio->bi_rw &= ~REQ_FLUSH;
Christoph Hellwig5a7bbad2011-09-12 12:12:01 +0200479 mddev->pers->make_request(mddev, bio);
NeilBrowna2826aa2009-12-14 12:49:49 +1100480 }
NeilBrown2b74e122010-12-09 15:59:01 +1100481
482 mddev->flush_bio = NULL;
483 wake_up(&mddev->sb_wait);
NeilBrowna2826aa2009-12-14 12:49:49 +1100484}
485
NeilBrownfd01b882011-10-11 16:47:53 +1100486void md_flush_request(struct mddev *mddev, struct bio *bio)
NeilBrowna2826aa2009-12-14 12:49:49 +1100487{
488 spin_lock_irq(&mddev->write_lock);
489 wait_event_lock_irq(mddev->sb_wait,
Tejun Heoe9c74692010-09-03 11:56:18 +0200490 !mddev->flush_bio,
NeilBrowna2826aa2009-12-14 12:49:49 +1100491 mddev->write_lock, /*nothing*/);
Tejun Heoe9c74692010-09-03 11:56:18 +0200492 mddev->flush_bio = bio;
NeilBrowna2826aa2009-12-14 12:49:49 +1100493 spin_unlock_irq(&mddev->write_lock);
494
NeilBrowna035fc32010-12-09 16:17:51 +1100495 INIT_WORK(&mddev->flush_work, submit_flushes);
496 queue_work(md_wq, &mddev->flush_work);
NeilBrowna2826aa2009-12-14 12:49:49 +1100497}
Tejun Heoe9c74692010-09-03 11:56:18 +0200498EXPORT_SYMBOL(md_flush_request);
NeilBrown409c57f2009-03-31 14:39:39 +1100499
NeilBrown97658cd2011-04-18 18:25:42 +1000500/* Support for plugging.
501 * This mirrors the plugging support in request_queue, but does not
502 * require having a whole queue or request structures.
503 * We allocate an md_plug_cb for each md device and each thread it gets
504 * plugged on. This links tot the private plug_handle structure in the
505 * personality data where we keep a count of the number of outstanding
506 * plugs so other code can see if a plug is active.
507 */
508struct md_plug_cb {
509 struct blk_plug_cb cb;
NeilBrownfd01b882011-10-11 16:47:53 +1100510 struct mddev *mddev;
NeilBrown97658cd2011-04-18 18:25:42 +1000511};
512
513static void plugger_unplug(struct blk_plug_cb *cb)
514{
515 struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
516 if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
517 md_wakeup_thread(mdcb->mddev->thread);
518 kfree(mdcb);
519}
520
521/* Check that an unplug wakeup will come shortly.
522 * If not, wakeup the md thread immediately
523 */
NeilBrownfd01b882011-10-11 16:47:53 +1100524int mddev_check_plugged(struct mddev *mddev)
NeilBrown97658cd2011-04-18 18:25:42 +1000525{
526 struct blk_plug *plug = current->plug;
527 struct md_plug_cb *mdcb;
528
529 if (!plug)
530 return 0;
531
532 list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
533 if (mdcb->cb.callback == plugger_unplug &&
534 mdcb->mddev == mddev) {
535 /* Already on the list, move to top */
536 if (mdcb != list_first_entry(&plug->cb_list,
537 struct md_plug_cb,
538 cb.list))
539 list_move(&mdcb->cb.list, &plug->cb_list);
540 return 1;
541 }
542 }
543 /* Not currently on the callback list */
544 mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
545 if (!mdcb)
546 return 0;
547
548 mdcb->mddev = mddev;
549 mdcb->cb.callback = plugger_unplug;
550 atomic_inc(&mddev->plug_cnt);
551 list_add(&mdcb->cb.list, &plug->cb_list);
552 return 1;
553}
554EXPORT_SYMBOL_GPL(mddev_check_plugged);
NeilBrown2ac87402010-06-01 19:37:29 +1000555
NeilBrownfd01b882011-10-11 16:47:53 +1100556static inline struct mddev *mddev_get(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557{
558 atomic_inc(&mddev->active);
559 return mddev;
560}
561
Dan Williams5fd3a172009-03-04 00:57:25 -0700562static void mddev_delayed_delete(struct work_struct *ws);
NeilBrownd3374822009-01-09 08:31:10 +1100563
NeilBrownfd01b882011-10-11 16:47:53 +1100564static void mddev_put(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565{
NeilBrowna167f662010-10-26 18:31:13 +1100566 struct bio_set *bs = NULL;
567
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
569 return;
NeilBrownd3374822009-01-09 08:31:10 +1100570 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
NeilBrowncbd19982009-12-30 12:08:49 +1100571 mddev->ctime == 0 && !mddev->hold_active) {
572 /* Array is not configured at all, and not held active,
573 * so destroy it */
NeilBrownaf8a2432011-12-08 15:49:46 +1100574 list_del_init(&mddev->all_mddevs);
NeilBrowna167f662010-10-26 18:31:13 +1100575 bs = mddev->bio_set;
576 mddev->bio_set = NULL;
NeilBrownd3374822009-01-09 08:31:10 +1100577 if (mddev->gendisk) {
Tejun Heoe804ac72010-10-15 15:36:08 +0200578 /* We did a probe so need to clean up. Call
579 * queue_work inside the spinlock so that
580 * flush_workqueue() after mddev_find will
581 * succeed in waiting for the work to be done.
NeilBrownd3374822009-01-09 08:31:10 +1100582 */
583 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
Tejun Heoe804ac72010-10-15 15:36:08 +0200584 queue_work(md_misc_wq, &mddev->del_work);
NeilBrownd3374822009-01-09 08:31:10 +1100585 } else
586 kfree(mddev);
587 }
588 spin_unlock(&all_mddevs_lock);
NeilBrowna167f662010-10-26 18:31:13 +1100589 if (bs)
590 bioset_free(bs);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591}
592
NeilBrownfd01b882011-10-11 16:47:53 +1100593void mddev_init(struct mddev *mddev)
NeilBrownfafd7fb2010-04-01 15:55:30 +1100594{
595 mutex_init(&mddev->open_mutex);
596 mutex_init(&mddev->reconfig_mutex);
597 mutex_init(&mddev->bitmap_info.mutex);
598 INIT_LIST_HEAD(&mddev->disks);
599 INIT_LIST_HEAD(&mddev->all_mddevs);
600 init_timer(&mddev->safemode_timer);
601 atomic_set(&mddev->active, 1);
602 atomic_set(&mddev->openers, 0);
603 atomic_set(&mddev->active_io, 0);
NeilBrown97658cd2011-04-18 18:25:42 +1000604 atomic_set(&mddev->plug_cnt, 0);
NeilBrownfafd7fb2010-04-01 15:55:30 +1100605 spin_lock_init(&mddev->write_lock);
606 atomic_set(&mddev->flush_pending, 0);
607 init_waitqueue_head(&mddev->sb_wait);
608 init_waitqueue_head(&mddev->recovery_wait);
609 mddev->reshape_position = MaxSector;
NeilBrown2c810cd2012-05-21 09:27:00 +1000610 mddev->reshape_backwards = 0;
NeilBrownfafd7fb2010-04-01 15:55:30 +1100611 mddev->resync_min = 0;
612 mddev->resync_max = MaxSector;
613 mddev->level = LEVEL_NONE;
614}
NeilBrown390ee602010-06-01 19:37:27 +1000615EXPORT_SYMBOL_GPL(mddev_init);
NeilBrownfafd7fb2010-04-01 15:55:30 +1100616
NeilBrownfd01b882011-10-11 16:47:53 +1100617static struct mddev * mddev_find(dev_t unit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618{
NeilBrownfd01b882011-10-11 16:47:53 +1100619 struct mddev *mddev, *new = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620
NeilBrown8f5f02c2011-02-16 13:58:51 +1100621 if (unit && MAJOR(unit) != MD_MAJOR)
622 unit &= ~((1<<MdpMinorShift)-1);
623
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 retry:
625 spin_lock(&all_mddevs_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626
NeilBrownefeb53c2009-01-09 08:31:10 +1100627 if (unit) {
628 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
629 if (mddev->unit == unit) {
630 mddev_get(mddev);
631 spin_unlock(&all_mddevs_lock);
632 kfree(new);
633 return mddev;
634 }
635
636 if (new) {
637 list_add(&new->all_mddevs, &all_mddevs);
638 spin_unlock(&all_mddevs_lock);
639 new->hold_active = UNTIL_IOCTL;
640 return new;
641 }
642 } else if (new) {
643 /* find an unused unit number */
644 static int next_minor = 512;
645 int start = next_minor;
646 int is_free = 0;
647 int dev = 0;
648 while (!is_free) {
649 dev = MKDEV(MD_MAJOR, next_minor);
650 next_minor++;
651 if (next_minor > MINORMASK)
652 next_minor = 0;
653 if (next_minor == start) {
654 /* Oh dear, all in use. */
655 spin_unlock(&all_mddevs_lock);
656 kfree(new);
657 return NULL;
658 }
659
660 is_free = 1;
661 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
662 if (mddev->unit == dev) {
663 is_free = 0;
664 break;
665 }
666 }
667 new->unit = dev;
668 new->md_minor = MINOR(dev);
669 new->hold_active = UNTIL_STOP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 list_add(&new->all_mddevs, &all_mddevs);
671 spin_unlock(&all_mddevs_lock);
672 return new;
673 }
674 spin_unlock(&all_mddevs_lock);
675
NeilBrown9ffae0c2006-01-06 00:20:32 -0800676 new = kzalloc(sizeof(*new), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 if (!new)
678 return NULL;
679
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 new->unit = unit;
681 if (MAJOR(unit) == MD_MAJOR)
682 new->md_minor = MINOR(unit);
683 else
684 new->md_minor = MINOR(unit) >> MdpMinorShift;
685
NeilBrownfafd7fb2010-04-01 15:55:30 +1100686 mddev_init(new);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688 goto retry;
689}
690
NeilBrownfd01b882011-10-11 16:47:53 +1100691static inline int mddev_lock(struct mddev * mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692{
NeilBrowndf5b89b2006-03-27 01:18:20 -0800693 return mutex_lock_interruptible(&mddev->reconfig_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694}
695
NeilBrownfd01b882011-10-11 16:47:53 +1100696static inline int mddev_is_locked(struct mddev *mddev)
Dan Williamsb522adc2009-03-31 15:00:31 +1100697{
698 return mutex_is_locked(&mddev->reconfig_mutex);
699}
700
NeilBrownfd01b882011-10-11 16:47:53 +1100701static inline int mddev_trylock(struct mddev * mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702{
NeilBrowndf5b89b2006-03-27 01:18:20 -0800703 return mutex_trylock(&mddev->reconfig_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704}
705
NeilBrownb6eb1272010-04-15 10:13:47 +1000706static struct attribute_group md_redundancy_group;
707
NeilBrownfd01b882011-10-11 16:47:53 +1100708static void mddev_unlock(struct mddev * mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709{
NeilBrowna64c8762010-04-14 17:15:37 +1000710 if (mddev->to_remove) {
NeilBrownb6eb1272010-04-15 10:13:47 +1000711 /* These cannot be removed under reconfig_mutex as
712 * an access to the files will try to take reconfig_mutex
713 * while holding the file unremovable, which leads to
714 * a deadlock.
NeilBrownbb4f1e92010-08-08 21:18:03 +1000715 * So hold set sysfs_active while the remove in happeing,
716 * and anything else which might set ->to_remove or my
717 * otherwise change the sysfs namespace will fail with
718 * -EBUSY if sysfs_active is still set.
719 * We set sysfs_active under reconfig_mutex and elsewhere
720 * test it under the same mutex to ensure its correct value
721 * is seen.
NeilBrownb6eb1272010-04-15 10:13:47 +1000722 */
NeilBrowna64c8762010-04-14 17:15:37 +1000723 struct attribute_group *to_remove = mddev->to_remove;
724 mddev->to_remove = NULL;
NeilBrownbb4f1e92010-08-08 21:18:03 +1000725 mddev->sysfs_active = 1;
NeilBrownb6eb1272010-04-15 10:13:47 +1000726 mutex_unlock(&mddev->reconfig_mutex);
727
NeilBrown00bcb4a2010-06-01 19:37:23 +1000728 if (mddev->kobj.sd) {
729 if (to_remove != &md_redundancy_group)
730 sysfs_remove_group(&mddev->kobj, to_remove);
731 if (mddev->pers == NULL ||
732 mddev->pers->sync_request == NULL) {
733 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
734 if (mddev->sysfs_action)
735 sysfs_put(mddev->sysfs_action);
736 mddev->sysfs_action = NULL;
737 }
NeilBrowna64c8762010-04-14 17:15:37 +1000738 }
NeilBrownbb4f1e92010-08-08 21:18:03 +1000739 mddev->sysfs_active = 0;
NeilBrownb6eb1272010-04-15 10:13:47 +1000740 } else
741 mutex_unlock(&mddev->reconfig_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742
Chris Dunlop751e67c2011-10-19 16:48:26 +1100743 /* As we've dropped the mutex we need a spinlock to
744 * make sure the thread doesn't disappear
NeilBrown01f96c02011-09-21 15:30:20 +1000745 */
746 spin_lock(&pers_lock);
NeilBrown005eca52005-08-22 13:11:08 -0700747 md_wakeup_thread(mddev->thread);
NeilBrown01f96c02011-09-21 15:30:20 +1000748 spin_unlock(&pers_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749}
750
NeilBrownfd01b882011-10-11 16:47:53 +1100751static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752{
NeilBrown3cb03002011-10-11 16:45:26 +1100753 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754
NeilBrowndafb20f2012-03-19 12:46:39 +1100755 rdev_for_each(rdev, mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 if (rdev->desc_nr == nr)
757 return rdev;
Cheng Renquan159ec1f2009-01-09 08:31:08 +1100758
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 return NULL;
760}
761
NeilBrownfd01b882011-10-11 16:47:53 +1100762static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763{
NeilBrown3cb03002011-10-11 16:45:26 +1100764 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765
NeilBrowndafb20f2012-03-19 12:46:39 +1100766 rdev_for_each(rdev, mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700767 if (rdev->bdev->bd_dev == dev)
768 return rdev;
Cheng Renquan159ec1f2009-01-09 08:31:08 +1100769
Linus Torvalds1da177e2005-04-16 15:20:36 -0700770 return NULL;
771}
772
NeilBrown84fc4b52011-10-11 16:49:58 +1100773static struct md_personality *find_pers(int level, char *clevel)
NeilBrown2604b702006-01-06 00:20:36 -0800774{
NeilBrown84fc4b52011-10-11 16:49:58 +1100775 struct md_personality *pers;
NeilBrownd9d166c2006-01-06 00:20:51 -0800776 list_for_each_entry(pers, &pers_list, list) {
777 if (level != LEVEL_NONE && pers->level == level)
NeilBrown2604b702006-01-06 00:20:36 -0800778 return pers;
NeilBrownd9d166c2006-01-06 00:20:51 -0800779 if (strcmp(pers->name, clevel)==0)
780 return pers;
781 }
NeilBrown2604b702006-01-06 00:20:36 -0800782 return NULL;
783}
784
Andre Nollb73df2d2008-07-11 22:02:23 +1000785/* return the offset of the super block in 512byte sectors */
NeilBrown3cb03002011-10-11 16:45:26 +1100786static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787{
Jonathan Brassow57b2caa2011-01-14 09:14:33 +1100788 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
Andre Nollb73df2d2008-07-11 22:02:23 +1000789 return MD_NEW_SIZE_SECTORS(num_sectors);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790}
791
NeilBrown3cb03002011-10-11 16:45:26 +1100792static int alloc_disk_sb(struct md_rdev * rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793{
794 if (rdev->sb_page)
795 MD_BUG();
796
797 rdev->sb_page = alloc_page(GFP_KERNEL);
798 if (!rdev->sb_page) {
799 printk(KERN_ALERT "md: out of memory.\n");
Andre Nollebc24332008-07-11 22:02:20 +1000800 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801 }
802
803 return 0;
804}
805
NeilBrown3cb03002011-10-11 16:45:26 +1100806static void free_disk_sb(struct md_rdev * rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807{
808 if (rdev->sb_page) {
NeilBrown2d1f3b52006-01-06 00:20:31 -0800809 put_page(rdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 rdev->sb_loaded = 0;
811 rdev->sb_page = NULL;
Andre Noll0f420352008-07-11 22:02:23 +1000812 rdev->sb_start = 0;
Andre Nolldd8ac332009-03-31 14:33:13 +1100813 rdev->sectors = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 }
NeilBrown2699b672011-07-28 11:31:47 +1000815 if (rdev->bb_page) {
816 put_page(rdev->bb_page);
817 rdev->bb_page = NULL;
818 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819}
820
821
NeilBrown6712ecf2007-09-27 12:47:43 +0200822static void super_written(struct bio *bio, int error)
NeilBrown7bfa19f2005-06-21 17:17:28 -0700823{
NeilBrown3cb03002011-10-11 16:45:26 +1100824 struct md_rdev *rdev = bio->bi_private;
NeilBrownfd01b882011-10-11 16:47:53 +1100825 struct mddev *mddev = rdev->mddev;
NeilBrown7bfa19f2005-06-21 17:17:28 -0700826
NeilBrown3a0f5bb2006-10-03 01:16:03 -0700827 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
828 printk("md: super_written gets error=%d, uptodate=%d\n",
829 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
830 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
NeilBrowna9701a32005-11-08 21:39:34 -0800831 md_error(mddev, rdev);
NeilBrown3a0f5bb2006-10-03 01:16:03 -0700832 }
NeilBrown7bfa19f2005-06-21 17:17:28 -0700833
NeilBrowna9701a32005-11-08 21:39:34 -0800834 if (atomic_dec_and_test(&mddev->pending_writes))
835 wake_up(&mddev->sb_wait);
Neil Brownf8b58ed2005-06-27 22:29:34 -0700836 bio_put(bio);
NeilBrown7bfa19f2005-06-21 17:17:28 -0700837}
838
NeilBrownfd01b882011-10-11 16:47:53 +1100839void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
NeilBrown7bfa19f2005-06-21 17:17:28 -0700840 sector_t sector, int size, struct page *page)
841{
842 /* write first size bytes of page to sector of rdev
843 * Increment mddev->pending_writes before returning
844 * and decrement it on completion, waking up sb_wait
845 * if zero is reached.
846 * If an error occurred, call md_error
847 */
NeilBrowna167f662010-10-26 18:31:13 +1100848 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
NeilBrown7bfa19f2005-06-21 17:17:28 -0700849
Jonathan Brassowa6ff7e02011-01-14 09:14:34 +1100850 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
NeilBrown7bfa19f2005-06-21 17:17:28 -0700851 bio->bi_sector = sector;
852 bio_add_page(bio, page, size, 0);
853 bio->bi_private = rdev;
854 bio->bi_end_io = super_written;
NeilBrowna9701a32005-11-08 21:39:34 -0800855
NeilBrown7bfa19f2005-06-21 17:17:28 -0700856 atomic_inc(&mddev->pending_writes);
Namhyung Kima5bf4df2011-08-25 14:43:34 +1000857 submit_bio(WRITE_FLUSH_FUA, bio);
NeilBrowna9701a32005-11-08 21:39:34 -0800858}
859
NeilBrownfd01b882011-10-11 16:47:53 +1100860void md_super_wait(struct mddev *mddev)
NeilBrowna9701a32005-11-08 21:39:34 -0800861{
Tejun Heoe9c74692010-09-03 11:56:18 +0200862 /* wait for all superblock writes that were scheduled to complete */
NeilBrowna9701a32005-11-08 21:39:34 -0800863 DEFINE_WAIT(wq);
864 for(;;) {
865 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
866 if (atomic_read(&mddev->pending_writes)==0)
867 break;
NeilBrowna9701a32005-11-08 21:39:34 -0800868 schedule();
869 }
870 finish_wait(&mddev->sb_wait, &wq);
NeilBrown7bfa19f2005-06-21 17:17:28 -0700871}
872
NeilBrown6712ecf2007-09-27 12:47:43 +0200873static void bi_complete(struct bio *bio, int error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 complete((struct completion*)bio->bi_private);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876}
877
NeilBrown3cb03002011-10-11 16:45:26 +1100878int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
Jonathan Brassowccebd4c2011-01-14 09:14:33 +1100879 struct page *page, int rw, bool metadata_op)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880{
NeilBrowna167f662010-10-26 18:31:13 +1100881 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 struct completion event;
883 int ret;
884
Jens Axboe721a9602011-03-09 11:56:30 +0100885 rw |= REQ_SYNC;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
Jonathan Brassowa6ff7e02011-01-14 09:14:34 +1100887 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
888 rdev->meta_bdev : rdev->bdev;
Jonathan Brassowccebd4c2011-01-14 09:14:33 +1100889 if (metadata_op)
890 bio->bi_sector = sector + rdev->sb_start;
891 else
892 bio->bi_sector = sector + rdev->data_offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 bio_add_page(bio, page, size, 0);
894 init_completion(&event);
895 bio->bi_private = &event;
896 bio->bi_end_io = bi_complete;
897 submit_bio(rw, bio);
898 wait_for_completion(&event);
899
900 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
901 bio_put(bio);
902 return ret;
903}
NeilBrowna8745db2006-01-06 00:20:34 -0800904EXPORT_SYMBOL_GPL(sync_page_io);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700905
NeilBrown3cb03002011-10-11 16:45:26 +1100906static int read_disk_sb(struct md_rdev * rdev, int size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907{
908 char b[BDEVNAME_SIZE];
909 if (!rdev->sb_page) {
910 MD_BUG();
911 return -EINVAL;
912 }
913 if (rdev->sb_loaded)
914 return 0;
915
916
Jonathan Brassowccebd4c2011-01-14 09:14:33 +1100917 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 goto fail;
919 rdev->sb_loaded = 1;
920 return 0;
921
922fail:
923 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
924 bdevname(rdev->bdev,b));
925 return -EINVAL;
926}
927
928static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
929{
Andre Noll05710462008-07-11 22:02:20 +1000930 return sb1->set_uuid0 == sb2->set_uuid0 &&
931 sb1->set_uuid1 == sb2->set_uuid1 &&
932 sb1->set_uuid2 == sb2->set_uuid2 &&
933 sb1->set_uuid3 == sb2->set_uuid3;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934}
935
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
937{
938 int ret;
939 mdp_super_t *tmp1, *tmp2;
940
941 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
942 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
943
944 if (!tmp1 || !tmp2) {
945 ret = 0;
Andre Noll35020f12008-03-23 15:10:33 +0100946 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947 goto abort;
948 }
949
950 *tmp1 = *sb1;
951 *tmp2 = *sb2;
952
953 /*
954 * nr_disks is not constant
955 */
956 tmp1->nr_disks = 0;
957 tmp2->nr_disks = 0;
958
Andre Nollce0c8e02008-07-11 22:02:20 +1000959 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960abort:
Jesper Juhl990a8ba2005-06-21 17:17:30 -0700961 kfree(tmp1);
962 kfree(tmp2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700963 return ret;
964}
965
NeilBrown4d167f02007-05-09 02:35:37 -0700966
967static u32 md_csum_fold(u32 csum)
968{
969 csum = (csum & 0xffff) + (csum >> 16);
970 return (csum & 0xffff) + (csum >> 16);
971}
972
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973static unsigned int calc_sb_csum(mdp_super_t * sb)
974{
NeilBrown4d167f02007-05-09 02:35:37 -0700975 u64 newcsum = 0;
976 u32 *sb32 = (u32*)sb;
977 int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978 unsigned int disk_csum, csum;
979
980 disk_csum = sb->sb_csum;
981 sb->sb_csum = 0;
NeilBrown4d167f02007-05-09 02:35:37 -0700982
983 for (i = 0; i < MD_SB_BYTES/4 ; i++)
984 newcsum += sb32[i];
985 csum = (newcsum & 0xffffffff) + (newcsum>>32);
986
987
988#ifdef CONFIG_ALPHA
989 /* This used to use csum_partial, which was wrong for several
990 * reasons including that different results are returned on
991 * different architectures. It isn't critical that we get exactly
992 * the same return value as before (we always csum_fold before
993 * testing, and that removes any differences). However as we
994 * know that csum_partial always returned a 16bit value on
995 * alphas, do a fold to maximise conformity to previous behaviour.
996 */
997 sb->sb_csum = md_csum_fold(disk_csum);
998#else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999 sb->sb_csum = disk_csum;
NeilBrown4d167f02007-05-09 02:35:37 -07001000#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 return csum;
1002}
1003
1004
1005/*
1006 * Handle superblock details.
1007 * We want to be able to handle multiple superblock formats
1008 * so we have a common interface to them all, and an array of
1009 * different handlers.
1010 * We rely on user-space to write the initial superblock, and support
1011 * reading and updating of superblocks.
1012 * Interface methods are:
NeilBrown3cb03002011-10-11 16:45:26 +11001013 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 * loads and validates a superblock on dev.
1015 * if refdev != NULL, compare superblocks on both devices
1016 * Return:
1017 * 0 - dev has a superblock that is compatible with refdev
1018 * 1 - dev has a superblock that is compatible and newer than refdev
1019 * so dev should be used as the refdev in future
1020 * -EINVAL superblock incompatible or invalid
1021 * -othererror e.g. -EIO
1022 *
NeilBrownfd01b882011-10-11 16:47:53 +11001023 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 * Verify that dev is acceptable into mddev.
1025 * The first time, mddev->raid_disks will be 0, and data from
1026 * dev should be merged in. Subsequent calls check that dev
1027 * is new enough. Return 0 or -EINVAL
1028 *
NeilBrownfd01b882011-10-11 16:47:53 +11001029 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030 * Update the superblock for rdev with data in mddev
1031 * This does not write to disc.
1032 *
1033 */
1034
1035struct super_type {
Chris Webb0cd17fe2008-06-28 08:31:46 +10001036 char *name;
1037 struct module *owner;
NeilBrownc6563a82012-05-21 09:27:00 +10001038 int (*load_super)(struct md_rdev *rdev,
1039 struct md_rdev *refdev,
Chris Webb0cd17fe2008-06-28 08:31:46 +10001040 int minor_version);
NeilBrownc6563a82012-05-21 09:27:00 +10001041 int (*validate_super)(struct mddev *mddev,
1042 struct md_rdev *rdev);
1043 void (*sync_super)(struct mddev *mddev,
1044 struct md_rdev *rdev);
NeilBrown3cb03002011-10-11 16:45:26 +11001045 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
Andre Noll15f4a5f2008-07-21 14:42:12 +10001046 sector_t num_sectors);
NeilBrownc6563a82012-05-21 09:27:00 +10001047 int (*allow_new_offset)(struct md_rdev *rdev,
1048 unsigned long long new_offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049};
1050
1051/*
Andre Noll0894cc32009-06-18 08:49:23 +10001052 * Check that the given mddev has no bitmap.
1053 *
1054 * This function is called from the run method of all personalities that do not
1055 * support bitmaps. It prints an error message and returns non-zero if mddev
1056 * has a bitmap. Otherwise, it returns 0.
1057 *
1058 */
NeilBrownfd01b882011-10-11 16:47:53 +11001059int md_check_no_bitmap(struct mddev *mddev)
Andre Noll0894cc32009-06-18 08:49:23 +10001060{
NeilBrownc3d97142009-12-14 12:49:52 +11001061 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
Andre Noll0894cc32009-06-18 08:49:23 +10001062 return 0;
1063 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
1064 mdname(mddev), mddev->pers->name);
1065 return 1;
1066}
1067EXPORT_SYMBOL(md_check_no_bitmap);
1068
1069/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 * load_super for 0.90.0
1071 */
NeilBrown3cb03002011-10-11 16:45:26 +11001072static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001073{
1074 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1075 mdp_super_t *sb;
1076 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001077
1078 /*
Andre Noll0f420352008-07-11 22:02:23 +10001079 * Calculate the position of the superblock (512byte sectors),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 * it's at the end of the disk.
1081 *
1082 * It also happens to be a multiple of 4Kb.
1083 */
Jonathan Brassow57b2caa2011-01-14 09:14:33 +11001084 rdev->sb_start = calc_dev_sboffset(rdev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085
NeilBrown0002b272005-09-09 16:23:53 -07001086 ret = read_disk_sb(rdev, MD_SB_BYTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087 if (ret) return ret;
1088
1089 ret = -EINVAL;
1090
1091 bdevname(rdev->bdev, b);
Namhyung Kim65a06f062011-07-27 11:00:36 +10001092 sb = page_address(rdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093
1094 if (sb->md_magic != MD_SB_MAGIC) {
1095 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
1096 b);
1097 goto abort;
1098 }
1099
1100 if (sb->major_version != 0 ||
NeilBrownf6705572006-03-27 01:18:11 -08001101 sb->minor_version < 90 ||
1102 sb->minor_version > 91) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
1104 sb->major_version, sb->minor_version,
1105 b);
1106 goto abort;
1107 }
1108
1109 if (sb->raid_disks <= 0)
1110 goto abort;
1111
NeilBrown4d167f02007-05-09 02:35:37 -07001112 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001113 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
1114 b);
1115 goto abort;
1116 }
1117
1118 rdev->preferred_minor = sb->md_minor;
1119 rdev->data_offset = 0;
NeilBrownc6563a82012-05-21 09:27:00 +10001120 rdev->new_data_offset = 0;
NeilBrown0002b272005-09-09 16:23:53 -07001121 rdev->sb_size = MD_SB_BYTES;
NeilBrown9f2f3832011-07-28 11:31:47 +10001122 rdev->badblocks.shift = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123
1124 if (sb->level == LEVEL_MULTIPATH)
1125 rdev->desc_nr = -1;
1126 else
1127 rdev->desc_nr = sb->this_disk.number;
1128
Harvey Harrison9a7b2b02008-04-28 02:15:49 -07001129 if (!refdev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 ret = 1;
Harvey Harrison9a7b2b02008-04-28 02:15:49 -07001131 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 __u64 ev1, ev2;
Namhyung Kim65a06f062011-07-27 11:00:36 +10001133 mdp_super_t *refsb = page_address(refdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 if (!uuid_equal(refsb, sb)) {
1135 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1136 b, bdevname(refdev->bdev,b2));
1137 goto abort;
1138 }
1139 if (!sb_equal(refsb, sb)) {
1140 printk(KERN_WARNING "md: %s has same UUID"
1141 " but different superblock to %s\n",
1142 b, bdevname(refdev->bdev, b2));
1143 goto abort;
1144 }
1145 ev1 = md_event(sb);
1146 ev2 = md_event(refsb);
1147 if (ev1 > ev2)
1148 ret = 1;
1149 else
1150 ret = 0;
1151 }
NeilBrown8190e752009-06-18 08:48:58 +10001152 rdev->sectors = rdev->sb_start;
NeilBrown27a7b262011-09-10 17:21:28 +10001153 /* Limit to 4TB as metadata cannot record more than that */
1154 if (rdev->sectors >= (2ULL << 32))
1155 rdev->sectors = (2ULL << 32) - 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156
NeilBrown27a7b262011-09-10 17:21:28 +10001157 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
NeilBrown2bf071b2006-01-06 00:20:55 -08001158 /* "this cannot possibly happen" ... */
1159 ret = -EINVAL;
1160
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 abort:
1162 return ret;
1163}
1164
1165/*
1166 * validate_super for 0.90.0
1167 */
NeilBrownfd01b882011-10-11 16:47:53 +11001168static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169{
1170 mdp_disk_t *desc;
Namhyung Kim65a06f062011-07-27 11:00:36 +10001171 mdp_super_t *sb = page_address(rdev->sb_page);
NeilBrown07d84d102006-06-26 00:27:56 -07001172 __u64 ev1 = md_event(sb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173
NeilBrown41158c72005-06-21 17:17:25 -07001174 rdev->raid_disk = -1;
NeilBrownc5d79ad2008-02-06 01:39:54 -08001175 clear_bit(Faulty, &rdev->flags);
1176 clear_bit(In_sync, &rdev->flags);
1177 clear_bit(WriteMostly, &rdev->flags);
NeilBrownc5d79ad2008-02-06 01:39:54 -08001178
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179 if (mddev->raid_disks == 0) {
1180 mddev->major_version = 0;
1181 mddev->minor_version = sb->minor_version;
1182 mddev->patch_version = sb->patch_version;
NeilBrowne6910632008-02-06 01:39:51 -08001183 mddev->external = 0;
Andre Noll9d8f0362009-06-18 08:45:01 +10001184 mddev->chunk_sectors = sb->chunk_size >> 9;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001185 mddev->ctime = sb->ctime;
1186 mddev->utime = sb->utime;
1187 mddev->level = sb->level;
NeilBrownd9d166c2006-01-06 00:20:51 -08001188 mddev->clevel[0] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 mddev->layout = sb->layout;
1190 mddev->raid_disks = sb->raid_disks;
NeilBrown27a7b262011-09-10 17:21:28 +10001191 mddev->dev_sectors = ((sector_t)sb->size) * 2;
NeilBrown07d84d102006-06-26 00:27:56 -07001192 mddev->events = ev1;
NeilBrownc3d97142009-12-14 12:49:52 +11001193 mddev->bitmap_info.offset = 0;
1194 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
NeilBrown2c810cd2012-05-21 09:27:00 +10001195 mddev->reshape_backwards = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196
NeilBrownf6705572006-03-27 01:18:11 -08001197 if (mddev->minor_version >= 91) {
1198 mddev->reshape_position = sb->reshape_position;
1199 mddev->delta_disks = sb->delta_disks;
1200 mddev->new_level = sb->new_level;
1201 mddev->new_layout = sb->new_layout;
Andre Noll664e7c42009-06-18 08:45:27 +10001202 mddev->new_chunk_sectors = sb->new_chunk >> 9;
NeilBrown2c810cd2012-05-21 09:27:00 +10001203 if (mddev->delta_disks < 0)
1204 mddev->reshape_backwards = 1;
NeilBrownf6705572006-03-27 01:18:11 -08001205 } else {
1206 mddev->reshape_position = MaxSector;
1207 mddev->delta_disks = 0;
1208 mddev->new_level = mddev->level;
1209 mddev->new_layout = mddev->layout;
Andre Noll664e7c42009-06-18 08:45:27 +10001210 mddev->new_chunk_sectors = mddev->chunk_sectors;
NeilBrownf6705572006-03-27 01:18:11 -08001211 }
1212
Linus Torvalds1da177e2005-04-16 15:20:36 -07001213 if (sb->state & (1<<MD_SB_CLEAN))
1214 mddev->recovery_cp = MaxSector;
1215 else {
1216 if (sb->events_hi == sb->cp_events_hi &&
1217 sb->events_lo == sb->cp_events_lo) {
1218 mddev->recovery_cp = sb->recovery_cp;
1219 } else
1220 mddev->recovery_cp = 0;
1221 }
1222
1223 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1224 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1225 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1226 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1227
1228 mddev->max_disks = MD_SB_DISKS;
NeilBrowna654b9d82005-06-21 17:17:27 -07001229
1230 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
NeilBrownc3d97142009-12-14 12:49:52 +11001231 mddev->bitmap_info.file == NULL)
1232 mddev->bitmap_info.offset =
1233 mddev->bitmap_info.default_offset;
NeilBrowna654b9d82005-06-21 17:17:27 -07001234
NeilBrown41158c72005-06-21 17:17:25 -07001235 } else if (mddev->pers == NULL) {
NeilBrownbe6800a2010-05-18 10:17:09 +10001236 /* Insist on good event counter while assembling, except
1237 * for spares (which don't need an event count) */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 ++ev1;
NeilBrownbe6800a2010-05-18 10:17:09 +10001239 if (sb->disks[rdev->desc_nr].state & (
1240 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1241 if (ev1 < mddev->events)
1242 return -EINVAL;
NeilBrown41158c72005-06-21 17:17:25 -07001243 } else if (mddev->bitmap) {
1244 /* if adding to array with a bitmap, then we can accept an
1245 * older device ... but not too old.
1246 */
NeilBrown41158c72005-06-21 17:17:25 -07001247 if (ev1 < mddev->bitmap->events_cleared)
1248 return 0;
NeilBrown07d84d102006-06-26 00:27:56 -07001249 } else {
1250 if (ev1 < mddev->events)
1251 /* just a hot-add of a new device, leave raid_disk at -1 */
1252 return 0;
1253 }
NeilBrown41158c72005-06-21 17:17:25 -07001254
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 if (mddev->level != LEVEL_MULTIPATH) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256 desc = sb->disks + rdev->desc_nr;
1257
1258 if (desc->state & (1<<MD_DISK_FAULTY))
NeilBrownb2d444d2005-11-08 21:39:31 -08001259 set_bit(Faulty, &rdev->flags);
NeilBrown7c7546c2006-06-26 00:27:41 -07001260 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1261 desc->raid_disk < mddev->raid_disks */) {
NeilBrownb2d444d2005-11-08 21:39:31 -08001262 set_bit(In_sync, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 rdev->raid_disk = desc->raid_disk;
NeilBrown0261cd9f2009-11-13 17:40:48 +11001264 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1265 /* active but not in sync implies recovery up to
1266 * reshape position. We don't know exactly where
1267 * that is, so set to zero for now */
1268 if (mddev->minor_version >= 91) {
1269 rdev->recovery_offset = 0;
1270 rdev->raid_disk = desc->raid_disk;
1271 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 }
NeilBrown8ddf9ef2005-09-09 16:23:45 -07001273 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1274 set_bit(WriteMostly, &rdev->flags);
NeilBrown41158c72005-06-21 17:17:25 -07001275 } else /* MULTIPATH are always insync */
NeilBrownb2d444d2005-11-08 21:39:31 -08001276 set_bit(In_sync, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277 return 0;
1278}
1279
1280/*
1281 * sync_super for 0.90.0
1282 */
NeilBrownfd01b882011-10-11 16:47:53 +11001283static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284{
1285 mdp_super_t *sb;
NeilBrown3cb03002011-10-11 16:45:26 +11001286 struct md_rdev *rdev2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287 int next_spare = mddev->raid_disks;
NeilBrown19133a42005-11-08 21:39:35 -08001288
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289
1290 /* make rdev->sb match mddev data..
1291 *
1292 * 1/ zero out disks
1293 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1294 * 3/ any empty disks < next_spare become removed
1295 *
1296 * disks[0] gets initialised to REMOVED because
1297 * we cannot be sure from other fields if it has
1298 * been initialised or not.
1299 */
1300 int i;
1301 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1302
NeilBrown61181562005-09-09 16:24:02 -07001303 rdev->sb_size = MD_SB_BYTES;
1304
Namhyung Kim65a06f062011-07-27 11:00:36 +10001305 sb = page_address(rdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306
1307 memset(sb, 0, sizeof(*sb));
1308
1309 sb->md_magic = MD_SB_MAGIC;
1310 sb->major_version = mddev->major_version;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001311 sb->patch_version = mddev->patch_version;
1312 sb->gvalid_words = 0; /* ignored */
1313 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1314 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1315 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1316 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1317
1318 sb->ctime = mddev->ctime;
1319 sb->level = mddev->level;
Andre Noll58c0fed2009-03-31 14:33:13 +11001320 sb->size = mddev->dev_sectors / 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001321 sb->raid_disks = mddev->raid_disks;
1322 sb->md_minor = mddev->md_minor;
NeilBrowne6910632008-02-06 01:39:51 -08001323 sb->not_persistent = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324 sb->utime = mddev->utime;
1325 sb->state = 0;
1326 sb->events_hi = (mddev->events>>32);
1327 sb->events_lo = (u32)mddev->events;
1328
NeilBrownf6705572006-03-27 01:18:11 -08001329 if (mddev->reshape_position == MaxSector)
1330 sb->minor_version = 90;
1331 else {
1332 sb->minor_version = 91;
1333 sb->reshape_position = mddev->reshape_position;
1334 sb->new_level = mddev->new_level;
1335 sb->delta_disks = mddev->delta_disks;
1336 sb->new_layout = mddev->new_layout;
Andre Noll664e7c42009-06-18 08:45:27 +10001337 sb->new_chunk = mddev->new_chunk_sectors << 9;
NeilBrownf6705572006-03-27 01:18:11 -08001338 }
1339 mddev->minor_version = sb->minor_version;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340 if (mddev->in_sync)
1341 {
1342 sb->recovery_cp = mddev->recovery_cp;
1343 sb->cp_events_hi = (mddev->events>>32);
1344 sb->cp_events_lo = (u32)mddev->events;
1345 if (mddev->recovery_cp == MaxSector)
1346 sb->state = (1<< MD_SB_CLEAN);
1347 } else
1348 sb->recovery_cp = 0;
1349
1350 sb->layout = mddev->layout;
Andre Noll9d8f0362009-06-18 08:45:01 +10001351 sb->chunk_size = mddev->chunk_sectors << 9;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001352
NeilBrownc3d97142009-12-14 12:49:52 +11001353 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
NeilBrowna654b9d82005-06-21 17:17:27 -07001354 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1355
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 sb->disks[0].state = (1<<MD_DISK_REMOVED);
NeilBrowndafb20f2012-03-19 12:46:39 +11001357 rdev_for_each(rdev2, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 mdp_disk_t *d;
NeilBrown86e6ffd2005-11-08 21:39:24 -08001359 int desc_nr;
NeilBrown0261cd9f2009-11-13 17:40:48 +11001360 int is_active = test_bit(In_sync, &rdev2->flags);
1361
1362 if (rdev2->raid_disk >= 0 &&
1363 sb->minor_version >= 91)
1364 /* we have nowhere to store the recovery_offset,
1365 * but if it is not below the reshape_position,
1366 * we can piggy-back on that.
1367 */
1368 is_active = 1;
1369 if (rdev2->raid_disk < 0 ||
1370 test_bit(Faulty, &rdev2->flags))
1371 is_active = 0;
1372 if (is_active)
NeilBrown86e6ffd2005-11-08 21:39:24 -08001373 desc_nr = rdev2->raid_disk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 else
NeilBrown86e6ffd2005-11-08 21:39:24 -08001375 desc_nr = next_spare++;
NeilBrown19133a42005-11-08 21:39:35 -08001376 rdev2->desc_nr = desc_nr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 d = &sb->disks[rdev2->desc_nr];
1378 nr_disks++;
1379 d->number = rdev2->desc_nr;
1380 d->major = MAJOR(rdev2->bdev->bd_dev);
1381 d->minor = MINOR(rdev2->bdev->bd_dev);
NeilBrown0261cd9f2009-11-13 17:40:48 +11001382 if (is_active)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 d->raid_disk = rdev2->raid_disk;
1384 else
1385 d->raid_disk = rdev2->desc_nr; /* compatibility */
NeilBrown1be78922006-03-27 01:18:03 -08001386 if (test_bit(Faulty, &rdev2->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 d->state = (1<<MD_DISK_FAULTY);
NeilBrown0261cd9f2009-11-13 17:40:48 +11001388 else if (is_active) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 d->state = (1<<MD_DISK_ACTIVE);
NeilBrown0261cd9f2009-11-13 17:40:48 +11001390 if (test_bit(In_sync, &rdev2->flags))
1391 d->state |= (1<<MD_DISK_SYNC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 active++;
1393 working++;
1394 } else {
1395 d->state = 0;
1396 spare++;
1397 working++;
1398 }
NeilBrown8ddf9ef2005-09-09 16:23:45 -07001399 if (test_bit(WriteMostly, &rdev2->flags))
1400 d->state |= (1<<MD_DISK_WRITEMOSTLY);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001401 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 /* now set the "removed" and "faulty" bits on any missing devices */
1403 for (i=0 ; i < mddev->raid_disks ; i++) {
1404 mdp_disk_t *d = &sb->disks[i];
1405 if (d->state == 0 && d->number == 0) {
1406 d->number = i;
1407 d->raid_disk = i;
1408 d->state = (1<<MD_DISK_REMOVED);
1409 d->state |= (1<<MD_DISK_FAULTY);
1410 failed++;
1411 }
1412 }
1413 sb->nr_disks = nr_disks;
1414 sb->active_disks = active;
1415 sb->working_disks = working;
1416 sb->failed_disks = failed;
1417 sb->spare_disks = spare;
1418
1419 sb->this_disk = sb->disks[rdev->desc_nr];
1420 sb->sb_csum = calc_sb_csum(sb);
1421}
1422
1423/*
Chris Webb0cd17fe2008-06-28 08:31:46 +10001424 * rdev_size_change for 0.90.0
1425 */
1426static unsigned long long
NeilBrown3cb03002011-10-11 16:45:26 +11001427super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
Chris Webb0cd17fe2008-06-28 08:31:46 +10001428{
Andre Noll58c0fed2009-03-31 14:33:13 +11001429 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
Chris Webb0cd17fe2008-06-28 08:31:46 +10001430 return 0; /* component must fit device */
NeilBrownc3d97142009-12-14 12:49:52 +11001431 if (rdev->mddev->bitmap_info.offset)
Chris Webb0cd17fe2008-06-28 08:31:46 +10001432 return 0; /* can't move bitmap */
Jonathan Brassow57b2caa2011-01-14 09:14:33 +11001433 rdev->sb_start = calc_dev_sboffset(rdev);
Andre Noll15f4a5f2008-07-21 14:42:12 +10001434 if (!num_sectors || num_sectors > rdev->sb_start)
1435 num_sectors = rdev->sb_start;
NeilBrown27a7b262011-09-10 17:21:28 +10001436 /* Limit to 4TB as metadata cannot record more than that.
1437 * 4TB == 2^32 KB, or 2*2^32 sectors.
1438 */
1439 if (num_sectors >= (2ULL << 32))
1440 num_sectors = (2ULL << 32) - 2;
Andre Noll0f420352008-07-11 22:02:23 +10001441 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
Chris Webb0cd17fe2008-06-28 08:31:46 +10001442 rdev->sb_page);
1443 md_super_wait(rdev->mddev);
Justin Maggardc26a44e2010-11-24 16:36:17 +11001444 return num_sectors;
Chris Webb0cd17fe2008-06-28 08:31:46 +10001445}
1446
NeilBrownc6563a82012-05-21 09:27:00 +10001447static int
1448super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1449{
1450 /* non-zero offset changes not possible with v0.90 */
1451 return new_offset == 0;
1452}
Chris Webb0cd17fe2008-06-28 08:31:46 +10001453
1454/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 * version 1 superblock
1456 */
1457
NeilBrown1c05b4b2006-10-21 10:24:08 -07001458static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459{
NeilBrown1c05b4b2006-10-21 10:24:08 -07001460 __le32 disk_csum;
1461 u32 csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462 unsigned long long newcsum;
1463 int size = 256 + le32_to_cpu(sb->max_dev)*2;
NeilBrown1c05b4b2006-10-21 10:24:08 -07001464 __le32 *isuper = (__le32*)sb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465 int i;
1466
1467 disk_csum = sb->sb_csum;
1468 sb->sb_csum = 0;
1469 newcsum = 0;
1470 for (i=0; size>=4; size -= 4 )
1471 newcsum += le32_to_cpu(*isuper++);
1472
1473 if (size == 2)
NeilBrown1c05b4b2006-10-21 10:24:08 -07001474 newcsum += le16_to_cpu(*(__le16*) isuper);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475
1476 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1477 sb->sb_csum = disk_csum;
1478 return cpu_to_le32(csum);
1479}
1480
NeilBrown2699b672011-07-28 11:31:47 +10001481static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1482 int acknowledged);
NeilBrown3cb03002011-10-11 16:45:26 +11001483static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484{
1485 struct mdp_superblock_1 *sb;
1486 int ret;
Andre Noll0f420352008-07-11 22:02:23 +10001487 sector_t sb_start;
NeilBrownc6563a82012-05-21 09:27:00 +10001488 sector_t sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
NeilBrown0002b272005-09-09 16:23:53 -07001490 int bmask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491
1492 /*
Andre Noll0f420352008-07-11 22:02:23 +10001493 * Calculate the position of the superblock in 512byte sectors.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494 * It is always aligned to a 4K boundary and
1495 * depeding on minor_version, it can be:
1496 * 0: At least 8K, but less than 12K, from end of device
1497 * 1: At start of device
1498 * 2: 4K from start of device.
1499 */
1500 switch(minor_version) {
1501 case 0:
Mike Snitzer77304d22010-11-08 14:39:12 +01001502 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
Andre Noll0f420352008-07-11 22:02:23 +10001503 sb_start -= 8*2;
1504 sb_start &= ~(sector_t)(4*2-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 break;
1506 case 1:
Andre Noll0f420352008-07-11 22:02:23 +10001507 sb_start = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001508 break;
1509 case 2:
Andre Noll0f420352008-07-11 22:02:23 +10001510 sb_start = 8;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 break;
1512 default:
1513 return -EINVAL;
1514 }
Andre Noll0f420352008-07-11 22:02:23 +10001515 rdev->sb_start = sb_start;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516
NeilBrown0002b272005-09-09 16:23:53 -07001517 /* superblock is rarely larger than 1K, but it can be larger,
1518 * and it is safe to read 4k, so we do that
1519 */
1520 ret = read_disk_sb(rdev, 4096);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001521 if (ret) return ret;
1522
1523
Namhyung Kim65a06f062011-07-27 11:00:36 +10001524 sb = page_address(rdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
1526 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1527 sb->major_version != cpu_to_le32(1) ||
1528 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
Andre Noll0f420352008-07-11 22:02:23 +10001529 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
NeilBrown71c08052005-09-09 16:23:51 -07001530 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531 return -EINVAL;
1532
1533 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1534 printk("md: invalid superblock checksum on %s\n",
1535 bdevname(rdev->bdev,b));
1536 return -EINVAL;
1537 }
1538 if (le64_to_cpu(sb->data_size) < 10) {
1539 printk("md: data_size too small on %s\n",
1540 bdevname(rdev->bdev,b));
1541 return -EINVAL;
1542 }
NeilBrownc6563a82012-05-21 09:27:00 +10001543 if (sb->pad0 ||
1544 sb->pad3[0] ||
1545 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1546 /* Some padding is non-zero, might be a new feature */
1547 return -EINVAL;
NeilBrowne11e93f2007-05-09 02:35:36 -07001548
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 rdev->preferred_minor = 0xffff;
1550 rdev->data_offset = le64_to_cpu(sb->data_offset);
NeilBrownc6563a82012-05-21 09:27:00 +10001551 rdev->new_data_offset = rdev->data_offset;
1552 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1553 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1554 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
NeilBrown4dbcdc72006-01-06 00:20:52 -08001555 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556
NeilBrown0002b272005-09-09 16:23:53 -07001557 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
Martin K. Petersene1defc42009-05-22 17:17:49 -04001558 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
NeilBrown0002b272005-09-09 16:23:53 -07001559 if (rdev->sb_size & bmask)
NeilBrowna1801f82008-03-04 14:29:31 -08001560 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1561
1562 if (minor_version
Andre Noll0f420352008-07-11 22:02:23 +10001563 && rdev->data_offset < sb_start + (rdev->sb_size/512))
NeilBrowna1801f82008-03-04 14:29:31 -08001564 return -EINVAL;
NeilBrownc6563a82012-05-21 09:27:00 +10001565 if (minor_version
1566 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1567 return -EINVAL;
NeilBrown0002b272005-09-09 16:23:53 -07001568
NeilBrown31b65a02006-07-10 04:44:14 -07001569 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1570 rdev->desc_nr = -1;
1571 else
1572 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1573
NeilBrown2699b672011-07-28 11:31:47 +10001574 if (!rdev->bb_page) {
1575 rdev->bb_page = alloc_page(GFP_KERNEL);
1576 if (!rdev->bb_page)
1577 return -ENOMEM;
1578 }
1579 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1580 rdev->badblocks.count == 0) {
1581 /* need to load the bad block list.
1582 * Currently we limit it to one page.
1583 */
1584 s32 offset;
1585 sector_t bb_sector;
1586 u64 *bbp;
1587 int i;
1588 int sectors = le16_to_cpu(sb->bblog_size);
1589 if (sectors > (PAGE_SIZE / 512))
1590 return -EINVAL;
1591 offset = le32_to_cpu(sb->bblog_offset);
1592 if (offset == 0)
1593 return -EINVAL;
1594 bb_sector = (long long)offset;
1595 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1596 rdev->bb_page, READ, true))
1597 return -EIO;
1598 bbp = (u64 *)page_address(rdev->bb_page);
1599 rdev->badblocks.shift = sb->bblog_shift;
1600 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1601 u64 bb = le64_to_cpu(*bbp);
1602 int count = bb & (0x3ff);
1603 u64 sector = bb >> 10;
1604 sector <<= sb->bblog_shift;
1605 count <<= sb->bblog_shift;
1606 if (bb + 1 == 0)
1607 break;
1608 if (md_set_badblocks(&rdev->badblocks,
1609 sector, count, 1) == 0)
1610 return -EINVAL;
1611 }
1612 } else if (sb->bblog_offset == 0)
1613 rdev->badblocks.shift = -1;
1614
Harvey Harrison9a7b2b02008-04-28 02:15:49 -07001615 if (!refdev) {
NeilBrown8ed75462006-02-03 03:03:41 -08001616 ret = 1;
Harvey Harrison9a7b2b02008-04-28 02:15:49 -07001617 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 __u64 ev1, ev2;
Namhyung Kim65a06f062011-07-27 11:00:36 +10001619 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620
1621 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1622 sb->level != refsb->level ||
1623 sb->layout != refsb->layout ||
1624 sb->chunksize != refsb->chunksize) {
1625 printk(KERN_WARNING "md: %s has strangely different"
1626 " superblock to %s\n",
1627 bdevname(rdev->bdev,b),
1628 bdevname(refdev->bdev,b2));
1629 return -EINVAL;
1630 }
1631 ev1 = le64_to_cpu(sb->events);
1632 ev2 = le64_to_cpu(refsb->events);
1633
1634 if (ev1 > ev2)
NeilBrown8ed75462006-02-03 03:03:41 -08001635 ret = 1;
1636 else
1637 ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638 }
NeilBrownc6563a82012-05-21 09:27:00 +10001639 if (minor_version) {
1640 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1641 sectors -= rdev->data_offset;
1642 } else
1643 sectors = rdev->sb_start;
1644 if (sectors < le64_to_cpu(sb->data_size))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645 return -EINVAL;
Andre Nolldd8ac332009-03-31 14:33:13 +11001646 rdev->sectors = le64_to_cpu(sb->data_size);
NeilBrown8ed75462006-02-03 03:03:41 -08001647 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648}
1649
NeilBrownfd01b882011-10-11 16:47:53 +11001650static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001651{
Namhyung Kim65a06f062011-07-27 11:00:36 +10001652 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
NeilBrown07d84d102006-06-26 00:27:56 -07001653 __u64 ev1 = le64_to_cpu(sb->events);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001654
NeilBrown41158c72005-06-21 17:17:25 -07001655 rdev->raid_disk = -1;
NeilBrownc5d79ad2008-02-06 01:39:54 -08001656 clear_bit(Faulty, &rdev->flags);
1657 clear_bit(In_sync, &rdev->flags);
1658 clear_bit(WriteMostly, &rdev->flags);
NeilBrownc5d79ad2008-02-06 01:39:54 -08001659
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660 if (mddev->raid_disks == 0) {
1661 mddev->major_version = 1;
1662 mddev->patch_version = 0;
NeilBrowne6910632008-02-06 01:39:51 -08001663 mddev->external = 0;
Andre Noll9d8f0362009-06-18 08:45:01 +10001664 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1666 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1667 mddev->level = le32_to_cpu(sb->level);
NeilBrownd9d166c2006-01-06 00:20:51 -08001668 mddev->clevel[0] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 mddev->layout = le32_to_cpu(sb->layout);
1670 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
Andre Noll58c0fed2009-03-31 14:33:13 +11001671 mddev->dev_sectors = le64_to_cpu(sb->size);
NeilBrown07d84d102006-06-26 00:27:56 -07001672 mddev->events = ev1;
NeilBrownc3d97142009-12-14 12:49:52 +11001673 mddev->bitmap_info.offset = 0;
1674 mddev->bitmap_info.default_offset = 1024 >> 9;
NeilBrown2c810cd2012-05-21 09:27:00 +10001675 mddev->reshape_backwards = 0;
1676
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1678 memcpy(mddev->uuid, sb->set_uuid, 16);
1679
1680 mddev->max_disks = (4096-256)/2;
NeilBrowna654b9d82005-06-21 17:17:27 -07001681
NeilBrown71c08052005-09-09 16:23:51 -07001682 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
NeilBrownc3d97142009-12-14 12:49:52 +11001683 mddev->bitmap_info.file == NULL )
1684 mddev->bitmap_info.offset =
1685 (__s32)le32_to_cpu(sb->bitmap_offset);
NeilBrowne11e93f2007-05-09 02:35:36 -07001686
NeilBrownf6705572006-03-27 01:18:11 -08001687 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1688 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1689 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1690 mddev->new_level = le32_to_cpu(sb->new_level);
1691 mddev->new_layout = le32_to_cpu(sb->new_layout);
Andre Noll664e7c42009-06-18 08:45:27 +10001692 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
NeilBrown2c810cd2012-05-21 09:27:00 +10001693 if (mddev->delta_disks < 0 ||
1694 (mddev->delta_disks == 0 &&
1695 (le32_to_cpu(sb->feature_map)
1696 & MD_FEATURE_RESHAPE_BACKWARDS)))
1697 mddev->reshape_backwards = 1;
NeilBrownf6705572006-03-27 01:18:11 -08001698 } else {
1699 mddev->reshape_position = MaxSector;
1700 mddev->delta_disks = 0;
1701 mddev->new_level = mddev->level;
1702 mddev->new_layout = mddev->layout;
Andre Noll664e7c42009-06-18 08:45:27 +10001703 mddev->new_chunk_sectors = mddev->chunk_sectors;
NeilBrownf6705572006-03-27 01:18:11 -08001704 }
1705
NeilBrown41158c72005-06-21 17:17:25 -07001706 } else if (mddev->pers == NULL) {
NeilBrownbe6800a2010-05-18 10:17:09 +10001707 /* Insist of good event counter while assembling, except for
1708 * spares (which don't need an event count) */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 ++ev1;
NeilBrownbe6800a2010-05-18 10:17:09 +10001710 if (rdev->desc_nr >= 0 &&
1711 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1712 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1713 if (ev1 < mddev->events)
1714 return -EINVAL;
NeilBrown41158c72005-06-21 17:17:25 -07001715 } else if (mddev->bitmap) {
1716 /* If adding to array with a bitmap, then we can accept an
1717 * older device, but not too old.
1718 */
NeilBrown41158c72005-06-21 17:17:25 -07001719 if (ev1 < mddev->bitmap->events_cleared)
1720 return 0;
NeilBrown07d84d102006-06-26 00:27:56 -07001721 } else {
1722 if (ev1 < mddev->events)
1723 /* just a hot-add of a new device, leave raid_disk at -1 */
1724 return 0;
1725 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726 if (mddev->level != LEVEL_MULTIPATH) {
1727 int role;
NeilBrown3673f302009-08-03 10:59:56 +10001728 if (rdev->desc_nr < 0 ||
1729 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1730 role = 0xffff;
1731 rdev->desc_nr = -1;
1732 } else
1733 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 switch(role) {
1735 case 0xffff: /* spare */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736 break;
1737 case 0xfffe: /* faulty */
NeilBrownb2d444d2005-11-08 21:39:31 -08001738 set_bit(Faulty, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 break;
1740 default:
NeilBrown5fd6c1d2006-06-26 00:27:40 -07001741 if ((le32_to_cpu(sb->feature_map) &
1742 MD_FEATURE_RECOVERY_OFFSET))
1743 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1744 else
1745 set_bit(In_sync, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001746 rdev->raid_disk = role;
1747 break;
1748 }
NeilBrown8ddf9ef2005-09-09 16:23:45 -07001749 if (sb->devflags & WriteMostly1)
1750 set_bit(WriteMostly, &rdev->flags);
NeilBrown2d78f8c2011-12-23 10:17:51 +11001751 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1752 set_bit(Replacement, &rdev->flags);
NeilBrown41158c72005-06-21 17:17:25 -07001753 } else /* MULTIPATH are always insync */
NeilBrownb2d444d2005-11-08 21:39:31 -08001754 set_bit(In_sync, &rdev->flags);
NeilBrown41158c72005-06-21 17:17:25 -07001755
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 return 0;
1757}
1758
NeilBrownfd01b882011-10-11 16:47:53 +11001759static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760{
1761 struct mdp_superblock_1 *sb;
NeilBrown3cb03002011-10-11 16:45:26 +11001762 struct md_rdev *rdev2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763 int max_dev, i;
1764 /* make rdev->sb match mddev and rdev data. */
1765
Namhyung Kim65a06f062011-07-27 11:00:36 +10001766 sb = page_address(rdev->sb_page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767
1768 sb->feature_map = 0;
1769 sb->pad0 = 0;
NeilBrown5fd6c1d2006-06-26 00:27:40 -07001770 sb->recovery_offset = cpu_to_le64(0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 memset(sb->pad3, 0, sizeof(sb->pad3));
1772
1773 sb->utime = cpu_to_le64((__u64)mddev->utime);
1774 sb->events = cpu_to_le64(mddev->events);
1775 if (mddev->in_sync)
1776 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1777 else
1778 sb->resync_offset = cpu_to_le64(0);
1779
NeilBrown1c05b4b2006-10-21 10:24:08 -07001780 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
NeilBrown4dbcdc72006-01-06 00:20:52 -08001781
NeilBrownf0ca3402006-02-02 14:28:04 -08001782 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
Andre Noll58c0fed2009-03-31 14:33:13 +11001783 sb->size = cpu_to_le64(mddev->dev_sectors);
Andre Noll9d8f0362009-06-18 08:45:01 +10001784 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
NeilBrown62e1e382009-05-26 09:40:59 +10001785 sb->level = cpu_to_le32(mddev->level);
1786 sb->layout = cpu_to_le32(mddev->layout);
NeilBrownf0ca3402006-02-02 14:28:04 -08001787
NeilBrownaeb9b2112011-08-25 14:43:08 +10001788 if (test_bit(WriteMostly, &rdev->flags))
1789 sb->devflags |= WriteMostly1;
1790 else
1791 sb->devflags &= ~WriteMostly1;
NeilBrownc6563a82012-05-21 09:27:00 +10001792 sb->data_offset = cpu_to_le64(rdev->data_offset);
1793 sb->data_size = cpu_to_le64(rdev->sectors);
NeilBrownaeb9b2112011-08-25 14:43:08 +10001794
NeilBrownc3d97142009-12-14 12:49:52 +11001795 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1796 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
NeilBrown71c08052005-09-09 16:23:51 -07001797 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
NeilBrowna654b9d82005-06-21 17:17:27 -07001798 }
NeilBrown5fd6c1d2006-06-26 00:27:40 -07001799
1800 if (rdev->raid_disk >= 0 &&
NeilBrown97e4f422009-03-31 14:33:13 +11001801 !test_bit(In_sync, &rdev->flags)) {
NeilBrown93be75f2009-12-14 12:50:06 +11001802 sb->feature_map |=
1803 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1804 sb->recovery_offset =
1805 cpu_to_le64(rdev->recovery_offset);
NeilBrown5fd6c1d2006-06-26 00:27:40 -07001806 }
NeilBrown2d78f8c2011-12-23 10:17:51 +11001807 if (test_bit(Replacement, &rdev->flags))
1808 sb->feature_map |=
1809 cpu_to_le32(MD_FEATURE_REPLACEMENT);
NeilBrown5fd6c1d2006-06-26 00:27:40 -07001810
NeilBrownf6705572006-03-27 01:18:11 -08001811 if (mddev->reshape_position != MaxSector) {
1812 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1813 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1814 sb->new_layout = cpu_to_le32(mddev->new_layout);
1815 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1816 sb->new_level = cpu_to_le32(mddev->new_level);
Andre Noll664e7c42009-06-18 08:45:27 +10001817 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
NeilBrown2c810cd2012-05-21 09:27:00 +10001818 if (mddev->delta_disks == 0 &&
1819 mddev->reshape_backwards)
1820 sb->feature_map
1821 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
NeilBrownc6563a82012-05-21 09:27:00 +10001822 if (rdev->new_data_offset != rdev->data_offset) {
1823 sb->feature_map
1824 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1825 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1826 - rdev->data_offset));
1827 }
NeilBrownf6705572006-03-27 01:18:11 -08001828 }
NeilBrowna654b9d82005-06-21 17:17:27 -07001829
NeilBrown2699b672011-07-28 11:31:47 +10001830 if (rdev->badblocks.count == 0)
1831 /* Nothing to do for bad blocks*/ ;
1832 else if (sb->bblog_offset == 0)
1833 /* Cannot record bad blocks on this device */
1834 md_error(mddev, rdev);
1835 else {
1836 struct badblocks *bb = &rdev->badblocks;
1837 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1838 u64 *p = bb->page;
1839 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1840 if (bb->changed) {
1841 unsigned seq;
1842
1843retry:
1844 seq = read_seqbegin(&bb->lock);
1845
1846 memset(bbp, 0xff, PAGE_SIZE);
1847
1848 for (i = 0 ; i < bb->count ; i++) {
1849 u64 internal_bb = *p++;
1850 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1851 | BB_LEN(internal_bb));
1852 *bbp++ = cpu_to_le64(store_bb);
1853 }
NeilBrownd0962932012-03-19 12:46:41 +11001854 bb->changed = 0;
NeilBrown2699b672011-07-28 11:31:47 +10001855 if (read_seqretry(&bb->lock, seq))
1856 goto retry;
1857
1858 bb->sector = (rdev->sb_start +
1859 (int)le32_to_cpu(sb->bblog_offset));
1860 bb->size = le16_to_cpu(sb->bblog_size);
NeilBrown2699b672011-07-28 11:31:47 +10001861 }
1862 }
1863
Linus Torvalds1da177e2005-04-16 15:20:36 -07001864 max_dev = 0;
NeilBrowndafb20f2012-03-19 12:46:39 +11001865 rdev_for_each(rdev2, mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866 if (rdev2->desc_nr+1 > max_dev)
1867 max_dev = rdev2->desc_nr+1;
NeilBrowna778b732007-05-23 13:58:10 -07001868
NeilBrown70471da2009-08-03 10:59:57 +10001869 if (max_dev > le32_to_cpu(sb->max_dev)) {
1870 int bmask;
NeilBrowna778b732007-05-23 13:58:10 -07001871 sb->max_dev = cpu_to_le32(max_dev);
NeilBrown70471da2009-08-03 10:59:57 +10001872 rdev->sb_size = max_dev * 2 + 256;
1873 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1874 if (rdev->sb_size & bmask)
1875 rdev->sb_size = (rdev->sb_size | bmask) + 1;
NeilBrownddcf3522010-09-08 16:48:17 +10001876 } else
1877 max_dev = le32_to_cpu(sb->max_dev);
1878
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 for (i=0; i<max_dev;i++)
1880 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1881
NeilBrowndafb20f2012-03-19 12:46:39 +11001882 rdev_for_each(rdev2, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 i = rdev2->desc_nr;
NeilBrownb2d444d2005-11-08 21:39:31 -08001884 if (test_bit(Faulty, &rdev2->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885 sb->dev_roles[i] = cpu_to_le16(0xfffe);
NeilBrownb2d444d2005-11-08 21:39:31 -08001886 else if (test_bit(In_sync, &rdev2->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001887 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
NeilBrown93be75f2009-12-14 12:50:06 +11001888 else if (rdev2->raid_disk >= 0)
NeilBrown5fd6c1d2006-06-26 00:27:40 -07001889 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 else
1891 sb->dev_roles[i] = cpu_to_le16(0xffff);
1892 }
1893
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 sb->sb_csum = calc_sb_1_csum(sb);
1895}
1896
Chris Webb0cd17fe2008-06-28 08:31:46 +10001897static unsigned long long
NeilBrown3cb03002011-10-11 16:45:26 +11001898super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
Chris Webb0cd17fe2008-06-28 08:31:46 +10001899{
1900 struct mdp_superblock_1 *sb;
Andre Noll15f4a5f2008-07-21 14:42:12 +10001901 sector_t max_sectors;
Andre Noll58c0fed2009-03-31 14:33:13 +11001902 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
Chris Webb0cd17fe2008-06-28 08:31:46 +10001903 return 0; /* component must fit device */
NeilBrownc6563a82012-05-21 09:27:00 +10001904 if (rdev->data_offset != rdev->new_data_offset)
1905 return 0; /* too confusing */
Andre Noll0f420352008-07-11 22:02:23 +10001906 if (rdev->sb_start < rdev->data_offset) {
Chris Webb0cd17fe2008-06-28 08:31:46 +10001907 /* minor versions 1 and 2; superblock before data */
Mike Snitzer77304d22010-11-08 14:39:12 +01001908 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
Andre Noll15f4a5f2008-07-21 14:42:12 +10001909 max_sectors -= rdev->data_offset;
1910 if (!num_sectors || num_sectors > max_sectors)
1911 num_sectors = max_sectors;
NeilBrownc3d97142009-12-14 12:49:52 +11001912 } else if (rdev->mddev->bitmap_info.offset) {
Chris Webb0cd17fe2008-06-28 08:31:46 +10001913 /* minor version 0 with bitmap we can't move */
1914 return 0;
1915 } else {
1916 /* minor version 0; superblock after data */
Andre Noll0f420352008-07-11 22:02:23 +10001917 sector_t sb_start;
Mike Snitzer77304d22010-11-08 14:39:12 +01001918 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
Andre Noll0f420352008-07-11 22:02:23 +10001919 sb_start &= ~(sector_t)(4*2 - 1);
Andre Nolldd8ac332009-03-31 14:33:13 +11001920 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
Andre Noll15f4a5f2008-07-21 14:42:12 +10001921 if (!num_sectors || num_sectors > max_sectors)
1922 num_sectors = max_sectors;
Andre Noll0f420352008-07-11 22:02:23 +10001923 rdev->sb_start = sb_start;
Chris Webb0cd17fe2008-06-28 08:31:46 +10001924 }
Namhyung Kim65a06f062011-07-27 11:00:36 +10001925 sb = page_address(rdev->sb_page);
Andre Noll15f4a5f2008-07-21 14:42:12 +10001926 sb->data_size = cpu_to_le64(num_sectors);
Andre Noll0f420352008-07-11 22:02:23 +10001927 sb->super_offset = rdev->sb_start;
Chris Webb0cd17fe2008-06-28 08:31:46 +10001928 sb->sb_csum = calc_sb_1_csum(sb);
Andre Noll0f420352008-07-11 22:02:23 +10001929 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
Chris Webb0cd17fe2008-06-28 08:31:46 +10001930 rdev->sb_page);
1931 md_super_wait(rdev->mddev);
Justin Maggardc26a44e2010-11-24 16:36:17 +11001932 return num_sectors;
NeilBrownc6563a82012-05-21 09:27:00 +10001933
1934}
1935
1936static int
1937super_1_allow_new_offset(struct md_rdev *rdev,
1938 unsigned long long new_offset)
1939{
1940 /* All necessary checks on new >= old have been done */
1941 struct bitmap *bitmap;
1942 if (new_offset >= rdev->data_offset)
1943 return 1;
1944
1945 /* with 1.0 metadata, there is no metadata to tread on
1946 * so we can always move back */
1947 if (rdev->mddev->minor_version == 0)
1948 return 1;
1949
1950 /* otherwise we must be sure not to step on
1951 * any metadata, so stay:
1952 * 36K beyond start of superblock
1953 * beyond end of badblocks
1954 * beyond write-intent bitmap
1955 */
1956 if (rdev->sb_start + (32+4)*2 > new_offset)
1957 return 0;
1958 bitmap = rdev->mddev->bitmap;
1959 if (bitmap && !rdev->mddev->bitmap_info.file &&
1960 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1961 bitmap->file_pages * (PAGE_SIZE>>9) > new_offset)
1962 return 0;
1963 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1964 return 0;
1965
1966 return 1;
Chris Webb0cd17fe2008-06-28 08:31:46 +10001967}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968
Adrian Bunk75c96f82005-05-05 16:16:09 -07001969static struct super_type super_types[] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001970 [0] = {
1971 .name = "0.90.0",
1972 .owner = THIS_MODULE,
Chris Webb0cd17fe2008-06-28 08:31:46 +10001973 .load_super = super_90_load,
1974 .validate_super = super_90_validate,
1975 .sync_super = super_90_sync,
1976 .rdev_size_change = super_90_rdev_size_change,
NeilBrownc6563a82012-05-21 09:27:00 +10001977 .allow_new_offset = super_90_allow_new_offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001978 },
1979 [1] = {
1980 .name = "md-1",
1981 .owner = THIS_MODULE,
Chris Webb0cd17fe2008-06-28 08:31:46 +10001982 .load_super = super_1_load,
1983 .validate_super = super_1_validate,
1984 .sync_super = super_1_sync,
1985 .rdev_size_change = super_1_rdev_size_change,
NeilBrownc6563a82012-05-21 09:27:00 +10001986 .allow_new_offset = super_1_allow_new_offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987 },
1988};
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989
NeilBrownfd01b882011-10-11 16:47:53 +11001990static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
Jonathan Brassow076f9682011-06-07 17:51:30 -05001991{
1992 if (mddev->sync_super) {
1993 mddev->sync_super(mddev, rdev);
1994 return;
1995 }
1996
1997 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1998
1999 super_types[mddev->major_version].sync_super(mddev, rdev);
2000}
2001
NeilBrownfd01b882011-10-11 16:47:53 +11002002static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003{
NeilBrown3cb03002011-10-11 16:45:26 +11002004 struct md_rdev *rdev, *rdev2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005
NeilBrown4b809912008-07-21 17:05:25 +10002006 rcu_read_lock();
2007 rdev_for_each_rcu(rdev, mddev1)
2008 rdev_for_each_rcu(rdev2, mddev2)
NeilBrown7dd5e7c32007-02-28 20:11:35 -08002009 if (rdev->bdev->bd_contains ==
NeilBrown4b809912008-07-21 17:05:25 +10002010 rdev2->bdev->bd_contains) {
2011 rcu_read_unlock();
NeilBrown7dd5e7c32007-02-28 20:11:35 -08002012 return 1;
NeilBrown4b809912008-07-21 17:05:25 +10002013 }
2014 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 return 0;
2016}
2017
2018static LIST_HEAD(pending_raid_disks);
2019
Andre Nollac5e7112009-08-03 10:59:47 +10002020/*
2021 * Try to register data integrity profile for an mddev
2022 *
2023 * This is called when an array is started and after a disk has been kicked
2024 * from the array. It only succeeds if all working and active component devices
2025 * are integrity capable with matching profiles.
2026 */
NeilBrownfd01b882011-10-11 16:47:53 +11002027int md_integrity_register(struct mddev *mddev)
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002028{
NeilBrown3cb03002011-10-11 16:45:26 +11002029 struct md_rdev *rdev, *reference = NULL;
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002030
Andre Nollac5e7112009-08-03 10:59:47 +10002031 if (list_empty(&mddev->disks))
2032 return 0; /* nothing to do */
Jonathan Brassow629acb62011-06-08 15:10:08 +10002033 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2034 return 0; /* shouldn't register, or already is */
NeilBrowndafb20f2012-03-19 12:46:39 +11002035 rdev_for_each(rdev, mddev) {
Andre Nollac5e7112009-08-03 10:59:47 +10002036 /* skip spares and non-functional disks */
2037 if (test_bit(Faulty, &rdev->flags))
2038 continue;
2039 if (rdev->raid_disk < 0)
2040 continue;
Andre Nollac5e7112009-08-03 10:59:47 +10002041 if (!reference) {
2042 /* Use the first rdev as the reference */
2043 reference = rdev;
2044 continue;
2045 }
2046 /* does this rdev's profile match the reference profile? */
2047 if (blk_integrity_compare(reference->bdev->bd_disk,
2048 rdev->bdev->bd_disk) < 0)
2049 return -EINVAL;
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002050 }
Martin K. Petersen89078d52011-03-28 20:09:12 -04002051 if (!reference || !bdev_get_integrity(reference->bdev))
2052 return 0;
Andre Nollac5e7112009-08-03 10:59:47 +10002053 /*
2054 * All component devices are integrity capable and have matching
2055 * profiles, register the common profile for the md device.
2056 */
2057 if (blk_integrity_register(mddev->gendisk,
2058 bdev_get_integrity(reference->bdev)) != 0) {
2059 printk(KERN_ERR "md: failed to register integrity for %s\n",
2060 mdname(mddev));
2061 return -EINVAL;
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002062 }
Martin K. Petersena91a2782011-03-17 11:11:05 +01002063 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
2064 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2065 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
2066 mdname(mddev));
2067 return -EINVAL;
2068 }
Andre Nollac5e7112009-08-03 10:59:47 +10002069 return 0;
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002070}
Andre Nollac5e7112009-08-03 10:59:47 +10002071EXPORT_SYMBOL(md_integrity_register);
2072
2073/* Disable data integrity if non-capable/non-matching disk is being added */
NeilBrownfd01b882011-10-11 16:47:53 +11002074void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
Andre Nollac5e7112009-08-03 10:59:47 +10002075{
2076 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
2077 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
2078
2079 if (!bi_mddev) /* nothing to do */
2080 return;
2081 if (rdev->raid_disk < 0) /* skip spares */
2082 return;
2083 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2084 rdev->bdev->bd_disk) >= 0)
2085 return;
2086 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2087 blk_integrity_unregister(mddev->gendisk);
2088}
2089EXPORT_SYMBOL(md_integrity_add_rdev);
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002090
NeilBrownfd01b882011-10-11 16:47:53 +11002091static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092{
NeilBrown7dd5e7c32007-02-28 20:11:35 -08002093 char b[BDEVNAME_SIZE];
NeilBrownf637b9f2005-11-08 21:39:37 -08002094 struct kobject *ko;
Neil Brown1edf80d2006-01-12 01:05:23 -08002095 char *s;
NeilBrown5e55e2f2007-03-26 21:32:14 -08002096 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097
2098 if (rdev->mddev) {
2099 MD_BUG();
2100 return -EINVAL;
2101 }
Dan Williams11e2ede2008-04-30 00:52:32 -07002102
2103 /* prevent duplicates */
2104 if (find_rdev(mddev, rdev->bdev->bd_dev))
2105 return -EEXIST;
2106
Andre Nolldd8ac332009-03-31 14:33:13 +11002107 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2108 if (rdev->sectors && (mddev->dev_sectors == 0 ||
2109 rdev->sectors < mddev->dev_sectors)) {
NeilBrowna778b732007-05-23 13:58:10 -07002110 if (mddev->pers) {
2111 /* Cannot change size, so fail
2112 * If mddev->level <= 0, then we don't care
2113 * about aligning sizes (e.g. linear)
2114 */
2115 if (mddev->level > 0)
2116 return -ENOSPC;
2117 } else
Andre Nolldd8ac332009-03-31 14:33:13 +11002118 mddev->dev_sectors = rdev->sectors;
NeilBrown2bf071b2006-01-06 00:20:55 -08002119 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120
2121 /* Verify rdev->desc_nr is unique.
2122 * If it is -1, assign a free number, else
2123 * check number is not in use
2124 */
2125 if (rdev->desc_nr < 0) {
2126 int choice = 0;
2127 if (mddev->pers) choice = mddev->raid_disks;
2128 while (find_rdev_nr(mddev, choice))
2129 choice++;
2130 rdev->desc_nr = choice;
2131 } else {
2132 if (find_rdev_nr(mddev, rdev->desc_nr))
2133 return -EBUSY;
2134 }
NeilBrownde01dfa2009-02-06 18:02:46 +11002135 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2136 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2137 mdname(mddev), mddev->max_disks);
2138 return -EBUSY;
2139 }
NeilBrown19133a42005-11-08 21:39:35 -08002140 bdevname(rdev->bdev,b);
Greg Kroah-Hartman649316b2007-12-17 23:05:35 -07002141 while ( (s=strchr(b, '/')) != NULL)
Neil Brown1edf80d2006-01-12 01:05:23 -08002142 *s = '!';
Greg Kroah-Hartman649316b2007-12-17 23:05:35 -07002143
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144 rdev->mddev = mddev;
NeilBrown19133a42005-11-08 21:39:35 -08002145 printk(KERN_INFO "md: bind<%s>\n", b);
NeilBrown86e6ffd2005-11-08 21:39:24 -08002146
Greg Kroah-Hartmanb2d6db52007-12-17 23:05:35 -07002147 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
NeilBrown5e55e2f2007-03-26 21:32:14 -08002148 goto fail;
NeilBrown86e6ffd2005-11-08 21:39:24 -08002149
Tejun Heo0762b8b2008-08-25 19:56:12 +09002150 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
NeilBrown00bcb4a2010-06-01 19:37:23 +10002151 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2152 /* failure here is OK */;
2153 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
NeilBrown3c0ee632008-10-21 13:25:28 +11002154
NeilBrown4b809912008-07-21 17:05:25 +10002155 list_add_rcu(&rdev->same_set, &mddev->disks);
Tejun Heoe09b4572010-11-13 11:55:17 +01002156 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
NeilBrown4044ba52009-01-09 08:31:11 +11002157
2158 /* May as well allow recovery to be retried once */
NeilBrown53890422011-07-27 11:00:36 +10002159 mddev->recovery_disabled++;
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11002160
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161 return 0;
NeilBrown5e55e2f2007-03-26 21:32:14 -08002162
2163 fail:
2164 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2165 b, mdname(mddev));
2166 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002167}
2168
NeilBrown177a99b2008-02-06 01:39:56 -08002169static void md_delayed_delete(struct work_struct *ws)
NeilBrown5792a282007-04-04 19:08:18 -07002170{
NeilBrown3cb03002011-10-11 16:45:26 +11002171 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
NeilBrown5792a282007-04-04 19:08:18 -07002172 kobject_del(&rdev->kobj);
NeilBrown177a99b2008-02-06 01:39:56 -08002173 kobject_put(&rdev->kobj);
NeilBrown5792a282007-04-04 19:08:18 -07002174}
2175
NeilBrown3cb03002011-10-11 16:45:26 +11002176static void unbind_rdev_from_array(struct md_rdev * rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177{
2178 char b[BDEVNAME_SIZE];
2179 if (!rdev->mddev) {
2180 MD_BUG();
2181 return;
2182 }
Tejun Heo49731ba2011-01-14 18:43:57 +01002183 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
NeilBrown4b809912008-07-21 17:05:25 +10002184 list_del_rcu(&rdev->same_set);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002185 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2186 rdev->mddev = NULL;
NeilBrown86e6ffd2005-11-08 21:39:24 -08002187 sysfs_remove_link(&rdev->kobj, "block");
NeilBrown3c0ee632008-10-21 13:25:28 +11002188 sysfs_put(rdev->sysfs_state);
2189 rdev->sysfs_state = NULL;
NeilBrown2230dfe2011-07-28 11:31:46 +10002190 kfree(rdev->badblocks.page);
2191 rdev->badblocks.count = 0;
2192 rdev->badblocks.page = NULL;
NeilBrown5792a282007-04-04 19:08:18 -07002193 /* We need to delay this, otherwise we can deadlock when
NeilBrown4b809912008-07-21 17:05:25 +10002194 * writing to 'remove' to "dev/state". We also need
2195 * to delay it due to rcu usage.
NeilBrown5792a282007-04-04 19:08:18 -07002196 */
NeilBrown4b809912008-07-21 17:05:25 +10002197 synchronize_rcu();
NeilBrown177a99b2008-02-06 01:39:56 -08002198 INIT_WORK(&rdev->del_work, md_delayed_delete);
2199 kobject_get(&rdev->kobj);
Tejun Heoe804ac72010-10-15 15:36:08 +02002200 queue_work(md_misc_wq, &rdev->del_work);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201}
2202
2203/*
2204 * prevent the device from being mounted, repartitioned or
2205 * otherwise reused by a RAID array (or any other kernel
2206 * subsystem), by bd_claiming the device.
2207 */
NeilBrown3cb03002011-10-11 16:45:26 +11002208static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209{
2210 int err = 0;
2211 struct block_device *bdev;
2212 char b[BDEVNAME_SIZE];
2213
Tejun Heod4d77622010-11-13 11:55:18 +01002214 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
NeilBrown3cb03002011-10-11 16:45:26 +11002215 shared ? (struct md_rdev *)lock_rdev : rdev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 if (IS_ERR(bdev)) {
2217 printk(KERN_ERR "md: could not open %s.\n",
2218 __bdevname(dev, b));
2219 return PTR_ERR(bdev);
2220 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221 rdev->bdev = bdev;
2222 return err;
2223}
2224
NeilBrown3cb03002011-10-11 16:45:26 +11002225static void unlock_rdev(struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226{
2227 struct block_device *bdev = rdev->bdev;
2228 rdev->bdev = NULL;
2229 if (!bdev)
2230 MD_BUG();
Tejun Heoe525fd82010-11-13 11:55:17 +01002231 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232}
2233
2234void md_autodetect_dev(dev_t dev);
2235
NeilBrown3cb03002011-10-11 16:45:26 +11002236static void export_rdev(struct md_rdev * rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237{
2238 char b[BDEVNAME_SIZE];
2239 printk(KERN_INFO "md: export_rdev(%s)\n",
2240 bdevname(rdev->bdev,b));
2241 if (rdev->mddev)
2242 MD_BUG();
2243 free_disk_sb(rdev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244#ifndef MODULE
NeilBrownd0fae182008-03-04 14:29:31 -08002245 if (test_bit(AutoDetected, &rdev->flags))
2246 md_autodetect_dev(rdev->bdev->bd_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247#endif
2248 unlock_rdev(rdev);
NeilBrown86e6ffd2005-11-08 21:39:24 -08002249 kobject_put(&rdev->kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250}
2251
NeilBrown3cb03002011-10-11 16:45:26 +11002252static void kick_rdev_from_array(struct md_rdev * rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002253{
2254 unbind_rdev_from_array(rdev);
2255 export_rdev(rdev);
2256}
2257
NeilBrownfd01b882011-10-11 16:47:53 +11002258static void export_array(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259{
NeilBrown3cb03002011-10-11 16:45:26 +11002260 struct md_rdev *rdev, *tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261
NeilBrowndafb20f2012-03-19 12:46:39 +11002262 rdev_for_each_safe(rdev, tmp, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263 if (!rdev->mddev) {
2264 MD_BUG();
2265 continue;
2266 }
2267 kick_rdev_from_array(rdev);
2268 }
2269 if (!list_empty(&mddev->disks))
2270 MD_BUG();
2271 mddev->raid_disks = 0;
2272 mddev->major_version = 0;
2273}
2274
2275static void print_desc(mdp_disk_t *desc)
2276{
2277 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
2278 desc->major,desc->minor,desc->raid_disk,desc->state);
2279}
2280
Cheng Renquancd2ac932009-01-09 08:31:08 +11002281static void print_sb_90(mdp_super_t *sb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282{
2283 int i;
2284
2285 printk(KERN_INFO
2286 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2287 sb->major_version, sb->minor_version, sb->patch_version,
2288 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2289 sb->ctime);
2290 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
2291 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
2292 sb->md_minor, sb->layout, sb->chunk_size);
2293 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
2294 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
2295 sb->utime, sb->state, sb->active_disks, sb->working_disks,
2296 sb->failed_disks, sb->spare_disks,
2297 sb->sb_csum, (unsigned long)sb->events_lo);
2298
2299 printk(KERN_INFO);
2300 for (i = 0; i < MD_SB_DISKS; i++) {
2301 mdp_disk_t *desc;
2302
2303 desc = sb->disks + i;
2304 if (desc->number || desc->major || desc->minor ||
2305 desc->raid_disk || (desc->state && (desc->state != 4))) {
2306 printk(" D %2d: ", i);
2307 print_desc(desc);
2308 }
2309 }
2310 printk(KERN_INFO "md: THIS: ");
2311 print_desc(&sb->this_disk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312}
2313
Cheng Renquancd2ac932009-01-09 08:31:08 +11002314static void print_sb_1(struct mdp_superblock_1 *sb)
2315{
2316 __u8 *uuid;
2317
2318 uuid = sb->set_uuid;
Joe Perchesad361c92009-07-06 13:05:40 -07002319 printk(KERN_INFO
Joe Perches7b75c2f2009-12-14 18:01:12 -08002320 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
Joe Perchesad361c92009-07-06 13:05:40 -07002321 "md: Name: \"%s\" CT:%llu\n",
Cheng Renquancd2ac932009-01-09 08:31:08 +11002322 le32_to_cpu(sb->major_version),
2323 le32_to_cpu(sb->feature_map),
Joe Perches7b75c2f2009-12-14 18:01:12 -08002324 uuid,
Cheng Renquancd2ac932009-01-09 08:31:08 +11002325 sb->set_name,
2326 (unsigned long long)le64_to_cpu(sb->ctime)
2327 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
2328
2329 uuid = sb->device_uuid;
Joe Perchesad361c92009-07-06 13:05:40 -07002330 printk(KERN_INFO
2331 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
Cheng Renquancd2ac932009-01-09 08:31:08 +11002332 " RO:%llu\n"
Joe Perches7b75c2f2009-12-14 18:01:12 -08002333 "md: Dev:%08x UUID: %pU\n"
Joe Perchesad361c92009-07-06 13:05:40 -07002334 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
2335 "md: (MaxDev:%u) \n",
Cheng Renquancd2ac932009-01-09 08:31:08 +11002336 le32_to_cpu(sb->level),
2337 (unsigned long long)le64_to_cpu(sb->size),
2338 le32_to_cpu(sb->raid_disks),
2339 le32_to_cpu(sb->layout),
2340 le32_to_cpu(sb->chunksize),
2341 (unsigned long long)le64_to_cpu(sb->data_offset),
2342 (unsigned long long)le64_to_cpu(sb->data_size),
2343 (unsigned long long)le64_to_cpu(sb->super_offset),
2344 (unsigned long long)le64_to_cpu(sb->recovery_offset),
2345 le32_to_cpu(sb->dev_number),
Joe Perches7b75c2f2009-12-14 18:01:12 -08002346 uuid,
Cheng Renquancd2ac932009-01-09 08:31:08 +11002347 sb->devflags,
2348 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
2349 (unsigned long long)le64_to_cpu(sb->events),
2350 (unsigned long long)le64_to_cpu(sb->resync_offset),
2351 le32_to_cpu(sb->sb_csum),
2352 le32_to_cpu(sb->max_dev)
2353 );
2354}
2355
NeilBrown3cb03002011-10-11 16:45:26 +11002356static void print_rdev(struct md_rdev *rdev, int major_version)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357{
2358 char b[BDEVNAME_SIZE];
Andre Nolldd8ac332009-03-31 14:33:13 +11002359 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
2360 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
NeilBrownb2d444d2005-11-08 21:39:31 -08002361 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
2362 rdev->desc_nr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363 if (rdev->sb_loaded) {
Cheng Renquancd2ac932009-01-09 08:31:08 +11002364 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2365 switch (major_version) {
2366 case 0:
Namhyung Kim65a06f062011-07-27 11:00:36 +10002367 print_sb_90(page_address(rdev->sb_page));
Cheng Renquancd2ac932009-01-09 08:31:08 +11002368 break;
2369 case 1:
Namhyung Kim65a06f062011-07-27 11:00:36 +10002370 print_sb_1(page_address(rdev->sb_page));
Cheng Renquancd2ac932009-01-09 08:31:08 +11002371 break;
2372 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373 } else
2374 printk(KERN_INFO "md: no rdev superblock!\n");
2375}
2376
Adrian Bunk5e563412006-06-26 00:27:42 -07002377static void md_print_devices(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002378{
Cheng Renquan159ec1f2009-01-09 08:31:08 +11002379 struct list_head *tmp;
NeilBrown3cb03002011-10-11 16:45:26 +11002380 struct md_rdev *rdev;
NeilBrownfd01b882011-10-11 16:47:53 +11002381 struct mddev *mddev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382 char b[BDEVNAME_SIZE];
2383
2384 printk("\n");
2385 printk("md: **********************************\n");
2386 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
2387 printk("md: **********************************\n");
NeilBrown29ac4aa2008-02-06 01:39:58 -08002388 for_each_mddev(mddev, tmp) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389
NeilBrown32a76272005-06-21 17:17:14 -07002390 if (mddev->bitmap)
2391 bitmap_print_sb(mddev->bitmap);
2392 else
2393 printk("%s: ", mdname(mddev));
NeilBrowndafb20f2012-03-19 12:46:39 +11002394 rdev_for_each(rdev, mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 printk("<%s>", bdevname(rdev->bdev,b));
2396 printk("\n");
2397
NeilBrowndafb20f2012-03-19 12:46:39 +11002398 rdev_for_each(rdev, mddev)
Cheng Renquancd2ac932009-01-09 08:31:08 +11002399 print_rdev(rdev, mddev->major_version);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 }
2401 printk("md: **********************************\n");
2402 printk("\n");
2403}
2404
2405
NeilBrownfd01b882011-10-11 16:47:53 +11002406static void sync_sbs(struct mddev * mddev, int nospares)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002407{
NeilBrown42543762006-06-26 00:27:57 -07002408 /* Update each superblock (in-memory image), but
2409 * if we are allowed to, skip spares which already
2410 * have the right event counter, or have one earlier
2411 * (which would mean they aren't being marked as dirty
2412 * with the rest of the array)
2413 */
NeilBrown3cb03002011-10-11 16:45:26 +11002414 struct md_rdev *rdev;
NeilBrowndafb20f2012-03-19 12:46:39 +11002415 rdev_for_each(rdev, mddev) {
NeilBrown42543762006-06-26 00:27:57 -07002416 if (rdev->sb_events == mddev->events ||
2417 (nospares &&
2418 rdev->raid_disk < 0 &&
NeilBrown42543762006-06-26 00:27:57 -07002419 rdev->sb_events+1 == mddev->events)) {
2420 /* Don't update this superblock */
2421 rdev->sb_loaded = 2;
2422 } else {
Jonathan Brassow076f9682011-06-07 17:51:30 -05002423 sync_super(mddev, rdev);
NeilBrown42543762006-06-26 00:27:57 -07002424 rdev->sb_loaded = 1;
2425 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426 }
2427}
2428
NeilBrownfd01b882011-10-11 16:47:53 +11002429static void md_update_sb(struct mddev * mddev, int force_change)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430{
NeilBrown3cb03002011-10-11 16:45:26 +11002431 struct md_rdev *rdev;
NeilBrown06d91a52005-06-21 17:17:12 -07002432 int sync_req;
NeilBrown42543762006-06-26 00:27:57 -07002433 int nospares = 0;
NeilBrown2699b672011-07-28 11:31:47 +10002434 int any_badblocks_changed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436repeat:
NeilBrown3a3a5dd2010-08-16 18:09:31 +10002437 /* First make sure individual recovery_offsets are correct */
NeilBrowndafb20f2012-03-19 12:46:39 +11002438 rdev_for_each(rdev, mddev) {
NeilBrown3a3a5dd2010-08-16 18:09:31 +10002439 if (rdev->raid_disk >= 0 &&
2440 mddev->delta_disks >= 0 &&
2441 !test_bit(In_sync, &rdev->flags) &&
2442 mddev->curr_resync_completed > rdev->recovery_offset)
2443 rdev->recovery_offset = mddev->curr_resync_completed;
2444
2445 }
Dan Williamsbd52b742010-08-30 17:33:33 +10002446 if (!mddev->persistent) {
NeilBrown070dc6d2010-08-30 17:33:34 +10002447 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
NeilBrown3a3a5dd2010-08-16 18:09:31 +10002448 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
NeilBrownde393cd2011-07-28 11:31:48 +10002449 if (!mddev->external) {
NeilBrownd97a41dc2010-10-28 17:30:20 +11002450 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
NeilBrowndafb20f2012-03-19 12:46:39 +11002451 rdev_for_each(rdev, mddev) {
NeilBrownde393cd2011-07-28 11:31:48 +10002452 if (rdev->badblocks.changed) {
NeilBrownd0962932012-03-19 12:46:41 +11002453 rdev->badblocks.changed = 0;
NeilBrownde393cd2011-07-28 11:31:48 +10002454 md_ack_all_badblocks(&rdev->badblocks);
2455 md_error(mddev, rdev);
2456 }
2457 clear_bit(Blocked, &rdev->flags);
2458 clear_bit(BlockedBadBlocks, &rdev->flags);
2459 wake_up(&rdev->blocked_wait);
2460 }
2461 }
NeilBrown3a3a5dd2010-08-16 18:09:31 +10002462 wake_up(&mddev->sb_wait);
2463 return;
2464 }
2465
NeilBrowna9701a32005-11-08 21:39:34 -08002466 spin_lock_irq(&mddev->write_lock);
NeilBrown84692192006-08-27 01:23:49 -07002467
NeilBrown3a3a5dd2010-08-16 18:09:31 +10002468 mddev->utime = get_seconds();
2469
NeilBrown850b2b422006-10-03 01:15:46 -07002470 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2471 force_change = 1;
2472 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2473 /* just a clean<-> dirty transition, possibly leave spares alone,
2474 * though if events isn't the right even/odd, we will have to do
2475 * spares after all
2476 */
2477 nospares = 1;
2478 if (force_change)
2479 nospares = 0;
2480 if (mddev->degraded)
NeilBrown84692192006-08-27 01:23:49 -07002481 /* If the array is degraded, then skipping spares is both
2482 * dangerous and fairly pointless.
2483 * Dangerous because a device that was removed from the array
2484 * might have a event_count that still looks up-to-date,
2485 * so it can be re-added without a resync.
2486 * Pointless because if there are any spares to skip,
2487 * then a recovery will happen and soon that array won't
2488 * be degraded any more and the spare can go back to sleep then.
2489 */
NeilBrown850b2b422006-10-03 01:15:46 -07002490 nospares = 0;
NeilBrown84692192006-08-27 01:23:49 -07002491
NeilBrown06d91a52005-06-21 17:17:12 -07002492 sync_req = mddev->in_sync;
NeilBrown42543762006-06-26 00:27:57 -07002493
2494 /* If this is just a dirty<->clean transition, and the array is clean
2495 * and 'events' is odd, we can roll back to the previous clean state */
NeilBrown850b2b422006-10-03 01:15:46 -07002496 if (nospares
NeilBrown42543762006-06-26 00:27:57 -07002497 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
NeilBrowna8707c02010-05-18 09:28:43 +10002498 && mddev->can_decrease_events
2499 && mddev->events != 1) {
NeilBrown42543762006-06-26 00:27:57 -07002500 mddev->events--;
NeilBrowna8707c02010-05-18 09:28:43 +10002501 mddev->can_decrease_events = 0;
2502 } else {
NeilBrown42543762006-06-26 00:27:57 -07002503 /* otherwise we have to go forward and ... */
2504 mddev->events ++;
NeilBrowna8707c02010-05-18 09:28:43 +10002505 mddev->can_decrease_events = nospares;
NeilBrown42543762006-06-26 00:27:57 -07002506 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507
2508 if (!mddev->events) {
2509 /*
2510 * oops, this 64-bit counter should never wrap.
2511 * Either we are in around ~1 trillion A.C., assuming
2512 * 1 reboot per second, or we have a bug:
2513 */
2514 MD_BUG();
2515 mddev->events --;
2516 }
NeilBrown2699b672011-07-28 11:31:47 +10002517
NeilBrowndafb20f2012-03-19 12:46:39 +11002518 rdev_for_each(rdev, mddev) {
NeilBrown2699b672011-07-28 11:31:47 +10002519 if (rdev->badblocks.changed)
2520 any_badblocks_changed++;
NeilBrownde393cd2011-07-28 11:31:48 +10002521 if (test_bit(Faulty, &rdev->flags))
2522 set_bit(FaultRecorded, &rdev->flags);
2523 }
NeilBrown2699b672011-07-28 11:31:47 +10002524
NeilBrowne6910632008-02-06 01:39:51 -08002525 sync_sbs(mddev, nospares);
NeilBrowna9701a32005-11-08 21:39:34 -08002526 spin_unlock_irq(&mddev->write_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527
NeilBrown36a4e1f2011-10-07 14:23:17 +11002528 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2529 mdname(mddev), mddev->in_sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530
NeilBrown4ad13662007-07-17 04:06:13 -07002531 bitmap_update_sb(mddev->bitmap);
NeilBrowndafb20f2012-03-19 12:46:39 +11002532 rdev_for_each(rdev, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002533 char b[BDEVNAME_SIZE];
NeilBrown36a4e1f2011-10-07 14:23:17 +11002534
NeilBrown42543762006-06-26 00:27:57 -07002535 if (rdev->sb_loaded != 1)
2536 continue; /* no noise on spare devices */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537
Andrei Warkentind70ed2e2011-10-18 12:16:48 +11002538 if (!test_bit(Faulty, &rdev->flags) &&
2539 rdev->saved_raid_disk == -1) {
NeilBrown7bfa19f2005-06-21 17:17:28 -07002540 md_super_write(mddev,rdev,
Andre Noll0f420352008-07-11 22:02:23 +10002541 rdev->sb_start, rdev->sb_size,
NeilBrown7bfa19f2005-06-21 17:17:28 -07002542 rdev->sb_page);
NeilBrown36a4e1f2011-10-07 14:23:17 +11002543 pr_debug("md: (write) %s's sb offset: %llu\n",
2544 bdevname(rdev->bdev, b),
2545 (unsigned long long)rdev->sb_start);
NeilBrown42543762006-06-26 00:27:57 -07002546 rdev->sb_events = mddev->events;
NeilBrown2699b672011-07-28 11:31:47 +10002547 if (rdev->badblocks.size) {
2548 md_super_write(mddev, rdev,
2549 rdev->badblocks.sector,
2550 rdev->badblocks.size << 9,
2551 rdev->bb_page);
2552 rdev->badblocks.size = 0;
2553 }
NeilBrown7bfa19f2005-06-21 17:17:28 -07002554
Andrei Warkentind70ed2e2011-10-18 12:16:48 +11002555 } else if (test_bit(Faulty, &rdev->flags))
NeilBrown36a4e1f2011-10-07 14:23:17 +11002556 pr_debug("md: %s (skipping faulty)\n",
2557 bdevname(rdev->bdev, b));
Andrei Warkentind70ed2e2011-10-18 12:16:48 +11002558 else
2559 pr_debug("(skipping incremental s/r ");
2560
NeilBrown7bfa19f2005-06-21 17:17:28 -07002561 if (mddev->level == LEVEL_MULTIPATH)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562 /* only need to write one superblock... */
2563 break;
2564 }
NeilBrowna9701a32005-11-08 21:39:34 -08002565 md_super_wait(mddev);
NeilBrown850b2b422006-10-03 01:15:46 -07002566 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
NeilBrown7bfa19f2005-06-21 17:17:28 -07002567
NeilBrowna9701a32005-11-08 21:39:34 -08002568 spin_lock_irq(&mddev->write_lock);
NeilBrown850b2b422006-10-03 01:15:46 -07002569 if (mddev->in_sync != sync_req ||
2570 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
NeilBrown06d91a52005-06-21 17:17:12 -07002571 /* have to write it out again */
NeilBrowna9701a32005-11-08 21:39:34 -08002572 spin_unlock_irq(&mddev->write_lock);
NeilBrown06d91a52005-06-21 17:17:12 -07002573 goto repeat;
2574 }
NeilBrown850b2b422006-10-03 01:15:46 -07002575 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
NeilBrowna9701a32005-11-08 21:39:34 -08002576 spin_unlock_irq(&mddev->write_lock);
NeilBrown3d310eb2005-06-21 17:17:26 -07002577 wake_up(&mddev->sb_wait);
NeilBrownacb180b2009-04-14 16:28:34 +10002578 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2579 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
NeilBrown06d91a52005-06-21 17:17:12 -07002580
NeilBrowndafb20f2012-03-19 12:46:39 +11002581 rdev_for_each(rdev, mddev) {
NeilBrownde393cd2011-07-28 11:31:48 +10002582 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2583 clear_bit(Blocked, &rdev->flags);
2584
2585 if (any_badblocks_changed)
NeilBrown2699b672011-07-28 11:31:47 +10002586 md_ack_all_badblocks(&rdev->badblocks);
NeilBrownde393cd2011-07-28 11:31:48 +10002587 clear_bit(BlockedBadBlocks, &rdev->flags);
2588 wake_up(&rdev->blocked_wait);
2589 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590}
2591
Andre Noll7f6ce762008-03-23 18:34:54 +01002592/* words written to sysfs files may, or may not, be \n terminated.
NeilBrownbce74da2006-01-06 00:20:41 -08002593 * We want to accept with case. For this we use cmd_match.
2594 */
2595static int cmd_match(const char *cmd, const char *str)
2596{
2597 /* See if cmd, written into a sysfs file, matches
2598 * str. They must either be the same, or cmd can
2599 * have a trailing newline
2600 */
2601 while (*cmd && *str && *cmd == *str) {
2602 cmd++;
2603 str++;
2604 }
2605 if (*cmd == '\n')
2606 cmd++;
2607 if (*str || *cmd)
2608 return 0;
2609 return 1;
2610}
2611
NeilBrown86e6ffd2005-11-08 21:39:24 -08002612struct rdev_sysfs_entry {
2613 struct attribute attr;
NeilBrown3cb03002011-10-11 16:45:26 +11002614 ssize_t (*show)(struct md_rdev *, char *);
2615 ssize_t (*store)(struct md_rdev *, const char *, size_t);
NeilBrown86e6ffd2005-11-08 21:39:24 -08002616};
2617
2618static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002619state_show(struct md_rdev *rdev, char *page)
NeilBrown86e6ffd2005-11-08 21:39:24 -08002620{
2621 char *sep = "";
NeilBrown20a49ff2008-02-06 01:39:57 -08002622 size_t len = 0;
NeilBrown86e6ffd2005-11-08 21:39:24 -08002623
NeilBrownde393cd2011-07-28 11:31:48 +10002624 if (test_bit(Faulty, &rdev->flags) ||
2625 rdev->badblocks.unacked_exist) {
NeilBrown86e6ffd2005-11-08 21:39:24 -08002626 len+= sprintf(page+len, "%sfaulty",sep);
2627 sep = ",";
2628 }
NeilBrownb2d444d2005-11-08 21:39:31 -08002629 if (test_bit(In_sync, &rdev->flags)) {
NeilBrown86e6ffd2005-11-08 21:39:24 -08002630 len += sprintf(page+len, "%sin_sync",sep);
2631 sep = ",";
2632 }
NeilBrownf6556752006-06-26 00:28:01 -07002633 if (test_bit(WriteMostly, &rdev->flags)) {
2634 len += sprintf(page+len, "%swrite_mostly",sep);
2635 sep = ",";
2636 }
NeilBrownde393cd2011-07-28 11:31:48 +10002637 if (test_bit(Blocked, &rdev->flags) ||
NeilBrown52c64152011-12-08 16:22:48 +11002638 (rdev->badblocks.unacked_exist
2639 && !test_bit(Faulty, &rdev->flags))) {
Dan Williams6bfe0b42008-04-30 00:52:32 -07002640 len += sprintf(page+len, "%sblocked", sep);
2641 sep = ",";
2642 }
NeilBrownb2d444d2005-11-08 21:39:31 -08002643 if (!test_bit(Faulty, &rdev->flags) &&
2644 !test_bit(In_sync, &rdev->flags)) {
NeilBrown86e6ffd2005-11-08 21:39:24 -08002645 len += sprintf(page+len, "%sspare", sep);
2646 sep = ",";
2647 }
NeilBrownd7a9d442011-07-28 11:31:48 +10002648 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2649 len += sprintf(page+len, "%swrite_error", sep);
2650 sep = ",";
2651 }
NeilBrown2d78f8c2011-12-23 10:17:51 +11002652 if (test_bit(WantReplacement, &rdev->flags)) {
2653 len += sprintf(page+len, "%swant_replacement", sep);
2654 sep = ",";
2655 }
2656 if (test_bit(Replacement, &rdev->flags)) {
2657 len += sprintf(page+len, "%sreplacement", sep);
2658 sep = ",";
2659 }
2660
NeilBrown86e6ffd2005-11-08 21:39:24 -08002661 return len+sprintf(page+len, "\n");
2662}
2663
NeilBrown45dc2de2006-06-26 00:27:58 -07002664static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002665state_store(struct md_rdev *rdev, const char *buf, size_t len)
NeilBrown45dc2de2006-06-26 00:27:58 -07002666{
2667 /* can write
NeilBrownde393cd2011-07-28 11:31:48 +10002668 * faulty - simulates an error
NeilBrown45dc2de2006-06-26 00:27:58 -07002669 * remove - disconnects the device
NeilBrownf6556752006-06-26 00:28:01 -07002670 * writemostly - sets write_mostly
2671 * -writemostly - clears write_mostly
NeilBrownde393cd2011-07-28 11:31:48 +10002672 * blocked - sets the Blocked flags
2673 * -blocked - clears the Blocked and possibly simulates an error
NeilBrown6d56e272009-04-14 12:01:57 +10002674 * insync - sets Insync providing device isn't active
NeilBrownd7a9d442011-07-28 11:31:48 +10002675 * write_error - sets WriteErrorSeen
2676 * -write_error - clears WriteErrorSeen
NeilBrown45dc2de2006-06-26 00:27:58 -07002677 */
2678 int err = -EINVAL;
2679 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2680 md_error(rdev->mddev, rdev);
NeilBrown5ef56c82011-08-25 14:42:51 +10002681 if (test_bit(Faulty, &rdev->flags))
2682 err = 0;
2683 else
2684 err = -EBUSY;
NeilBrown45dc2de2006-06-26 00:27:58 -07002685 } else if (cmd_match(buf, "remove")) {
2686 if (rdev->raid_disk >= 0)
2687 err = -EBUSY;
2688 else {
NeilBrownfd01b882011-10-11 16:47:53 +11002689 struct mddev *mddev = rdev->mddev;
NeilBrown45dc2de2006-06-26 00:27:58 -07002690 kick_rdev_from_array(rdev);
NeilBrown3f9d7b02006-12-22 01:11:41 -08002691 if (mddev->pers)
2692 md_update_sb(mddev, 1);
NeilBrown45dc2de2006-06-26 00:27:58 -07002693 md_new_event(mddev);
2694 err = 0;
2695 }
NeilBrownf6556752006-06-26 00:28:01 -07002696 } else if (cmd_match(buf, "writemostly")) {
2697 set_bit(WriteMostly, &rdev->flags);
2698 err = 0;
2699 } else if (cmd_match(buf, "-writemostly")) {
2700 clear_bit(WriteMostly, &rdev->flags);
2701 err = 0;
Dan Williams6bfe0b42008-04-30 00:52:32 -07002702 } else if (cmd_match(buf, "blocked")) {
2703 set_bit(Blocked, &rdev->flags);
2704 err = 0;
2705 } else if (cmd_match(buf, "-blocked")) {
NeilBrownde393cd2011-07-28 11:31:48 +10002706 if (!test_bit(Faulty, &rdev->flags) &&
NeilBrown7da64a02011-08-30 16:20:17 +10002707 rdev->badblocks.unacked_exist) {
NeilBrownde393cd2011-07-28 11:31:48 +10002708 /* metadata handler doesn't understand badblocks,
2709 * so we need to fail the device
2710 */
2711 md_error(rdev->mddev, rdev);
2712 }
Dan Williams6bfe0b42008-04-30 00:52:32 -07002713 clear_bit(Blocked, &rdev->flags);
NeilBrownde393cd2011-07-28 11:31:48 +10002714 clear_bit(BlockedBadBlocks, &rdev->flags);
Dan Williams6bfe0b42008-04-30 00:52:32 -07002715 wake_up(&rdev->blocked_wait);
2716 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2717 md_wakeup_thread(rdev->mddev->thread);
2718
2719 err = 0;
NeilBrown6d56e272009-04-14 12:01:57 +10002720 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2721 set_bit(In_sync, &rdev->flags);
2722 err = 0;
NeilBrownd7a9d442011-07-28 11:31:48 +10002723 } else if (cmd_match(buf, "write_error")) {
2724 set_bit(WriteErrorSeen, &rdev->flags);
2725 err = 0;
2726 } else if (cmd_match(buf, "-write_error")) {
2727 clear_bit(WriteErrorSeen, &rdev->flags);
2728 err = 0;
NeilBrown2d78f8c2011-12-23 10:17:51 +11002729 } else if (cmd_match(buf, "want_replacement")) {
2730 /* Any non-spare device that is not a replacement can
2731 * become want_replacement at any time, but we then need to
2732 * check if recovery is needed.
2733 */
2734 if (rdev->raid_disk >= 0 &&
2735 !test_bit(Replacement, &rdev->flags))
2736 set_bit(WantReplacement, &rdev->flags);
2737 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2738 md_wakeup_thread(rdev->mddev->thread);
2739 err = 0;
2740 } else if (cmd_match(buf, "-want_replacement")) {
2741 /* Clearing 'want_replacement' is always allowed.
2742 * Once replacements starts it is too late though.
2743 */
2744 err = 0;
2745 clear_bit(WantReplacement, &rdev->flags);
2746 } else if (cmd_match(buf, "replacement")) {
2747 /* Can only set a device as a replacement when array has not
2748 * yet been started. Once running, replacement is automatic
2749 * from spares, or by assigning 'slot'.
2750 */
2751 if (rdev->mddev->pers)
2752 err = -EBUSY;
2753 else {
2754 set_bit(Replacement, &rdev->flags);
2755 err = 0;
2756 }
2757 } else if (cmd_match(buf, "-replacement")) {
2758 /* Similarly, can only clear Replacement before start */
2759 if (rdev->mddev->pers)
2760 err = -EBUSY;
2761 else {
2762 clear_bit(Replacement, &rdev->flags);
2763 err = 0;
2764 }
NeilBrown45dc2de2006-06-26 00:27:58 -07002765 }
NeilBrown00bcb4a2010-06-01 19:37:23 +10002766 if (!err)
2767 sysfs_notify_dirent_safe(rdev->sysfs_state);
NeilBrown45dc2de2006-06-26 00:27:58 -07002768 return err ? err : len;
2769}
NeilBrown80ca3a42006-07-10 04:44:18 -07002770static struct rdev_sysfs_entry rdev_state =
2771__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
NeilBrown86e6ffd2005-11-08 21:39:24 -08002772
2773static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002774errors_show(struct md_rdev *rdev, char *page)
NeilBrown4dbcdc72006-01-06 00:20:52 -08002775{
2776 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2777}
2778
2779static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002780errors_store(struct md_rdev *rdev, const char *buf, size_t len)
NeilBrown4dbcdc72006-01-06 00:20:52 -08002781{
2782 char *e;
2783 unsigned long n = simple_strtoul(buf, &e, 10);
2784 if (*buf && (*e == 0 || *e == '\n')) {
2785 atomic_set(&rdev->corrected_errors, n);
2786 return len;
2787 }
2788 return -EINVAL;
2789}
2790static struct rdev_sysfs_entry rdev_errors =
NeilBrown80ca3a42006-07-10 04:44:18 -07002791__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
NeilBrown4dbcdc72006-01-06 00:20:52 -08002792
NeilBrown014236d2006-01-06 00:20:55 -08002793static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002794slot_show(struct md_rdev *rdev, char *page)
NeilBrown014236d2006-01-06 00:20:55 -08002795{
2796 if (rdev->raid_disk < 0)
2797 return sprintf(page, "none\n");
2798 else
2799 return sprintf(page, "%d\n", rdev->raid_disk);
2800}
2801
2802static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002803slot_store(struct md_rdev *rdev, const char *buf, size_t len)
NeilBrown014236d2006-01-06 00:20:55 -08002804{
2805 char *e;
NeilBrownc303da62008-02-06 01:39:51 -08002806 int err;
NeilBrown014236d2006-01-06 00:20:55 -08002807 int slot = simple_strtoul(buf, &e, 10);
2808 if (strncmp(buf, "none", 4)==0)
2809 slot = -1;
2810 else if (e==buf || (*e && *e!= '\n'))
2811 return -EINVAL;
Neil Brown6c2fce22008-06-28 08:31:31 +10002812 if (rdev->mddev->pers && slot == -1) {
NeilBrownc303da62008-02-06 01:39:51 -08002813 /* Setting 'slot' on an active array requires also
2814 * updating the 'rd%d' link, and communicating
2815 * with the personality with ->hot_*_disk.
2816 * For now we only support removing
2817 * failed/spare devices. This normally happens automatically,
2818 * but not when the metadata is externally managed.
2819 */
NeilBrownc303da62008-02-06 01:39:51 -08002820 if (rdev->raid_disk == -1)
2821 return -EEXIST;
2822 /* personality does all needed checks */
Namhyung Kim01393f32011-06-09 11:42:54 +10002823 if (rdev->mddev->pers->hot_remove_disk == NULL)
NeilBrownc303da62008-02-06 01:39:51 -08002824 return -EINVAL;
2825 err = rdev->mddev->pers->
NeilBrownb8321b62011-12-23 10:17:51 +11002826 hot_remove_disk(rdev->mddev, rdev);
NeilBrownc303da62008-02-06 01:39:51 -08002827 if (err)
2828 return err;
Namhyung Kim36fad852011-07-27 11:00:36 +10002829 sysfs_unlink_rdev(rdev->mddev, rdev);
Maciej Trelab7103102010-04-14 16:58:16 +10002830 rdev->raid_disk = -1;
NeilBrownc303da62008-02-06 01:39:51 -08002831 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2832 md_wakeup_thread(rdev->mddev->thread);
Neil Brown6c2fce22008-06-28 08:31:31 +10002833 } else if (rdev->mddev->pers) {
Neil Brown6c2fce22008-06-28 08:31:31 +10002834 /* Activating a spare .. or possibly reactivating
NeilBrown6d56e272009-04-14 12:01:57 +10002835 * if we ever get bitmaps working here.
Neil Brown6c2fce22008-06-28 08:31:31 +10002836 */
2837
2838 if (rdev->raid_disk != -1)
2839 return -EBUSY;
2840
NeilBrownc6751b22011-02-02 11:57:13 +11002841 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2842 return -EBUSY;
2843
Neil Brown6c2fce22008-06-28 08:31:31 +10002844 if (rdev->mddev->pers->hot_add_disk == NULL)
2845 return -EINVAL;
2846
NeilBrownba1b41b2011-01-14 09:14:34 +11002847 if (slot >= rdev->mddev->raid_disks &&
2848 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2849 return -ENOSPC;
2850
Neil Brown6c2fce22008-06-28 08:31:31 +10002851 rdev->raid_disk = slot;
2852 if (test_bit(In_sync, &rdev->flags))
2853 rdev->saved_raid_disk = slot;
2854 else
2855 rdev->saved_raid_disk = -1;
NeilBrownd30519f2011-10-18 12:13:47 +11002856 clear_bit(In_sync, &rdev->flags);
Neil Brown6c2fce22008-06-28 08:31:31 +10002857 err = rdev->mddev->pers->
2858 hot_add_disk(rdev->mddev, rdev);
Neil Brown199050e2008-06-28 08:31:33 +10002859 if (err) {
Neil Brown6c2fce22008-06-28 08:31:31 +10002860 rdev->raid_disk = -1;
Neil Brown6c2fce22008-06-28 08:31:31 +10002861 return err;
Neil Brown52664732008-06-28 08:31:44 +10002862 } else
NeilBrown00bcb4a2010-06-01 19:37:23 +10002863 sysfs_notify_dirent_safe(rdev->sysfs_state);
Namhyung Kim36fad852011-07-27 11:00:36 +10002864 if (sysfs_link_rdev(rdev->mddev, rdev))
NeilBrown00bcb4a2010-06-01 19:37:23 +10002865 /* failure here is OK */;
Neil Brown6c2fce22008-06-28 08:31:31 +10002866 /* don't wakeup anyone, leave that to userspace. */
NeilBrownc303da62008-02-06 01:39:51 -08002867 } else {
NeilBrownba1b41b2011-01-14 09:14:34 +11002868 if (slot >= rdev->mddev->raid_disks &&
2869 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
NeilBrownc303da62008-02-06 01:39:51 -08002870 return -ENOSPC;
2871 rdev->raid_disk = slot;
2872 /* assume it is working */
NeilBrownc5d79ad2008-02-06 01:39:54 -08002873 clear_bit(Faulty, &rdev->flags);
2874 clear_bit(WriteMostly, &rdev->flags);
NeilBrownc303da62008-02-06 01:39:51 -08002875 set_bit(In_sync, &rdev->flags);
NeilBrown00bcb4a2010-06-01 19:37:23 +10002876 sysfs_notify_dirent_safe(rdev->sysfs_state);
NeilBrownc303da62008-02-06 01:39:51 -08002877 }
NeilBrown014236d2006-01-06 00:20:55 -08002878 return len;
2879}
2880
2881
2882static struct rdev_sysfs_entry rdev_slot =
NeilBrown80ca3a42006-07-10 04:44:18 -07002883__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
NeilBrown014236d2006-01-06 00:20:55 -08002884
NeilBrown93c8cad2006-01-06 00:20:56 -08002885static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002886offset_show(struct md_rdev *rdev, char *page)
NeilBrown93c8cad2006-01-06 00:20:56 -08002887{
Andrew Morton6961ece2006-01-06 00:20:59 -08002888 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
NeilBrown93c8cad2006-01-06 00:20:56 -08002889}
2890
2891static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002892offset_store(struct md_rdev *rdev, const char *buf, size_t len)
NeilBrown93c8cad2006-01-06 00:20:56 -08002893{
NeilBrownc6563a82012-05-21 09:27:00 +10002894 unsigned long long offset;
2895 if (strict_strtoull(buf, 10, &offset) < 0)
NeilBrown93c8cad2006-01-06 00:20:56 -08002896 return -EINVAL;
Neil Brown8ed0a522008-06-28 08:31:29 +10002897 if (rdev->mddev->pers && rdev->raid_disk >= 0)
NeilBrown93c8cad2006-01-06 00:20:56 -08002898 return -EBUSY;
Andre Nolldd8ac332009-03-31 14:33:13 +11002899 if (rdev->sectors && rdev->mddev->external)
NeilBrownc5d79ad2008-02-06 01:39:54 -08002900 /* Must set offset before size, so overlap checks
2901 * can be sane */
2902 return -EBUSY;
NeilBrown93c8cad2006-01-06 00:20:56 -08002903 rdev->data_offset = offset;
2904 return len;
2905}
2906
2907static struct rdev_sysfs_entry rdev_offset =
NeilBrown80ca3a42006-07-10 04:44:18 -07002908__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
NeilBrown93c8cad2006-01-06 00:20:56 -08002909
NeilBrownc6563a82012-05-21 09:27:00 +10002910static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2911{
2912 return sprintf(page, "%llu\n",
2913 (unsigned long long)rdev->new_data_offset);
2914}
2915
2916static ssize_t new_offset_store(struct md_rdev *rdev,
2917 const char *buf, size_t len)
2918{
2919 unsigned long long new_offset;
2920 struct mddev *mddev = rdev->mddev;
2921
2922 if (strict_strtoull(buf, 10, &new_offset) < 0)
2923 return -EINVAL;
2924
2925 if (mddev->sync_thread)
2926 return -EBUSY;
2927 if (new_offset == rdev->data_offset)
2928 /* reset is always permitted */
2929 ;
2930 else if (new_offset > rdev->data_offset) {
2931 /* must not push array size beyond rdev_sectors */
2932 if (new_offset - rdev->data_offset
2933 + mddev->dev_sectors > rdev->sectors)
2934 return -E2BIG;
2935 }
2936 /* Metadata worries about other space details. */
2937
2938 /* decreasing the offset is inconsistent with a backwards
2939 * reshape.
2940 */
2941 if (new_offset < rdev->data_offset &&
2942 mddev->reshape_backwards)
2943 return -EINVAL;
2944 /* Increasing offset is inconsistent with forwards
2945 * reshape. reshape_direction should be set to
2946 * 'backwards' first.
2947 */
2948 if (new_offset > rdev->data_offset &&
2949 !mddev->reshape_backwards)
2950 return -EINVAL;
2951
2952 if (mddev->pers && mddev->persistent &&
2953 !super_types[mddev->major_version]
2954 .allow_new_offset(rdev, new_offset))
2955 return -E2BIG;
2956 rdev->new_data_offset = new_offset;
2957 if (new_offset > rdev->data_offset)
2958 mddev->reshape_backwards = 1;
2959 else if (new_offset < rdev->data_offset)
2960 mddev->reshape_backwards = 0;
2961
2962 return len;
2963}
2964static struct rdev_sysfs_entry rdev_new_offset =
2965__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2966
NeilBrown83303b62006-01-06 00:21:06 -08002967static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11002968rdev_size_show(struct md_rdev *rdev, char *page)
NeilBrown83303b62006-01-06 00:21:06 -08002969{
Andre Nolldd8ac332009-03-31 14:33:13 +11002970 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
NeilBrown83303b62006-01-06 00:21:06 -08002971}
2972
NeilBrownc5d79ad2008-02-06 01:39:54 -08002973static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2974{
2975 /* check if two start/length pairs overlap */
2976 if (s1+l1 <= s2)
2977 return 0;
2978 if (s2+l2 <= s1)
2979 return 0;
2980 return 1;
2981}
2982
Dan Williamsb522adc2009-03-31 15:00:31 +11002983static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2984{
2985 unsigned long long blocks;
2986 sector_t new;
2987
2988 if (strict_strtoull(buf, 10, &blocks) < 0)
2989 return -EINVAL;
2990
2991 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2992 return -EINVAL; /* sector conversion overflow */
2993
2994 new = blocks * 2;
2995 if (new != blocks * 2)
2996 return -EINVAL; /* unsigned long long to sector_t overflow */
2997
2998 *sectors = new;
2999 return 0;
3000}
3001
NeilBrown83303b62006-01-06 00:21:06 -08003002static ssize_t
NeilBrown3cb03002011-10-11 16:45:26 +11003003rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
NeilBrown83303b62006-01-06 00:21:06 -08003004{
NeilBrownfd01b882011-10-11 16:47:53 +11003005 struct mddev *my_mddev = rdev->mddev;
Andre Nolldd8ac332009-03-31 14:33:13 +11003006 sector_t oldsectors = rdev->sectors;
Dan Williamsb522adc2009-03-31 15:00:31 +11003007 sector_t sectors;
NeilBrown27c529b2008-03-04 14:29:33 -08003008
Dan Williamsb522adc2009-03-31 15:00:31 +11003009 if (strict_blocks_to_sectors(buf, &sectors) < 0)
Neil Brownd7027452008-07-12 10:37:50 +10003010 return -EINVAL;
NeilBrownc6563a82012-05-21 09:27:00 +10003011 if (rdev->data_offset != rdev->new_data_offset)
3012 return -EINVAL; /* too confusing */
Chris Webb0cd17fe2008-06-28 08:31:46 +10003013 if (my_mddev->pers && rdev->raid_disk >= 0) {
Neil Brownd7027452008-07-12 10:37:50 +10003014 if (my_mddev->persistent) {
Andre Nolldd8ac332009-03-31 14:33:13 +11003015 sectors = super_types[my_mddev->major_version].
3016 rdev_size_change(rdev, sectors);
3017 if (!sectors)
Chris Webb0cd17fe2008-06-28 08:31:46 +10003018 return -EBUSY;
Andre Nolldd8ac332009-03-31 14:33:13 +11003019 } else if (!sectors)
Mike Snitzer77304d22010-11-08 14:39:12 +01003020 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
Andre Nolldd8ac332009-03-31 14:33:13 +11003021 rdev->data_offset;
Chris Webb0cd17fe2008-06-28 08:31:46 +10003022 }
Andre Nolldd8ac332009-03-31 14:33:13 +11003023 if (sectors < my_mddev->dev_sectors)
Chris Webb7d3c6f82008-10-13 11:55:11 +11003024 return -EINVAL; /* component must fit device */
Chris Webb0cd17fe2008-06-28 08:31:46 +10003025
Andre Nolldd8ac332009-03-31 14:33:13 +11003026 rdev->sectors = sectors;
3027 if (sectors > oldsectors && my_mddev->external) {
NeilBrownc5d79ad2008-02-06 01:39:54 -08003028 /* need to check that all other rdevs with the same ->bdev
3029 * do not overlap. We need to unlock the mddev to avoid
Andre Nolldd8ac332009-03-31 14:33:13 +11003030 * a deadlock. We have already changed rdev->sectors, and if
NeilBrownc5d79ad2008-02-06 01:39:54 -08003031 * we have to change it back, we will have the lock again.
3032 */
NeilBrownfd01b882011-10-11 16:47:53 +11003033 struct mddev *mddev;
NeilBrownc5d79ad2008-02-06 01:39:54 -08003034 int overlap = 0;
Cheng Renquan159ec1f2009-01-09 08:31:08 +11003035 struct list_head *tmp;
NeilBrownc5d79ad2008-02-06 01:39:54 -08003036
NeilBrown27c529b2008-03-04 14:29:33 -08003037 mddev_unlock(my_mddev);
NeilBrown29ac4aa2008-02-06 01:39:58 -08003038 for_each_mddev(mddev, tmp) {
NeilBrown3cb03002011-10-11 16:45:26 +11003039 struct md_rdev *rdev2;
NeilBrownc5d79ad2008-02-06 01:39:54 -08003040
3041 mddev_lock(mddev);
NeilBrowndafb20f2012-03-19 12:46:39 +11003042 rdev_for_each(rdev2, mddev)
NeilBrownf21e9ff2011-01-31 12:10:09 +11003043 if (rdev->bdev == rdev2->bdev &&
3044 rdev != rdev2 &&
3045 overlaps(rdev->data_offset, rdev->sectors,
3046 rdev2->data_offset,
3047 rdev2->sectors)) {
NeilBrownc5d79ad2008-02-06 01:39:54 -08003048 overlap = 1;
3049 break;
3050 }
3051 mddev_unlock(mddev);
3052 if (overlap) {
3053 mddev_put(mddev);
3054 break;
3055 }
3056 }
NeilBrown27c529b2008-03-04 14:29:33 -08003057 mddev_lock(my_mddev);
NeilBrownc5d79ad2008-02-06 01:39:54 -08003058 if (overlap) {
3059 /* Someone else could have slipped in a size
3060 * change here, but doing so is just silly.
Andre Nolldd8ac332009-03-31 14:33:13 +11003061 * We put oldsectors back because we *know* it is
NeilBrownc5d79ad2008-02-06 01:39:54 -08003062 * safe, and trust userspace not to race with
3063 * itself
3064 */
Andre Nolldd8ac332009-03-31 14:33:13 +11003065 rdev->sectors = oldsectors;
NeilBrownc5d79ad2008-02-06 01:39:54 -08003066 return -EBUSY;
3067 }
3068 }
NeilBrown83303b62006-01-06 00:21:06 -08003069 return len;
3070}
3071
3072static struct rdev_sysfs_entry rdev_size =
NeilBrown80ca3a42006-07-10 04:44:18 -07003073__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
NeilBrown83303b62006-01-06 00:21:06 -08003074
Dan Williams06e3c812009-12-12 21:17:12 -07003075
NeilBrown3cb03002011-10-11 16:45:26 +11003076static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
Dan Williams06e3c812009-12-12 21:17:12 -07003077{
3078 unsigned long long recovery_start = rdev->recovery_offset;
3079
3080 if (test_bit(In_sync, &rdev->flags) ||
3081 recovery_start == MaxSector)
3082 return sprintf(page, "none\n");
3083
3084 return sprintf(page, "%llu\n", recovery_start);
3085}
3086
NeilBrown3cb03002011-10-11 16:45:26 +11003087static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
Dan Williams06e3c812009-12-12 21:17:12 -07003088{
3089 unsigned long long recovery_start;
3090
3091 if (cmd_match(buf, "none"))
3092 recovery_start = MaxSector;
3093 else if (strict_strtoull(buf, 10, &recovery_start))
3094 return -EINVAL;
3095
3096 if (rdev->mddev->pers &&
3097 rdev->raid_disk >= 0)
3098 return -EBUSY;
3099
3100 rdev->recovery_offset = recovery_start;
3101 if (recovery_start == MaxSector)
3102 set_bit(In_sync, &rdev->flags);
3103 else
3104 clear_bit(In_sync, &rdev->flags);
3105 return len;
3106}
3107
3108static struct rdev_sysfs_entry rdev_recovery_start =
3109__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3110
NeilBrown16c791a2011-07-28 11:31:47 +10003111
3112static ssize_t
3113badblocks_show(struct badblocks *bb, char *page, int unack);
3114static ssize_t
3115badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3116
NeilBrown3cb03002011-10-11 16:45:26 +11003117static ssize_t bb_show(struct md_rdev *rdev, char *page)
NeilBrown16c791a2011-07-28 11:31:47 +10003118{
3119 return badblocks_show(&rdev->badblocks, page, 0);
3120}
NeilBrown3cb03002011-10-11 16:45:26 +11003121static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
NeilBrown16c791a2011-07-28 11:31:47 +10003122{
NeilBrownde393cd2011-07-28 11:31:48 +10003123 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3124 /* Maybe that ack was all we needed */
3125 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3126 wake_up(&rdev->blocked_wait);
3127 return rv;
NeilBrown16c791a2011-07-28 11:31:47 +10003128}
3129static struct rdev_sysfs_entry rdev_bad_blocks =
3130__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3131
3132
NeilBrown3cb03002011-10-11 16:45:26 +11003133static ssize_t ubb_show(struct md_rdev *rdev, char *page)
NeilBrown16c791a2011-07-28 11:31:47 +10003134{
3135 return badblocks_show(&rdev->badblocks, page, 1);
3136}
NeilBrown3cb03002011-10-11 16:45:26 +11003137static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
NeilBrown16c791a2011-07-28 11:31:47 +10003138{
3139 return badblocks_store(&rdev->badblocks, page, len, 1);
3140}
3141static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3142__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3143
NeilBrown86e6ffd2005-11-08 21:39:24 -08003144static struct attribute *rdev_default_attrs[] = {
3145 &rdev_state.attr,
NeilBrown4dbcdc72006-01-06 00:20:52 -08003146 &rdev_errors.attr,
NeilBrown014236d2006-01-06 00:20:55 -08003147 &rdev_slot.attr,
NeilBrown93c8cad2006-01-06 00:20:56 -08003148 &rdev_offset.attr,
NeilBrownc6563a82012-05-21 09:27:00 +10003149 &rdev_new_offset.attr,
NeilBrown83303b62006-01-06 00:21:06 -08003150 &rdev_size.attr,
Dan Williams06e3c812009-12-12 21:17:12 -07003151 &rdev_recovery_start.attr,
NeilBrown16c791a2011-07-28 11:31:47 +10003152 &rdev_bad_blocks.attr,
3153 &rdev_unack_bad_blocks.attr,
NeilBrown86e6ffd2005-11-08 21:39:24 -08003154 NULL,
3155};
3156static ssize_t
3157rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3158{
3159 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
NeilBrown3cb03002011-10-11 16:45:26 +11003160 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
NeilBrownfd01b882011-10-11 16:47:53 +11003161 struct mddev *mddev = rdev->mddev;
NeilBrown27c529b2008-03-04 14:29:33 -08003162 ssize_t rv;
NeilBrown86e6ffd2005-11-08 21:39:24 -08003163
3164 if (!entry->show)
3165 return -EIO;
NeilBrown27c529b2008-03-04 14:29:33 -08003166
3167 rv = mddev ? mddev_lock(mddev) : -EBUSY;
3168 if (!rv) {
3169 if (rdev->mddev == NULL)
3170 rv = -EBUSY;
3171 else
3172 rv = entry->show(rdev, page);
3173 mddev_unlock(mddev);
3174 }
3175 return rv;
NeilBrown86e6ffd2005-11-08 21:39:24 -08003176}
3177
3178static ssize_t
3179rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3180 const char *page, size_t length)
3181{
3182 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
NeilBrown3cb03002011-10-11 16:45:26 +11003183 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
NeilBrown27c529b2008-03-04 14:29:33 -08003184 ssize_t rv;
NeilBrownfd01b882011-10-11 16:47:53 +11003185 struct mddev *mddev = rdev->mddev;
NeilBrown86e6ffd2005-11-08 21:39:24 -08003186
3187 if (!entry->store)
3188 return -EIO;
NeilBrown67463ac2006-07-10 04:44:19 -07003189 if (!capable(CAP_SYS_ADMIN))
3190 return -EACCES;
NeilBrown27c529b2008-03-04 14:29:33 -08003191 rv = mddev ? mddev_lock(mddev): -EBUSY;
NeilBrownca388052008-02-06 01:39:55 -08003192 if (!rv) {
NeilBrown27c529b2008-03-04 14:29:33 -08003193 if (rdev->mddev == NULL)
3194 rv = -EBUSY;
3195 else
3196 rv = entry->store(rdev, page, length);
Dan Williams6a518302008-04-30 00:52:28 -07003197 mddev_unlock(mddev);
NeilBrownca388052008-02-06 01:39:55 -08003198 }
3199 return rv;
NeilBrown86e6ffd2005-11-08 21:39:24 -08003200}
3201
3202static void rdev_free(struct kobject *ko)
3203{
NeilBrown3cb03002011-10-11 16:45:26 +11003204 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
NeilBrown86e6ffd2005-11-08 21:39:24 -08003205 kfree(rdev);
3206}
Emese Revfy52cf25d2010-01-19 02:58:23 +01003207static const struct sysfs_ops rdev_sysfs_ops = {
NeilBrown86e6ffd2005-11-08 21:39:24 -08003208 .show = rdev_attr_show,
3209 .store = rdev_attr_store,
3210};
3211static struct kobj_type rdev_ktype = {
3212 .release = rdev_free,
3213 .sysfs_ops = &rdev_sysfs_ops,
3214 .default_attrs = rdev_default_attrs,
3215};
3216
NeilBrown3cb03002011-10-11 16:45:26 +11003217int md_rdev_init(struct md_rdev *rdev)
NeilBrowne8bb9a82010-06-01 19:37:26 +10003218{
3219 rdev->desc_nr = -1;
3220 rdev->saved_raid_disk = -1;
3221 rdev->raid_disk = -1;
3222 rdev->flags = 0;
3223 rdev->data_offset = 0;
NeilBrownc6563a82012-05-21 09:27:00 +10003224 rdev->new_data_offset = 0;
NeilBrowne8bb9a82010-06-01 19:37:26 +10003225 rdev->sb_events = 0;
3226 rdev->last_read_error.tv_sec = 0;
3227 rdev->last_read_error.tv_nsec = 0;
NeilBrown2699b672011-07-28 11:31:47 +10003228 rdev->sb_loaded = 0;
3229 rdev->bb_page = NULL;
NeilBrowne8bb9a82010-06-01 19:37:26 +10003230 atomic_set(&rdev->nr_pending, 0);
3231 atomic_set(&rdev->read_errors, 0);
3232 atomic_set(&rdev->corrected_errors, 0);
3233
3234 INIT_LIST_HEAD(&rdev->same_set);
3235 init_waitqueue_head(&rdev->blocked_wait);
NeilBrown2230dfe2011-07-28 11:31:46 +10003236
3237 /* Add space to store bad block list.
3238 * This reserves the space even on arrays where it cannot
3239 * be used - I wonder if that matters
3240 */
3241 rdev->badblocks.count = 0;
3242 rdev->badblocks.shift = 0;
3243 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3244 seqlock_init(&rdev->badblocks.lock);
3245 if (rdev->badblocks.page == NULL)
3246 return -ENOMEM;
3247
3248 return 0;
NeilBrowne8bb9a82010-06-01 19:37:26 +10003249}
3250EXPORT_SYMBOL_GPL(md_rdev_init);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003251/*
3252 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3253 *
3254 * mark the device faulty if:
3255 *
3256 * - the device is nonexistent (zero size)
3257 * - the device has no valid superblock
3258 *
3259 * a faulty rdev _never_ has rdev->sb set.
3260 */
NeilBrown3cb03002011-10-11 16:45:26 +11003261static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262{
3263 char b[BDEVNAME_SIZE];
3264 int err;
NeilBrown3cb03002011-10-11 16:45:26 +11003265 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003266 sector_t size;
3267
NeilBrown9ffae0c2006-01-06 00:20:32 -08003268 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003269 if (!rdev) {
3270 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3271 return ERR_PTR(-ENOMEM);
3272 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003273
NeilBrown2230dfe2011-07-28 11:31:46 +10003274 err = md_rdev_init(rdev);
3275 if (err)
3276 goto abort_free;
3277 err = alloc_disk_sb(rdev);
3278 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003279 goto abort_free;
3280
NeilBrownc5d79ad2008-02-06 01:39:54 -08003281 err = lock_rdev(rdev, newdev, super_format == -2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003282 if (err)
3283 goto abort_free;
3284
Greg Kroah-Hartmanf9cb0742007-12-17 23:05:35 -07003285 kobject_init(&rdev->kobj, &rdev_ktype);
NeilBrown86e6ffd2005-11-08 21:39:24 -08003286
Mike Snitzer77304d22010-11-08 14:39:12 +01003287 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003288 if (!size) {
3289 printk(KERN_WARNING
3290 "md: %s has zero or unknown size, marking faulty!\n",
3291 bdevname(rdev->bdev,b));
3292 err = -EINVAL;
3293 goto abort_free;
3294 }
3295
3296 if (super_format >= 0) {
3297 err = super_types[super_format].
3298 load_super(rdev, NULL, super_minor);
3299 if (err == -EINVAL) {
NeilBrowndf968c42007-07-17 04:06:11 -07003300 printk(KERN_WARNING
3301 "md: %s does not have a valid v%d.%d "
3302 "superblock, not importing!\n",
3303 bdevname(rdev->bdev,b),
3304 super_format, super_minor);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305 goto abort_free;
3306 }
3307 if (err < 0) {
3308 printk(KERN_WARNING
3309 "md: could not read %s's sb, not importing!\n",
3310 bdevname(rdev->bdev,b));
3311 goto abort_free;
3312 }
3313 }
NeilBrown9f2f3832011-07-28 11:31:47 +10003314 if (super_format == -1)
3315 /* hot-add for 0.90, or non-persistent: so no badblocks */
3316 rdev->badblocks.shift = -1;
Dan Williams6bfe0b42008-04-30 00:52:32 -07003317
Linus Torvalds1da177e2005-04-16 15:20:36 -07003318 return rdev;
3319
3320abort_free:
NeilBrown2699b672011-07-28 11:31:47 +10003321 if (rdev->bdev)
3322 unlock_rdev(rdev);
3323 free_disk_sb(rdev);
NeilBrown2230dfe2011-07-28 11:31:46 +10003324 kfree(rdev->badblocks.page);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 kfree(rdev);
3326 return ERR_PTR(err);
3327}
3328
3329/*
3330 * Check a full RAID array for plausibility
3331 */
3332
3333
NeilBrownfd01b882011-10-11 16:47:53 +11003334static void analyze_sbs(struct mddev * mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003335{
3336 int i;
NeilBrown3cb03002011-10-11 16:45:26 +11003337 struct md_rdev *rdev, *freshest, *tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003338 char b[BDEVNAME_SIZE];
3339
3340 freshest = NULL;
NeilBrowndafb20f2012-03-19 12:46:39 +11003341 rdev_for_each_safe(rdev, tmp, mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342 switch (super_types[mddev->major_version].
3343 load_super(rdev, freshest, mddev->minor_version)) {
3344 case 1:
3345 freshest = rdev;
3346 break;
3347 case 0:
3348 break;
3349 default:
3350 printk( KERN_ERR \
3351 "md: fatal superblock inconsistency in %s"
3352 " -- removing from array\n",
3353 bdevname(rdev->bdev,b));
3354 kick_rdev_from_array(rdev);
3355 }
3356
3357
3358 super_types[mddev->major_version].
3359 validate_super(mddev, freshest);
3360
3361 i = 0;
NeilBrowndafb20f2012-03-19 12:46:39 +11003362 rdev_for_each_safe(rdev, tmp, mddev) {
NeilBrown233fca32010-04-14 17:02:09 +10003363 if (mddev->max_disks &&
3364 (rdev->desc_nr >= mddev->max_disks ||
3365 i > mddev->max_disks)) {
NeilBrownde01dfa2009-02-06 18:02:46 +11003366 printk(KERN_WARNING
3367 "md: %s: %s: only %d devices permitted\n",
3368 mdname(mddev), bdevname(rdev->bdev, b),
3369 mddev->max_disks);
3370 kick_rdev_from_array(rdev);
3371 continue;
3372 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373 if (rdev != freshest)
3374 if (super_types[mddev->major_version].
3375 validate_super(mddev, rdev)) {
3376 printk(KERN_WARNING "md: kicking non-fresh %s"
3377 " from array!\n",
3378 bdevname(rdev->bdev,b));
3379 kick_rdev_from_array(rdev);
3380 continue;
3381 }
3382 if (mddev->level == LEVEL_MULTIPATH) {
3383 rdev->desc_nr = i++;
3384 rdev->raid_disk = rdev->desc_nr;
NeilBrownb2d444d2005-11-08 21:39:31 -08003385 set_bit(In_sync, &rdev->flags);
NeilBrown5e5e3e72009-10-16 16:35:30 +11003386 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
NeilBrowna778b732007-05-23 13:58:10 -07003387 rdev->raid_disk = -1;
3388 clear_bit(In_sync, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003389 }
3390 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003391}
3392
NeilBrown72e02072009-12-14 12:49:55 +11003393/* Read a fixed-point number.
3394 * Numbers in sysfs attributes should be in "standard" units where
3395 * possible, so time should be in seconds.
3396 * However we internally use a a much smaller unit such as
3397 * milliseconds or jiffies.
3398 * This function takes a decimal number with a possible fractional
3399 * component, and produces an integer which is the result of
3400 * multiplying that number by 10^'scale'.
3401 * all without any floating-point arithmetic.
3402 */
3403int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3404{
3405 unsigned long result = 0;
3406 long decimals = -1;
3407 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3408 if (*cp == '.')
3409 decimals = 0;
3410 else if (decimals < scale) {
3411 unsigned int value;
3412 value = *cp - '0';
3413 result = result * 10 + value;
3414 if (decimals >= 0)
3415 decimals++;
3416 }
3417 cp++;
3418 }
3419 if (*cp == '\n')
3420 cp++;
3421 if (*cp)
3422 return -EINVAL;
3423 if (decimals < 0)
3424 decimals = 0;
3425 while (decimals < scale) {
3426 result *= 10;
3427 decimals ++;
3428 }
3429 *res = result;
3430 return 0;
3431}
3432
3433
NeilBrown19052c02008-08-05 15:54:13 +10003434static void md_safemode_timeout(unsigned long data);
3435
NeilBrowneae17012005-11-08 21:39:23 -08003436static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003437safe_delay_show(struct mddev *mddev, char *page)
NeilBrown16f17b32006-06-26 00:27:37 -07003438{
3439 int msec = (mddev->safemode_delay*1000)/HZ;
3440 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3441}
3442static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003443safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
NeilBrown16f17b32006-06-26 00:27:37 -07003444{
NeilBrown16f17b32006-06-26 00:27:37 -07003445 unsigned long msec;
Dan Williams97ce0a72008-09-24 22:48:19 -07003446
NeilBrown72e02072009-12-14 12:49:55 +11003447 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
NeilBrown16f17b32006-06-26 00:27:37 -07003448 return -EINVAL;
NeilBrown16f17b32006-06-26 00:27:37 -07003449 if (msec == 0)
3450 mddev->safemode_delay = 0;
3451 else {
NeilBrown19052c02008-08-05 15:54:13 +10003452 unsigned long old_delay = mddev->safemode_delay;
NeilBrown16f17b32006-06-26 00:27:37 -07003453 mddev->safemode_delay = (msec*HZ)/1000;
3454 if (mddev->safemode_delay == 0)
3455 mddev->safemode_delay = 1;
NeilBrown19052c02008-08-05 15:54:13 +10003456 if (mddev->safemode_delay < old_delay)
3457 md_safemode_timeout((unsigned long)mddev);
NeilBrown16f17b32006-06-26 00:27:37 -07003458 }
3459 return len;
3460}
3461static struct md_sysfs_entry md_safe_delay =
NeilBrown80ca3a42006-07-10 04:44:18 -07003462__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
NeilBrown16f17b32006-06-26 00:27:37 -07003463
3464static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003465level_show(struct mddev *mddev, char *page)
NeilBrowneae17012005-11-08 21:39:23 -08003466{
NeilBrown84fc4b52011-10-11 16:49:58 +11003467 struct md_personality *p = mddev->pers;
NeilBrownd9d166c2006-01-06 00:20:51 -08003468 if (p)
NeilBrowneae17012005-11-08 21:39:23 -08003469 return sprintf(page, "%s\n", p->name);
NeilBrownd9d166c2006-01-06 00:20:51 -08003470 else if (mddev->clevel[0])
3471 return sprintf(page, "%s\n", mddev->clevel);
3472 else if (mddev->level != LEVEL_NONE)
3473 return sprintf(page, "%d\n", mddev->level);
3474 else
3475 return 0;
NeilBrowneae17012005-11-08 21:39:23 -08003476}
3477
NeilBrownd9d166c2006-01-06 00:20:51 -08003478static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003479level_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrownd9d166c2006-01-06 00:20:51 -08003480{
Dan Williamsf2859af2010-05-02 10:04:16 -07003481 char clevel[16];
NeilBrown20a49ff2008-02-06 01:39:57 -08003482 ssize_t rv = len;
NeilBrown84fc4b52011-10-11 16:49:58 +11003483 struct md_personality *pers;
Dan Williamsf2859af2010-05-02 10:04:16 -07003484 long level;
NeilBrown245f46c2009-03-31 14:39:39 +11003485 void *priv;
NeilBrown3cb03002011-10-11 16:45:26 +11003486 struct md_rdev *rdev;
NeilBrown245f46c2009-03-31 14:39:39 +11003487
3488 if (mddev->pers == NULL) {
3489 if (len == 0)
3490 return 0;
3491 if (len >= sizeof(mddev->clevel))
3492 return -ENOSPC;
3493 strncpy(mddev->clevel, buf, len);
3494 if (mddev->clevel[len-1] == '\n')
3495 len--;
3496 mddev->clevel[len] = 0;
3497 mddev->level = LEVEL_NONE;
3498 return rv;
3499 }
3500
3501 /* request to change the personality. Need to ensure:
3502 * - array is not engaged in resync/recovery/reshape
3503 * - old personality can be suspended
3504 * - new personality will access other array.
3505 */
3506
NeilBrownbb4f1e92010-08-08 21:18:03 +10003507 if (mddev->sync_thread ||
3508 mddev->reshape_position != MaxSector ||
3509 mddev->sysfs_active)
NeilBrownd9d166c2006-01-06 00:20:51 -08003510 return -EBUSY;
NeilBrown245f46c2009-03-31 14:39:39 +11003511
3512 if (!mddev->pers->quiesce) {
3513 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3514 mdname(mddev), mddev->pers->name);
3515 return -EINVAL;
3516 }
3517
3518 /* Now find the new personality */
Dan Williamsf2859af2010-05-02 10:04:16 -07003519 if (len == 0 || len >= sizeof(clevel))
NeilBrown245f46c2009-03-31 14:39:39 +11003520 return -EINVAL;
Dan Williamsf2859af2010-05-02 10:04:16 -07003521 strncpy(clevel, buf, len);
3522 if (clevel[len-1] == '\n')
NeilBrownd9d166c2006-01-06 00:20:51 -08003523 len--;
Dan Williamsf2859af2010-05-02 10:04:16 -07003524 clevel[len] = 0;
3525 if (strict_strtol(clevel, 10, &level))
3526 level = LEVEL_NONE;
NeilBrown245f46c2009-03-31 14:39:39 +11003527
Dan Williamsf2859af2010-05-02 10:04:16 -07003528 if (request_module("md-%s", clevel) != 0)
3529 request_module("md-level-%s", clevel);
NeilBrown245f46c2009-03-31 14:39:39 +11003530 spin_lock(&pers_lock);
Dan Williamsf2859af2010-05-02 10:04:16 -07003531 pers = find_pers(level, clevel);
NeilBrown245f46c2009-03-31 14:39:39 +11003532 if (!pers || !try_module_get(pers->owner)) {
3533 spin_unlock(&pers_lock);
Dan Williamsf2859af2010-05-02 10:04:16 -07003534 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
NeilBrown245f46c2009-03-31 14:39:39 +11003535 return -EINVAL;
3536 }
3537 spin_unlock(&pers_lock);
3538
3539 if (pers == mddev->pers) {
3540 /* Nothing to do! */
3541 module_put(pers->owner);
3542 return rv;
3543 }
3544 if (!pers->takeover) {
3545 module_put(pers->owner);
3546 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
Dan Williamsf2859af2010-05-02 10:04:16 -07003547 mdname(mddev), clevel);
NeilBrown245f46c2009-03-31 14:39:39 +11003548 return -EINVAL;
3549 }
3550
NeilBrowndafb20f2012-03-19 12:46:39 +11003551 rdev_for_each(rdev, mddev)
NeilBrowne93f68a2010-06-15 09:36:03 +01003552 rdev->new_raid_disk = rdev->raid_disk;
3553
NeilBrown245f46c2009-03-31 14:39:39 +11003554 /* ->takeover must set new_* and/or delta_disks
3555 * if it succeeds, and may set them when it fails.
3556 */
3557 priv = pers->takeover(mddev);
3558 if (IS_ERR(priv)) {
3559 mddev->new_level = mddev->level;
3560 mddev->new_layout = mddev->layout;
Andre Noll664e7c42009-06-18 08:45:27 +10003561 mddev->new_chunk_sectors = mddev->chunk_sectors;
NeilBrown245f46c2009-03-31 14:39:39 +11003562 mddev->raid_disks -= mddev->delta_disks;
3563 mddev->delta_disks = 0;
NeilBrown2c810cd2012-05-21 09:27:00 +10003564 mddev->reshape_backwards = 0;
NeilBrown245f46c2009-03-31 14:39:39 +11003565 module_put(pers->owner);
3566 printk(KERN_WARNING "md: %s: %s would not accept array\n",
Dan Williamsf2859af2010-05-02 10:04:16 -07003567 mdname(mddev), clevel);
NeilBrown245f46c2009-03-31 14:39:39 +11003568 return PTR_ERR(priv);
3569 }
3570
3571 /* Looks like we have a winner */
3572 mddev_suspend(mddev);
3573 mddev->pers->stop(mddev);
NeilBrowna64c8762010-04-14 17:15:37 +10003574
3575 if (mddev->pers->sync_request == NULL &&
3576 pers->sync_request != NULL) {
3577 /* need to add the md_redundancy_group */
3578 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3579 printk(KERN_WARNING
3580 "md: cannot register extra attributes for %s\n",
3581 mdname(mddev));
NeilBrown19fdb9e2010-05-22 08:31:36 +10003582 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
NeilBrowna64c8762010-04-14 17:15:37 +10003583 }
3584 if (mddev->pers->sync_request != NULL &&
3585 pers->sync_request == NULL) {
3586 /* need to remove the md_redundancy_group */
3587 if (mddev->to_remove == NULL)
3588 mddev->to_remove = &md_redundancy_group;
3589 }
3590
Trela Maciej54071b32010-03-08 16:02:42 +11003591 if (mddev->pers->sync_request == NULL &&
3592 mddev->external) {
3593 /* We are converting from a no-redundancy array
3594 * to a redundancy array and metadata is managed
3595 * externally so we need to be sure that writes
3596 * won't block due to a need to transition
3597 * clean->dirty
3598 * until external management is started.
3599 */
3600 mddev->in_sync = 0;
3601 mddev->safemode_delay = 0;
3602 mddev->safemode = 0;
3603 }
3604
NeilBrowndafb20f2012-03-19 12:46:39 +11003605 rdev_for_each(rdev, mddev) {
NeilBrowne93f68a2010-06-15 09:36:03 +01003606 if (rdev->raid_disk < 0)
3607 continue;
NeilBrownbf2cb0d2011-01-14 09:14:34 +11003608 if (rdev->new_raid_disk >= mddev->raid_disks)
NeilBrowne93f68a2010-06-15 09:36:03 +01003609 rdev->new_raid_disk = -1;
3610 if (rdev->new_raid_disk == rdev->raid_disk)
3611 continue;
Namhyung Kim36fad852011-07-27 11:00:36 +10003612 sysfs_unlink_rdev(mddev, rdev);
NeilBrowne93f68a2010-06-15 09:36:03 +01003613 }
NeilBrowndafb20f2012-03-19 12:46:39 +11003614 rdev_for_each(rdev, mddev) {
NeilBrowne93f68a2010-06-15 09:36:03 +01003615 if (rdev->raid_disk < 0)
3616 continue;
3617 if (rdev->new_raid_disk == rdev->raid_disk)
3618 continue;
3619 rdev->raid_disk = rdev->new_raid_disk;
3620 if (rdev->raid_disk < 0)
NeilBrown3a981b03f2009-08-03 10:59:55 +10003621 clear_bit(In_sync, &rdev->flags);
NeilBrowne93f68a2010-06-15 09:36:03 +01003622 else {
Namhyung Kim36fad852011-07-27 11:00:36 +10003623 if (sysfs_link_rdev(mddev, rdev))
3624 printk(KERN_WARNING "md: cannot register rd%d"
3625 " for %s after level change\n",
3626 rdev->raid_disk, mdname(mddev));
NeilBrown3a981b03f2009-08-03 10:59:55 +10003627 }
NeilBrowne93f68a2010-06-15 09:36:03 +01003628 }
3629
3630 module_put(mddev->pers->owner);
NeilBrown245f46c2009-03-31 14:39:39 +11003631 mddev->pers = pers;
3632 mddev->private = priv;
3633 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3634 mddev->level = mddev->new_level;
3635 mddev->layout = mddev->new_layout;
Andre Noll664e7c42009-06-18 08:45:27 +10003636 mddev->chunk_sectors = mddev->new_chunk_sectors;
NeilBrown245f46c2009-03-31 14:39:39 +11003637 mddev->delta_disks = 0;
NeilBrown2c810cd2012-05-21 09:27:00 +10003638 mddev->reshape_backwards = 0;
Krzysztof Wojcikfee68722011-04-20 15:39:53 +10003639 mddev->degraded = 0;
Trela, Maciej9af204c2010-03-08 16:02:44 +11003640 if (mddev->pers->sync_request == NULL) {
3641 /* this is now an array without redundancy, so
3642 * it must always be in_sync
3643 */
3644 mddev->in_sync = 1;
3645 del_timer_sync(&mddev->safemode_timer);
3646 }
NeilBrown245f46c2009-03-31 14:39:39 +11003647 pers->run(mddev);
3648 mddev_resume(mddev);
3649 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3650 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3651 md_wakeup_thread(mddev->thread);
Maciej Trela5cac78612010-04-14 17:17:39 +10003652 sysfs_notify(&mddev->kobj, NULL, "level");
Dan Williamsbb7f8d22010-05-01 18:14:57 -07003653 md_new_event(mddev);
NeilBrownd9d166c2006-01-06 00:20:51 -08003654 return rv;
3655}
3656
3657static struct md_sysfs_entry md_level =
NeilBrown80ca3a42006-07-10 04:44:18 -07003658__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
NeilBrowneae17012005-11-08 21:39:23 -08003659
NeilBrownd4dbd022006-06-26 00:27:59 -07003660
3661static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003662layout_show(struct mddev *mddev, char *page)
NeilBrownd4dbd022006-06-26 00:27:59 -07003663{
3664 /* just a number, not meaningful for all levels */
NeilBrown08a02ec2007-05-09 02:35:38 -07003665 if (mddev->reshape_position != MaxSector &&
3666 mddev->layout != mddev->new_layout)
3667 return sprintf(page, "%d (%d)\n",
3668 mddev->new_layout, mddev->layout);
NeilBrownd4dbd022006-06-26 00:27:59 -07003669 return sprintf(page, "%d\n", mddev->layout);
3670}
3671
3672static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003673layout_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrownd4dbd022006-06-26 00:27:59 -07003674{
3675 char *e;
3676 unsigned long n = simple_strtoul(buf, &e, 10);
NeilBrownd4dbd022006-06-26 00:27:59 -07003677
3678 if (!*buf || (*e && *e != '\n'))
3679 return -EINVAL;
3680
NeilBrownb3546032009-03-31 14:56:41 +11003681 if (mddev->pers) {
3682 int err;
NeilBrown50ac1682009-06-18 08:47:55 +10003683 if (mddev->pers->check_reshape == NULL)
NeilBrownb3546032009-03-31 14:56:41 +11003684 return -EBUSY;
NeilBrown597a7112009-06-18 08:47:42 +10003685 mddev->new_layout = n;
NeilBrown50ac1682009-06-18 08:47:55 +10003686 err = mddev->pers->check_reshape(mddev);
NeilBrown597a7112009-06-18 08:47:42 +10003687 if (err) {
3688 mddev->new_layout = mddev->layout;
NeilBrownb3546032009-03-31 14:56:41 +11003689 return err;
NeilBrown597a7112009-06-18 08:47:42 +10003690 }
NeilBrownb3546032009-03-31 14:56:41 +11003691 } else {
NeilBrown08a02ec2007-05-09 02:35:38 -07003692 mddev->new_layout = n;
NeilBrownb3546032009-03-31 14:56:41 +11003693 if (mddev->reshape_position == MaxSector)
3694 mddev->layout = n;
3695 }
NeilBrownd4dbd022006-06-26 00:27:59 -07003696 return len;
3697}
3698static struct md_sysfs_entry md_layout =
NeilBrown80ca3a42006-07-10 04:44:18 -07003699__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
NeilBrownd4dbd022006-06-26 00:27:59 -07003700
3701
NeilBrowneae17012005-11-08 21:39:23 -08003702static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003703raid_disks_show(struct mddev *mddev, char *page)
NeilBrowneae17012005-11-08 21:39:23 -08003704{
NeilBrownbb636542005-11-08 21:39:45 -08003705 if (mddev->raid_disks == 0)
3706 return 0;
NeilBrown08a02ec2007-05-09 02:35:38 -07003707 if (mddev->reshape_position != MaxSector &&
3708 mddev->delta_disks != 0)
3709 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3710 mddev->raid_disks - mddev->delta_disks);
NeilBrowneae17012005-11-08 21:39:23 -08003711 return sprintf(page, "%d\n", mddev->raid_disks);
3712}
3713
NeilBrownfd01b882011-10-11 16:47:53 +11003714static int update_raid_disks(struct mddev *mddev, int raid_disks);
NeilBrownda943b992006-01-06 00:20:54 -08003715
3716static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003717raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrownda943b992006-01-06 00:20:54 -08003718{
NeilBrownda943b992006-01-06 00:20:54 -08003719 char *e;
3720 int rv = 0;
3721 unsigned long n = simple_strtoul(buf, &e, 10);
3722
3723 if (!*buf || (*e && *e != '\n'))
3724 return -EINVAL;
3725
3726 if (mddev->pers)
3727 rv = update_raid_disks(mddev, n);
NeilBrown08a02ec2007-05-09 02:35:38 -07003728 else if (mddev->reshape_position != MaxSector) {
NeilBrownc6563a82012-05-21 09:27:00 +10003729 struct md_rdev *rdev;
NeilBrown08a02ec2007-05-09 02:35:38 -07003730 int olddisks = mddev->raid_disks - mddev->delta_disks;
NeilBrownc6563a82012-05-21 09:27:00 +10003731
3732 rdev_for_each(rdev, mddev) {
3733 if (olddisks < n &&
3734 rdev->data_offset < rdev->new_data_offset)
3735 return -EINVAL;
3736 if (olddisks > n &&
3737 rdev->data_offset > rdev->new_data_offset)
3738 return -EINVAL;
3739 }
NeilBrown08a02ec2007-05-09 02:35:38 -07003740 mddev->delta_disks = n - olddisks;
3741 mddev->raid_disks = n;
NeilBrown2c810cd2012-05-21 09:27:00 +10003742 mddev->reshape_backwards = (mddev->delta_disks < 0);
NeilBrown08a02ec2007-05-09 02:35:38 -07003743 } else
NeilBrownda943b992006-01-06 00:20:54 -08003744 mddev->raid_disks = n;
3745 return rv ? rv : len;
3746}
3747static struct md_sysfs_entry md_raid_disks =
NeilBrown80ca3a42006-07-10 04:44:18 -07003748__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
NeilBrowneae17012005-11-08 21:39:23 -08003749
NeilBrown24dd4692005-11-08 21:39:26 -08003750static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003751chunk_size_show(struct mddev *mddev, char *page)
NeilBrown3b343802006-01-06 00:20:47 -08003752{
NeilBrown08a02ec2007-05-09 02:35:38 -07003753 if (mddev->reshape_position != MaxSector &&
Andre Noll664e7c42009-06-18 08:45:27 +10003754 mddev->chunk_sectors != mddev->new_chunk_sectors)
3755 return sprintf(page, "%d (%d)\n",
3756 mddev->new_chunk_sectors << 9,
Andre Noll9d8f0362009-06-18 08:45:01 +10003757 mddev->chunk_sectors << 9);
3758 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
NeilBrown3b343802006-01-06 00:20:47 -08003759}
3760
3761static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003762chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown3b343802006-01-06 00:20:47 -08003763{
NeilBrown3b343802006-01-06 00:20:47 -08003764 char *e;
3765 unsigned long n = simple_strtoul(buf, &e, 10);
3766
NeilBrown3b343802006-01-06 00:20:47 -08003767 if (!*buf || (*e && *e != '\n'))
3768 return -EINVAL;
3769
NeilBrownb3546032009-03-31 14:56:41 +11003770 if (mddev->pers) {
3771 int err;
NeilBrown50ac1682009-06-18 08:47:55 +10003772 if (mddev->pers->check_reshape == NULL)
NeilBrownb3546032009-03-31 14:56:41 +11003773 return -EBUSY;
NeilBrown597a7112009-06-18 08:47:42 +10003774 mddev->new_chunk_sectors = n >> 9;
NeilBrown50ac1682009-06-18 08:47:55 +10003775 err = mddev->pers->check_reshape(mddev);
NeilBrown597a7112009-06-18 08:47:42 +10003776 if (err) {
3777 mddev->new_chunk_sectors = mddev->chunk_sectors;
NeilBrownb3546032009-03-31 14:56:41 +11003778 return err;
NeilBrown597a7112009-06-18 08:47:42 +10003779 }
NeilBrownb3546032009-03-31 14:56:41 +11003780 } else {
Andre Noll664e7c42009-06-18 08:45:27 +10003781 mddev->new_chunk_sectors = n >> 9;
NeilBrownb3546032009-03-31 14:56:41 +11003782 if (mddev->reshape_position == MaxSector)
Andre Noll9d8f0362009-06-18 08:45:01 +10003783 mddev->chunk_sectors = n >> 9;
NeilBrownb3546032009-03-31 14:56:41 +11003784 }
NeilBrown3b343802006-01-06 00:20:47 -08003785 return len;
3786}
3787static struct md_sysfs_entry md_chunk_size =
NeilBrown80ca3a42006-07-10 04:44:18 -07003788__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
NeilBrown3b343802006-01-06 00:20:47 -08003789
NeilBrowna94213b2006-06-26 00:28:00 -07003790static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003791resync_start_show(struct mddev *mddev, char *page)
NeilBrowna94213b2006-06-26 00:28:00 -07003792{
NeilBrownd1a7c502009-03-31 15:24:32 +11003793 if (mddev->recovery_cp == MaxSector)
3794 return sprintf(page, "none\n");
NeilBrowna94213b2006-06-26 00:28:00 -07003795 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3796}
3797
3798static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003799resync_start_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrowna94213b2006-06-26 00:28:00 -07003800{
NeilBrowna94213b2006-06-26 00:28:00 -07003801 char *e;
3802 unsigned long long n = simple_strtoull(buf, &e, 10);
3803
NeilBrownb0986362011-05-11 15:52:21 +10003804 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
NeilBrowna94213b2006-06-26 00:28:00 -07003805 return -EBUSY;
Dan Williams06e3c812009-12-12 21:17:12 -07003806 if (cmd_match(buf, "none"))
3807 n = MaxSector;
3808 else if (!*buf || (*e && *e != '\n'))
NeilBrowna94213b2006-06-26 00:28:00 -07003809 return -EINVAL;
3810
3811 mddev->recovery_cp = n;
3812 return len;
3813}
3814static struct md_sysfs_entry md_resync_start =
NeilBrown80ca3a42006-07-10 04:44:18 -07003815__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
NeilBrowna94213b2006-06-26 00:28:00 -07003816
NeilBrown9e653b62006-06-26 00:27:58 -07003817/*
3818 * The array state can be:
3819 *
3820 * clear
3821 * No devices, no size, no level
3822 * Equivalent to STOP_ARRAY ioctl
3823 * inactive
3824 * May have some settings, but array is not active
3825 * all IO results in error
3826 * When written, doesn't tear down array, but just stops it
3827 * suspended (not supported yet)
3828 * All IO requests will block. The array can be reconfigured.
Andre Noll910d8cb2008-03-25 21:00:53 +01003829 * Writing this, if accepted, will block until array is quiescent
NeilBrown9e653b62006-06-26 00:27:58 -07003830 * readonly
3831 * no resync can happen. no superblocks get written.
3832 * write requests fail
3833 * read-auto
3834 * like readonly, but behaves like 'clean' on a write request.
3835 *
3836 * clean - no pending writes, but otherwise active.
3837 * When written to inactive array, starts without resync
3838 * If a write request arrives then
3839 * if metadata is known, mark 'dirty' and switch to 'active'.
3840 * if not known, block and switch to write-pending
3841 * If written to an active array that has pending writes, then fails.
3842 * active
3843 * fully active: IO and resync can be happening.
3844 * When written to inactive array, starts with resync
3845 *
3846 * write-pending
3847 * clean, but writes are blocked waiting for 'active' to be written.
3848 *
3849 * active-idle
3850 * like active, but no writes have been seen for a while (100msec).
3851 *
3852 */
3853enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3854 write_pending, active_idle, bad_word};
Adrian Bunk05381952006-06-26 00:28:01 -07003855static char *array_states[] = {
NeilBrown9e653b62006-06-26 00:27:58 -07003856 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3857 "write-pending", "active-idle", NULL };
3858
3859static int match_word(const char *word, char **list)
3860{
3861 int n;
3862 for (n=0; list[n]; n++)
3863 if (cmd_match(word, list[n]))
3864 break;
3865 return n;
3866}
3867
3868static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003869array_state_show(struct mddev *mddev, char *page)
NeilBrown9e653b62006-06-26 00:27:58 -07003870{
3871 enum array_state st = inactive;
3872
3873 if (mddev->pers)
3874 switch(mddev->ro) {
3875 case 1:
3876 st = readonly;
3877 break;
3878 case 2:
3879 st = read_auto;
3880 break;
3881 case 0:
3882 if (mddev->in_sync)
3883 st = clean;
NeilBrown070dc6d2010-08-30 17:33:34 +10003884 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
NeilBrowne6910632008-02-06 01:39:51 -08003885 st = write_pending;
NeilBrown9e653b62006-06-26 00:27:58 -07003886 else if (mddev->safemode)
3887 st = active_idle;
3888 else
3889 st = active;
3890 }
3891 else {
3892 if (list_empty(&mddev->disks) &&
3893 mddev->raid_disks == 0 &&
Andre Noll58c0fed2009-03-31 14:33:13 +11003894 mddev->dev_sectors == 0)
NeilBrown9e653b62006-06-26 00:27:58 -07003895 st = clear;
3896 else
3897 st = inactive;
3898 }
3899 return sprintf(page, "%s\n", array_states[st]);
3900}
3901
NeilBrownfd01b882011-10-11 16:47:53 +11003902static int do_md_stop(struct mddev * mddev, int ro, int is_open);
3903static int md_set_readonly(struct mddev * mddev, int is_open);
3904static int do_md_run(struct mddev * mddev);
3905static int restart_array(struct mddev *mddev);
NeilBrown9e653b62006-06-26 00:27:58 -07003906
3907static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11003908array_state_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown9e653b62006-06-26 00:27:58 -07003909{
3910 int err = -EINVAL;
3911 enum array_state st = match_word(buf, array_states);
3912 switch(st) {
3913 case bad_word:
3914 break;
3915 case clear:
3916 /* stopping an active array */
NeilBrownf2ea68c2008-07-21 17:05:25 +10003917 if (atomic_read(&mddev->openers) > 0)
NeilBrowne6910632008-02-06 01:39:51 -08003918 return -EBUSY;
Neil Browndf5b20c2008-07-11 22:02:22 +10003919 err = do_md_stop(mddev, 0, 0);
NeilBrown9e653b62006-06-26 00:27:58 -07003920 break;
3921 case inactive:
3922 /* stopping an active array */
3923 if (mddev->pers) {
NeilBrownf2ea68c2008-07-21 17:05:25 +10003924 if (atomic_read(&mddev->openers) > 0)
NeilBrown9e653b62006-06-26 00:27:58 -07003925 return -EBUSY;
Neil Browndf5b20c2008-07-11 22:02:22 +10003926 err = do_md_stop(mddev, 2, 0);
NeilBrowne6910632008-02-06 01:39:51 -08003927 } else
3928 err = 0; /* already inactive */
NeilBrown9e653b62006-06-26 00:27:58 -07003929 break;
3930 case suspended:
3931 break; /* not supported yet */
3932 case readonly:
3933 if (mddev->pers)
NeilBrowna4bd82d02010-03-29 13:23:10 +11003934 err = md_set_readonly(mddev, 0);
NeilBrown9e653b62006-06-26 00:27:58 -07003935 else {
3936 mddev->ro = 1;
NeilBrown648b6292008-04-30 00:52:30 -07003937 set_disk_ro(mddev->gendisk, 1);
NeilBrown9e653b62006-06-26 00:27:58 -07003938 err = do_md_run(mddev);
3939 }
3940 break;
3941 case read_auto:
NeilBrown9e653b62006-06-26 00:27:58 -07003942 if (mddev->pers) {
NeilBrown80268ee2008-10-13 11:55:12 +11003943 if (mddev->ro == 0)
NeilBrowna4bd82d02010-03-29 13:23:10 +11003944 err = md_set_readonly(mddev, 0);
NeilBrown80268ee2008-10-13 11:55:12 +11003945 else if (mddev->ro == 1)
NeilBrown648b6292008-04-30 00:52:30 -07003946 err = restart_array(mddev);
3947 if (err == 0) {
3948 mddev->ro = 2;
3949 set_disk_ro(mddev->gendisk, 0);
3950 }
NeilBrown9e653b62006-06-26 00:27:58 -07003951 } else {
3952 mddev->ro = 2;
3953 err = do_md_run(mddev);
3954 }
3955 break;
3956 case clean:
3957 if (mddev->pers) {
3958 restart_array(mddev);
3959 spin_lock_irq(&mddev->write_lock);
3960 if (atomic_read(&mddev->writes_pending) == 0) {
NeilBrowne6910632008-02-06 01:39:51 -08003961 if (mddev->in_sync == 0) {
3962 mddev->in_sync = 1;
NeilBrown31a59e32008-04-30 00:52:30 -07003963 if (mddev->safemode == 1)
3964 mddev->safemode = 0;
NeilBrown070dc6d2010-08-30 17:33:34 +10003965 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
NeilBrowne6910632008-02-06 01:39:51 -08003966 }
3967 err = 0;
3968 } else
3969 err = -EBUSY;
NeilBrown9e653b62006-06-26 00:27:58 -07003970 spin_unlock_irq(&mddev->write_lock);
NeilBrown5bf29592009-05-07 12:50:57 +10003971 } else
3972 err = -EINVAL;
NeilBrown9e653b62006-06-26 00:27:58 -07003973 break;
3974 case active:
3975 if (mddev->pers) {
3976 restart_array(mddev);
NeilBrown070dc6d2010-08-30 17:33:34 +10003977 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
NeilBrown9e653b62006-06-26 00:27:58 -07003978 wake_up(&mddev->sb_wait);
3979 err = 0;
3980 } else {
3981 mddev->ro = 0;
NeilBrown648b6292008-04-30 00:52:30 -07003982 set_disk_ro(mddev->gendisk, 0);
NeilBrown9e653b62006-06-26 00:27:58 -07003983 err = do_md_run(mddev);
3984 }
3985 break;
3986 case write_pending:
3987 case active_idle:
3988 /* these cannot be set */
3989 break;
3990 }
3991 if (err)
3992 return err;
Neil Brown0fd62b82008-06-28 08:31:36 +10003993 else {
NeilBrown1d23f172011-12-08 15:49:12 +11003994 if (mddev->hold_active == UNTIL_IOCTL)
3995 mddev->hold_active = 0;
NeilBrown00bcb4a2010-06-01 19:37:23 +10003996 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrown9e653b62006-06-26 00:27:58 -07003997 return len;
Neil Brown0fd62b82008-06-28 08:31:36 +10003998 }
NeilBrown9e653b62006-06-26 00:27:58 -07003999}
NeilBrown80ca3a42006-07-10 04:44:18 -07004000static struct md_sysfs_entry md_array_state =
4001__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
NeilBrown9e653b62006-06-26 00:27:58 -07004002
NeilBrown6d7ff7382006-01-06 00:21:16 -08004003static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004004max_corrected_read_errors_show(struct mddev *mddev, char *page) {
Robert Becker1e509152009-12-14 12:49:58 +11004005 return sprintf(page, "%d\n",
4006 atomic_read(&mddev->max_corr_read_errors));
4007}
4008
4009static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004010max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
Robert Becker1e509152009-12-14 12:49:58 +11004011{
4012 char *e;
4013 unsigned long n = simple_strtoul(buf, &e, 10);
4014
4015 if (*buf && (*e == 0 || *e == '\n')) {
4016 atomic_set(&mddev->max_corr_read_errors, n);
4017 return len;
4018 }
4019 return -EINVAL;
4020}
4021
4022static struct md_sysfs_entry max_corr_read_errors =
4023__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4024 max_corrected_read_errors_store);
4025
4026static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004027null_show(struct mddev *mddev, char *page)
NeilBrown6d7ff7382006-01-06 00:21:16 -08004028{
4029 return -EINVAL;
4030}
4031
4032static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004033new_dev_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown6d7ff7382006-01-06 00:21:16 -08004034{
4035 /* buf must be %d:%d\n? giving major and minor numbers */
4036 /* The new device is added to the array.
4037 * If the array has a persistent superblock, we read the
4038 * superblock to initialise info and check validity.
4039 * Otherwise, only checking done is that in bind_rdev_to_array,
4040 * which mainly checks size.
4041 */
4042 char *e;
4043 int major = simple_strtoul(buf, &e, 10);
4044 int minor;
4045 dev_t dev;
NeilBrown3cb03002011-10-11 16:45:26 +11004046 struct md_rdev *rdev;
NeilBrown6d7ff7382006-01-06 00:21:16 -08004047 int err;
4048
4049 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4050 return -EINVAL;
4051 minor = simple_strtoul(e+1, &e, 10);
4052 if (*e && *e != '\n')
4053 return -EINVAL;
4054 dev = MKDEV(major, minor);
4055 if (major != MAJOR(dev) ||
4056 minor != MINOR(dev))
4057 return -EOVERFLOW;
4058
4059
4060 if (mddev->persistent) {
4061 rdev = md_import_device(dev, mddev->major_version,
4062 mddev->minor_version);
4063 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
NeilBrown3cb03002011-10-11 16:45:26 +11004064 struct md_rdev *rdev0
4065 = list_entry(mddev->disks.next,
4066 struct md_rdev, same_set);
NeilBrown6d7ff7382006-01-06 00:21:16 -08004067 err = super_types[mddev->major_version]
4068 .load_super(rdev, rdev0, mddev->minor_version);
4069 if (err < 0)
4070 goto out;
4071 }
NeilBrownc5d79ad2008-02-06 01:39:54 -08004072 } else if (mddev->external)
4073 rdev = md_import_device(dev, -2, -1);
4074 else
NeilBrown6d7ff7382006-01-06 00:21:16 -08004075 rdev = md_import_device(dev, -1, -1);
4076
4077 if (IS_ERR(rdev))
4078 return PTR_ERR(rdev);
4079 err = bind_rdev_to_array(rdev, mddev);
4080 out:
4081 if (err)
4082 export_rdev(rdev);
4083 return err ? err : len;
4084}
4085
4086static struct md_sysfs_entry md_new_device =
NeilBrown80ca3a42006-07-10 04:44:18 -07004087__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
NeilBrown3b343802006-01-06 00:20:47 -08004088
4089static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004090bitmap_store(struct mddev *mddev, const char *buf, size_t len)
Paul Clements9b1d1da2006-10-03 01:15:49 -07004091{
4092 char *end;
4093 unsigned long chunk, end_chunk;
4094
4095 if (!mddev->bitmap)
4096 goto out;
4097 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4098 while (*buf) {
4099 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4100 if (buf == end) break;
4101 if (*end == '-') { /* range */
4102 buf = end + 1;
4103 end_chunk = simple_strtoul(buf, &end, 0);
4104 if (buf == end) break;
4105 }
4106 if (*end && !isspace(*end)) break;
4107 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
André Goddard Rosae7d28602009-12-14 18:01:06 -08004108 buf = skip_spaces(end);
Paul Clements9b1d1da2006-10-03 01:15:49 -07004109 }
4110 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4111out:
4112 return len;
4113}
4114
4115static struct md_sysfs_entry md_bitmap =
4116__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4117
4118static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004119size_show(struct mddev *mddev, char *page)
NeilBrowna35b0d692006-01-06 00:20:49 -08004120{
Andre Noll58c0fed2009-03-31 14:33:13 +11004121 return sprintf(page, "%llu\n",
4122 (unsigned long long)mddev->dev_sectors / 2);
NeilBrowna35b0d692006-01-06 00:20:49 -08004123}
4124
NeilBrownfd01b882011-10-11 16:47:53 +11004125static int update_size(struct mddev *mddev, sector_t num_sectors);
NeilBrowna35b0d692006-01-06 00:20:49 -08004126
4127static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004128size_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrowna35b0d692006-01-06 00:20:49 -08004129{
4130 /* If array is inactive, we can reduce the component size, but
4131 * not increase it (except from 0).
4132 * If array is active, we can try an on-line resize
4133 */
Dan Williamsb522adc2009-03-31 15:00:31 +11004134 sector_t sectors;
4135 int err = strict_blocks_to_sectors(buf, &sectors);
NeilBrowna35b0d692006-01-06 00:20:49 -08004136
Andre Noll58c0fed2009-03-31 14:33:13 +11004137 if (err < 0)
4138 return err;
NeilBrowna35b0d692006-01-06 00:20:49 -08004139 if (mddev->pers) {
Andre Noll58c0fed2009-03-31 14:33:13 +11004140 err = update_size(mddev, sectors);
NeilBrown850b2b422006-10-03 01:15:46 -07004141 md_update_sb(mddev, 1);
NeilBrowna35b0d692006-01-06 00:20:49 -08004142 } else {
Andre Noll58c0fed2009-03-31 14:33:13 +11004143 if (mddev->dev_sectors == 0 ||
4144 mddev->dev_sectors > sectors)
4145 mddev->dev_sectors = sectors;
NeilBrowna35b0d692006-01-06 00:20:49 -08004146 else
4147 err = -ENOSPC;
4148 }
4149 return err ? err : len;
4150}
4151
4152static struct md_sysfs_entry md_size =
NeilBrown80ca3a42006-07-10 04:44:18 -07004153__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
NeilBrowna35b0d692006-01-06 00:20:49 -08004154
NeilBrown8bb93aa2006-01-06 00:20:50 -08004155
4156/* Metdata version.
NeilBrowne6910632008-02-06 01:39:51 -08004157 * This is one of
4158 * 'none' for arrays with no metadata (good luck...)
4159 * 'external' for arrays with externally managed metadata,
NeilBrown8bb93aa2006-01-06 00:20:50 -08004160 * or N.M for internally known formats
4161 */
4162static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004163metadata_show(struct mddev *mddev, char *page)
NeilBrown8bb93aa2006-01-06 00:20:50 -08004164{
4165 if (mddev->persistent)
4166 return sprintf(page, "%d.%d\n",
4167 mddev->major_version, mddev->minor_version);
NeilBrowne6910632008-02-06 01:39:51 -08004168 else if (mddev->external)
4169 return sprintf(page, "external:%s\n", mddev->metadata_type);
NeilBrown8bb93aa2006-01-06 00:20:50 -08004170 else
4171 return sprintf(page, "none\n");
4172}
4173
4174static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004175metadata_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown8bb93aa2006-01-06 00:20:50 -08004176{
4177 int major, minor;
4178 char *e;
NeilBrownea43ddd2008-10-13 11:55:11 +11004179 /* Changing the details of 'external' metadata is
4180 * always permitted. Otherwise there must be
4181 * no devices attached to the array.
4182 */
4183 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4184 ;
4185 else if (!list_empty(&mddev->disks))
NeilBrown8bb93aa2006-01-06 00:20:50 -08004186 return -EBUSY;
4187
4188 if (cmd_match(buf, "none")) {
4189 mddev->persistent = 0;
NeilBrowne6910632008-02-06 01:39:51 -08004190 mddev->external = 0;
4191 mddev->major_version = 0;
4192 mddev->minor_version = 90;
4193 return len;
4194 }
4195 if (strncmp(buf, "external:", 9) == 0) {
NeilBrown20a49ff2008-02-06 01:39:57 -08004196 size_t namelen = len-9;
NeilBrowne6910632008-02-06 01:39:51 -08004197 if (namelen >= sizeof(mddev->metadata_type))
4198 namelen = sizeof(mddev->metadata_type)-1;
4199 strncpy(mddev->metadata_type, buf+9, namelen);
4200 mddev->metadata_type[namelen] = 0;
4201 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4202 mddev->metadata_type[--namelen] = 0;
4203 mddev->persistent = 0;
4204 mddev->external = 1;
NeilBrown8bb93aa2006-01-06 00:20:50 -08004205 mddev->major_version = 0;
4206 mddev->minor_version = 90;
4207 return len;
4208 }
4209 major = simple_strtoul(buf, &e, 10);
4210 if (e==buf || *e != '.')
4211 return -EINVAL;
4212 buf = e+1;
4213 minor = simple_strtoul(buf, &e, 10);
NeilBrown3f9d7b02006-12-22 01:11:41 -08004214 if (e==buf || (*e && *e != '\n') )
NeilBrown8bb93aa2006-01-06 00:20:50 -08004215 return -EINVAL;
Ahmed S. Darwish50511da2007-05-09 02:35:34 -07004216 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
NeilBrown8bb93aa2006-01-06 00:20:50 -08004217 return -ENOENT;
4218 mddev->major_version = major;
4219 mddev->minor_version = minor;
4220 mddev->persistent = 1;
NeilBrowne6910632008-02-06 01:39:51 -08004221 mddev->external = 0;
NeilBrown8bb93aa2006-01-06 00:20:50 -08004222 return len;
4223}
4224
4225static struct md_sysfs_entry md_metadata =
NeilBrown80ca3a42006-07-10 04:44:18 -07004226__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
NeilBrown8bb93aa2006-01-06 00:20:50 -08004227
NeilBrowna35b0d692006-01-06 00:20:49 -08004228static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004229action_show(struct mddev *mddev, char *page)
NeilBrown24dd4692005-11-08 21:39:26 -08004230{
NeilBrown7eec3142005-11-08 21:39:44 -08004231 char *type = "idle";
NeilBrownb6a9ce62009-05-26 09:41:17 +10004232 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4233 type = "frozen";
4234 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
NeilBrown2b12ab62007-10-16 23:30:53 -07004235 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
NeilBrownccfcc3c2006-03-27 01:18:09 -08004236 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4237 type = "reshape";
4238 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
NeilBrown24dd4692005-11-08 21:39:26 -08004239 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4240 type = "resync";
4241 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4242 type = "check";
4243 else
4244 type = "repair";
Neil Brown72a23c22008-06-28 08:31:41 +10004245 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
NeilBrown24dd4692005-11-08 21:39:26 -08004246 type = "recover";
4247 }
4248 return sprintf(page, "%s\n", type);
4249}
4250
NeilBrownfd01b882011-10-11 16:47:53 +11004251static void reap_sync_thread(struct mddev *mddev);
NeilBrown7ebc0be2011-01-14 09:14:33 +11004252
NeilBrown24dd4692005-11-08 21:39:26 -08004253static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004254action_store(struct mddev *mddev, const char *page, size_t len)
NeilBrown24dd4692005-11-08 21:39:26 -08004255{
NeilBrown7eec3142005-11-08 21:39:44 -08004256 if (!mddev->pers || !mddev->pers->sync_request)
4257 return -EINVAL;
4258
NeilBrownb6a9ce62009-05-26 09:41:17 +10004259 if (cmd_match(page, "frozen"))
4260 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4261 else
4262 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4263
4264 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
NeilBrown7eec3142005-11-08 21:39:44 -08004265 if (mddev->sync_thread) {
4266 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
NeilBrown7ebc0be2011-01-14 09:14:33 +11004267 reap_sync_thread(mddev);
NeilBrown7eec3142005-11-08 21:39:44 -08004268 }
NeilBrown03c902e2006-01-06 00:20:46 -08004269 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4270 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
NeilBrown24dd4692005-11-08 21:39:26 -08004271 return -EBUSY;
Neil Brown72a23c22008-06-28 08:31:41 +10004272 else if (cmd_match(page, "resync"))
NeilBrown7eec3142005-11-08 21:39:44 -08004273 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
Neil Brown72a23c22008-06-28 08:31:41 +10004274 else if (cmd_match(page, "recover")) {
4275 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4276 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4277 } else if (cmd_match(page, "reshape")) {
NeilBrown16484bf2006-03-27 01:18:13 -08004278 int err;
4279 if (mddev->pers->start_reshape == NULL)
4280 return -EINVAL;
4281 err = mddev->pers->start_reshape(mddev);
4282 if (err)
4283 return err;
Neil Browna99ac972008-06-28 08:31:43 +10004284 sysfs_notify(&mddev->kobj, NULL, "degraded");
NeilBrown16484bf2006-03-27 01:18:13 -08004285 } else {
NeilBrownbce74da2006-01-06 00:20:41 -08004286 if (cmd_match(page, "check"))
NeilBrown7eec3142005-11-08 21:39:44 -08004287 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
NeilBrown2adc7d42006-05-20 14:59:57 -07004288 else if (!cmd_match(page, "repair"))
NeilBrown7eec3142005-11-08 21:39:44 -08004289 return -EINVAL;
4290 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4291 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
NeilBrown7eec3142005-11-08 21:39:44 -08004292 }
NeilBrown03c902e2006-01-06 00:20:46 -08004293 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
NeilBrown24dd4692005-11-08 21:39:26 -08004294 md_wakeup_thread(mddev->thread);
NeilBrown00bcb4a2010-06-01 19:37:23 +10004295 sysfs_notify_dirent_safe(mddev->sysfs_action);
NeilBrown24dd4692005-11-08 21:39:26 -08004296 return len;
4297}
4298
NeilBrown9d888832005-11-08 21:39:26 -08004299static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004300mismatch_cnt_show(struct mddev *mddev, char *page)
NeilBrown9d888832005-11-08 21:39:26 -08004301{
4302 return sprintf(page, "%llu\n",
4303 (unsigned long long) mddev->resync_mismatches);
4304}
4305
NeilBrown80ca3a42006-07-10 04:44:18 -07004306static struct md_sysfs_entry md_scan_mode =
4307__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
NeilBrown24dd4692005-11-08 21:39:26 -08004308
NeilBrown96de1e62005-11-08 21:39:39 -08004309
NeilBrown80ca3a42006-07-10 04:44:18 -07004310static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
NeilBrown9d888832005-11-08 21:39:26 -08004311
NeilBrown88202a02006-01-06 00:21:36 -08004312static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004313sync_min_show(struct mddev *mddev, char *page)
NeilBrown88202a02006-01-06 00:21:36 -08004314{
4315 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4316 mddev->sync_speed_min ? "local": "system");
4317}
4318
4319static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004320sync_min_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown88202a02006-01-06 00:21:36 -08004321{
4322 int min;
4323 char *e;
4324 if (strncmp(buf, "system", 6)==0) {
4325 mddev->sync_speed_min = 0;
4326 return len;
4327 }
4328 min = simple_strtoul(buf, &e, 10);
4329 if (buf == e || (*e && *e != '\n') || min <= 0)
4330 return -EINVAL;
4331 mddev->sync_speed_min = min;
4332 return len;
4333}
4334
4335static struct md_sysfs_entry md_sync_min =
4336__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4337
4338static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004339sync_max_show(struct mddev *mddev, char *page)
NeilBrown88202a02006-01-06 00:21:36 -08004340{
4341 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4342 mddev->sync_speed_max ? "local": "system");
4343}
4344
4345static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004346sync_max_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown88202a02006-01-06 00:21:36 -08004347{
4348 int max;
4349 char *e;
4350 if (strncmp(buf, "system", 6)==0) {
4351 mddev->sync_speed_max = 0;
4352 return len;
4353 }
4354 max = simple_strtoul(buf, &e, 10);
4355 if (buf == e || (*e && *e != '\n') || max <= 0)
4356 return -EINVAL;
4357 mddev->sync_speed_max = max;
4358 return len;
4359}
4360
4361static struct md_sysfs_entry md_sync_max =
4362__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4363
Iustin Popd7f3d292007-10-16 23:30:54 -07004364static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004365degraded_show(struct mddev *mddev, char *page)
Iustin Popd7f3d292007-10-16 23:30:54 -07004366{
4367 return sprintf(page, "%d\n", mddev->degraded);
4368}
4369static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
NeilBrown88202a02006-01-06 00:21:36 -08004370
4371static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004372sync_force_parallel_show(struct mddev *mddev, char *page)
Bernd Schubert90b08712008-05-23 13:04:38 -07004373{
4374 return sprintf(page, "%d\n", mddev->parallel_resync);
4375}
4376
4377static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004378sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
Bernd Schubert90b08712008-05-23 13:04:38 -07004379{
4380 long n;
4381
4382 if (strict_strtol(buf, 10, &n))
4383 return -EINVAL;
4384
4385 if (n != 0 && n != 1)
4386 return -EINVAL;
4387
4388 mddev->parallel_resync = n;
4389
4390 if (mddev->sync_thread)
4391 wake_up(&resync_wait);
4392
4393 return len;
4394}
4395
4396/* force parallel resync, even with shared block devices */
4397static struct md_sysfs_entry md_sync_force_parallel =
4398__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4399 sync_force_parallel_show, sync_force_parallel_store);
4400
4401static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004402sync_speed_show(struct mddev *mddev, char *page)
NeilBrown88202a02006-01-06 00:21:36 -08004403{
4404 unsigned long resync, dt, db;
NeilBrownd1a7c502009-03-31 15:24:32 +11004405 if (mddev->curr_resync == 0)
4406 return sprintf(page, "none\n");
Andre Noll9687a602008-03-25 22:24:09 +01004407 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4408 dt = (jiffies - mddev->resync_mark) / HZ;
NeilBrown88202a02006-01-06 00:21:36 -08004409 if (!dt) dt++;
Andre Noll9687a602008-03-25 22:24:09 +01004410 db = resync - mddev->resync_mark_cnt;
4411 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
NeilBrown88202a02006-01-06 00:21:36 -08004412}
4413
NeilBrown80ca3a42006-07-10 04:44:18 -07004414static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
NeilBrown88202a02006-01-06 00:21:36 -08004415
4416static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004417sync_completed_show(struct mddev *mddev, char *page)
NeilBrown88202a02006-01-06 00:21:36 -08004418{
RĂ©mi RĂ©rolle13ae8642011-01-14 09:14:34 +11004419 unsigned long long max_sectors, resync;
NeilBrown88202a02006-01-06 00:21:36 -08004420
NeilBrownacb180b2009-04-14 16:28:34 +10004421 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4422 return sprintf(page, "none\n");
4423
NeilBrown88202a02006-01-06 00:21:36 -08004424 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
Andre Noll58c0fed2009-03-31 14:33:13 +11004425 max_sectors = mddev->resync_max_sectors;
NeilBrown88202a02006-01-06 00:21:36 -08004426 else
Andre Noll58c0fed2009-03-31 14:33:13 +11004427 max_sectors = mddev->dev_sectors;
NeilBrown88202a02006-01-06 00:21:36 -08004428
NeilBrownacb180b2009-04-14 16:28:34 +10004429 resync = mddev->curr_resync_completed;
RĂ©mi RĂ©rolle13ae8642011-01-14 09:14:34 +11004430 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
NeilBrown88202a02006-01-06 00:21:36 -08004431}
4432
NeilBrown80ca3a42006-07-10 04:44:18 -07004433static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
NeilBrown88202a02006-01-06 00:21:36 -08004434
NeilBrowne464eaf2006-03-27 01:18:14 -08004435static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004436min_sync_show(struct mddev *mddev, char *page)
Neil Brown5e96ee62008-06-28 08:31:24 +10004437{
4438 return sprintf(page, "%llu\n",
4439 (unsigned long long)mddev->resync_min);
4440}
4441static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004442min_sync_store(struct mddev *mddev, const char *buf, size_t len)
Neil Brown5e96ee62008-06-28 08:31:24 +10004443{
4444 unsigned long long min;
4445 if (strict_strtoull(buf, 10, &min))
4446 return -EINVAL;
4447 if (min > mddev->resync_max)
4448 return -EINVAL;
4449 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4450 return -EBUSY;
4451
4452 /* Must be a multiple of chunk_size */
Andre Noll9d8f0362009-06-18 08:45:01 +10004453 if (mddev->chunk_sectors) {
raz ben yehuda2ac06c32009-06-16 17:01:42 +10004454 sector_t temp = min;
Andre Noll9d8f0362009-06-18 08:45:01 +10004455 if (sector_div(temp, mddev->chunk_sectors))
Neil Brown5e96ee62008-06-28 08:31:24 +10004456 return -EINVAL;
4457 }
4458 mddev->resync_min = min;
4459
4460 return len;
4461}
4462
4463static struct md_sysfs_entry md_min_sync =
4464__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4465
4466static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004467max_sync_show(struct mddev *mddev, char *page)
NeilBrownc6207272008-02-06 01:39:52 -08004468{
4469 if (mddev->resync_max == MaxSector)
4470 return sprintf(page, "max\n");
4471 else
4472 return sprintf(page, "%llu\n",
4473 (unsigned long long)mddev->resync_max);
4474}
4475static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004476max_sync_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrownc6207272008-02-06 01:39:52 -08004477{
4478 if (strncmp(buf, "max", 3) == 0)
4479 mddev->resync_max = MaxSector;
4480 else {
Neil Brown5e96ee62008-06-28 08:31:24 +10004481 unsigned long long max;
4482 if (strict_strtoull(buf, 10, &max))
4483 return -EINVAL;
4484 if (max < mddev->resync_min)
NeilBrownc6207272008-02-06 01:39:52 -08004485 return -EINVAL;
4486 if (max < mddev->resync_max &&
NeilBrown4d484a42009-08-13 10:41:50 +10004487 mddev->ro == 0 &&
NeilBrownc6207272008-02-06 01:39:52 -08004488 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4489 return -EBUSY;
4490
4491 /* Must be a multiple of chunk_size */
Andre Noll9d8f0362009-06-18 08:45:01 +10004492 if (mddev->chunk_sectors) {
raz ben yehuda2ac06c32009-06-16 17:01:42 +10004493 sector_t temp = max;
Andre Noll9d8f0362009-06-18 08:45:01 +10004494 if (sector_div(temp, mddev->chunk_sectors))
NeilBrownc6207272008-02-06 01:39:52 -08004495 return -EINVAL;
4496 }
4497 mddev->resync_max = max;
4498 }
4499 wake_up(&mddev->recovery_wait);
4500 return len;
4501}
4502
4503static struct md_sysfs_entry md_max_sync =
4504__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4505
4506static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004507suspend_lo_show(struct mddev *mddev, char *page)
NeilBrowne464eaf2006-03-27 01:18:14 -08004508{
4509 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4510}
4511
4512static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004513suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrowne464eaf2006-03-27 01:18:14 -08004514{
4515 char *e;
4516 unsigned long long new = simple_strtoull(buf, &e, 10);
NeilBrown23ddff32011-01-14 09:14:34 +11004517 unsigned long long old = mddev->suspend_lo;
NeilBrowne464eaf2006-03-27 01:18:14 -08004518
NeilBrownb8d966e2009-07-01 11:14:04 +10004519 if (mddev->pers == NULL ||
4520 mddev->pers->quiesce == NULL)
NeilBrowne464eaf2006-03-27 01:18:14 -08004521 return -EINVAL;
4522 if (buf == e || (*e && *e != '\n'))
4523 return -EINVAL;
NeilBrown23ddff32011-01-14 09:14:34 +11004524
4525 mddev->suspend_lo = new;
4526 if (new >= old)
4527 /* Shrinking suspended region */
NeilBrowne464eaf2006-03-27 01:18:14 -08004528 mddev->pers->quiesce(mddev, 2);
NeilBrown23ddff32011-01-14 09:14:34 +11004529 else {
4530 /* Expanding suspended region - need to wait */
4531 mddev->pers->quiesce(mddev, 1);
4532 mddev->pers->quiesce(mddev, 0);
4533 }
4534 return len;
NeilBrowne464eaf2006-03-27 01:18:14 -08004535}
4536static struct md_sysfs_entry md_suspend_lo =
4537__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4538
4539
4540static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004541suspend_hi_show(struct mddev *mddev, char *page)
NeilBrowne464eaf2006-03-27 01:18:14 -08004542{
4543 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4544}
4545
4546static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004547suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrowne464eaf2006-03-27 01:18:14 -08004548{
4549 char *e;
4550 unsigned long long new = simple_strtoull(buf, &e, 10);
NeilBrown23ddff32011-01-14 09:14:34 +11004551 unsigned long long old = mddev->suspend_hi;
NeilBrowne464eaf2006-03-27 01:18:14 -08004552
NeilBrownb8d966e2009-07-01 11:14:04 +10004553 if (mddev->pers == NULL ||
4554 mddev->pers->quiesce == NULL)
NeilBrowne464eaf2006-03-27 01:18:14 -08004555 return -EINVAL;
4556 if (buf == e || (*e && *e != '\n'))
4557 return -EINVAL;
NeilBrown23ddff32011-01-14 09:14:34 +11004558
4559 mddev->suspend_hi = new;
4560 if (new <= old)
4561 /* Shrinking suspended region */
4562 mddev->pers->quiesce(mddev, 2);
4563 else {
4564 /* Expanding suspended region - need to wait */
NeilBrowne464eaf2006-03-27 01:18:14 -08004565 mddev->pers->quiesce(mddev, 1);
4566 mddev->pers->quiesce(mddev, 0);
NeilBrown23ddff32011-01-14 09:14:34 +11004567 }
4568 return len;
NeilBrowne464eaf2006-03-27 01:18:14 -08004569}
4570static struct md_sysfs_entry md_suspend_hi =
4571__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4572
NeilBrown08a02ec2007-05-09 02:35:38 -07004573static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004574reshape_position_show(struct mddev *mddev, char *page)
NeilBrown08a02ec2007-05-09 02:35:38 -07004575{
4576 if (mddev->reshape_position != MaxSector)
4577 return sprintf(page, "%llu\n",
4578 (unsigned long long)mddev->reshape_position);
4579 strcpy(page, "none\n");
4580 return 5;
4581}
4582
4583static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004584reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
NeilBrown08a02ec2007-05-09 02:35:38 -07004585{
NeilBrownc6563a82012-05-21 09:27:00 +10004586 struct md_rdev *rdev;
NeilBrown08a02ec2007-05-09 02:35:38 -07004587 char *e;
4588 unsigned long long new = simple_strtoull(buf, &e, 10);
4589 if (mddev->pers)
4590 return -EBUSY;
4591 if (buf == e || (*e && *e != '\n'))
4592 return -EINVAL;
4593 mddev->reshape_position = new;
4594 mddev->delta_disks = 0;
NeilBrown2c810cd2012-05-21 09:27:00 +10004595 mddev->reshape_backwards = 0;
NeilBrown08a02ec2007-05-09 02:35:38 -07004596 mddev->new_level = mddev->level;
4597 mddev->new_layout = mddev->layout;
Andre Noll664e7c42009-06-18 08:45:27 +10004598 mddev->new_chunk_sectors = mddev->chunk_sectors;
NeilBrownc6563a82012-05-21 09:27:00 +10004599 rdev_for_each(rdev, mddev)
4600 rdev->new_data_offset = rdev->data_offset;
NeilBrown08a02ec2007-05-09 02:35:38 -07004601 return len;
4602}
4603
4604static struct md_sysfs_entry md_reshape_position =
4605__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4606 reshape_position_store);
4607
Dan Williamsb522adc2009-03-31 15:00:31 +11004608static ssize_t
NeilBrown2c810cd2012-05-21 09:27:00 +10004609reshape_direction_show(struct mddev *mddev, char *page)
4610{
4611 return sprintf(page, "%s\n",
4612 mddev->reshape_backwards ? "backwards" : "forwards");
4613}
4614
4615static ssize_t
4616reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4617{
4618 int backwards = 0;
4619 if (cmd_match(buf, "forwards"))
4620 backwards = 0;
4621 else if (cmd_match(buf, "backwards"))
4622 backwards = 1;
4623 else
4624 return -EINVAL;
4625 if (mddev->reshape_backwards == backwards)
4626 return len;
4627
4628 /* check if we are allowed to change */
4629 if (mddev->delta_disks)
4630 return -EBUSY;
4631
4632 if (mddev->persistent &&
4633 mddev->major_version == 0)
4634 return -EINVAL;
4635
4636 mddev->reshape_backwards = backwards;
4637 return len;
4638}
4639
4640static struct md_sysfs_entry md_reshape_direction =
4641__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4642 reshape_direction_store);
4643
4644static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004645array_size_show(struct mddev *mddev, char *page)
Dan Williamsb522adc2009-03-31 15:00:31 +11004646{
4647 if (mddev->external_size)
4648 return sprintf(page, "%llu\n",
4649 (unsigned long long)mddev->array_sectors/2);
4650 else
4651 return sprintf(page, "default\n");
4652}
4653
4654static ssize_t
NeilBrownfd01b882011-10-11 16:47:53 +11004655array_size_store(struct mddev *mddev, const char *buf, size_t len)
Dan Williamsb522adc2009-03-31 15:00:31 +11004656{
4657 sector_t sectors;
4658
4659 if (strncmp(buf, "default", 7) == 0) {
4660 if (mddev->pers)
4661 sectors = mddev->pers->size(mddev, 0, 0);
4662 else
4663 sectors = mddev->array_sectors;
4664
4665 mddev->external_size = 0;
4666 } else {
4667 if (strict_blocks_to_sectors(buf, &sectors) < 0)
4668 return -EINVAL;
4669 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
NeilBrown2b69c832009-05-26 09:41:17 +10004670 return -E2BIG;
Dan Williamsb522adc2009-03-31 15:00:31 +11004671
4672 mddev->external_size = 1;
4673 }
4674
4675 mddev->array_sectors = sectors;
NeilBrowncbe6ef12011-02-16 13:58:38 +11004676 if (mddev->pers) {
4677 set_capacity(mddev->gendisk, mddev->array_sectors);
NeilBrown449aad32009-08-03 10:59:58 +10004678 revalidate_disk(mddev->gendisk);
NeilBrowncbe6ef12011-02-16 13:58:38 +11004679 }
Dan Williamsb522adc2009-03-31 15:00:31 +11004680 return len;
4681}
4682
4683static struct md_sysfs_entry md_array_size =
4684__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4685 array_size_store);
NeilBrowne464eaf2006-03-27 01:18:14 -08004686
NeilBrowneae17012005-11-08 21:39:23 -08004687static struct attribute *md_default_attrs[] = {
4688 &md_level.attr,
NeilBrownd4dbd022006-06-26 00:27:59 -07004689 &md_layout.attr,
NeilBrowneae17012005-11-08 21:39:23 -08004690 &md_raid_disks.attr,
NeilBrown3b343802006-01-06 00:20:47 -08004691 &md_chunk_size.attr,
NeilBrowna35b0d692006-01-06 00:20:49 -08004692 &md_size.attr,
NeilBrowna94213b2006-06-26 00:28:00 -07004693 &md_resync_start.attr,
NeilBrown8bb93aa2006-01-06 00:20:50 -08004694 &md_metadata.attr,
NeilBrown6d7ff7382006-01-06 00:21:16 -08004695 &md_new_device.attr,
NeilBrown16f17b32006-06-26 00:27:37 -07004696 &md_safe_delay.attr,
NeilBrown9e653b62006-06-26 00:27:58 -07004697 &md_array_state.attr,
NeilBrown08a02ec2007-05-09 02:35:38 -07004698 &md_reshape_position.attr,
NeilBrown2c810cd2012-05-21 09:27:00 +10004699 &md_reshape_direction.attr,
Dan Williamsb522adc2009-03-31 15:00:31 +11004700 &md_array_size.attr,
Robert Becker1e509152009-12-14 12:49:58 +11004701 &max_corr_read_errors.attr,
NeilBrown411036f2005-11-08 21:39:40 -08004702 NULL,
4703};
4704
4705static struct attribute *md_redundancy_attrs[] = {
NeilBrown24dd4692005-11-08 21:39:26 -08004706 &md_scan_mode.attr,
NeilBrown9d888832005-11-08 21:39:26 -08004707 &md_mismatches.attr,
NeilBrown88202a02006-01-06 00:21:36 -08004708 &md_sync_min.attr,
4709 &md_sync_max.attr,
4710 &md_sync_speed.attr,
Bernd Schubert90b08712008-05-23 13:04:38 -07004711 &md_sync_force_parallel.attr,
NeilBrown88202a02006-01-06 00:21:36 -08004712 &md_sync_completed.attr,
Neil Brown5e96ee62008-06-28 08:31:24 +10004713 &md_min_sync.attr,
NeilBrownc6207272008-02-06 01:39:52 -08004714 &md_max_sync.attr,
NeilBrowne464eaf2006-03-27 01:18:14 -08004715 &md_suspend_lo.attr,
4716 &md_suspend_hi.attr,
Paul Clements9b1d1da2006-10-03 01:15:49 -07004717 &md_bitmap.attr,
Iustin Popd7f3d292007-10-16 23:30:54 -07004718 &md_degraded.attr,
NeilBrowneae17012005-11-08 21:39:23 -08004719 NULL,
4720};
NeilBrown411036f2005-11-08 21:39:40 -08004721static struct attribute_group md_redundancy_group = {
4722 .name = NULL,
4723 .attrs = md_redundancy_attrs,
4724};
4725
NeilBrowneae17012005-11-08 21:39:23 -08004726
4727static ssize_t
4728md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4729{
4730 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
NeilBrownfd01b882011-10-11 16:47:53 +11004731 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
NeilBrown96de1e62005-11-08 21:39:39 -08004732 ssize_t rv;
NeilBrowneae17012005-11-08 21:39:23 -08004733
4734 if (!entry->show)
4735 return -EIO;
NeilBrownaf8a2432011-12-08 15:49:46 +11004736 spin_lock(&all_mddevs_lock);
4737 if (list_empty(&mddev->all_mddevs)) {
4738 spin_unlock(&all_mddevs_lock);
4739 return -EBUSY;
4740 }
4741 mddev_get(mddev);
4742 spin_unlock(&all_mddevs_lock);
4743
Ingo Molnar5dc5cf72006-04-20 02:43:23 -07004744 rv = mddev_lock(mddev);
4745 if (!rv) {
4746 rv = entry->show(mddev, page);
4747 mddev_unlock(mddev);
4748 }
NeilBrownaf8a2432011-12-08 15:49:46 +11004749 mddev_put(mddev);
NeilBrown96de1e62005-11-08 21:39:39 -08004750 return rv;
NeilBrowneae17012005-11-08 21:39:23 -08004751}
4752
4753static ssize_t
4754md_attr_store(struct kobject *kobj, struct attribute *attr,
4755 const char *page, size_t length)
4756{
4757 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
NeilBrownfd01b882011-10-11 16:47:53 +11004758 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
NeilBrown96de1e62005-11-08 21:39:39 -08004759 ssize_t rv;
NeilBrowneae17012005-11-08 21:39:23 -08004760
4761 if (!entry->store)
4762 return -EIO;
NeilBrown67463ac2006-07-10 04:44:19 -07004763 if (!capable(CAP_SYS_ADMIN))
4764 return -EACCES;
NeilBrownaf8a2432011-12-08 15:49:46 +11004765 spin_lock(&all_mddevs_lock);
4766 if (list_empty(&mddev->all_mddevs)) {
4767 spin_unlock(&all_mddevs_lock);
4768 return -EBUSY;
4769 }
4770 mddev_get(mddev);
4771 spin_unlock(&all_mddevs_lock);
Ingo Molnar5dc5cf72006-04-20 02:43:23 -07004772 rv = mddev_lock(mddev);
4773 if (!rv) {
4774 rv = entry->store(mddev, page, length);
4775 mddev_unlock(mddev);
4776 }
NeilBrownaf8a2432011-12-08 15:49:46 +11004777 mddev_put(mddev);
NeilBrown96de1e62005-11-08 21:39:39 -08004778 return rv;
NeilBrowneae17012005-11-08 21:39:23 -08004779}
4780
4781static void md_free(struct kobject *ko)
4782{
NeilBrownfd01b882011-10-11 16:47:53 +11004783 struct mddev *mddev = container_of(ko, struct mddev, kobj);
NeilBrowna21d1502009-01-09 08:31:09 +11004784
4785 if (mddev->sysfs_state)
4786 sysfs_put(mddev->sysfs_state);
4787
4788 if (mddev->gendisk) {
4789 del_gendisk(mddev->gendisk);
4790 put_disk(mddev->gendisk);
4791 }
4792 if (mddev->queue)
4793 blk_cleanup_queue(mddev->queue);
4794
NeilBrowneae17012005-11-08 21:39:23 -08004795 kfree(mddev);
4796}
4797
Emese Revfy52cf25d2010-01-19 02:58:23 +01004798static const struct sysfs_ops md_sysfs_ops = {
NeilBrowneae17012005-11-08 21:39:23 -08004799 .show = md_attr_show,
4800 .store = md_attr_store,
4801};
4802static struct kobj_type md_ktype = {
4803 .release = md_free,
4804 .sysfs_ops = &md_sysfs_ops,
4805 .default_attrs = md_default_attrs,
4806};
4807
Linus Torvalds1da177e2005-04-16 15:20:36 -07004808int mdp_major = 0;
4809
Dan Williams5fd3a172009-03-04 00:57:25 -07004810static void mddev_delayed_delete(struct work_struct *ws)
4811{
NeilBrownfd01b882011-10-11 16:47:53 +11004812 struct mddev *mddev = container_of(ws, struct mddev, del_work);
Dan Williams5fd3a172009-03-04 00:57:25 -07004813
NeilBrown43a70502009-12-14 12:49:55 +11004814 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
Dan Williams5fd3a172009-03-04 00:57:25 -07004815 kobject_del(&mddev->kobj);
4816 kobject_put(&mddev->kobj);
4817}
4818
NeilBrownefeb53c2009-01-09 08:31:10 +11004819static int md_alloc(dev_t dev, char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004820{
Arjan van de Ven48c9c272006-03-27 01:18:20 -08004821 static DEFINE_MUTEX(disks_mutex);
NeilBrownfd01b882011-10-11 16:47:53 +11004822 struct mddev *mddev = mddev_find(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004823 struct gendisk *disk;
NeilBrownefeb53c2009-01-09 08:31:10 +11004824 int partitioned;
4825 int shift;
4826 int unit;
Greg Kroah-Hartman3830c622007-12-17 15:54:39 -04004827 int error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004828
4829 if (!mddev)
NeilBrownefeb53c2009-01-09 08:31:10 +11004830 return -ENODEV;
4831
4832 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4833 shift = partitioned ? MdpMinorShift : 0;
4834 unit = MINOR(mddev->unit) >> shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004835
Tejun Heoe804ac72010-10-15 15:36:08 +02004836 /* wait for any previous instance of this device to be
4837 * completely removed (mddev_delayed_delete).
NeilBrownd3374822009-01-09 08:31:10 +11004838 */
Tejun Heoe804ac72010-10-15 15:36:08 +02004839 flush_workqueue(md_misc_wq);
NeilBrownd3374822009-01-09 08:31:10 +11004840
Arjan van de Ven48c9c272006-03-27 01:18:20 -08004841 mutex_lock(&disks_mutex);
NeilBrown0909dc42009-07-01 12:27:21 +10004842 error = -EEXIST;
4843 if (mddev->gendisk)
4844 goto abort;
NeilBrownefeb53c2009-01-09 08:31:10 +11004845
4846 if (name) {
4847 /* Need to ensure that 'name' is not a duplicate.
4848 */
NeilBrownfd01b882011-10-11 16:47:53 +11004849 struct mddev *mddev2;
NeilBrownefeb53c2009-01-09 08:31:10 +11004850 spin_lock(&all_mddevs_lock);
4851
4852 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4853 if (mddev2->gendisk &&
4854 strcmp(mddev2->gendisk->disk_name, name) == 0) {
4855 spin_unlock(&all_mddevs_lock);
NeilBrown0909dc42009-07-01 12:27:21 +10004856 goto abort;
NeilBrownefeb53c2009-01-09 08:31:10 +11004857 }
4858 spin_unlock(&all_mddevs_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004859 }
NeilBrown8b765392009-01-09 08:31:08 +11004860
NeilBrown0909dc42009-07-01 12:27:21 +10004861 error = -ENOMEM;
NeilBrown8b765392009-01-09 08:31:08 +11004862 mddev->queue = blk_alloc_queue(GFP_KERNEL);
NeilBrown0909dc42009-07-01 12:27:21 +10004863 if (!mddev->queue)
4864 goto abort;
NeilBrown409c57f2009-03-31 14:39:39 +11004865 mddev->queue->queuedata = mddev;
4866
NeilBrown409c57f2009-03-31 14:39:39 +11004867 blk_queue_make_request(mddev->queue, md_make_request);
Martin K. Petersenb1bd0552012-01-11 16:27:11 +01004868 blk_set_stacking_limits(&mddev->queue->limits);
NeilBrown8b765392009-01-09 08:31:08 +11004869
Linus Torvalds1da177e2005-04-16 15:20:36 -07004870 disk = alloc_disk(1 << shift);
4871 if (!disk) {
NeilBrown8b765392009-01-09 08:31:08 +11004872 blk_cleanup_queue(mddev->queue);
4873 mddev->queue = NULL;
NeilBrown0909dc42009-07-01 12:27:21 +10004874 goto abort;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004875 }
NeilBrownefeb53c2009-01-09 08:31:10 +11004876 disk->major = MAJOR(mddev->unit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004877 disk->first_minor = unit << shift;
NeilBrownefeb53c2009-01-09 08:31:10 +11004878 if (name)
4879 strcpy(disk->disk_name, name);
4880 else if (partitioned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004881 sprintf(disk->disk_name, "md_d%d", unit);
Greg Kroah-Hartmance7b0f462005-06-20 21:15:16 -07004882 else
Linus Torvalds1da177e2005-04-16 15:20:36 -07004883 sprintf(disk->disk_name, "md%d", unit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004884 disk->fops = &md_fops;
4885 disk->private_data = mddev;
4886 disk->queue = mddev->queue;
NeilBrownb0140892011-05-10 17:49:01 +10004887 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
NeilBrown92850bb2008-10-21 13:25:32 +11004888 /* Allow extended partitions. This makes the
NeilBrownd3374822009-01-09 08:31:10 +11004889 * 'mdp' device redundant, but we can't really
NeilBrown92850bb2008-10-21 13:25:32 +11004890 * remove it now.
4891 */
4892 disk->flags |= GENHD_FL_EXT_DEVT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004893 mddev->gendisk = disk;
NeilBrownb0140892011-05-10 17:49:01 +10004894 /* As soon as we call add_disk(), another thread could get
4895 * through to md_open, so make sure it doesn't get too far
4896 */
4897 mutex_lock(&mddev->open_mutex);
4898 add_disk(disk);
4899
Tejun Heoed9e1982008-08-25 19:56:05 +09004900 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4901 &disk_to_dev(disk)->kobj, "%s", "md");
NeilBrown0909dc42009-07-01 12:27:21 +10004902 if (error) {
4903 /* This isn't possible, but as kobject_init_and_add is marked
4904 * __must_check, we must do something with the result
4905 */
NeilBrown5e55e2f2007-03-26 21:32:14 -08004906 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4907 disk->disk_name);
NeilBrown0909dc42009-07-01 12:27:21 +10004908 error = 0;
4909 }
NeilBrown00bcb4a2010-06-01 19:37:23 +10004910 if (mddev->kobj.sd &&
4911 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
NeilBrown43a70502009-12-14 12:49:55 +11004912 printk(KERN_DEBUG "pointless warning\n");
NeilBrownb0140892011-05-10 17:49:01 +10004913 mutex_unlock(&mddev->open_mutex);
NeilBrown0909dc42009-07-01 12:27:21 +10004914 abort:
4915 mutex_unlock(&disks_mutex);
NeilBrown00bcb4a2010-06-01 19:37:23 +10004916 if (!error && mddev->kobj.sd) {
Greg Kroah-Hartman3830c622007-12-17 15:54:39 -04004917 kobject_uevent(&mddev->kobj, KOBJ_ADD);
NeilBrown00bcb4a2010-06-01 19:37:23 +10004918 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
NeilBrownb62b7592008-10-21 13:25:21 +11004919 }
NeilBrownd3374822009-01-09 08:31:10 +11004920 mddev_put(mddev);
NeilBrown0909dc42009-07-01 12:27:21 +10004921 return error;
NeilBrownefeb53c2009-01-09 08:31:10 +11004922}
4923
4924static struct kobject *md_probe(dev_t dev, int *part, void *data)
4925{
4926 md_alloc(dev, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004927 return NULL;
4928}
4929
NeilBrownefeb53c2009-01-09 08:31:10 +11004930static int add_named_array(const char *val, struct kernel_param *kp)
4931{
4932 /* val must be "md_*" where * is not all digits.
4933 * We allocate an array with a large free minor number, and
4934 * set the name to val. val must not already be an active name.
4935 */
4936 int len = strlen(val);
4937 char buf[DISK_NAME_LEN];
4938
4939 while (len && val[len-1] == '\n')
4940 len--;
4941 if (len >= DISK_NAME_LEN)
4942 return -E2BIG;
4943 strlcpy(buf, val, len+1);
4944 if (strncmp(buf, "md_", 3) != 0)
4945 return -EINVAL;
4946 return md_alloc(0, buf);
4947}
4948
Linus Torvalds1da177e2005-04-16 15:20:36 -07004949static void md_safemode_timeout(unsigned long data)
4950{
NeilBrownfd01b882011-10-11 16:47:53 +11004951 struct mddev *mddev = (struct mddev *) data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004952
Neil Brown0fd62b82008-06-28 08:31:36 +10004953 if (!atomic_read(&mddev->writes_pending)) {
4954 mddev->safemode = 1;
4955 if (mddev->external)
NeilBrown00bcb4a2010-06-01 19:37:23 +10004956 sysfs_notify_dirent_safe(mddev->sysfs_state);
Neil Brown0fd62b82008-06-28 08:31:36 +10004957 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004958 md_wakeup_thread(mddev->thread);
4959}
4960
NeilBrown6ff8d8ec2006-01-06 00:20:15 -08004961static int start_dirty_degraded;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962
NeilBrownfd01b882011-10-11 16:47:53 +11004963int md_run(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004964{
NeilBrown2604b702006-01-06 00:20:36 -08004965 int err;
NeilBrown3cb03002011-10-11 16:45:26 +11004966 struct md_rdev *rdev;
NeilBrown84fc4b52011-10-11 16:49:58 +11004967 struct md_personality *pers;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004968
NeilBrowna757e642005-04-16 15:26:42 -07004969 if (list_empty(&mddev->disks))
4970 /* cannot run an array with no devices.. */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004971 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004972
4973 if (mddev->pers)
4974 return -EBUSY;
NeilBrownbb4f1e92010-08-08 21:18:03 +10004975 /* Cannot run until previous stop completes properly */
4976 if (mddev->sysfs_active)
4977 return -EBUSY;
NeilBrownb6eb1272010-04-15 10:13:47 +10004978
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979 /*
4980 * Analyze all RAID superblock(s)
4981 */
NeilBrown1ec4a932008-02-06 01:39:53 -08004982 if (!mddev->raid_disks) {
4983 if (!mddev->persistent)
4984 return -EINVAL;
NeilBrowna757e642005-04-16 15:26:42 -07004985 analyze_sbs(mddev);
NeilBrown1ec4a932008-02-06 01:39:53 -08004986 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004987
NeilBrownd9d166c2006-01-06 00:20:51 -08004988 if (mddev->level != LEVEL_NONE)
4989 request_module("md-level-%d", mddev->level);
4990 else if (mddev->clevel[0])
4991 request_module("md-%s", mddev->clevel);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004992
4993 /*
4994 * Drop all container device buffers, from now on
4995 * the only valid external interface is through the md
4996 * device.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004997 */
NeilBrowndafb20f2012-03-19 12:46:39 +11004998 rdev_for_each(rdev, mddev) {
NeilBrownb2d444d2005-11-08 21:39:31 -08004999 if (test_bit(Faulty, &rdev->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005000 continue;
5001 sync_blockdev(rdev->bdev);
Peter Zijlstraf98393a2007-05-06 14:49:54 -07005002 invalidate_bdev(rdev->bdev);
NeilBrownf0d76d72007-07-17 04:06:12 -07005003
5004 /* perform some consistency tests on the device.
5005 * We don't want the data to overlap the metadata,
Andre Noll58c0fed2009-03-31 14:33:13 +11005006 * Internal Bitmap issues have been handled elsewhere.
NeilBrownf0d76d72007-07-17 04:06:12 -07005007 */
Jonathan Brassowa6ff7e02011-01-14 09:14:34 +11005008 if (rdev->meta_bdev) {
5009 /* Nothing to check */;
5010 } else if (rdev->data_offset < rdev->sb_start) {
Andre Noll58c0fed2009-03-31 14:33:13 +11005011 if (mddev->dev_sectors &&
5012 rdev->data_offset + mddev->dev_sectors
Andre Noll0f420352008-07-11 22:02:23 +10005013 > rdev->sb_start) {
NeilBrownf0d76d72007-07-17 04:06:12 -07005014 printk("md: %s: data overlaps metadata\n",
5015 mdname(mddev));
5016 return -EINVAL;
5017 }
5018 } else {
Andre Noll0f420352008-07-11 22:02:23 +10005019 if (rdev->sb_start + rdev->sb_size/512
NeilBrownf0d76d72007-07-17 04:06:12 -07005020 > rdev->data_offset) {
5021 printk("md: %s: metadata overlaps data\n",
5022 mdname(mddev));
5023 return -EINVAL;
5024 }
5025 }
NeilBrown00bcb4a2010-06-01 19:37:23 +10005026 sysfs_notify_dirent_safe(rdev->sysfs_state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005027 }
5028
NeilBrowna167f662010-10-26 18:31:13 +11005029 if (mddev->bio_set == NULL)
NeilBrowna519b262011-07-28 07:56:24 +10005030 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
NeilBrownfd01b882011-10-11 16:47:53 +11005031 sizeof(struct mddev *));
NeilBrowna167f662010-10-26 18:31:13 +11005032
Linus Torvalds1da177e2005-04-16 15:20:36 -07005033 spin_lock(&pers_lock);
NeilBrownd9d166c2006-01-06 00:20:51 -08005034 pers = find_pers(mddev->level, mddev->clevel);
NeilBrown2604b702006-01-06 00:20:36 -08005035 if (!pers || !try_module_get(pers->owner)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005036 spin_unlock(&pers_lock);
NeilBrownd9d166c2006-01-06 00:20:51 -08005037 if (mddev->level != LEVEL_NONE)
5038 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5039 mddev->level);
5040 else
5041 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5042 mddev->clevel);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005043 return -EINVAL;
5044 }
NeilBrown2604b702006-01-06 00:20:36 -08005045 mddev->pers = pers;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046 spin_unlock(&pers_lock);
NeilBrown34817e82009-03-31 14:39:38 +11005047 if (mddev->level != pers->level) {
5048 mddev->level = pers->level;
5049 mddev->new_level = pers->level;
5050 }
NeilBrownd9d166c2006-01-06 00:20:51 -08005051 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005052
NeilBrownf6705572006-03-27 01:18:11 -08005053 if (mddev->reshape_position != MaxSector &&
NeilBrown63c70c42006-03-27 01:18:13 -08005054 pers->start_reshape == NULL) {
NeilBrownf6705572006-03-27 01:18:11 -08005055 /* This personality cannot handle reshaping... */
5056 mddev->pers = NULL;
5057 module_put(pers->owner);
5058 return -EINVAL;
5059 }
5060
NeilBrown7dd5e7c32007-02-28 20:11:35 -08005061 if (pers->sync_request) {
5062 /* Warn if this is a potentially silly
5063 * configuration.
5064 */
5065 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
NeilBrown3cb03002011-10-11 16:45:26 +11005066 struct md_rdev *rdev2;
NeilBrown7dd5e7c32007-02-28 20:11:35 -08005067 int warned = 0;
Cheng Renquan159ec1f2009-01-09 08:31:08 +11005068
NeilBrowndafb20f2012-03-19 12:46:39 +11005069 rdev_for_each(rdev, mddev)
5070 rdev_for_each(rdev2, mddev) {
NeilBrown7dd5e7c32007-02-28 20:11:35 -08005071 if (rdev < rdev2 &&
5072 rdev->bdev->bd_contains ==
5073 rdev2->bdev->bd_contains) {
5074 printk(KERN_WARNING
5075 "%s: WARNING: %s appears to be"
5076 " on the same physical disk as"
5077 " %s.\n",
5078 mdname(mddev),
5079 bdevname(rdev->bdev,b),
5080 bdevname(rdev2->bdev,b2));
5081 warned = 1;
5082 }
5083 }
Cheng Renquan159ec1f2009-01-09 08:31:08 +11005084
NeilBrown7dd5e7c32007-02-28 20:11:35 -08005085 if (warned)
5086 printk(KERN_WARNING
5087 "True protection against single-disk"
5088 " failure might be compromised.\n");
5089 }
5090
NeilBrown657390d2005-08-26 18:34:16 -07005091 mddev->recovery = 0;
Andre Noll58c0fed2009-03-31 14:33:13 +11005092 /* may be over-ridden by personality */
5093 mddev->resync_max_sectors = mddev->dev_sectors;
5094
NeilBrown6ff8d8ec2006-01-06 00:20:15 -08005095 mddev->ok_start_degraded = start_dirty_degraded;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005096
NeilBrown0f9552b52009-12-30 12:08:50 +11005097 if (start_readonly && mddev->ro == 0)
NeilBrownf91de922005-11-08 21:39:36 -08005098 mddev->ro = 2; /* read-only, but switch on first write */
5099
NeilBrownb15c2e52006-01-06 00:20:16 -08005100 err = mddev->pers->run(mddev);
Andre Noll13e53df2008-03-26 00:07:03 +01005101 if (err)
5102 printk(KERN_ERR "md: pers->run() failed ...\n");
Dan Williamsb522adc2009-03-31 15:00:31 +11005103 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
5104 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5105 " but 'external_size' not in effect?\n", __func__);
5106 printk(KERN_ERR
5107 "md: invalid array_size %llu > default size %llu\n",
5108 (unsigned long long)mddev->array_sectors / 2,
5109 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
5110 err = -EINVAL;
5111 mddev->pers->stop(mddev);
5112 }
5113 if (err == 0 && mddev->pers->sync_request) {
NeilBrownb15c2e52006-01-06 00:20:16 -08005114 err = bitmap_create(mddev);
5115 if (err) {
5116 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5117 mdname(mddev), err);
5118 mddev->pers->stop(mddev);
5119 }
5120 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005121 if (err) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005122 module_put(mddev->pers->owner);
5123 mddev->pers = NULL;
NeilBrown32a76272005-06-21 17:17:14 -07005124 bitmap_destroy(mddev);
5125 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005126 }
NeilBrown5e55e2f2007-03-26 21:32:14 -08005127 if (mddev->pers->sync_request) {
NeilBrown00bcb4a2010-06-01 19:37:23 +10005128 if (mddev->kobj.sd &&
5129 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
NeilBrown5e55e2f2007-03-26 21:32:14 -08005130 printk(KERN_WARNING
5131 "md: cannot register extra attributes for %s\n",
5132 mdname(mddev));
NeilBrown00bcb4a2010-06-01 19:37:23 +10005133 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
NeilBrown5e55e2f2007-03-26 21:32:14 -08005134 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
NeilBrownfd9d49c2005-11-08 21:39:42 -08005135 mddev->ro = 0;
5136
Linus Torvalds1da177e2005-04-16 15:20:36 -07005137 atomic_set(&mddev->writes_pending,0);
Robert Becker1e509152009-12-14 12:49:58 +11005138 atomic_set(&mddev->max_corr_read_errors,
5139 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005140 mddev->safemode = 0;
5141 mddev->safemode_timer.function = md_safemode_timeout;
5142 mddev->safemode_timer.data = (unsigned long) mddev;
NeilBrown16f17b32006-06-26 00:27:37 -07005143 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005144 mddev->in_sync = 1;
NeilBrown0ca69882011-01-14 09:14:33 +11005145 smp_wmb();
5146 mddev->ready = 1;
NeilBrowndafb20f2012-03-19 12:46:39 +11005147 rdev_for_each(rdev, mddev)
Namhyung Kim36fad852011-07-27 11:00:36 +10005148 if (rdev->raid_disk >= 0)
5149 if (sysfs_link_rdev(mddev, rdev))
NeilBrown00bcb4a2010-06-01 19:37:23 +10005150 /* failure here is OK */;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005151
5152 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5153
NeilBrown850b2b422006-10-03 01:15:46 -07005154 if (mddev->flags)
5155 md_update_sb(mddev, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156
NeilBrownd7603b72006-01-06 00:20:30 -08005157 md_new_event(mddev);
NeilBrown00bcb4a2010-06-01 19:37:23 +10005158 sysfs_notify_dirent_safe(mddev->sysfs_state);
5159 sysfs_notify_dirent_safe(mddev->sysfs_action);
Neil Browna99ac972008-06-28 08:31:43 +10005160 sysfs_notify(&mddev->kobj, NULL, "degraded");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005161 return 0;
5162}
NeilBrown390ee602010-06-01 19:37:27 +10005163EXPORT_SYMBOL_GPL(md_run);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005164
NeilBrownfd01b882011-10-11 16:47:53 +11005165static int do_md_run(struct mddev *mddev)
NeilBrownfe60b012010-03-29 11:10:42 +11005166{
5167 int err;
5168
5169 err = md_run(mddev);
5170 if (err)
5171 goto out;
NeilBrown69e51b42010-06-01 19:37:35 +10005172 err = bitmap_load(mddev);
5173 if (err) {
5174 bitmap_destroy(mddev);
5175 goto out;
5176 }
Jonathan Brassow0fd018a2011-06-07 17:49:36 -05005177
5178 md_wakeup_thread(mddev->thread);
5179 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5180
NeilBrownfe60b012010-03-29 11:10:42 +11005181 set_capacity(mddev->gendisk, mddev->array_sectors);
5182 revalidate_disk(mddev->gendisk);
NeilBrownf0b4f7e2011-02-24 17:26:41 +11005183 mddev->changed = 1;
NeilBrownfe60b012010-03-29 11:10:42 +11005184 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5185out:
5186 return err;
5187}
5188
NeilBrownfd01b882011-10-11 16:47:53 +11005189static int restart_array(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005190{
5191 struct gendisk *disk = mddev->gendisk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005192
Andre Noll80fab1d2008-07-11 22:02:21 +10005193 /* Complain if it has no devices */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005194 if (list_empty(&mddev->disks))
Andre Noll80fab1d2008-07-11 22:02:21 +10005195 return -ENXIO;
5196 if (!mddev->pers)
5197 return -EINVAL;
5198 if (!mddev->ro)
5199 return -EBUSY;
5200 mddev->safemode = 0;
5201 mddev->ro = 0;
5202 set_disk_ro(disk, 0);
5203 printk(KERN_INFO "md: %s switched to read-write mode.\n",
5204 mdname(mddev));
5205 /* Kick recovery or resync if necessary */
5206 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5207 md_wakeup_thread(mddev->thread);
5208 md_wakeup_thread(mddev->sync_thread);
NeilBrown00bcb4a2010-06-01 19:37:23 +10005209 sysfs_notify_dirent_safe(mddev->sysfs_state);
Andre Noll80fab1d2008-07-11 22:02:21 +10005210 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005211}
5212
NeilBrownacc55e22006-06-26 00:27:47 -07005213/* similar to deny_write_access, but accounts for our holding a reference
5214 * to the file ourselves */
5215static int deny_bitmap_write_access(struct file * file)
5216{
5217 struct inode *inode = file->f_mapping->host;
5218
5219 spin_lock(&inode->i_lock);
5220 if (atomic_read(&inode->i_writecount) > 1) {
5221 spin_unlock(&inode->i_lock);
5222 return -ETXTBSY;
5223 }
5224 atomic_set(&inode->i_writecount, -1);
5225 spin_unlock(&inode->i_lock);
5226
5227 return 0;
5228}
5229
NeilBrown43a70502009-12-14 12:49:55 +11005230void restore_bitmap_write_access(struct file *file)
NeilBrownacc55e22006-06-26 00:27:47 -07005231{
5232 struct inode *inode = file->f_mapping->host;
5233
5234 spin_lock(&inode->i_lock);
5235 atomic_set(&inode->i_writecount, 1);
5236 spin_unlock(&inode->i_lock);
5237}
5238
NeilBrownfd01b882011-10-11 16:47:53 +11005239static void md_clean(struct mddev *mddev)
NeilBrown6177b472010-03-29 11:37:13 +11005240{
5241 mddev->array_sectors = 0;
5242 mddev->external_size = 0;
5243 mddev->dev_sectors = 0;
5244 mddev->raid_disks = 0;
5245 mddev->recovery_cp = 0;
5246 mddev->resync_min = 0;
5247 mddev->resync_max = MaxSector;
5248 mddev->reshape_position = MaxSector;
5249 mddev->external = 0;
5250 mddev->persistent = 0;
5251 mddev->level = LEVEL_NONE;
5252 mddev->clevel[0] = 0;
5253 mddev->flags = 0;
5254 mddev->ro = 0;
5255 mddev->metadata_type[0] = 0;
5256 mddev->chunk_sectors = 0;
5257 mddev->ctime = mddev->utime = 0;
5258 mddev->layout = 0;
5259 mddev->max_disks = 0;
5260 mddev->events = 0;
NeilBrowna8707c02010-05-18 09:28:43 +10005261 mddev->can_decrease_events = 0;
NeilBrown6177b472010-03-29 11:37:13 +11005262 mddev->delta_disks = 0;
NeilBrown2c810cd2012-05-21 09:27:00 +10005263 mddev->reshape_backwards = 0;
NeilBrown6177b472010-03-29 11:37:13 +11005264 mddev->new_level = LEVEL_NONE;
5265 mddev->new_layout = 0;
5266 mddev->new_chunk_sectors = 0;
5267 mddev->curr_resync = 0;
5268 mddev->resync_mismatches = 0;
5269 mddev->suspend_lo = mddev->suspend_hi = 0;
5270 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5271 mddev->recovery = 0;
5272 mddev->in_sync = 0;
NeilBrownf0b4f7e2011-02-24 17:26:41 +11005273 mddev->changed = 0;
NeilBrown6177b472010-03-29 11:37:13 +11005274 mddev->degraded = 0;
NeilBrown6177b472010-03-29 11:37:13 +11005275 mddev->safemode = 0;
NeilBrown050b6612012-03-19 12:46:39 +11005276 mddev->merge_check_needed = 0;
NeilBrown6177b472010-03-29 11:37:13 +11005277 mddev->bitmap_info.offset = 0;
5278 mddev->bitmap_info.default_offset = 0;
5279 mddev->bitmap_info.chunksize = 0;
5280 mddev->bitmap_info.daemon_sleep = 0;
5281 mddev->bitmap_info.max_write_behind = 0;
5282}
5283
NeilBrownfd01b882011-10-11 16:47:53 +11005284static void __md_stop_writes(struct mddev *mddev)
NeilBrowna047e122010-03-29 12:07:53 +11005285{
5286 if (mddev->sync_thread) {
5287 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5288 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
NeilBrown7ebc0be2011-01-14 09:14:33 +11005289 reap_sync_thread(mddev);
NeilBrowna047e122010-03-29 12:07:53 +11005290 }
5291
5292 del_timer_sync(&mddev->safemode_timer);
5293
5294 bitmap_flush(mddev);
5295 md_super_wait(mddev);
5296
5297 if (!mddev->in_sync || mddev->flags) {
5298 /* mark array as shutdown cleanly */
5299 mddev->in_sync = 1;
5300 md_update_sb(mddev, 1);
5301 }
5302}
NeilBrowndefad612011-01-14 09:14:33 +11005303
NeilBrownfd01b882011-10-11 16:47:53 +11005304void md_stop_writes(struct mddev *mddev)
NeilBrowndefad612011-01-14 09:14:33 +11005305{
5306 mddev_lock(mddev);
5307 __md_stop_writes(mddev);
5308 mddev_unlock(mddev);
5309}
NeilBrown390ee602010-06-01 19:37:27 +10005310EXPORT_SYMBOL_GPL(md_stop_writes);
NeilBrowna047e122010-03-29 12:07:53 +11005311
NeilBrownfd01b882011-10-11 16:47:53 +11005312void md_stop(struct mddev *mddev)
NeilBrown6177b472010-03-29 11:37:13 +11005313{
NeilBrown0ca69882011-01-14 09:14:33 +11005314 mddev->ready = 0;
NeilBrown6177b472010-03-29 11:37:13 +11005315 mddev->pers->stop(mddev);
5316 if (mddev->pers->sync_request && mddev->to_remove == NULL)
5317 mddev->to_remove = &md_redundancy_group;
5318 module_put(mddev->pers->owner);
5319 mddev->pers = NULL;
NeilBrowncca9cf92010-04-01 12:08:16 +11005320 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
NeilBrown6177b472010-03-29 11:37:13 +11005321}
NeilBrown390ee602010-06-01 19:37:27 +10005322EXPORT_SYMBOL_GPL(md_stop);
NeilBrown6177b472010-03-29 11:37:13 +11005323
NeilBrownfd01b882011-10-11 16:47:53 +11005324static int md_set_readonly(struct mddev *mddev, int is_open)
NeilBrowna4bd82d02010-03-29 13:23:10 +11005325{
5326 int err = 0;
5327 mutex_lock(&mddev->open_mutex);
5328 if (atomic_read(&mddev->openers) > is_open) {
5329 printk("md: %s still in use.\n",mdname(mddev));
5330 err = -EBUSY;
5331 goto out;
5332 }
5333 if (mddev->pers) {
NeilBrowndefad612011-01-14 09:14:33 +11005334 __md_stop_writes(mddev);
NeilBrowna4bd82d02010-03-29 13:23:10 +11005335
5336 err = -ENXIO;
5337 if (mddev->ro==1)
5338 goto out;
5339 mddev->ro = 1;
5340 set_disk_ro(mddev->gendisk, 1);
5341 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
NeilBrown00bcb4a2010-06-01 19:37:23 +10005342 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrowna4bd82d02010-03-29 13:23:10 +11005343 err = 0;
5344 }
5345out:
5346 mutex_unlock(&mddev->open_mutex);
5347 return err;
5348}
5349
NeilBrown9e653b62006-06-26 00:27:58 -07005350/* mode:
5351 * 0 - completely stop and dis-assemble array
NeilBrown9e653b62006-06-26 00:27:58 -07005352 * 2 - stop but do not disassemble array
5353 */
NeilBrownfd01b882011-10-11 16:47:53 +11005354static int do_md_stop(struct mddev * mddev, int mode, int is_open)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005355{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005356 struct gendisk *disk = mddev->gendisk;
NeilBrown3cb03002011-10-11 16:45:26 +11005357 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005358
NeilBrownc8c00a62009-08-10 12:50:52 +10005359 mutex_lock(&mddev->open_mutex);
NeilBrownbb4f1e92010-08-08 21:18:03 +10005360 if (atomic_read(&mddev->openers) > is_open ||
5361 mddev->sysfs_active) {
Neil Browndf5b20c2008-07-11 22:02:22 +10005362 printk("md: %s still in use.\n",mdname(mddev));
NeilBrown6e17b022010-08-07 21:41:19 +10005363 mutex_unlock(&mddev->open_mutex);
5364 return -EBUSY;
5365 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005366
NeilBrown6e17b022010-08-07 21:41:19 +10005367 if (mddev->pers) {
NeilBrowna4bd82d02010-03-29 13:23:10 +11005368 if (mddev->ro)
5369 set_disk_ro(disk, 0);
NeilBrown409c57f2009-03-31 14:39:39 +11005370
NeilBrowndefad612011-01-14 09:14:33 +11005371 __md_stop_writes(mddev);
NeilBrowna4bd82d02010-03-29 13:23:10 +11005372 md_stop(mddev);
5373 mddev->queue->merge_bvec_fn = NULL;
NeilBrowna4bd82d02010-03-29 13:23:10 +11005374 mddev->queue->backing_dev_info.congested_fn = NULL;
NeilBrown6177b472010-03-29 11:37:13 +11005375
NeilBrowna4bd82d02010-03-29 13:23:10 +11005376 /* tell userspace to handle 'inactive' */
NeilBrown00bcb4a2010-06-01 19:37:23 +10005377 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrown0d4ca602006-12-10 02:20:44 -08005378
NeilBrowndafb20f2012-03-19 12:46:39 +11005379 rdev_for_each(rdev, mddev)
Namhyung Kim36fad852011-07-27 11:00:36 +10005380 if (rdev->raid_disk >= 0)
5381 sysfs_unlink_rdev(mddev, rdev);
NeilBrownc4647292009-05-07 12:51:06 +10005382
NeilBrowna4bd82d02010-03-29 13:23:10 +11005383 set_capacity(disk, 0);
NeilBrown6e17b022010-08-07 21:41:19 +10005384 mutex_unlock(&mddev->open_mutex);
NeilBrownf0b4f7e2011-02-24 17:26:41 +11005385 mddev->changed = 1;
NeilBrowna4bd82d02010-03-29 13:23:10 +11005386 revalidate_disk(disk);
NeilBrown0d4ca602006-12-10 02:20:44 -08005387
NeilBrowna4bd82d02010-03-29 13:23:10 +11005388 if (mddev->ro)
5389 mddev->ro = 0;
NeilBrown6e17b022010-08-07 21:41:19 +10005390 } else
5391 mutex_unlock(&mddev->open_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005392 /*
5393 * Free resources if final stop
5394 */
NeilBrown9e653b62006-06-26 00:27:58 -07005395 if (mode == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5397
NeilBrown978f9462006-02-02 14:28:05 -08005398 bitmap_destroy(mddev);
NeilBrownc3d97142009-12-14 12:49:52 +11005399 if (mddev->bitmap_info.file) {
5400 restore_bitmap_write_access(mddev->bitmap_info.file);
5401 fput(mddev->bitmap_info.file);
5402 mddev->bitmap_info.file = NULL;
NeilBrown978f9462006-02-02 14:28:05 -08005403 }
NeilBrownc3d97142009-12-14 12:49:52 +11005404 mddev->bitmap_info.offset = 0;
NeilBrown978f9462006-02-02 14:28:05 -08005405
Linus Torvalds1da177e2005-04-16 15:20:36 -07005406 export_array(mddev);
5407
NeilBrown6177b472010-03-29 11:37:13 +11005408 md_clean(mddev);
NeilBrown934d9c22008-10-28 17:01:23 +11005409 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
NeilBrownefeb53c2009-01-09 08:31:10 +11005410 if (mddev->hold_active == UNTIL_STOP)
5411 mddev->hold_active = 0;
NeilBrowna4bd82d02010-03-29 13:23:10 +11005412 }
Martin K. Petersen3f9d99c2009-03-31 14:27:02 +11005413 blk_integrity_unregister(disk);
NeilBrownd7603b72006-01-06 00:20:30 -08005414 md_new_event(mddev);
NeilBrown00bcb4a2010-06-01 19:37:23 +10005415 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrown6e17b022010-08-07 21:41:19 +10005416 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005417}
5418
Jeff Garzikfdee8ae2006-12-10 02:20:50 -08005419#ifndef MODULE
NeilBrownfd01b882011-10-11 16:47:53 +11005420static void autorun_array(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005421{
NeilBrown3cb03002011-10-11 16:45:26 +11005422 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005423 int err;
5424
NeilBrowna757e642005-04-16 15:26:42 -07005425 if (list_empty(&mddev->disks))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005426 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005427
5428 printk(KERN_INFO "md: running: ");
5429
NeilBrowndafb20f2012-03-19 12:46:39 +11005430 rdev_for_each(rdev, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005431 char b[BDEVNAME_SIZE];
5432 printk("<%s>", bdevname(rdev->bdev,b));
5433 }
5434 printk("\n");
5435
NeilBrownd710e132008-10-13 11:55:12 +11005436 err = do_md_run(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005437 if (err) {
5438 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
NeilBrownd710e132008-10-13 11:55:12 +11005439 do_md_stop(mddev, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005440 }
5441}
5442
5443/*
5444 * lets try to run arrays based on all disks that have arrived
5445 * until now. (those are in pending_raid_disks)
5446 *
5447 * the method: pick the first pending disk, collect all disks with
5448 * the same UUID, remove all from the pending list and put them into
5449 * the 'same_array' list. Then order this list based on superblock
5450 * update time (freshest comes first), kick out 'old' disks and
5451 * compare superblocks. If everything's fine then run it.
5452 *
5453 * If "unit" is allocated, then bump its reference count
5454 */
5455static void autorun_devices(int part)
5456{
NeilBrown3cb03002011-10-11 16:45:26 +11005457 struct md_rdev *rdev0, *rdev, *tmp;
NeilBrownfd01b882011-10-11 16:47:53 +11005458 struct mddev *mddev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005459 char b[BDEVNAME_SIZE];
5460
5461 printk(KERN_INFO "md: autorun ...\n");
5462 while (!list_empty(&pending_raid_disks)) {
NeilBrowne8703fe2006-10-03 01:15:59 -07005463 int unit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005464 dev_t dev;
NeilBrownad01c9e2006-03-27 01:18:07 -08005465 LIST_HEAD(candidates);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005466 rdev0 = list_entry(pending_raid_disks.next,
NeilBrown3cb03002011-10-11 16:45:26 +11005467 struct md_rdev, same_set);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005468
5469 printk(KERN_INFO "md: considering %s ...\n",
5470 bdevname(rdev0->bdev,b));
5471 INIT_LIST_HEAD(&candidates);
Cheng Renquan159ec1f2009-01-09 08:31:08 +11005472 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005473 if (super_90_load(rdev, rdev0, 0) >= 0) {
5474 printk(KERN_INFO "md: adding %s ...\n",
5475 bdevname(rdev->bdev,b));
5476 list_move(&rdev->same_set, &candidates);
5477 }
5478 /*
5479 * now we have a set of devices, with all of them having
5480 * mostly sane superblocks. It's time to allocate the
5481 * mddev.
5482 */
NeilBrowne8703fe2006-10-03 01:15:59 -07005483 if (part) {
5484 dev = MKDEV(mdp_major,
5485 rdev0->preferred_minor << MdpMinorShift);
5486 unit = MINOR(dev) >> MdpMinorShift;
5487 } else {
5488 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5489 unit = MINOR(dev);
5490 }
5491 if (rdev0->preferred_minor != unit) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005492 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5493 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5494 break;
5495 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005496
5497 md_probe(dev, NULL, NULL);
5498 mddev = mddev_find(dev);
Neil Brown9bbbca32008-06-28 08:31:17 +10005499 if (!mddev || !mddev->gendisk) {
5500 if (mddev)
5501 mddev_put(mddev);
5502 printk(KERN_ERR
Linus Torvalds1da177e2005-04-16 15:20:36 -07005503 "md: cannot allocate memory for md drive.\n");
5504 break;
5505 }
5506 if (mddev_lock(mddev))
5507 printk(KERN_WARNING "md: %s locked, cannot run\n",
5508 mdname(mddev));
5509 else if (mddev->raid_disks || mddev->major_version
5510 || !list_empty(&mddev->disks)) {
5511 printk(KERN_WARNING
5512 "md: %s already running, cannot run %s\n",
5513 mdname(mddev), bdevname(rdev0->bdev,b));
5514 mddev_unlock(mddev);
5515 } else {
5516 printk(KERN_INFO "md: created %s\n", mdname(mddev));
NeilBrown1ec4a932008-02-06 01:39:53 -08005517 mddev->persistent = 1;
Cheng Renquan159ec1f2009-01-09 08:31:08 +11005518 rdev_for_each_list(rdev, tmp, &candidates) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005519 list_del_init(&rdev->same_set);
5520 if (bind_rdev_to_array(rdev, mddev))
5521 export_rdev(rdev);
5522 }
5523 autorun_array(mddev);
5524 mddev_unlock(mddev);
5525 }
5526 /* on success, candidates will be empty, on error
5527 * it won't...
5528 */
Cheng Renquan159ec1f2009-01-09 08:31:08 +11005529 rdev_for_each_list(rdev, tmp, &candidates) {
NeilBrown4b809912008-07-21 17:05:25 +10005530 list_del_init(&rdev->same_set);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005531 export_rdev(rdev);
NeilBrown4b809912008-07-21 17:05:25 +10005532 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005533 mddev_put(mddev);
5534 }
5535 printk(KERN_INFO "md: ... autorun DONE.\n");
5536}
Jeff Garzikfdee8ae2006-12-10 02:20:50 -08005537#endif /* !MODULE */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005538
Linus Torvalds1da177e2005-04-16 15:20:36 -07005539static int get_version(void __user * arg)
5540{
5541 mdu_version_t ver;
5542
5543 ver.major = MD_MAJOR_VERSION;
5544 ver.minor = MD_MINOR_VERSION;
5545 ver.patchlevel = MD_PATCHLEVEL_VERSION;
5546
5547 if (copy_to_user(arg, &ver, sizeof(ver)))
5548 return -EFAULT;
5549
5550 return 0;
5551}
5552
NeilBrownfd01b882011-10-11 16:47:53 +11005553static int get_array_info(struct mddev * mddev, void __user * arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005554{
5555 mdu_array_info_t info;
NeilBrowna9f326e2009-09-23 18:06:41 +10005556 int nr,working,insync,failed,spare;
NeilBrown3cb03002011-10-11 16:45:26 +11005557 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005558
NeilBrowna9f326e2009-09-23 18:06:41 +10005559 nr=working=insync=failed=spare=0;
NeilBrowndafb20f2012-03-19 12:46:39 +11005560 rdev_for_each(rdev, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005561 nr++;
NeilBrownb2d444d2005-11-08 21:39:31 -08005562 if (test_bit(Faulty, &rdev->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005563 failed++;
5564 else {
5565 working++;
NeilBrownb2d444d2005-11-08 21:39:31 -08005566 if (test_bit(In_sync, &rdev->flags))
NeilBrowna9f326e2009-09-23 18:06:41 +10005567 insync++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005568 else
5569 spare++;
5570 }
5571 }
5572
5573 info.major_version = mddev->major_version;
5574 info.minor_version = mddev->minor_version;
5575 info.patch_version = MD_PATCHLEVEL_VERSION;
5576 info.ctime = mddev->ctime;
5577 info.level = mddev->level;
Andre Noll58c0fed2009-03-31 14:33:13 +11005578 info.size = mddev->dev_sectors / 2;
5579 if (info.size != mddev->dev_sectors / 2) /* overflow */
NeilBrown284ae7c2006-02-03 03:03:40 -08005580 info.size = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005581 info.nr_disks = nr;
5582 info.raid_disks = mddev->raid_disks;
5583 info.md_minor = mddev->md_minor;
5584 info.not_persistent= !mddev->persistent;
5585
5586 info.utime = mddev->utime;
5587 info.state = 0;
5588 if (mddev->in_sync)
5589 info.state = (1<<MD_SB_CLEAN);
NeilBrownc3d97142009-12-14 12:49:52 +11005590 if (mddev->bitmap && mddev->bitmap_info.offset)
NeilBrown36fa3062005-09-09 16:23:45 -07005591 info.state = (1<<MD_SB_BITMAP_PRESENT);
NeilBrowna9f326e2009-09-23 18:06:41 +10005592 info.active_disks = insync;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005593 info.working_disks = working;
5594 info.failed_disks = failed;
5595 info.spare_disks = spare;
5596
5597 info.layout = mddev->layout;
Andre Noll9d8f0362009-06-18 08:45:01 +10005598 info.chunk_size = mddev->chunk_sectors << 9;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005599
5600 if (copy_to_user(arg, &info, sizeof(info)))
5601 return -EFAULT;
5602
5603 return 0;
5604}
5605
NeilBrownfd01b882011-10-11 16:47:53 +11005606static int get_bitmap_file(struct mddev * mddev, void __user * arg)
NeilBrown32a76272005-06-21 17:17:14 -07005607{
5608 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5609 char *ptr, *buf = NULL;
5610 int err = -ENOMEM;
5611
Dan Williamsb5470dc2008-06-27 21:44:04 -07005612 if (md_allow_write(mddev))
5613 file = kmalloc(sizeof(*file), GFP_NOIO);
5614 else
5615 file = kmalloc(sizeof(*file), GFP_KERNEL);
NeilBrown2a2275d2007-01-26 00:57:11 -08005616
NeilBrown32a76272005-06-21 17:17:14 -07005617 if (!file)
5618 goto out;
5619
5620 /* bitmap disabled, zero the first byte and copy out */
5621 if (!mddev->bitmap || !mddev->bitmap->file) {
5622 file->pathname[0] = '\0';
5623 goto copy_out;
5624 }
5625
5626 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5627 if (!buf)
5628 goto out;
5629
Christoph Hellwig6bcfd602008-05-23 13:04:34 -07005630 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
5631 if (IS_ERR(ptr))
NeilBrown32a76272005-06-21 17:17:14 -07005632 goto out;
5633
5634 strcpy(file->pathname, ptr);
5635
5636copy_out:
5637 err = 0;
5638 if (copy_to_user(arg, file, sizeof(*file)))
5639 err = -EFAULT;
5640out:
5641 kfree(buf);
5642 kfree(file);
5643 return err;
5644}
5645
NeilBrownfd01b882011-10-11 16:47:53 +11005646static int get_disk_info(struct mddev * mddev, void __user * arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005647{
5648 mdu_disk_info_t info;
NeilBrown3cb03002011-10-11 16:45:26 +11005649 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005650
5651 if (copy_from_user(&info, arg, sizeof(info)))
5652 return -EFAULT;
5653
Andre Noll26ef3792008-07-11 22:02:21 +10005654 rdev = find_rdev_nr(mddev, info.number);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005655 if (rdev) {
5656 info.major = MAJOR(rdev->bdev->bd_dev);
5657 info.minor = MINOR(rdev->bdev->bd_dev);
5658 info.raid_disk = rdev->raid_disk;
5659 info.state = 0;
NeilBrownb2d444d2005-11-08 21:39:31 -08005660 if (test_bit(Faulty, &rdev->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005661 info.state |= (1<<MD_DISK_FAULTY);
NeilBrownb2d444d2005-11-08 21:39:31 -08005662 else if (test_bit(In_sync, &rdev->flags)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005663 info.state |= (1<<MD_DISK_ACTIVE);
5664 info.state |= (1<<MD_DISK_SYNC);
5665 }
NeilBrown8ddf9ef2005-09-09 16:23:45 -07005666 if (test_bit(WriteMostly, &rdev->flags))
5667 info.state |= (1<<MD_DISK_WRITEMOSTLY);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005668 } else {
5669 info.major = info.minor = 0;
5670 info.raid_disk = -1;
5671 info.state = (1<<MD_DISK_REMOVED);
5672 }
5673
5674 if (copy_to_user(arg, &info, sizeof(info)))
5675 return -EFAULT;
5676
5677 return 0;
5678}
5679
NeilBrownfd01b882011-10-11 16:47:53 +11005680static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005681{
5682 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
NeilBrown3cb03002011-10-11 16:45:26 +11005683 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005684 dev_t dev = MKDEV(info->major,info->minor);
5685
5686 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5687 return -EOVERFLOW;
5688
5689 if (!mddev->raid_disks) {
5690 int err;
5691 /* expecting a device which has a superblock */
5692 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5693 if (IS_ERR(rdev)) {
5694 printk(KERN_WARNING
5695 "md: md_import_device returned %ld\n",
5696 PTR_ERR(rdev));
5697 return PTR_ERR(rdev);
5698 }
5699 if (!list_empty(&mddev->disks)) {
NeilBrown3cb03002011-10-11 16:45:26 +11005700 struct md_rdev *rdev0
5701 = list_entry(mddev->disks.next,
5702 struct md_rdev, same_set);
NeilBrowna9f326e2009-09-23 18:06:41 +10005703 err = super_types[mddev->major_version]
Linus Torvalds1da177e2005-04-16 15:20:36 -07005704 .load_super(rdev, rdev0, mddev->minor_version);
5705 if (err < 0) {
5706 printk(KERN_WARNING
5707 "md: %s has different UUID to %s\n",
5708 bdevname(rdev->bdev,b),
5709 bdevname(rdev0->bdev,b2));
5710 export_rdev(rdev);
5711 return -EINVAL;
5712 }
5713 }
5714 err = bind_rdev_to_array(rdev, mddev);
5715 if (err)
5716 export_rdev(rdev);
5717 return err;
5718 }
5719
5720 /*
5721 * add_new_disk can be used once the array is assembled
5722 * to add "hot spares". They must already have a superblock
5723 * written
5724 */
5725 if (mddev->pers) {
5726 int err;
5727 if (!mddev->pers->hot_add_disk) {
5728 printk(KERN_WARNING
5729 "%s: personality does not support diskops!\n",
5730 mdname(mddev));
5731 return -EINVAL;
5732 }
NeilBrown7b1e35f2005-09-09 16:23:50 -07005733 if (mddev->persistent)
5734 rdev = md_import_device(dev, mddev->major_version,
5735 mddev->minor_version);
5736 else
5737 rdev = md_import_device(dev, -1, -1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005738 if (IS_ERR(rdev)) {
5739 printk(KERN_WARNING
5740 "md: md_import_device returned %ld\n",
5741 PTR_ERR(rdev));
5742 return PTR_ERR(rdev);
5743 }
NeilBrown1a855a02010-12-09 16:36:28 +11005744 /* set saved_raid_disk if appropriate */
NeilBrown41158c72005-06-21 17:17:25 -07005745 if (!mddev->persistent) {
5746 if (info->state & (1<<MD_DISK_SYNC) &&
NeilBrownbf572542011-01-12 09:03:35 +11005747 info->raid_disk < mddev->raid_disks) {
NeilBrown41158c72005-06-21 17:17:25 -07005748 rdev->raid_disk = info->raid_disk;
NeilBrownbf572542011-01-12 09:03:35 +11005749 set_bit(In_sync, &rdev->flags);
5750 } else
NeilBrown41158c72005-06-21 17:17:25 -07005751 rdev->raid_disk = -1;
5752 } else
5753 super_types[mddev->major_version].
5754 validate_super(mddev, rdev);
NeilBrownbedd86b2011-05-11 14:26:20 +10005755 if ((info->state & (1<<MD_DISK_SYNC)) &&
5756 (!test_bit(In_sync, &rdev->flags) ||
5757 rdev->raid_disk != info->raid_disk)) {
5758 /* This was a hot-add request, but events doesn't
5759 * match, so reject it.
5760 */
5761 export_rdev(rdev);
5762 return -EINVAL;
5763 }
5764
NeilBrown1a855a02010-12-09 16:36:28 +11005765 if (test_bit(In_sync, &rdev->flags))
5766 rdev->saved_raid_disk = rdev->raid_disk;
5767 else
5768 rdev->saved_raid_disk = -1;
NeilBrown41158c72005-06-21 17:17:25 -07005769
NeilBrownb2d444d2005-11-08 21:39:31 -08005770 clear_bit(In_sync, &rdev->flags); /* just to be sure */
NeilBrown8ddf9ef2005-09-09 16:23:45 -07005771 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5772 set_bit(WriteMostly, &rdev->flags);
NeilBrown575a80f2009-03-31 14:33:13 +11005773 else
5774 clear_bit(WriteMostly, &rdev->flags);
NeilBrown8ddf9ef2005-09-09 16:23:45 -07005775
Linus Torvalds1da177e2005-04-16 15:20:36 -07005776 rdev->raid_disk = -1;
5777 err = bind_rdev_to_array(rdev, mddev);
NeilBrown7c7546c2006-06-26 00:27:41 -07005778 if (!err && !mddev->pers->hot_remove_disk) {
5779 /* If there is hot_add_disk but no hot_remove_disk
5780 * then added disks for geometry changes,
5781 * and should be added immediately.
5782 */
5783 super_types[mddev->major_version].
5784 validate_super(mddev, rdev);
5785 err = mddev->pers->hot_add_disk(mddev, rdev);
5786 if (err)
5787 unbind_rdev_from_array(rdev);
5788 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005789 if (err)
5790 export_rdev(rdev);
Neil Brown52664732008-06-28 08:31:44 +10005791 else
NeilBrown00bcb4a2010-06-01 19:37:23 +10005792 sysfs_notify_dirent_safe(rdev->sysfs_state);
NeilBrownc3617772005-06-21 17:17:10 -07005793
NeilBrown17571282006-12-10 02:20:52 -08005794 md_update_sb(mddev, 1);
Neil Brown72a23c22008-06-28 08:31:41 +10005795 if (mddev->degraded)
5796 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
NeilBrownc3617772005-06-21 17:17:10 -07005797 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
马建朋9864c002011-06-09 11:42:48 +10005798 if (!err)
5799 md_new_event(mddev);
NeilBrown005eca52005-08-22 13:11:08 -07005800 md_wakeup_thread(mddev->thread);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005801 return err;
5802 }
5803
5804 /* otherwise, add_new_disk is only allowed
5805 * for major_version==0 superblocks
5806 */
5807 if (mddev->major_version != 0) {
5808 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5809 mdname(mddev));
5810 return -EINVAL;
5811 }
5812
5813 if (!(info->state & (1<<MD_DISK_FAULTY))) {
5814 int err;
NeilBrownd710e132008-10-13 11:55:12 +11005815 rdev = md_import_device(dev, -1, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005816 if (IS_ERR(rdev)) {
5817 printk(KERN_WARNING
5818 "md: error, md_import_device() returned %ld\n",
5819 PTR_ERR(rdev));
5820 return PTR_ERR(rdev);
5821 }
5822 rdev->desc_nr = info->number;
5823 if (info->raid_disk < mddev->raid_disks)
5824 rdev->raid_disk = info->raid_disk;
5825 else
5826 rdev->raid_disk = -1;
5827
Linus Torvalds1da177e2005-04-16 15:20:36 -07005828 if (rdev->raid_disk < mddev->raid_disks)
NeilBrownb2d444d2005-11-08 21:39:31 -08005829 if (info->state & (1<<MD_DISK_SYNC))
5830 set_bit(In_sync, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005831
NeilBrown8ddf9ef2005-09-09 16:23:45 -07005832 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5833 set_bit(WriteMostly, &rdev->flags);
5834
Linus Torvalds1da177e2005-04-16 15:20:36 -07005835 if (!mddev->persistent) {
5836 printk(KERN_INFO "md: nonpersistent superblock ...\n");
Mike Snitzer77304d22010-11-08 14:39:12 +01005837 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5838 } else
Jonathan Brassow57b2caa2011-01-14 09:14:33 +11005839 rdev->sb_start = calc_dev_sboffset(rdev);
NeilBrown8190e752009-06-18 08:48:58 +10005840 rdev->sectors = rdev->sb_start;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005841
NeilBrown2bf071b2006-01-06 00:20:55 -08005842 err = bind_rdev_to_array(rdev, mddev);
5843 if (err) {
5844 export_rdev(rdev);
5845 return err;
5846 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005847 }
5848
5849 return 0;
5850}
5851
NeilBrownfd01b882011-10-11 16:47:53 +11005852static int hot_remove_disk(struct mddev * mddev, dev_t dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005853{
5854 char b[BDEVNAME_SIZE];
NeilBrown3cb03002011-10-11 16:45:26 +11005855 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005856
Linus Torvalds1da177e2005-04-16 15:20:36 -07005857 rdev = find_rdev(mddev, dev);
5858 if (!rdev)
5859 return -ENXIO;
5860
5861 if (rdev->raid_disk >= 0)
5862 goto busy;
5863
5864 kick_rdev_from_array(rdev);
NeilBrown850b2b422006-10-03 01:15:46 -07005865 md_update_sb(mddev, 1);
NeilBrownd7603b72006-01-06 00:20:30 -08005866 md_new_event(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005867
5868 return 0;
5869busy:
Nick Andrewfdefa4d2008-04-21 22:42:58 +00005870 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07005871 bdevname(rdev->bdev,b), mdname(mddev));
5872 return -EBUSY;
5873}
5874
NeilBrownfd01b882011-10-11 16:47:53 +11005875static int hot_add_disk(struct mddev * mddev, dev_t dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005876{
5877 char b[BDEVNAME_SIZE];
5878 int err;
NeilBrown3cb03002011-10-11 16:45:26 +11005879 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005880
5881 if (!mddev->pers)
5882 return -ENODEV;
5883
5884 if (mddev->major_version != 0) {
5885 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5886 " version-0 superblocks.\n",
5887 mdname(mddev));
5888 return -EINVAL;
5889 }
5890 if (!mddev->pers->hot_add_disk) {
5891 printk(KERN_WARNING
5892 "%s: personality does not support diskops!\n",
5893 mdname(mddev));
5894 return -EINVAL;
5895 }
5896
NeilBrownd710e132008-10-13 11:55:12 +11005897 rdev = md_import_device(dev, -1, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005898 if (IS_ERR(rdev)) {
5899 printk(KERN_WARNING
5900 "md: error, md_import_device() returned %ld\n",
5901 PTR_ERR(rdev));
5902 return -EINVAL;
5903 }
5904
5905 if (mddev->persistent)
Jonathan Brassow57b2caa2011-01-14 09:14:33 +11005906 rdev->sb_start = calc_dev_sboffset(rdev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005907 else
Mike Snitzer77304d22010-11-08 14:39:12 +01005908 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005909
NeilBrown8190e752009-06-18 08:48:58 +10005910 rdev->sectors = rdev->sb_start;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005911
NeilBrownb2d444d2005-11-08 21:39:31 -08005912 if (test_bit(Faulty, &rdev->flags)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005913 printk(KERN_WARNING
5914 "md: can not hot-add faulty %s disk to %s!\n",
5915 bdevname(rdev->bdev,b), mdname(mddev));
5916 err = -EINVAL;
5917 goto abort_export;
5918 }
NeilBrownb2d444d2005-11-08 21:39:31 -08005919 clear_bit(In_sync, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005920 rdev->desc_nr = -1;
NeilBrown58427302006-10-06 00:44:04 -07005921 rdev->saved_raid_disk = -1;
NeilBrown2bf071b2006-01-06 00:20:55 -08005922 err = bind_rdev_to_array(rdev, mddev);
5923 if (err)
5924 goto abort_export;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005925
5926 /*
5927 * The rest should better be atomic, we can have disk failures
5928 * noticed in interrupt contexts ...
5929 */
5930
Linus Torvalds1da177e2005-04-16 15:20:36 -07005931 rdev->raid_disk = -1;
5932
NeilBrown850b2b422006-10-03 01:15:46 -07005933 md_update_sb(mddev, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005934
5935 /*
5936 * Kick recovery, maybe this spare has to be added to the
5937 * array immediately.
5938 */
5939 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5940 md_wakeup_thread(mddev->thread);
NeilBrownd7603b72006-01-06 00:20:30 -08005941 md_new_event(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005942 return 0;
5943
Linus Torvalds1da177e2005-04-16 15:20:36 -07005944abort_export:
5945 export_rdev(rdev);
5946 return err;
5947}
5948
NeilBrownfd01b882011-10-11 16:47:53 +11005949static int set_bitmap_file(struct mddev *mddev, int fd)
NeilBrown32a76272005-06-21 17:17:14 -07005950{
5951 int err;
5952
NeilBrown36fa3062005-09-09 16:23:45 -07005953 if (mddev->pers) {
5954 if (!mddev->pers->quiesce)
5955 return -EBUSY;
5956 if (mddev->recovery || mddev->sync_thread)
5957 return -EBUSY;
5958 /* we should be able to change the bitmap.. */
NeilBrown32a76272005-06-21 17:17:14 -07005959 }
5960
NeilBrown36fa3062005-09-09 16:23:45 -07005961
5962 if (fd >= 0) {
5963 if (mddev->bitmap)
5964 return -EEXIST; /* cannot add when bitmap is present */
NeilBrownc3d97142009-12-14 12:49:52 +11005965 mddev->bitmap_info.file = fget(fd);
NeilBrown36fa3062005-09-09 16:23:45 -07005966
NeilBrownc3d97142009-12-14 12:49:52 +11005967 if (mddev->bitmap_info.file == NULL) {
NeilBrown36fa3062005-09-09 16:23:45 -07005968 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5969 mdname(mddev));
5970 return -EBADF;
5971 }
5972
NeilBrownc3d97142009-12-14 12:49:52 +11005973 err = deny_bitmap_write_access(mddev->bitmap_info.file);
NeilBrown36fa3062005-09-09 16:23:45 -07005974 if (err) {
5975 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5976 mdname(mddev));
NeilBrownc3d97142009-12-14 12:49:52 +11005977 fput(mddev->bitmap_info.file);
5978 mddev->bitmap_info.file = NULL;
NeilBrown36fa3062005-09-09 16:23:45 -07005979 return err;
5980 }
NeilBrownc3d97142009-12-14 12:49:52 +11005981 mddev->bitmap_info.offset = 0; /* file overrides offset */
NeilBrown36fa3062005-09-09 16:23:45 -07005982 } else if (mddev->bitmap == NULL)
5983 return -ENOENT; /* cannot remove what isn't there */
5984 err = 0;
5985 if (mddev->pers) {
5986 mddev->pers->quiesce(mddev, 1);
NeilBrown69e51b42010-06-01 19:37:35 +10005987 if (fd >= 0) {
NeilBrown36fa3062005-09-09 16:23:45 -07005988 err = bitmap_create(mddev);
NeilBrown69e51b42010-06-01 19:37:35 +10005989 if (!err)
5990 err = bitmap_load(mddev);
5991 }
NeilBrownd7375ab322006-06-26 00:27:43 -07005992 if (fd < 0 || err) {
NeilBrown36fa3062005-09-09 16:23:45 -07005993 bitmap_destroy(mddev);
NeilBrownd7375ab322006-06-26 00:27:43 -07005994 fd = -1; /* make sure to put the file */
5995 }
NeilBrown36fa3062005-09-09 16:23:45 -07005996 mddev->pers->quiesce(mddev, 0);
NeilBrownd7375ab322006-06-26 00:27:43 -07005997 }
5998 if (fd < 0) {
NeilBrownc3d97142009-12-14 12:49:52 +11005999 if (mddev->bitmap_info.file) {
6000 restore_bitmap_write_access(mddev->bitmap_info.file);
6001 fput(mddev->bitmap_info.file);
NeilBrownacc55e22006-06-26 00:27:47 -07006002 }
NeilBrownc3d97142009-12-14 12:49:52 +11006003 mddev->bitmap_info.file = NULL;
NeilBrown36fa3062005-09-09 16:23:45 -07006004 }
6005
NeilBrown32a76272005-06-21 17:17:14 -07006006 return err;
6007}
6008
Linus Torvalds1da177e2005-04-16 15:20:36 -07006009/*
6010 * set_array_info is used two different ways
6011 * The original usage is when creating a new array.
6012 * In this usage, raid_disks is > 0 and it together with
6013 * level, size, not_persistent,layout,chunksize determine the
6014 * shape of the array.
6015 * This will always create an array with a type-0.90.0 superblock.
6016 * The newer usage is when assembling an array.
6017 * In this case raid_disks will be 0, and the major_version field is
6018 * use to determine which style super-blocks are to be found on the devices.
6019 * The minor and patch _version numbers are also kept incase the
6020 * super_block handler wishes to interpret them.
6021 */
NeilBrownfd01b882011-10-11 16:47:53 +11006022static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006023{
6024
6025 if (info->raid_disks == 0) {
6026 /* just setting version number for superblock loading */
6027 if (info->major_version < 0 ||
Ahmed S. Darwish50511da2007-05-09 02:35:34 -07006028 info->major_version >= ARRAY_SIZE(super_types) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07006029 super_types[info->major_version].name == NULL) {
6030 /* maybe try to auto-load a module? */
6031 printk(KERN_INFO
6032 "md: superblock version %d not known\n",
6033 info->major_version);
6034 return -EINVAL;
6035 }
6036 mddev->major_version = info->major_version;
6037 mddev->minor_version = info->minor_version;
6038 mddev->patch_version = info->patch_version;
NeilBrown3f9d7b02006-12-22 01:11:41 -08006039 mddev->persistent = !info->not_persistent;
NeilBrowncbd19982009-12-30 12:08:49 +11006040 /* ensure mddev_put doesn't delete this now that there
6041 * is some minimal configuration.
6042 */
6043 mddev->ctime = get_seconds();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006044 return 0;
6045 }
6046 mddev->major_version = MD_MAJOR_VERSION;
6047 mddev->minor_version = MD_MINOR_VERSION;
6048 mddev->patch_version = MD_PATCHLEVEL_VERSION;
6049 mddev->ctime = get_seconds();
6050
6051 mddev->level = info->level;
NeilBrown17115e02006-01-16 22:14:57 -08006052 mddev->clevel[0] = 0;
Andre Noll58c0fed2009-03-31 14:33:13 +11006053 mddev->dev_sectors = 2 * (sector_t)info->size;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006054 mddev->raid_disks = info->raid_disks;
6055 /* don't set md_minor, it is determined by which /dev/md* was
6056 * openned
6057 */
6058 if (info->state & (1<<MD_SB_CLEAN))
6059 mddev->recovery_cp = MaxSector;
6060 else
6061 mddev->recovery_cp = 0;
6062 mddev->persistent = ! info->not_persistent;
NeilBrowne6910632008-02-06 01:39:51 -08006063 mddev->external = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006064
6065 mddev->layout = info->layout;
Andre Noll9d8f0362009-06-18 08:45:01 +10006066 mddev->chunk_sectors = info->chunk_size >> 9;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006067
6068 mddev->max_disks = MD_SB_DISKS;
6069
NeilBrowne6910632008-02-06 01:39:51 -08006070 if (mddev->persistent)
6071 mddev->flags = 0;
NeilBrown850b2b422006-10-03 01:15:46 -07006072 set_bit(MD_CHANGE_DEVS, &mddev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006073
NeilBrownc3d97142009-12-14 12:49:52 +11006074 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6075 mddev->bitmap_info.offset = 0;
NeilBrownb2a27032005-11-28 13:44:12 -08006076
NeilBrownf6705572006-03-27 01:18:11 -08006077 mddev->reshape_position = MaxSector;
6078
Linus Torvalds1da177e2005-04-16 15:20:36 -07006079 /*
6080 * Generate a 128 bit UUID
6081 */
6082 get_random_bytes(mddev->uuid, 16);
6083
NeilBrownf6705572006-03-27 01:18:11 -08006084 mddev->new_level = mddev->level;
Andre Noll664e7c42009-06-18 08:45:27 +10006085 mddev->new_chunk_sectors = mddev->chunk_sectors;
NeilBrownf6705572006-03-27 01:18:11 -08006086 mddev->new_layout = mddev->layout;
6087 mddev->delta_disks = 0;
NeilBrown2c810cd2012-05-21 09:27:00 +10006088 mddev->reshape_backwards = 0;
NeilBrownf6705572006-03-27 01:18:11 -08006089
Linus Torvalds1da177e2005-04-16 15:20:36 -07006090 return 0;
6091}
6092
NeilBrownfd01b882011-10-11 16:47:53 +11006093void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
Dan Williams1f403622009-03-31 14:59:03 +11006094{
Dan Williamsb522adc2009-03-31 15:00:31 +11006095 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6096
6097 if (mddev->external_size)
6098 return;
6099
Dan Williams1f403622009-03-31 14:59:03 +11006100 mddev->array_sectors = array_sectors;
6101}
6102EXPORT_SYMBOL(md_set_array_sectors);
6103
NeilBrownfd01b882011-10-11 16:47:53 +11006104static int update_size(struct mddev *mddev, sector_t num_sectors)
NeilBrowna35b0d692006-01-06 00:20:49 -08006105{
NeilBrown3cb03002011-10-11 16:45:26 +11006106 struct md_rdev *rdev;
NeilBrowna35b0d692006-01-06 00:20:49 -08006107 int rv;
Andre Nolld71f9f82008-07-11 22:02:22 +10006108 int fit = (num_sectors == 0);
NeilBrowna35b0d692006-01-06 00:20:49 -08006109
6110 if (mddev->pers->resize == NULL)
6111 return -EINVAL;
Andre Nolld71f9f82008-07-11 22:02:22 +10006112 /* The "num_sectors" is the number of sectors of each device that
6113 * is used. This can only make sense for arrays with redundancy.
6114 * linear and raid0 always use whatever space is available. We can only
6115 * consider changing this number if no resync or reconstruction is
6116 * happening, and if the new size is acceptable. It must fit before the
Andre Noll0f420352008-07-11 22:02:23 +10006117 * sb_start or, if that is <data_offset, it must fit before the size
Andre Nolld71f9f82008-07-11 22:02:22 +10006118 * of each device. If num_sectors is zero, we find the largest size
6119 * that fits.
NeilBrowna35b0d692006-01-06 00:20:49 -08006120 */
6121 if (mddev->sync_thread)
6122 return -EBUSY;
NeilBrowndba034e2008-08-05 15:54:13 +10006123 if (mddev->bitmap)
6124 /* Sorry, cannot grow a bitmap yet, just remove it,
6125 * grow, and re-add.
6126 */
6127 return -EBUSY;
NeilBrowndafb20f2012-03-19 12:46:39 +11006128 rdev_for_each(rdev, mddev) {
Andre Nolldd8ac332009-03-31 14:33:13 +11006129 sector_t avail = rdev->sectors;
NeilBrown01ab5662006-10-28 10:38:30 -07006130
Andre Nolld71f9f82008-07-11 22:02:22 +10006131 if (fit && (num_sectors == 0 || num_sectors > avail))
6132 num_sectors = avail;
6133 if (avail < num_sectors)
NeilBrowna35b0d692006-01-06 00:20:49 -08006134 return -ENOSPC;
6135 }
Andre Nolld71f9f82008-07-11 22:02:22 +10006136 rv = mddev->pers->resize(mddev, num_sectors);
NeilBrown449aad32009-08-03 10:59:58 +10006137 if (!rv)
6138 revalidate_disk(mddev->gendisk);
NeilBrowna35b0d692006-01-06 00:20:49 -08006139 return rv;
6140}
6141
NeilBrownfd01b882011-10-11 16:47:53 +11006142static int update_raid_disks(struct mddev *mddev, int raid_disks)
NeilBrownda943b992006-01-06 00:20:54 -08006143{
6144 int rv;
NeilBrownc6563a82012-05-21 09:27:00 +10006145 struct md_rdev *rdev;
NeilBrownda943b992006-01-06 00:20:54 -08006146 /* change the number of raid disks */
NeilBrown63c70c42006-03-27 01:18:13 -08006147 if (mddev->pers->check_reshape == NULL)
NeilBrownda943b992006-01-06 00:20:54 -08006148 return -EINVAL;
6149 if (raid_disks <= 0 ||
NeilBrown233fca32010-04-14 17:02:09 +10006150 (mddev->max_disks && raid_disks >= mddev->max_disks))
NeilBrownda943b992006-01-06 00:20:54 -08006151 return -EINVAL;
NeilBrown63c70c42006-03-27 01:18:13 -08006152 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
NeilBrownda943b992006-01-06 00:20:54 -08006153 return -EBUSY;
NeilBrownc6563a82012-05-21 09:27:00 +10006154
6155 rdev_for_each(rdev, mddev) {
6156 if (mddev->raid_disks < raid_disks &&
6157 rdev->data_offset < rdev->new_data_offset)
6158 return -EINVAL;
6159 if (mddev->raid_disks > raid_disks &&
6160 rdev->data_offset > rdev->new_data_offset)
6161 return -EINVAL;
6162 }
6163
NeilBrown63c70c42006-03-27 01:18:13 -08006164 mddev->delta_disks = raid_disks - mddev->raid_disks;
NeilBrown2c810cd2012-05-21 09:27:00 +10006165 if (mddev->delta_disks < 0)
6166 mddev->reshape_backwards = 1;
6167 else if (mddev->delta_disks > 0)
6168 mddev->reshape_backwards = 0;
NeilBrown63c70c42006-03-27 01:18:13 -08006169
6170 rv = mddev->pers->check_reshape(mddev);
NeilBrown2c810cd2012-05-21 09:27:00 +10006171 if (rv < 0) {
NeilBrownde171cb2011-01-31 11:57:42 +11006172 mddev->delta_disks = 0;
NeilBrown2c810cd2012-05-21 09:27:00 +10006173 mddev->reshape_backwards = 0;
6174 }
NeilBrownda943b992006-01-06 00:20:54 -08006175 return rv;
6176}
6177
6178
Linus Torvalds1da177e2005-04-16 15:20:36 -07006179/*
6180 * update_array_info is used to change the configuration of an
6181 * on-line array.
6182 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6183 * fields in the info are checked against the array.
6184 * Any differences that cannot be handled will cause an error.
6185 * Normally, only one change can be managed at a time.
6186 */
NeilBrownfd01b882011-10-11 16:47:53 +11006187static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006188{
6189 int rv = 0;
6190 int cnt = 0;
NeilBrown36fa3062005-09-09 16:23:45 -07006191 int state = 0;
6192
6193 /* calculate expected state,ignoring low bits */
NeilBrownc3d97142009-12-14 12:49:52 +11006194 if (mddev->bitmap && mddev->bitmap_info.offset)
NeilBrown36fa3062005-09-09 16:23:45 -07006195 state |= (1 << MD_SB_BITMAP_PRESENT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006196
6197 if (mddev->major_version != info->major_version ||
6198 mddev->minor_version != info->minor_version ||
6199/* mddev->patch_version != info->patch_version || */
6200 mddev->ctime != info->ctime ||
6201 mddev->level != info->level ||
6202/* mddev->layout != info->layout || */
6203 !mddev->persistent != info->not_persistent||
Andre Noll9d8f0362009-06-18 08:45:01 +10006204 mddev->chunk_sectors != info->chunk_size >> 9 ||
NeilBrown36fa3062005-09-09 16:23:45 -07006205 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6206 ((state^info->state) & 0xfffffe00)
6207 )
Linus Torvalds1da177e2005-04-16 15:20:36 -07006208 return -EINVAL;
6209 /* Check there is only one change */
Andre Noll58c0fed2009-03-31 14:33:13 +11006210 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6211 cnt++;
6212 if (mddev->raid_disks != info->raid_disks)
6213 cnt++;
6214 if (mddev->layout != info->layout)
6215 cnt++;
6216 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6217 cnt++;
6218 if (cnt == 0)
6219 return 0;
6220 if (cnt > 1)
6221 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006222
6223 if (mddev->layout != info->layout) {
6224 /* Change layout
6225 * we don't need to do anything at the md level, the
6226 * personality will take care of it all.
6227 */
NeilBrown50ac1682009-06-18 08:47:55 +10006228 if (mddev->pers->check_reshape == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006229 return -EINVAL;
NeilBrown597a7112009-06-18 08:47:42 +10006230 else {
6231 mddev->new_layout = info->layout;
NeilBrown50ac1682009-06-18 08:47:55 +10006232 rv = mddev->pers->check_reshape(mddev);
NeilBrown597a7112009-06-18 08:47:42 +10006233 if (rv)
6234 mddev->new_layout = mddev->layout;
6235 return rv;
6236 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006237 }
Andre Noll58c0fed2009-03-31 14:33:13 +11006238 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
Andre Nolld71f9f82008-07-11 22:02:22 +10006239 rv = update_size(mddev, (sector_t)info->size * 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006240
NeilBrownda943b992006-01-06 00:20:54 -08006241 if (mddev->raid_disks != info->raid_disks)
6242 rv = update_raid_disks(mddev, info->raid_disks);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006243
NeilBrown36fa3062005-09-09 16:23:45 -07006244 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6245 if (mddev->pers->quiesce == NULL)
6246 return -EINVAL;
6247 if (mddev->recovery || mddev->sync_thread)
6248 return -EBUSY;
6249 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6250 /* add the bitmap */
6251 if (mddev->bitmap)
6252 return -EEXIST;
NeilBrownc3d97142009-12-14 12:49:52 +11006253 if (mddev->bitmap_info.default_offset == 0)
NeilBrown36fa3062005-09-09 16:23:45 -07006254 return -EINVAL;
NeilBrownc3d97142009-12-14 12:49:52 +11006255 mddev->bitmap_info.offset =
6256 mddev->bitmap_info.default_offset;
NeilBrown36fa3062005-09-09 16:23:45 -07006257 mddev->pers->quiesce(mddev, 1);
6258 rv = bitmap_create(mddev);
NeilBrown69e51b42010-06-01 19:37:35 +10006259 if (!rv)
6260 rv = bitmap_load(mddev);
NeilBrown36fa3062005-09-09 16:23:45 -07006261 if (rv)
6262 bitmap_destroy(mddev);
6263 mddev->pers->quiesce(mddev, 0);
6264 } else {
6265 /* remove the bitmap */
6266 if (!mddev->bitmap)
6267 return -ENOENT;
6268 if (mddev->bitmap->file)
6269 return -EINVAL;
6270 mddev->pers->quiesce(mddev, 1);
6271 bitmap_destroy(mddev);
6272 mddev->pers->quiesce(mddev, 0);
NeilBrownc3d97142009-12-14 12:49:52 +11006273 mddev->bitmap_info.offset = 0;
NeilBrown36fa3062005-09-09 16:23:45 -07006274 }
6275 }
NeilBrown850b2b422006-10-03 01:15:46 -07006276 md_update_sb(mddev, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006277 return rv;
6278}
6279
NeilBrownfd01b882011-10-11 16:47:53 +11006280static int set_disk_faulty(struct mddev *mddev, dev_t dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006281{
NeilBrown3cb03002011-10-11 16:45:26 +11006282 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006283
6284 if (mddev->pers == NULL)
6285 return -ENODEV;
6286
6287 rdev = find_rdev(mddev, dev);
6288 if (!rdev)
6289 return -ENODEV;
6290
6291 md_error(mddev, rdev);
NeilBrown5ef56c82011-08-25 14:42:51 +10006292 if (!test_bit(Faulty, &rdev->flags))
6293 return -EBUSY;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006294 return 0;
6295}
6296
Andre Noll2f9618c2008-04-25 18:57:58 +02006297/*
6298 * We have a problem here : there is no easy way to give a CHS
6299 * virtual geometry. We currently pretend that we have a 2 heads
6300 * 4 sectors (with a BIG number of cylinders...). This drives
6301 * dosfs just mad... ;-)
6302 */
Christoph Hellwiga885c8c2006-01-08 01:02:50 -08006303static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6304{
NeilBrownfd01b882011-10-11 16:47:53 +11006305 struct mddev *mddev = bdev->bd_disk->private_data;
Christoph Hellwiga885c8c2006-01-08 01:02:50 -08006306
6307 geo->heads = 2;
6308 geo->sectors = 4;
NeilBrown49ce6ce2010-03-29 10:51:42 +11006309 geo->cylinders = mddev->array_sectors / 8;
Christoph Hellwiga885c8c2006-01-08 01:02:50 -08006310 return 0;
6311}
6312
Al Viroa39907f2008-03-02 10:31:15 -05006313static int md_ioctl(struct block_device *bdev, fmode_t mode,
Linus Torvalds1da177e2005-04-16 15:20:36 -07006314 unsigned int cmd, unsigned long arg)
6315{
6316 int err = 0;
6317 void __user *argp = (void __user *)arg;
NeilBrownfd01b882011-10-11 16:47:53 +11006318 struct mddev *mddev = NULL;
Dan Williamse2218352010-05-12 08:25:37 +10006319 int ro;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006320
NeilBrown506c9e42011-12-23 10:17:26 +11006321 switch (cmd) {
6322 case RAID_VERSION:
6323 case GET_ARRAY_INFO:
6324 case GET_DISK_INFO:
6325 break;
6326 default:
6327 if (!capable(CAP_SYS_ADMIN))
6328 return -EACCES;
6329 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006330
6331 /*
6332 * Commands dealing with the RAID driver but not any
6333 * particular array:
6334 */
6335 switch (cmd)
6336 {
6337 case RAID_VERSION:
6338 err = get_version(argp);
6339 goto done;
6340
6341 case PRINT_RAID_DEBUG:
6342 err = 0;
6343 md_print_devices();
6344 goto done;
6345
6346#ifndef MODULE
6347 case RAID_AUTORUN:
6348 err = 0;
6349 autostart_arrays(arg);
6350 goto done;
6351#endif
6352 default:;
6353 }
6354
6355 /*
6356 * Commands creating/starting a new array:
6357 */
6358
Al Viroa39907f2008-03-02 10:31:15 -05006359 mddev = bdev->bd_disk->private_data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006360
6361 if (!mddev) {
6362 BUG();
6363 goto abort;
6364 }
6365
Linus Torvalds1da177e2005-04-16 15:20:36 -07006366 err = mddev_lock(mddev);
6367 if (err) {
6368 printk(KERN_INFO
6369 "md: ioctl lock interrupted, reason %d, cmd %d\n",
6370 err, cmd);
6371 goto abort;
6372 }
6373
6374 switch (cmd)
6375 {
6376 case SET_ARRAY_INFO:
6377 {
6378 mdu_array_info_t info;
6379 if (!arg)
6380 memset(&info, 0, sizeof(info));
6381 else if (copy_from_user(&info, argp, sizeof(info))) {
6382 err = -EFAULT;
6383 goto abort_unlock;
6384 }
6385 if (mddev->pers) {
6386 err = update_array_info(mddev, &info);
6387 if (err) {
6388 printk(KERN_WARNING "md: couldn't update"
6389 " array info. %d\n", err);
6390 goto abort_unlock;
6391 }
6392 goto done_unlock;
6393 }
6394 if (!list_empty(&mddev->disks)) {
6395 printk(KERN_WARNING
6396 "md: array %s already has disks!\n",
6397 mdname(mddev));
6398 err = -EBUSY;
6399 goto abort_unlock;
6400 }
6401 if (mddev->raid_disks) {
6402 printk(KERN_WARNING
6403 "md: array %s already initialised!\n",
6404 mdname(mddev));
6405 err = -EBUSY;
6406 goto abort_unlock;
6407 }
6408 err = set_array_info(mddev, &info);
6409 if (err) {
6410 printk(KERN_WARNING "md: couldn't set"
6411 " array info. %d\n", err);
6412 goto abort_unlock;
6413 }
6414 }
6415 goto done_unlock;
6416
6417 default:;
6418 }
6419
6420 /*
6421 * Commands querying/configuring an existing array:
6422 */
NeilBrown32a76272005-06-21 17:17:14 -07006423 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
NeilBrown3f9d7b02006-12-22 01:11:41 -08006424 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
NeilBrowna17184a2008-02-06 01:39:55 -08006425 if ((!mddev->raid_disks && !mddev->external)
6426 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6427 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6428 && cmd != GET_BITMAP_FILE) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006429 err = -ENODEV;
6430 goto abort_unlock;
6431 }
6432
6433 /*
6434 * Commands even a read-only array can execute:
6435 */
6436 switch (cmd)
6437 {
6438 case GET_ARRAY_INFO:
6439 err = get_array_info(mddev, argp);
6440 goto done_unlock;
6441
NeilBrown32a76272005-06-21 17:17:14 -07006442 case GET_BITMAP_FILE:
viro@ZenIV.linux.org.uk87162a22005-09-09 20:36:43 +01006443 err = get_bitmap_file(mddev, argp);
NeilBrown32a76272005-06-21 17:17:14 -07006444 goto done_unlock;
6445
Linus Torvalds1da177e2005-04-16 15:20:36 -07006446 case GET_DISK_INFO:
6447 err = get_disk_info(mddev, argp);
6448 goto done_unlock;
6449
6450 case RESTART_ARRAY_RW:
6451 err = restart_array(mddev);
6452 goto done_unlock;
6453
6454 case STOP_ARRAY:
NeilBrownd710e132008-10-13 11:55:12 +11006455 err = do_md_stop(mddev, 0, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006456 goto done_unlock;
6457
6458 case STOP_ARRAY_RO:
NeilBrowna4bd82d02010-03-29 13:23:10 +11006459 err = md_set_readonly(mddev, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006460 goto done_unlock;
6461
Dan Williamse2218352010-05-12 08:25:37 +10006462 case BLKROSET:
6463 if (get_user(ro, (int __user *)(arg))) {
6464 err = -EFAULT;
6465 goto done_unlock;
6466 }
6467 err = -EINVAL;
6468
6469 /* if the bdev is going readonly the value of mddev->ro
6470 * does not matter, no writes are coming
6471 */
6472 if (ro)
6473 goto done_unlock;
6474
6475 /* are we are already prepared for writes? */
6476 if (mddev->ro != 1)
6477 goto done_unlock;
6478
6479 /* transitioning to readauto need only happen for
6480 * arrays that call md_write_start
6481 */
6482 if (mddev->pers) {
6483 err = restart_array(mddev);
6484 if (err == 0) {
6485 mddev->ro = 2;
6486 set_disk_ro(mddev->gendisk, 0);
6487 }
6488 }
6489 goto done_unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006490 }
6491
6492 /*
6493 * The remaining ioctls are changing the state of the
NeilBrownf91de922005-11-08 21:39:36 -08006494 * superblock, so we do not allow them on read-only arrays.
6495 * However non-MD ioctls (e.g. get-size) will still come through
6496 * here and hit the 'default' below, so only disallow
6497 * 'md' ioctls, and switch to rw mode if started auto-readonly.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006498 */
Andre Nollbb57fc62008-04-25 19:06:35 +02006499 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
NeilBrownf91de922005-11-08 21:39:36 -08006500 if (mddev->ro == 2) {
6501 mddev->ro = 0;
NeilBrown00bcb4a2010-06-01 19:37:23 +10006502 sysfs_notify_dirent_safe(mddev->sysfs_state);
Neil Brown0fd62b82008-06-28 08:31:36 +10006503 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6504 md_wakeup_thread(mddev->thread);
NeilBrownf91de922005-11-08 21:39:36 -08006505 } else {
6506 err = -EROFS;
6507 goto abort_unlock;
6508 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006509 }
6510
6511 switch (cmd)
6512 {
6513 case ADD_NEW_DISK:
6514 {
6515 mdu_disk_info_t info;
6516 if (copy_from_user(&info, argp, sizeof(info)))
6517 err = -EFAULT;
6518 else
6519 err = add_new_disk(mddev, &info);
6520 goto done_unlock;
6521 }
6522
6523 case HOT_REMOVE_DISK:
6524 err = hot_remove_disk(mddev, new_decode_dev(arg));
6525 goto done_unlock;
6526
6527 case HOT_ADD_DISK:
6528 err = hot_add_disk(mddev, new_decode_dev(arg));
6529 goto done_unlock;
6530
6531 case SET_DISK_FAULTY:
6532 err = set_disk_faulty(mddev, new_decode_dev(arg));
6533 goto done_unlock;
6534
6535 case RUN_ARRAY:
NeilBrownd710e132008-10-13 11:55:12 +11006536 err = do_md_run(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006537 goto done_unlock;
6538
NeilBrown32a76272005-06-21 17:17:14 -07006539 case SET_BITMAP_FILE:
6540 err = set_bitmap_file(mddev, (int)arg);
6541 goto done_unlock;
6542
Linus Torvalds1da177e2005-04-16 15:20:36 -07006543 default:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006544 err = -EINVAL;
6545 goto abort_unlock;
6546 }
6547
6548done_unlock:
6549abort_unlock:
NeilBrownd3374822009-01-09 08:31:10 +11006550 if (mddev->hold_active == UNTIL_IOCTL &&
6551 err != -EINVAL)
6552 mddev->hold_active = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006553 mddev_unlock(mddev);
6554
6555 return err;
6556done:
6557 if (err)
6558 MD_BUG();
6559abort:
6560 return err;
6561}
Arnd Bergmannaa98aa32009-12-14 12:50:05 +11006562#ifdef CONFIG_COMPAT
6563static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6564 unsigned int cmd, unsigned long arg)
6565{
6566 switch (cmd) {
6567 case HOT_REMOVE_DISK:
6568 case HOT_ADD_DISK:
6569 case SET_DISK_FAULTY:
6570 case SET_BITMAP_FILE:
6571 /* These take in integer arg, do not convert */
6572 break;
6573 default:
6574 arg = (unsigned long)compat_ptr(arg);
6575 break;
6576 }
6577
6578 return md_ioctl(bdev, mode, cmd, arg);
6579}
6580#endif /* CONFIG_COMPAT */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006581
Al Viroa39907f2008-03-02 10:31:15 -05006582static int md_open(struct block_device *bdev, fmode_t mode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006583{
6584 /*
6585 * Succeed if we can lock the mddev, which confirms that
6586 * it isn't being stopped right now.
6587 */
NeilBrownfd01b882011-10-11 16:47:53 +11006588 struct mddev *mddev = mddev_find(bdev->bd_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006589 int err;
6590
NeilBrownd3374822009-01-09 08:31:10 +11006591 if (mddev->gendisk != bdev->bd_disk) {
6592 /* we are racing with mddev_put which is discarding this
6593 * bd_disk.
6594 */
6595 mddev_put(mddev);
6596 /* Wait until bdev->bd_disk is definitely gone */
Tejun Heoe804ac72010-10-15 15:36:08 +02006597 flush_workqueue(md_misc_wq);
NeilBrownd3374822009-01-09 08:31:10 +11006598 /* Then retry the open from the top */
6599 return -ERESTARTSYS;
6600 }
6601 BUG_ON(mddev != bdev->bd_disk->private_data);
6602
NeilBrownc8c00a62009-08-10 12:50:52 +10006603 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07006604 goto out;
6605
6606 err = 0;
NeilBrownf2ea68c2008-07-21 17:05:25 +10006607 atomic_inc(&mddev->openers);
NeilBrownc8c00a62009-08-10 12:50:52 +10006608 mutex_unlock(&mddev->open_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006609
NeilBrownf0b4f7e2011-02-24 17:26:41 +11006610 check_disk_change(bdev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006611 out:
6612 return err;
6613}
6614
Al Viroa39907f2008-03-02 10:31:15 -05006615static int md_release(struct gendisk *disk, fmode_t mode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006616{
NeilBrownfd01b882011-10-11 16:47:53 +11006617 struct mddev *mddev = disk->private_data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006618
Eric Sesterhenn52e5f9d2006-10-03 23:33:23 +02006619 BUG_ON(!mddev);
NeilBrownf2ea68c2008-07-21 17:05:25 +10006620 atomic_dec(&mddev->openers);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006621 mddev_put(mddev);
6622
6623 return 0;
6624}
NeilBrownf0b4f7e2011-02-24 17:26:41 +11006625
6626static int md_media_changed(struct gendisk *disk)
6627{
NeilBrownfd01b882011-10-11 16:47:53 +11006628 struct mddev *mddev = disk->private_data;
NeilBrownf0b4f7e2011-02-24 17:26:41 +11006629
6630 return mddev->changed;
6631}
6632
6633static int md_revalidate(struct gendisk *disk)
6634{
NeilBrownfd01b882011-10-11 16:47:53 +11006635 struct mddev *mddev = disk->private_data;
NeilBrownf0b4f7e2011-02-24 17:26:41 +11006636
6637 mddev->changed = 0;
6638 return 0;
6639}
Alexey Dobriyan83d5cde2009-09-21 17:01:13 -07006640static const struct block_device_operations md_fops =
Linus Torvalds1da177e2005-04-16 15:20:36 -07006641{
6642 .owner = THIS_MODULE,
Al Viroa39907f2008-03-02 10:31:15 -05006643 .open = md_open,
6644 .release = md_release,
NeilBrownb492b852009-05-26 12:57:36 +10006645 .ioctl = md_ioctl,
Arnd Bergmannaa98aa32009-12-14 12:50:05 +11006646#ifdef CONFIG_COMPAT
6647 .compat_ioctl = md_compat_ioctl,
6648#endif
Christoph Hellwiga885c8c2006-01-08 01:02:50 -08006649 .getgeo = md_getgeo,
NeilBrownf0b4f7e2011-02-24 17:26:41 +11006650 .media_changed = md_media_changed,
6651 .revalidate_disk= md_revalidate,
Linus Torvalds1da177e2005-04-16 15:20:36 -07006652};
6653
Adrian Bunk75c96f82005-05-05 16:16:09 -07006654static int md_thread(void * arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006655{
NeilBrown2b8bf342011-10-11 16:48:23 +11006656 struct md_thread *thread = arg;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006657
Linus Torvalds1da177e2005-04-16 15:20:36 -07006658 /*
6659 * md_thread is a 'system-thread', it's priority should be very
6660 * high. We avoid resource deadlocks individually in each
6661 * raid personality. (RAID5 does preallocation) We also use RR and
6662 * the very same RT priority as kswapd, thus we will never get
6663 * into a priority inversion deadlock.
6664 *
6665 * we definitely have to have equal or higher priority than
6666 * bdflush, otherwise bdflush will deadlock if there are too
6667 * many dirty RAID5 blocks.
6668 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006669
NeilBrown6985c432005-10-19 21:23:47 -07006670 allow_signal(SIGKILL);
NeilBrowna6fb0932005-09-09 16:23:56 -07006671 while (!kthread_should_stop()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006672
NeilBrown93588e22005-11-15 00:09:12 -08006673 /* We need to wait INTERRUPTIBLE so that
6674 * we don't add to the load-average.
6675 * That means we need to be sure no signals are
6676 * pending
6677 */
6678 if (signal_pending(current))
6679 flush_signals(current);
6680
6681 wait_event_interruptible_timeout
6682 (thread->wqueue,
6683 test_bit(THREAD_WAKEUP, &thread->flags)
6684 || kthread_should_stop(),
6685 thread->timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006686
NeilBrown6c987912011-01-14 09:13:53 +11006687 clear_bit(THREAD_WAKEUP, &thread->flags);
6688 if (!kthread_should_stop())
NeilBrown589a5942010-12-09 17:02:14 +11006689 thread->run(thread->mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006690 }
NeilBrowna6fb0932005-09-09 16:23:56 -07006691
Linus Torvalds1da177e2005-04-16 15:20:36 -07006692 return 0;
6693}
6694
NeilBrown2b8bf342011-10-11 16:48:23 +11006695void md_wakeup_thread(struct md_thread *thread)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006696{
6697 if (thread) {
NeilBrown36a4e1f2011-10-07 14:23:17 +11006698 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006699 set_bit(THREAD_WAKEUP, &thread->flags);
6700 wake_up(&thread->wqueue);
6701 }
6702}
6703
NeilBrown2b8bf342011-10-11 16:48:23 +11006704struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev,
Linus Torvalds1da177e2005-04-16 15:20:36 -07006705 const char *name)
6706{
NeilBrown2b8bf342011-10-11 16:48:23 +11006707 struct md_thread *thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006708
NeilBrown2b8bf342011-10-11 16:48:23 +11006709 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006710 if (!thread)
6711 return NULL;
6712
Linus Torvalds1da177e2005-04-16 15:20:36 -07006713 init_waitqueue_head(&thread->wqueue);
6714
Linus Torvalds1da177e2005-04-16 15:20:36 -07006715 thread->run = run;
6716 thread->mddev = mddev;
NeilBrown32a76272005-06-21 17:17:14 -07006717 thread->timeout = MAX_SCHEDULE_TIMEOUT;
NeilBrown0da3c612009-09-23 18:09:45 +10006718 thread->tsk = kthread_run(md_thread, thread,
6719 "%s_%s",
6720 mdname(thread->mddev),
6721 name ?: mddev->pers->name);
NeilBrowna6fb0932005-09-09 16:23:56 -07006722 if (IS_ERR(thread->tsk)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006723 kfree(thread);
6724 return NULL;
6725 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006726 return thread;
6727}
6728
NeilBrown2b8bf342011-10-11 16:48:23 +11006729void md_unregister_thread(struct md_thread **threadp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006730{
NeilBrown2b8bf342011-10-11 16:48:23 +11006731 struct md_thread *thread = *threadp;
NeilBrowne0cf8f02009-03-31 14:39:39 +11006732 if (!thread)
6733 return;
NeilBrown36a4e1f2011-10-07 14:23:17 +11006734 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
NeilBrown01f96c02011-09-21 15:30:20 +10006735 /* Locking ensures that mddev_unlock does not wake_up a
6736 * non-existent thread
6737 */
6738 spin_lock(&pers_lock);
6739 *threadp = NULL;
6740 spin_unlock(&pers_lock);
NeilBrowna6fb0932005-09-09 16:23:56 -07006741
6742 kthread_stop(thread->tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006743 kfree(thread);
6744}
6745
NeilBrownfd01b882011-10-11 16:47:53 +11006746void md_error(struct mddev *mddev, struct md_rdev *rdev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006747{
6748 if (!mddev) {
6749 MD_BUG();
6750 return;
6751 }
6752
NeilBrownb2d444d2005-11-08 21:39:31 -08006753 if (!rdev || test_bit(Faulty, &rdev->flags))
Linus Torvalds1da177e2005-04-16 15:20:36 -07006754 return;
Dan Williams6bfe0b42008-04-30 00:52:32 -07006755
NeilBrownde393cd2011-07-28 11:31:48 +10006756 if (!mddev->pers || !mddev->pers->error_handler)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006757 return;
6758 mddev->pers->error_handler(mddev,rdev);
Neil Brown72a23c22008-06-28 08:31:41 +10006759 if (mddev->degraded)
6760 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
NeilBrown00bcb4a2010-06-01 19:37:23 +10006761 sysfs_notify_dirent_safe(rdev->sysfs_state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006762 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6763 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6764 md_wakeup_thread(mddev->thread);
NeilBrown768a4182010-07-26 11:49:55 +10006765 if (mddev->event_work.func)
Tejun Heoe804ac72010-10-15 15:36:08 +02006766 queue_work(md_misc_wq, &mddev->event_work);
NeilBrownc331eb02006-05-30 21:27:13 -07006767 md_new_event_inintr(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006768}
6769
6770/* seq_file implementation /proc/mdstat */
6771
6772static void status_unused(struct seq_file *seq)
6773{
6774 int i = 0;
NeilBrown3cb03002011-10-11 16:45:26 +11006775 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006776
6777 seq_printf(seq, "unused devices: ");
6778
Cheng Renquan159ec1f2009-01-09 08:31:08 +11006779 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006780 char b[BDEVNAME_SIZE];
6781 i++;
6782 seq_printf(seq, "%s ",
6783 bdevname(rdev->bdev,b));
6784 }
6785 if (!i)
6786 seq_printf(seq, "<none>");
6787
6788 seq_printf(seq, "\n");
6789}
6790
6791
NeilBrownfd01b882011-10-11 16:47:53 +11006792static void status_resync(struct seq_file *seq, struct mddev * mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006793{
NeilBrowndd71cf62009-05-07 12:49:35 +10006794 sector_t max_sectors, resync, res;
6795 unsigned long dt, db;
6796 sector_t rt;
NeilBrown4588b422006-03-27 01:18:04 -08006797 int scale;
6798 unsigned int per_milli;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006799
NeilBrowndd71cf62009-05-07 12:49:35 +10006800 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006801
6802 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
NeilBrowndd71cf62009-05-07 12:49:35 +10006803 max_sectors = mddev->resync_max_sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006804 else
NeilBrowndd71cf62009-05-07 12:49:35 +10006805 max_sectors = mddev->dev_sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006806
6807 /*
6808 * Should not happen.
6809 */
NeilBrowndd71cf62009-05-07 12:49:35 +10006810 if (!max_sectors) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006811 MD_BUG();
6812 return;
6813 }
NeilBrown4588b422006-03-27 01:18:04 -08006814 /* Pick 'scale' such that (resync>>scale)*1000 will fit
NeilBrowndd71cf62009-05-07 12:49:35 +10006815 * in a sector_t, and (max_sectors>>scale) will fit in a
NeilBrown4588b422006-03-27 01:18:04 -08006816 * u32, as those are the requirements for sector_div.
6817 * Thus 'scale' must be at least 10
6818 */
6819 scale = 10;
6820 if (sizeof(sector_t) > sizeof(unsigned long)) {
NeilBrowndd71cf62009-05-07 12:49:35 +10006821 while ( max_sectors/2 > (1ULL<<(scale+32)))
NeilBrown4588b422006-03-27 01:18:04 -08006822 scale++;
6823 }
6824 res = (resync>>scale)*1000;
NeilBrowndd71cf62009-05-07 12:49:35 +10006825 sector_div(res, (u32)((max_sectors>>scale)+1));
NeilBrown4588b422006-03-27 01:18:04 -08006826
6827 per_milli = res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006828 {
NeilBrown4588b422006-03-27 01:18:04 -08006829 int i, x = per_milli/50, y = 20-x;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006830 seq_printf(seq, "[");
6831 for (i = 0; i < x; i++)
6832 seq_printf(seq, "=");
6833 seq_printf(seq, ">");
6834 for (i = 0; i < y; i++)
6835 seq_printf(seq, ".");
6836 seq_printf(seq, "] ");
6837 }
NeilBrown4588b422006-03-27 01:18:04 -08006838 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
NeilBrownccfcc3c2006-03-27 01:18:09 -08006839 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6840 "reshape" :
NeilBrown61df9d92006-10-03 01:15:57 -07006841 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6842 "check" :
6843 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6844 "resync" : "recovery"))),
6845 per_milli/10, per_milli % 10,
NeilBrowndd71cf62009-05-07 12:49:35 +10006846 (unsigned long long) resync/2,
6847 (unsigned long long) max_sectors/2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006848
6849 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07006850 * dt: time from mark until now
6851 * db: blocks written from mark until now
6852 * rt: remaining time
NeilBrowndd71cf62009-05-07 12:49:35 +10006853 *
6854 * rt is a sector_t, so could be 32bit or 64bit.
6855 * So we divide before multiply in case it is 32bit and close
6856 * to the limit.
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006857 * We scale the divisor (db) by 32 to avoid losing precision
NeilBrowndd71cf62009-05-07 12:49:35 +10006858 * near the end of resync when the number of remaining sectors
6859 * is close to 'db'.
6860 * We then divide rt by 32 after multiplying by db to compensate.
6861 * The '+1' avoids division by zero if db is very small.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006862 */
6863 dt = ((jiffies - mddev->resync_mark) / HZ);
6864 if (!dt) dt++;
NeilBrownff4e8d92006-07-10 04:44:16 -07006865 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6866 - mddev->resync_mark_cnt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006867
NeilBrowndd71cf62009-05-07 12:49:35 +10006868 rt = max_sectors - resync; /* number of remaining sectors */
6869 sector_div(rt, db/32+1);
6870 rt *= dt;
6871 rt >>= 5;
6872
6873 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6874 ((unsigned long)rt % 60)/6);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006875
NeilBrownff4e8d92006-07-10 04:44:16 -07006876 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006877}
6878
6879static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6880{
6881 struct list_head *tmp;
6882 loff_t l = *pos;
NeilBrownfd01b882011-10-11 16:47:53 +11006883 struct mddev *mddev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006884
6885 if (l >= 0x10000)
6886 return NULL;
6887 if (!l--)
6888 /* header */
6889 return (void*)1;
6890
6891 spin_lock(&all_mddevs_lock);
6892 list_for_each(tmp,&all_mddevs)
6893 if (!l--) {
NeilBrownfd01b882011-10-11 16:47:53 +11006894 mddev = list_entry(tmp, struct mddev, all_mddevs);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006895 mddev_get(mddev);
6896 spin_unlock(&all_mddevs_lock);
6897 return mddev;
6898 }
6899 spin_unlock(&all_mddevs_lock);
6900 if (!l--)
6901 return (void*)2;/* tail */
6902 return NULL;
6903}
6904
6905static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6906{
6907 struct list_head *tmp;
NeilBrownfd01b882011-10-11 16:47:53 +11006908 struct mddev *next_mddev, *mddev = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006909
6910 ++*pos;
6911 if (v == (void*)2)
6912 return NULL;
6913
6914 spin_lock(&all_mddevs_lock);
6915 if (v == (void*)1)
6916 tmp = all_mddevs.next;
6917 else
6918 tmp = mddev->all_mddevs.next;
6919 if (tmp != &all_mddevs)
NeilBrownfd01b882011-10-11 16:47:53 +11006920 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
Linus Torvalds1da177e2005-04-16 15:20:36 -07006921 else {
6922 next_mddev = (void*)2;
6923 *pos = 0x10000;
6924 }
6925 spin_unlock(&all_mddevs_lock);
6926
6927 if (v != (void*)1)
6928 mddev_put(mddev);
6929 return next_mddev;
6930
6931}
6932
6933static void md_seq_stop(struct seq_file *seq, void *v)
6934{
NeilBrownfd01b882011-10-11 16:47:53 +11006935 struct mddev *mddev = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006936
6937 if (mddev && v != (void*)1 && v != (void*)2)
6938 mddev_put(mddev);
6939}
6940
6941static int md_seq_show(struct seq_file *seq, void *v)
6942{
NeilBrownfd01b882011-10-11 16:47:53 +11006943 struct mddev *mddev = v;
Andre Nolldd8ac332009-03-31 14:33:13 +11006944 sector_t sectors;
NeilBrown3cb03002011-10-11 16:45:26 +11006945 struct md_rdev *rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006946
6947 if (v == (void*)1) {
NeilBrown84fc4b52011-10-11 16:49:58 +11006948 struct md_personality *pers;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006949 seq_printf(seq, "Personalities : ");
6950 spin_lock(&pers_lock);
NeilBrown2604b702006-01-06 00:20:36 -08006951 list_for_each_entry(pers, &pers_list, list)
6952 seq_printf(seq, "[%s] ", pers->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006953
6954 spin_unlock(&pers_lock);
6955 seq_printf(seq, "\n");
Kay Sieversf1514632011-07-12 20:48:39 +02006956 seq->poll_event = atomic_read(&md_event_count);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006957 return 0;
6958 }
6959 if (v == (void*)2) {
6960 status_unused(seq);
6961 return 0;
6962 }
6963
Ingo Molnar5dc5cf72006-04-20 02:43:23 -07006964 if (mddev_lock(mddev) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006965 return -EINTR;
Ingo Molnar5dc5cf72006-04-20 02:43:23 -07006966
Linus Torvalds1da177e2005-04-16 15:20:36 -07006967 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6968 seq_printf(seq, "%s : %sactive", mdname(mddev),
6969 mddev->pers ? "" : "in");
6970 if (mddev->pers) {
NeilBrownf91de922005-11-08 21:39:36 -08006971 if (mddev->ro==1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006972 seq_printf(seq, " (read-only)");
NeilBrownf91de922005-11-08 21:39:36 -08006973 if (mddev->ro==2)
NeilBrown52720ae2008-03-10 11:43:47 -07006974 seq_printf(seq, " (auto-read-only)");
Linus Torvalds1da177e2005-04-16 15:20:36 -07006975 seq_printf(seq, " %s", mddev->pers->name);
6976 }
6977
Andre Nolldd8ac332009-03-31 14:33:13 +11006978 sectors = 0;
NeilBrowndafb20f2012-03-19 12:46:39 +11006979 rdev_for_each(rdev, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006980 char b[BDEVNAME_SIZE];
6981 seq_printf(seq, " %s[%d]",
6982 bdevname(rdev->bdev,b), rdev->desc_nr);
NeilBrown8ddf9ef2005-09-09 16:23:45 -07006983 if (test_bit(WriteMostly, &rdev->flags))
6984 seq_printf(seq, "(W)");
NeilBrownb2d444d2005-11-08 21:39:31 -08006985 if (test_bit(Faulty, &rdev->flags)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006986 seq_printf(seq, "(F)");
6987 continue;
NeilBrown2d78f8c2011-12-23 10:17:51 +11006988 }
6989 if (rdev->raid_disk < 0)
NeilBrownb325a322005-09-09 16:24:00 -07006990 seq_printf(seq, "(S)"); /* spare */
NeilBrown2d78f8c2011-12-23 10:17:51 +11006991 if (test_bit(Replacement, &rdev->flags))
6992 seq_printf(seq, "(R)");
Andre Nolldd8ac332009-03-31 14:33:13 +11006993 sectors += rdev->sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006994 }
6995
6996 if (!list_empty(&mddev->disks)) {
6997 if (mddev->pers)
6998 seq_printf(seq, "\n %llu blocks",
Andre Nollf233ea52008-07-21 17:05:22 +10006999 (unsigned long long)
7000 mddev->array_sectors / 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007001 else
7002 seq_printf(seq, "\n %llu blocks",
Andre Nolldd8ac332009-03-31 14:33:13 +11007003 (unsigned long long)sectors / 2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007004 }
NeilBrown1cd6bf12005-09-09 16:24:00 -07007005 if (mddev->persistent) {
7006 if (mddev->major_version != 0 ||
7007 mddev->minor_version != 90) {
7008 seq_printf(seq," super %d.%d",
7009 mddev->major_version,
7010 mddev->minor_version);
7011 }
NeilBrowne6910632008-02-06 01:39:51 -08007012 } else if (mddev->external)
7013 seq_printf(seq, " super external:%s",
7014 mddev->metadata_type);
7015 else
NeilBrown1cd6bf12005-09-09 16:24:00 -07007016 seq_printf(seq, " super non-persistent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07007017
7018 if (mddev->pers) {
NeilBrownd710e132008-10-13 11:55:12 +11007019 mddev->pers->status(seq, mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007020 seq_printf(seq, "\n ");
NeilBrown8e1b39d2005-11-08 21:39:41 -08007021 if (mddev->pers->sync_request) {
7022 if (mddev->curr_resync > 2) {
NeilBrownd710e132008-10-13 11:55:12 +11007023 status_resync(seq, mddev);
NeilBrown8e1b39d2005-11-08 21:39:41 -08007024 seq_printf(seq, "\n ");
7025 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
7026 seq_printf(seq, "\tresync=DELAYED\n ");
7027 else if (mddev->recovery_cp < MaxSector)
7028 seq_printf(seq, "\tresync=PENDING\n ");
7029 }
NeilBrown32a76272005-06-21 17:17:14 -07007030 } else
7031 seq_printf(seq, "\n ");
7032
NeilBrown57148962012-03-19 12:46:40 +11007033 bitmap_status(seq, mddev->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007034
7035 seq_printf(seq, "\n");
7036 }
7037 mddev_unlock(mddev);
7038
7039 return 0;
7040}
7041
Jan Engelhardt110518b2009-05-07 12:49:37 +10007042static const struct seq_operations md_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007043 .start = md_seq_start,
7044 .next = md_seq_next,
7045 .stop = md_seq_stop,
7046 .show = md_seq_show,
7047};
7048
7049static int md_seq_open(struct inode *inode, struct file *file)
7050{
Kay Sieversf1514632011-07-12 20:48:39 +02007051 struct seq_file *seq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007052 int error;
7053
7054 error = seq_open(file, &md_seq_ops);
NeilBrownd7603b72006-01-06 00:20:30 -08007055 if (error)
Kay Sieversf1514632011-07-12 20:48:39 +02007056 return error;
7057
7058 seq = file->private_data;
7059 seq->poll_event = atomic_read(&md_event_count);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007060 return error;
7061}
7062
NeilBrownd7603b72006-01-06 00:20:30 -08007063static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7064{
Kay Sieversf1514632011-07-12 20:48:39 +02007065 struct seq_file *seq = filp->private_data;
NeilBrownd7603b72006-01-06 00:20:30 -08007066 int mask;
7067
7068 poll_wait(filp, &md_event_waiters, wait);
7069
7070 /* always allow read */
7071 mask = POLLIN | POLLRDNORM;
7072
Kay Sieversf1514632011-07-12 20:48:39 +02007073 if (seq->poll_event != atomic_read(&md_event_count))
NeilBrownd7603b72006-01-06 00:20:30 -08007074 mask |= POLLERR | POLLPRI;
7075 return mask;
7076}
7077
Arjan van de Venfa027c22007-02-12 00:55:33 -08007078static const struct file_operations md_seq_fops = {
Akinobu Mitae24650c2006-10-17 00:09:38 -07007079 .owner = THIS_MODULE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07007080 .open = md_seq_open,
7081 .read = seq_read,
7082 .llseek = seq_lseek,
Martin Peschkec3f94b402007-05-09 02:35:35 -07007083 .release = seq_release_private,
NeilBrownd7603b72006-01-06 00:20:30 -08007084 .poll = mdstat_poll,
Linus Torvalds1da177e2005-04-16 15:20:36 -07007085};
7086
NeilBrown84fc4b52011-10-11 16:49:58 +11007087int register_md_personality(struct md_personality *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007088{
Linus Torvalds1da177e2005-04-16 15:20:36 -07007089 spin_lock(&pers_lock);
NeilBrown2604b702006-01-06 00:20:36 -08007090 list_add_tail(&p->list, &pers_list);
7091 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007092 spin_unlock(&pers_lock);
7093 return 0;
7094}
7095
NeilBrown84fc4b52011-10-11 16:49:58 +11007096int unregister_md_personality(struct md_personality *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007097{
NeilBrown2604b702006-01-06 00:20:36 -08007098 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007099 spin_lock(&pers_lock);
NeilBrown2604b702006-01-06 00:20:36 -08007100 list_del_init(&p->list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007101 spin_unlock(&pers_lock);
7102 return 0;
7103}
7104
NeilBrownfd01b882011-10-11 16:47:53 +11007105static int is_mddev_idle(struct mddev *mddev, int init)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007106{
NeilBrown3cb03002011-10-11 16:45:26 +11007107 struct md_rdev * rdev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007108 int idle;
NeilBrowneea1bf32009-03-31 14:27:02 +11007109 int curr_events;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007110
7111 idle = 1;
NeilBrown4b809912008-07-21 17:05:25 +10007112 rcu_read_lock();
7113 rdev_for_each_rcu(rdev, mddev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007114 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
NeilBrowneea1bf32009-03-31 14:27:02 +11007115 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7116 (int)part_stat_read(&disk->part0, sectors[1]) -
7117 atomic_read(&disk->sync_io);
NeilBrown713f6ab2007-07-17 04:06:12 -07007118 /* sync IO will cause sync_io to increase before the disk_stats
7119 * as sync_io is counted when a request starts, and
7120 * disk_stats is counted when it completes.
7121 * So resync activity will cause curr_events to be smaller than
7122 * when there was no such activity.
7123 * non-sync IO will cause disk_stat to increase without
7124 * increasing sync_io so curr_events will (eventually)
7125 * be larger than it was before. Once it becomes
7126 * substantially larger, the test below will cause
7127 * the array to appear non-idle, and resync will slow
7128 * down.
7129 * If there is a lot of outstanding resync activity when
7130 * we set last_event to curr_events, then all that activity
7131 * completing might cause the array to appear non-idle
7132 * and resync will be slowed down even though there might
7133 * not have been non-resync activity. This will only
7134 * happen once though. 'last_events' will soon reflect
7135 * the state where there is little or no outstanding
7136 * resync requests, and further resync activity will
7137 * always make curr_events less than last_events.
NeilBrownc0e48522005-11-18 01:11:01 -08007138 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07007139 */
NeilBrowneea1bf32009-03-31 14:27:02 +11007140 if (init || curr_events - rdev->last_events > 64) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007141 rdev->last_events = curr_events;
7142 idle = 0;
7143 }
7144 }
NeilBrown4b809912008-07-21 17:05:25 +10007145 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07007146 return idle;
7147}
7148
NeilBrownfd01b882011-10-11 16:47:53 +11007149void md_done_sync(struct mddev *mddev, int blocks, int ok)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007150{
7151 /* another "blocks" (512byte) blocks have been synced */
7152 atomic_sub(blocks, &mddev->recovery_active);
7153 wake_up(&mddev->recovery_wait);
7154 if (!ok) {
NeilBrowndfc70642008-05-23 13:04:39 -07007155 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007156 md_wakeup_thread(mddev->thread);
7157 // stop recovery, signal do_sync ....
7158 }
7159}
7160
7161
NeilBrown06d91a52005-06-21 17:17:12 -07007162/* md_write_start(mddev, bi)
7163 * If we need to update some array metadata (e.g. 'active' flag
NeilBrown3d310eb2005-06-21 17:17:26 -07007164 * in superblock) before writing, schedule a superblock update
7165 * and wait for it to complete.
NeilBrown06d91a52005-06-21 17:17:12 -07007166 */
NeilBrownfd01b882011-10-11 16:47:53 +11007167void md_write_start(struct mddev *mddev, struct bio *bi)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007168{
Neil Brown0fd62b82008-06-28 08:31:36 +10007169 int did_change = 0;
NeilBrown06d91a52005-06-21 17:17:12 -07007170 if (bio_data_dir(bi) != WRITE)
NeilBrown3d310eb2005-06-21 17:17:26 -07007171 return;
NeilBrown06d91a52005-06-21 17:17:12 -07007172
NeilBrownf91de922005-11-08 21:39:36 -08007173 BUG_ON(mddev->ro == 1);
7174 if (mddev->ro == 2) {
7175 /* need to switch to read/write */
7176 mddev->ro = 0;
7177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7178 md_wakeup_thread(mddev->thread);
NeilBrown25156192008-03-04 14:29:32 -08007179 md_wakeup_thread(mddev->sync_thread);
Neil Brown0fd62b82008-06-28 08:31:36 +10007180 did_change = 1;
NeilBrownf91de922005-11-08 21:39:36 -08007181 }
NeilBrown06d91a52005-06-21 17:17:12 -07007182 atomic_inc(&mddev->writes_pending);
NeilBrown31a59e32008-04-30 00:52:30 -07007183 if (mddev->safemode == 1)
7184 mddev->safemode = 0;
NeilBrown06d91a52005-06-21 17:17:12 -07007185 if (mddev->in_sync) {
NeilBrowna9701a32005-11-08 21:39:34 -08007186 spin_lock_irq(&mddev->write_lock);
NeilBrown3d310eb2005-06-21 17:17:26 -07007187 if (mddev->in_sync) {
7188 mddev->in_sync = 0;
NeilBrown850b2b422006-10-03 01:15:46 -07007189 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
NeilBrown070dc6d2010-08-30 17:33:34 +10007190 set_bit(MD_CHANGE_PENDING, &mddev->flags);
NeilBrown3d310eb2005-06-21 17:17:26 -07007191 md_wakeup_thread(mddev->thread);
Neil Brown0fd62b82008-06-28 08:31:36 +10007192 did_change = 1;
NeilBrown3d310eb2005-06-21 17:17:26 -07007193 }
NeilBrowna9701a32005-11-08 21:39:34 -08007194 spin_unlock_irq(&mddev->write_lock);
NeilBrown06d91a52005-06-21 17:17:12 -07007195 }
Neil Brown0fd62b82008-06-28 08:31:36 +10007196 if (did_change)
NeilBrown00bcb4a2010-06-01 19:37:23 +10007197 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrown09a44cc2008-05-23 13:04:36 -07007198 wait_event(mddev->sb_wait,
NeilBrown09a44cc2008-05-23 13:04:36 -07007199 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -07007200}
7201
NeilBrownfd01b882011-10-11 16:47:53 +11007202void md_write_end(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007203{
7204 if (atomic_dec_and_test(&mddev->writes_pending)) {
7205 if (mddev->safemode == 2)
7206 md_wakeup_thread(mddev->thread);
NeilBrown16f17b32006-06-26 00:27:37 -07007207 else if (mddev->safemode_delay)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007208 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7209 }
7210}
7211
NeilBrown2a2275d2007-01-26 00:57:11 -08007212/* md_allow_write(mddev)
7213 * Calling this ensures that the array is marked 'active' so that writes
7214 * may proceed without blocking. It is important to call this before
7215 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7216 * Must be called with mddev_lock held.
Dan Williamsb5470dc2008-06-27 21:44:04 -07007217 *
7218 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7219 * is dropped, so return -EAGAIN after notifying userspace.
NeilBrown2a2275d2007-01-26 00:57:11 -08007220 */
NeilBrownfd01b882011-10-11 16:47:53 +11007221int md_allow_write(struct mddev *mddev)
NeilBrown2a2275d2007-01-26 00:57:11 -08007222{
7223 if (!mddev->pers)
Dan Williamsb5470dc2008-06-27 21:44:04 -07007224 return 0;
NeilBrown2a2275d2007-01-26 00:57:11 -08007225 if (mddev->ro)
Dan Williamsb5470dc2008-06-27 21:44:04 -07007226 return 0;
Neil Brown1a0fd492008-06-28 08:31:27 +10007227 if (!mddev->pers->sync_request)
Dan Williamsb5470dc2008-06-27 21:44:04 -07007228 return 0;
NeilBrown2a2275d2007-01-26 00:57:11 -08007229
7230 spin_lock_irq(&mddev->write_lock);
7231 if (mddev->in_sync) {
7232 mddev->in_sync = 0;
7233 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
NeilBrown070dc6d2010-08-30 17:33:34 +10007234 set_bit(MD_CHANGE_PENDING, &mddev->flags);
NeilBrown2a2275d2007-01-26 00:57:11 -08007235 if (mddev->safemode_delay &&
7236 mddev->safemode == 0)
7237 mddev->safemode = 1;
7238 spin_unlock_irq(&mddev->write_lock);
7239 md_update_sb(mddev, 0);
NeilBrown00bcb4a2010-06-01 19:37:23 +10007240 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrown2a2275d2007-01-26 00:57:11 -08007241 } else
7242 spin_unlock_irq(&mddev->write_lock);
Dan Williamsb5470dc2008-06-27 21:44:04 -07007243
NeilBrown070dc6d2010-08-30 17:33:34 +10007244 if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
Dan Williamsb5470dc2008-06-27 21:44:04 -07007245 return -EAGAIN;
7246 else
7247 return 0;
NeilBrown2a2275d2007-01-26 00:57:11 -08007248}
7249EXPORT_SYMBOL_GPL(md_allow_write);
7250
Linus Torvalds1da177e2005-04-16 15:20:36 -07007251#define SYNC_MARKS 10
7252#define SYNC_MARK_STEP (3*HZ)
NeilBrownfd01b882011-10-11 16:47:53 +11007253void md_do_sync(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007254{
NeilBrownfd01b882011-10-11 16:47:53 +11007255 struct mddev *mddev2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007256 unsigned int currspeed = 0,
7257 window;
NeilBrown57afd892005-06-21 17:17:13 -07007258 sector_t max_sectors,j, io_sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007259 unsigned long mark[SYNC_MARKS];
7260 sector_t mark_cnt[SYNC_MARKS];
7261 int last_mark,m;
7262 struct list_head *tmp;
7263 sector_t last_check;
NeilBrown57afd892005-06-21 17:17:13 -07007264 int skipped = 0;
NeilBrown3cb03002011-10-11 16:45:26 +11007265 struct md_rdev *rdev;
NeilBrown61df9d92006-10-03 01:15:57 -07007266 char *desc;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007267
7268 /* just incase thread restarts... */
7269 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7270 return;
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007271 if (mddev->ro) /* never try to sync a read-only array */
7272 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007273
NeilBrown61df9d92006-10-03 01:15:57 -07007274 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7275 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
7276 desc = "data-check";
7277 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7278 desc = "requested-resync";
7279 else
7280 desc = "resync";
7281 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7282 desc = "reshape";
7283 else
7284 desc = "recovery";
7285
Linus Torvalds1da177e2005-04-16 15:20:36 -07007286 /* we overload curr_resync somewhat here.
7287 * 0 == not engaged in resync at all
7288 * 2 == checking that there is no conflict with another sync
7289 * 1 == like 2, but have yielded to allow conflicting resync to
7290 * commense
7291 * other == active in resync - this many blocks
7292 *
7293 * Before starting a resync we must have set curr_resync to
7294 * 2, and then checked that every "conflicting" array has curr_resync
7295 * less than ours. When we find one that is the same or higher
7296 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
7297 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7298 * This will mean we have to start checking from the beginning again.
7299 *
7300 */
7301
7302 do {
7303 mddev->curr_resync = 2;
7304
7305 try_again:
NeilBrown404e4b42009-12-30 15:25:23 +11007306 if (kthread_should_stop())
NeilBrown6985c432005-10-19 21:23:47 -07007307 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
NeilBrown404e4b42009-12-30 15:25:23 +11007308
7309 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
Linus Torvalds1da177e2005-04-16 15:20:36 -07007310 goto skip;
NeilBrown29ac4aa2008-02-06 01:39:58 -08007311 for_each_mddev(mddev2, tmp) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007312 if (mddev2 == mddev)
7313 continue;
Bernd Schubert90b08712008-05-23 13:04:38 -07007314 if (!mddev->parallel_resync
7315 && mddev2->curr_resync
7316 && match_mddev_units(mddev, mddev2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007317 DEFINE_WAIT(wq);
7318 if (mddev < mddev2 && mddev->curr_resync == 2) {
7319 /* arbitrarily yield */
7320 mddev->curr_resync = 1;
7321 wake_up(&resync_wait);
7322 }
7323 if (mddev > mddev2 && mddev->curr_resync == 1)
7324 /* no need to wait here, we can wait the next
7325 * time 'round when curr_resync == 2
7326 */
7327 continue;
NeilBrown97441972008-09-19 11:49:54 +10007328 /* We need to wait 'interruptible' so as not to
7329 * contribute to the load average, and not to
7330 * be caught by 'softlockup'
7331 */
7332 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
NeilBrown787453c2005-11-08 21:39:43 -08007333 if (!kthread_should_stop() &&
NeilBrown8712e552005-10-26 01:58:58 -07007334 mddev2->curr_resync >= mddev->curr_resync) {
NeilBrown61df9d92006-10-03 01:15:57 -07007335 printk(KERN_INFO "md: delaying %s of %s"
7336 " until %s has finished (they"
Linus Torvalds1da177e2005-04-16 15:20:36 -07007337 " share one or more physical units)\n",
NeilBrown61df9d92006-10-03 01:15:57 -07007338 desc, mdname(mddev), mdname(mddev2));
Linus Torvalds1da177e2005-04-16 15:20:36 -07007339 mddev_put(mddev2);
NeilBrown97441972008-09-19 11:49:54 +10007340 if (signal_pending(current))
7341 flush_signals(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007342 schedule();
7343 finish_wait(&resync_wait, &wq);
7344 goto try_again;
7345 }
7346 finish_wait(&resync_wait, &wq);
7347 }
7348 }
7349 } while (mddev->curr_resync < 2);
7350
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007351 j = 0;
NeilBrown9d888832005-11-08 21:39:26 -08007352 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007353 /* resync follows the size requested by the personality,
NeilBrown57afd892005-06-21 17:17:13 -07007354 * which defaults to physical size, but can be virtual size
Linus Torvalds1da177e2005-04-16 15:20:36 -07007355 */
7356 max_sectors = mddev->resync_max_sectors;
NeilBrown9d888832005-11-08 21:39:26 -08007357 mddev->resync_mismatches = 0;
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007358 /* we don't use the checkpoint if there's a bitmap */
Neil Brown5e96ee62008-06-28 08:31:24 +10007359 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7360 j = mddev->resync_min;
7361 else if (!mddev->bitmap)
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007362 j = mddev->recovery_cp;
Neil Brown5e96ee62008-06-28 08:31:24 +10007363
NeilBrownccfcc3c2006-03-27 01:18:09 -08007364 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
Andre Noll58c0fed2009-03-31 14:33:13 +11007365 max_sectors = mddev->dev_sectors;
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007366 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07007367 /* recovery follows the physical size of devices */
Andre Noll58c0fed2009-03-31 14:33:13 +11007368 max_sectors = mddev->dev_sectors;
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007369 j = MaxSector;
Dan Williams4e59ca72009-12-12 21:17:06 -07007370 rcu_read_lock();
NeilBrowndafb20f2012-03-19 12:46:39 +11007371 rdev_for_each_rcu(rdev, mddev)
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007372 if (rdev->raid_disk >= 0 &&
7373 !test_bit(Faulty, &rdev->flags) &&
7374 !test_bit(In_sync, &rdev->flags) &&
7375 rdev->recovery_offset < j)
7376 j = rdev->recovery_offset;
Dan Williams4e59ca72009-12-12 21:17:06 -07007377 rcu_read_unlock();
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007378 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007379
NeilBrown61df9d92006-10-03 01:15:57 -07007380 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7381 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
7382 " %d KB/sec/disk.\n", speed_min(mddev));
Adrian Bunk338cec32005-09-10 00:26:54 -07007383 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
NeilBrown61df9d92006-10-03 01:15:57 -07007384 "(but not more than %d KB/sec) for %s.\n",
7385 speed_max(mddev), desc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007386
NeilBrowneea1bf32009-03-31 14:27:02 +11007387 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007388
NeilBrown57afd892005-06-21 17:17:13 -07007389 io_sectors = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007390 for (m = 0; m < SYNC_MARKS; m++) {
7391 mark[m] = jiffies;
NeilBrown57afd892005-06-21 17:17:13 -07007392 mark_cnt[m] = io_sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007393 }
7394 last_mark = 0;
7395 mddev->resync_mark = mark[last_mark];
7396 mddev->resync_mark_cnt = mark_cnt[last_mark];
7397
7398 /*
7399 * Tune reconstruction:
7400 */
7401 window = 32*(PAGE_SIZE/512);
Jonathan Brassowac42450c2011-06-07 17:48:35 -05007402 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7403 window/2, (unsigned long long)max_sectors/2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007404
7405 atomic_set(&mddev->recovery_active, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007406 last_check = 0;
7407
7408 if (j>2) {
7409 printk(KERN_INFO
NeilBrown61df9d92006-10-03 01:15:57 -07007410 "md: resuming %s of %s from checkpoint.\n",
7411 desc, mdname(mddev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07007412 mddev->curr_resync = j;
7413 }
NeilBrown75d3da42011-01-14 09:14:34 +11007414 mddev->curr_resync_completed = j;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007415
7416 while (j < max_sectors) {
NeilBrown57afd892005-06-21 17:17:13 -07007417 sector_t sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007418
NeilBrown57afd892005-06-21 17:17:13 -07007419 skipped = 0;
NeilBrown97e4f422009-03-31 14:33:13 +11007420
NeilBrown7a91ee12009-05-26 12:57:21 +10007421 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7422 ((mddev->curr_resync > mddev->curr_resync_completed &&
7423 (mddev->curr_resync - mddev->curr_resync_completed)
7424 > (max_sectors >> 4)) ||
7425 (j - mddev->curr_resync_completed)*2
7426 >= mddev->resync_max - mddev->curr_resync_completed
7427 )) {
NeilBrown97e4f422009-03-31 14:33:13 +11007428 /* time to update curr_resync_completed */
NeilBrown97e4f422009-03-31 14:33:13 +11007429 wait_event(mddev->recovery_wait,
7430 atomic_read(&mddev->recovery_active) == 0);
NeilBrown75d3da42011-01-14 09:14:34 +11007431 mddev->curr_resync_completed = j;
NeilBrown070dc6d2010-08-30 17:33:34 +10007432 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
NeilBrownacb180b2009-04-14 16:28:34 +10007433 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
NeilBrown97e4f422009-03-31 14:33:13 +11007434 }
NeilBrownacb180b2009-04-14 16:28:34 +10007435
NeilBrowne62e58a2009-07-01 13:15:35 +10007436 while (j >= mddev->resync_max && !kthread_should_stop()) {
7437 /* As this condition is controlled by user-space,
7438 * we can block indefinitely, so use '_interruptible'
7439 * to avoid triggering warnings.
7440 */
7441 flush_signals(current); /* just in case */
7442 wait_event_interruptible(mddev->recovery_wait,
7443 mddev->resync_max > j
7444 || kthread_should_stop());
7445 }
NeilBrownacb180b2009-04-14 16:28:34 +10007446
7447 if (kthread_should_stop())
7448 goto interrupted;
7449
NeilBrown57afd892005-06-21 17:17:13 -07007450 sectors = mddev->pers->sync_request(mddev, j, &skipped,
NeilBrownc6207272008-02-06 01:39:52 -08007451 currspeed < speed_min(mddev));
NeilBrown57afd892005-06-21 17:17:13 -07007452 if (sectors == 0) {
NeilBrowndfc70642008-05-23 13:04:39 -07007453 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007454 goto out;
7455 }
NeilBrown57afd892005-06-21 17:17:13 -07007456
7457 if (!skipped) { /* actual IO requested */
7458 io_sectors += sectors;
7459 atomic_add(sectors, &mddev->recovery_active);
7460 }
7461
NeilBrowne875ece2011-07-28 11:39:24 +10007462 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7463 break;
7464
Linus Torvalds1da177e2005-04-16 15:20:36 -07007465 j += sectors;
7466 if (j>1) mddev->curr_resync = j;
NeilBrownff4e8d92006-07-10 04:44:16 -07007467 mddev->curr_mark_cnt = io_sectors;
NeilBrownd7603b72006-01-06 00:20:30 -08007468 if (last_check == 0)
NeilBrowne875ece2011-07-28 11:39:24 +10007469 /* this is the earliest that rebuild will be
NeilBrownd7603b72006-01-06 00:20:30 -08007470 * visible in /proc/mdstat
7471 */
7472 md_new_event(mddev);
NeilBrown57afd892005-06-21 17:17:13 -07007473
7474 if (last_check + window > io_sectors || j == max_sectors)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007475 continue;
7476
NeilBrown57afd892005-06-21 17:17:13 -07007477 last_check = io_sectors;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007478 repeat:
7479 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7480 /* step marks */
7481 int next = (last_mark+1) % SYNC_MARKS;
7482
7483 mddev->resync_mark = mark[next];
7484 mddev->resync_mark_cnt = mark_cnt[next];
7485 mark[next] = jiffies;
NeilBrown57afd892005-06-21 17:17:13 -07007486 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007487 last_mark = next;
7488 }
7489
7490
NeilBrownc6207272008-02-06 01:39:52 -08007491 if (kthread_should_stop())
7492 goto interrupted;
7493
Linus Torvalds1da177e2005-04-16 15:20:36 -07007494
7495 /*
7496 * this loop exits only if either when we are slower than
7497 * the 'hard' speed limit, or the system was IO-idle for
7498 * a jiffy.
7499 * the system might be non-idle CPU-wise, but we only care
7500 * about not overloading the IO subsystem. (things like an
7501 * e2fsck being done on the RAID array should execute fast)
7502 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07007503 cond_resched();
7504
NeilBrown57afd892005-06-21 17:17:13 -07007505 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
7506 /((jiffies-mddev->resync_mark)/HZ +1) +1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007507
NeilBrown88202a02006-01-06 00:21:36 -08007508 if (currspeed > speed_min(mddev)) {
7509 if ((currspeed > speed_max(mddev)) ||
NeilBrowneea1bf32009-03-31 14:27:02 +11007510 !is_mddev_idle(mddev, 0)) {
NeilBrownc0e48522005-11-18 01:11:01 -08007511 msleep(500);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007512 goto repeat;
7513 }
7514 }
7515 }
NeilBrown61df9d92006-10-03 01:15:57 -07007516 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007517 /*
7518 * this also signals 'finished resyncing' to md_stop
7519 */
7520 out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07007521 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7522
7523 /* tell personality that we are finished */
NeilBrown57afd892005-06-21 17:17:13 -07007524 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007525
NeilBrowndfc70642008-05-23 13:04:39 -07007526 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007527 mddev->curr_resync > 2) {
7528 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7529 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7530 if (mddev->curr_resync >= mddev->recovery_cp) {
7531 printk(KERN_INFO
NeilBrown61df9d92006-10-03 01:15:57 -07007532 "md: checkpointing %s of %s.\n",
7533 desc, mdname(mddev));
NeilBrowndb91ff52012-02-07 12:01:51 +11007534 mddev->recovery_cp =
7535 mddev->curr_resync_completed;
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007536 }
7537 } else
7538 mddev->recovery_cp = MaxSector;
7539 } else {
7540 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7541 mddev->curr_resync = MaxSector;
Dan Williams4e59ca72009-12-12 21:17:06 -07007542 rcu_read_lock();
NeilBrowndafb20f2012-03-19 12:46:39 +11007543 rdev_for_each_rcu(rdev, mddev)
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007544 if (rdev->raid_disk >= 0 &&
NeilBrown70fffd02010-06-16 17:01:25 +10007545 mddev->delta_disks >= 0 &&
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007546 !test_bit(Faulty, &rdev->flags) &&
7547 !test_bit(In_sync, &rdev->flags) &&
7548 rdev->recovery_offset < mddev->curr_resync)
7549 rdev->recovery_offset = mddev->curr_resync;
Dan Williams4e59ca72009-12-12 21:17:06 -07007550 rcu_read_unlock();
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007551 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007552 }
NeilBrowndb91ff52012-02-07 12:01:51 +11007553 skip:
NeilBrown17571282006-12-10 02:20:52 -08007554 set_bit(MD_CHANGE_DEVS, &mddev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007555
NeilBrownc07b70a2009-12-14 12:49:48 +11007556 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7557 /* We completed so min/max setting can be forgotten if used. */
7558 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7559 mddev->resync_min = 0;
7560 mddev->resync_max = MaxSector;
7561 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7562 mddev->resync_min = mddev->curr_resync_completed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007563 mddev->curr_resync = 0;
7564 wake_up(&resync_wait);
7565 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7566 md_wakeup_thread(mddev->thread);
NeilBrownc6207272008-02-06 01:39:52 -08007567 return;
7568
7569 interrupted:
7570 /*
7571 * got a signal, exit.
7572 */
7573 printk(KERN_INFO
7574 "md: md_do_sync() got signal ... exiting\n");
7575 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7576 goto out;
7577
Linus Torvalds1da177e2005-04-16 15:20:36 -07007578}
NeilBrown29269552006-03-27 01:18:10 -08007579EXPORT_SYMBOL_GPL(md_do_sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007580
NeilBrownfd01b882011-10-11 16:47:53 +11007581static int remove_and_add_spares(struct mddev *mddev)
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007582{
NeilBrown3cb03002011-10-11 16:45:26 +11007583 struct md_rdev *rdev;
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007584 int spares = 0;
NeilBrownf2a371c2012-01-09 00:46:41 +11007585 int removed = 0;
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007586
NeilBrown97e4f422009-03-31 14:33:13 +11007587 mddev->curr_resync_completed = 0;
7588
NeilBrowndafb20f2012-03-19 12:46:39 +11007589 rdev_for_each(rdev, mddev)
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007590 if (rdev->raid_disk >= 0 &&
Dan Williams6bfe0b42008-04-30 00:52:32 -07007591 !test_bit(Blocked, &rdev->flags) &&
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007592 (test_bit(Faulty, &rdev->flags) ||
7593 ! test_bit(In_sync, &rdev->flags)) &&
7594 atomic_read(&rdev->nr_pending)==0) {
7595 if (mddev->pers->hot_remove_disk(
NeilBrownb8321b62011-12-23 10:17:51 +11007596 mddev, rdev) == 0) {
Namhyung Kim36fad852011-07-27 11:00:36 +10007597 sysfs_unlink_rdev(mddev, rdev);
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007598 rdev->raid_disk = -1;
NeilBrownf2a371c2012-01-09 00:46:41 +11007599 removed++;
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007600 }
7601 }
NeilBrownf2a371c2012-01-09 00:46:41 +11007602 if (removed)
7603 sysfs_notify(&mddev->kobj, NULL,
7604 "degraded");
7605
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007606
NeilBrowndafb20f2012-03-19 12:46:39 +11007607 rdev_for_each(rdev, mddev) {
NeilBrown7bfec5f2011-12-23 10:17:53 +11007608 if (rdev->raid_disk >= 0 &&
7609 !test_bit(In_sync, &rdev->flags) &&
7610 !test_bit(Faulty, &rdev->flags))
7611 spares++;
7612 if (rdev->raid_disk < 0
7613 && !test_bit(Faulty, &rdev->flags)) {
7614 rdev->recovery_offset = 0;
7615 if (mddev->pers->
7616 hot_add_disk(mddev, rdev) == 0) {
7617 if (sysfs_link_rdev(mddev, rdev))
7618 /* failure here is OK */;
NeilBrowndfc70642008-05-23 13:04:39 -07007619 spares++;
NeilBrown7bfec5f2011-12-23 10:17:53 +11007620 md_new_event(mddev);
7621 set_bit(MD_CHANGE_DEVS, &mddev->flags);
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007622 }
NeilBrowndfc70642008-05-23 13:04:39 -07007623 }
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007624 }
7625 return spares;
7626}
NeilBrown7ebc0be2011-01-14 09:14:33 +11007627
NeilBrownfd01b882011-10-11 16:47:53 +11007628static void reap_sync_thread(struct mddev *mddev)
NeilBrown7ebc0be2011-01-14 09:14:33 +11007629{
NeilBrown3cb03002011-10-11 16:45:26 +11007630 struct md_rdev *rdev;
NeilBrown7ebc0be2011-01-14 09:14:33 +11007631
7632 /* resync has finished, collect result */
NeilBrown01f96c02011-09-21 15:30:20 +10007633 md_unregister_thread(&mddev->sync_thread);
NeilBrown7ebc0be2011-01-14 09:14:33 +11007634 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7635 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7636 /* success...*/
7637 /* activate any spares */
7638 if (mddev->pers->spare_active(mddev))
7639 sysfs_notify(&mddev->kobj, NULL,
7640 "degraded");
7641 }
7642 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7643 mddev->pers->finish_reshape)
7644 mddev->pers->finish_reshape(mddev);
NeilBrown7ebc0be2011-01-14 09:14:33 +11007645
Andrei Warkentind70ed2e2011-10-18 12:16:48 +11007646 /* If array is no-longer degraded, then any saved_raid_disk
7647 * information must be scrapped. Also if any device is now
7648 * In_sync we must scrape the saved_raid_disk for that device
7649 * do the superblock for an incrementally recovered device
7650 * written out.
NeilBrown7ebc0be2011-01-14 09:14:33 +11007651 */
NeilBrowndafb20f2012-03-19 12:46:39 +11007652 rdev_for_each(rdev, mddev)
Andrei Warkentind70ed2e2011-10-18 12:16:48 +11007653 if (!mddev->degraded ||
7654 test_bit(In_sync, &rdev->flags))
NeilBrown7ebc0be2011-01-14 09:14:33 +11007655 rdev->saved_raid_disk = -1;
7656
Andrei Warkentind70ed2e2011-10-18 12:16:48 +11007657 md_update_sb(mddev, 1);
NeilBrown7ebc0be2011-01-14 09:14:33 +11007658 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7659 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7660 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7661 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7662 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7663 /* flag recovery needed just to double check */
7664 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7665 sysfs_notify_dirent_safe(mddev->sysfs_action);
7666 md_new_event(mddev);
Jonathan Brassow768e587e2011-07-27 11:00:37 +10007667 if (mddev->event_work.func)
7668 queue_work(md_misc_wq, &mddev->event_work);
NeilBrown7ebc0be2011-01-14 09:14:33 +11007669}
7670
Linus Torvalds1da177e2005-04-16 15:20:36 -07007671/*
7672 * This routine is regularly called by all per-raid-array threads to
7673 * deal with generic issues like resync and super-block update.
7674 * Raid personalities that don't have a thread (linear/raid0) do not
7675 * need this as they never do any recovery or update the superblock.
7676 *
7677 * It does not do any resync itself, but rather "forks" off other threads
7678 * to do that as needed.
7679 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7680 * "->recovery" and create a thread at ->sync_thread.
NeilBrowndfc70642008-05-23 13:04:39 -07007681 * When the thread finishes it sets MD_RECOVERY_DONE
Linus Torvalds1da177e2005-04-16 15:20:36 -07007682 * and wakeups up this thread which will reap the thread and finish up.
7683 * This thread also removes any faulty devices (with nr_pending == 0).
7684 *
7685 * The overall approach is:
7686 * 1/ if the superblock needs updating, update it.
7687 * 2/ If a recovery thread is running, don't do anything else.
7688 * 3/ If recovery has finished, clean up, possibly marking spares active.
7689 * 4/ If there are any faulty devices, remove them.
7690 * 5/ If array is degraded, try to add spares devices
7691 * 6/ If array has spares or is not in-sync, start a resync thread.
7692 */
NeilBrownfd01b882011-10-11 16:47:53 +11007693void md_check_recovery(struct mddev *mddev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007694{
Jonathan Brassow68866e42011-06-08 15:10:08 +10007695 if (mddev->suspended)
7696 return;
7697
NeilBrown5f404022005-06-21 17:17:16 -07007698 if (mddev->bitmap)
NeilBrownaa5cbd12009-12-14 12:49:46 +11007699 bitmap_daemon_work(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007700
NeilBrownfca4d842005-06-21 17:17:11 -07007701 if (signal_pending(current)) {
NeilBrown31a59e32008-04-30 00:52:30 -07007702 if (mddev->pers->sync_request && !mddev->external) {
NeilBrownfca4d842005-06-21 17:17:11 -07007703 printk(KERN_INFO "md: %s in immediate safe mode\n",
7704 mdname(mddev));
7705 mddev->safemode = 2;
7706 }
7707 flush_signals(current);
7708 }
7709
NeilBrownc89a8ee2008-08-05 15:54:13 +10007710 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7711 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007712 if ( ! (
NeilBrown126925c2010-09-07 17:02:47 +10007713 (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07007714 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
NeilBrownfca4d842005-06-21 17:17:11 -07007715 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
NeilBrown31a59e32008-04-30 00:52:30 -07007716 (mddev->external == 0 && mddev->safemode == 1) ||
NeilBrownfca4d842005-06-21 17:17:11 -07007717 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7718 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007719 ))
7720 return;
NeilBrownfca4d842005-06-21 17:17:11 -07007721
NeilBrowndf5b89b2006-03-27 01:18:20 -08007722 if (mddev_trylock(mddev)) {
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007723 int spares = 0;
NeilBrownfca4d842005-06-21 17:17:11 -07007724
NeilBrownc89a8ee2008-08-05 15:54:13 +10007725 if (mddev->ro) {
7726 /* Only thing we do on a ro array is remove
7727 * failed devices.
7728 */
NeilBrown3cb03002011-10-11 16:45:26 +11007729 struct md_rdev *rdev;
NeilBrowndafb20f2012-03-19 12:46:39 +11007730 rdev_for_each(rdev, mddev)
NeilBrowna8c42c72011-01-31 13:47:13 +11007731 if (rdev->raid_disk >= 0 &&
7732 !test_bit(Blocked, &rdev->flags) &&
7733 test_bit(Faulty, &rdev->flags) &&
7734 atomic_read(&rdev->nr_pending)==0) {
7735 if (mddev->pers->hot_remove_disk(
NeilBrownb8321b62011-12-23 10:17:51 +11007736 mddev, rdev) == 0) {
Namhyung Kim36fad852011-07-27 11:00:36 +10007737 sysfs_unlink_rdev(mddev, rdev);
NeilBrowna8c42c72011-01-31 13:47:13 +11007738 rdev->raid_disk = -1;
7739 }
7740 }
NeilBrownc89a8ee2008-08-05 15:54:13 +10007741 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7742 goto unlock;
7743 }
7744
NeilBrown31a59e32008-04-30 00:52:30 -07007745 if (!mddev->external) {
Neil Brown0fd62b82008-06-28 08:31:36 +10007746 int did_change = 0;
NeilBrown31a59e32008-04-30 00:52:30 -07007747 spin_lock_irq(&mddev->write_lock);
7748 if (mddev->safemode &&
7749 !atomic_read(&mddev->writes_pending) &&
7750 !mddev->in_sync &&
7751 mddev->recovery_cp == MaxSector) {
7752 mddev->in_sync = 1;
Neil Brown0fd62b82008-06-28 08:31:36 +10007753 did_change = 1;
NeilBrown070dc6d2010-08-30 17:33:34 +10007754 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
NeilBrown31a59e32008-04-30 00:52:30 -07007755 }
7756 if (mddev->safemode == 1)
7757 mddev->safemode = 0;
7758 spin_unlock_irq(&mddev->write_lock);
Neil Brown0fd62b82008-06-28 08:31:36 +10007759 if (did_change)
NeilBrown00bcb4a2010-06-01 19:37:23 +10007760 sysfs_notify_dirent_safe(mddev->sysfs_state);
NeilBrownfca4d842005-06-21 17:17:11 -07007761 }
NeilBrownfca4d842005-06-21 17:17:11 -07007762
NeilBrown850b2b422006-10-03 01:15:46 -07007763 if (mddev->flags)
7764 md_update_sb(mddev, 0);
NeilBrown06d91a52005-06-21 17:17:12 -07007765
Linus Torvalds1da177e2005-04-16 15:20:36 -07007766 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7767 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7768 /* resync/recovery still happening */
7769 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7770 goto unlock;
7771 }
7772 if (mddev->sync_thread) {
NeilBrown7ebc0be2011-01-14 09:14:33 +11007773 reap_sync_thread(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007774 goto unlock;
7775 }
Neil Brown72a23c22008-06-28 08:31:41 +10007776 /* Set RUNNING before clearing NEEDED to avoid
7777 * any transients in the value of "sync_action".
7778 */
7779 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
NeilBrown24dd4692005-11-08 21:39:26 -08007780 /* Clear some bits that don't mean anything, but
7781 * might be left set
7782 */
NeilBrown24dd4692005-11-08 21:39:26 -08007783 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7784 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007785
NeilBrowned209582012-04-24 10:23:14 +10007786 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7787 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
NeilBrown5fd6c1d2006-06-26 00:27:40 -07007788 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007789 /* no recovery is running.
7790 * remove any failed drives, then
7791 * add spares if possible.
7792 * Spare are also removed and re-added, to allow
7793 * the personality to fail the re-add.
7794 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07007795
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007796 if (mddev->reshape_position != MaxSector) {
NeilBrown50ac1682009-06-18 08:47:55 +10007797 if (mddev->pers->check_reshape == NULL ||
7798 mddev->pers->check_reshape(mddev) != 0)
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007799 /* Cannot proceed */
7800 goto unlock;
7801 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
Neil Brown72a23c22008-06-28 08:31:41 +10007802 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
NeilBrownb4c4c7b2007-02-28 20:11:48 -08007803 } else if ((spares = remove_and_add_spares(mddev))) {
NeilBrown24dd4692005-11-08 21:39:26 -08007804 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7805 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
Dan Williams56ac36d2008-08-07 10:02:47 -07007806 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
Neil Brown72a23c22008-06-28 08:31:41 +10007807 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
NeilBrown24dd4692005-11-08 21:39:26 -08007808 } else if (mddev->recovery_cp < MaxSector) {
7809 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
Neil Brown72a23c22008-06-28 08:31:41 +10007810 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
NeilBrown24dd4692005-11-08 21:39:26 -08007811 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7812 /* nothing to be done ... */
Linus Torvalds1da177e2005-04-16 15:20:36 -07007813 goto unlock;
NeilBrown24dd4692005-11-08 21:39:26 -08007814
Linus Torvalds1da177e2005-04-16 15:20:36 -07007815 if (mddev->pers->sync_request) {
NeilBrowna654b9d82005-06-21 17:17:27 -07007816 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
7817 /* We are adding a device or devices to an array
7818 * which has the bitmap stored on all devices.
7819 * So make sure all bitmap pages get written
7820 */
7821 bitmap_write_all(mddev->bitmap);
7822 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007823 mddev->sync_thread = md_register_thread(md_do_sync,
7824 mddev,
NeilBrown0da3c612009-09-23 18:09:45 +10007825 "resync");
Linus Torvalds1da177e2005-04-16 15:20:36 -07007826 if (!mddev->sync_thread) {
7827 printk(KERN_ERR "%s: could not start resync"
7828 " thread...\n",
7829 mdname(mddev));
7830 /* leave the spares where they are, it shouldn't hurt */
NeilBrown7ebc0be2011-01-14 09:14:33 +11007831 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7832 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7833 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7834 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7835 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
NeilBrownd7603b72006-01-06 00:20:30 -08007836 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07007837 md_wakeup_thread(mddev->sync_thread);
NeilBrown00bcb4a2010-06-01 19:37:23 +10007838 sysfs_notify_dirent_safe(mddev->sysfs_action);
NeilBrownd7603b72006-01-06 00:20:30 -08007839 md_new_event(mddev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007840 }
7841 unlock:
Neil Brown72a23c22008-06-28 08:31:41 +10007842 if (!mddev->sync_thread) {
7843 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7844 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7845 &mddev->recovery))
NeilBrown0c3573f2009-01-09 08:31:05 +11007846 if (mddev->sysfs_action)
NeilBrown00bcb4a2010-06-01 19:37:23 +10007847 sysfs_notify_dirent_safe(mddev->sysfs_action);
Neil Brown72a23c22008-06-28 08:31:41 +10007848 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007849 mddev_unlock(mddev);
7850 }
7851}
7852
NeilBrownfd01b882011-10-11 16:47:53 +11007853void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
Dan Williams6bfe0b42008-04-30 00:52:32 -07007854{
NeilBrown00bcb4a2010-06-01 19:37:23 +10007855 sysfs_notify_dirent_safe(rdev->sysfs_state);
Dan Williams6bfe0b42008-04-30 00:52:32 -07007856 wait_event_timeout(rdev->blocked_wait,
NeilBrownde393cd2011-07-28 11:31:48 +10007857 !test_bit(Blocked, &rdev->flags) &&
7858 !test_bit(BlockedBadBlocks, &rdev->flags),
Dan Williams6bfe0b42008-04-30 00:52:32 -07007859 msecs_to_jiffies(5000));
7860 rdev_dec_pending(rdev, mddev);
7861}
7862EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7863
NeilBrownc6563a82012-05-21 09:27:00 +10007864void md_finish_reshape(struct mddev *mddev)
7865{
7866 /* called be personality module when reshape completes. */
7867 struct md_rdev *rdev;
7868
7869 rdev_for_each(rdev, mddev) {
7870 if (rdev->data_offset > rdev->new_data_offset)
7871 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7872 else
7873 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7874 rdev->data_offset = rdev->new_data_offset;
7875 }
7876}
7877EXPORT_SYMBOL(md_finish_reshape);
NeilBrown2230dfe2011-07-28 11:31:46 +10007878
7879/* Bad block management.
7880 * We can record which blocks on each device are 'bad' and so just
7881 * fail those blocks, or that stripe, rather than the whole device.
7882 * Entries in the bad-block table are 64bits wide. This comprises:
7883 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7884 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7885 * A 'shift' can be set so that larger blocks are tracked and
7886 * consequently larger devices can be covered.
7887 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7888 *
7889 * Locking of the bad-block table uses a seqlock so md_is_badblock
7890 * might need to retry if it is very unlucky.
7891 * We will sometimes want to check for bad blocks in a bi_end_io function,
7892 * so we use the write_seqlock_irq variant.
7893 *
7894 * When looking for a bad block we specify a range and want to
7895 * know if any block in the range is bad. So we binary-search
7896 * to the last range that starts at-or-before the given endpoint,
7897 * (or "before the sector after the target range")
7898 * then see if it ends after the given start.
7899 * We return
7900 * 0 if there are no known bad blocks in the range
7901 * 1 if there are known bad block which are all acknowledged
7902 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7903 * plus the start/length of the first bad section we overlap.
7904 */
7905int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7906 sector_t *first_bad, int *bad_sectors)
7907{
7908 int hi;
7909 int lo = 0;
7910 u64 *p = bb->page;
7911 int rv = 0;
7912 sector_t target = s + sectors;
7913 unsigned seq;
7914
7915 if (bb->shift > 0) {
7916 /* round the start down, and the end up */
7917 s >>= bb->shift;
7918 target += (1<<bb->shift) - 1;
7919 target >>= bb->shift;
7920 sectors = target - s;
7921 }
7922 /* 'target' is now the first block after the bad range */
7923
7924retry:
7925 seq = read_seqbegin(&bb->lock);
7926
7927 hi = bb->count;
7928
7929 /* Binary search between lo and hi for 'target'
7930 * i.e. for the last range that starts before 'target'
7931 */
7932 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7933 * are known not to be the last range before target.
7934 * VARIANT: hi-lo is the number of possible
7935 * ranges, and decreases until it reaches 1
7936 */
7937 while (hi - lo > 1) {
7938 int mid = (lo + hi) / 2;
7939 sector_t a = BB_OFFSET(p[mid]);
7940 if (a < target)
7941 /* This could still be the one, earlier ranges
7942 * could not. */
7943 lo = mid;
7944 else
7945 /* This and later ranges are definitely out. */
7946 hi = mid;
7947 }
7948 /* 'lo' might be the last that started before target, but 'hi' isn't */
7949 if (hi > lo) {
7950 /* need to check all range that end after 's' to see if
7951 * any are unacknowledged.
7952 */
7953 while (lo >= 0 &&
7954 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7955 if (BB_OFFSET(p[lo]) < target) {
7956 /* starts before the end, and finishes after
7957 * the start, so they must overlap
7958 */
7959 if (rv != -1 && BB_ACK(p[lo]))
7960 rv = 1;
7961 else
7962 rv = -1;
7963 *first_bad = BB_OFFSET(p[lo]);
7964 *bad_sectors = BB_LEN(p[lo]);
7965 }
7966 lo--;
7967 }
7968 }
7969
7970 if (read_seqretry(&bb->lock, seq))
7971 goto retry;
7972
7973 return rv;
7974}
7975EXPORT_SYMBOL_GPL(md_is_badblock);
7976
7977/*
7978 * Add a range of bad blocks to the table.
7979 * This might extend the table, or might contract it
7980 * if two adjacent ranges can be merged.
7981 * We binary-search to find the 'insertion' point, then
7982 * decide how best to handle it.
7983 */
7984static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7985 int acknowledged)
7986{
7987 u64 *p;
7988 int lo, hi;
7989 int rv = 1;
7990
7991 if (bb->shift < 0)
7992 /* badblocks are disabled */
7993 return 0;
7994
7995 if (bb->shift) {
7996 /* round the start down, and the end up */
7997 sector_t next = s + sectors;
7998 s >>= bb->shift;
7999 next += (1<<bb->shift) - 1;
8000 next >>= bb->shift;
8001 sectors = next - s;
8002 }
8003
8004 write_seqlock_irq(&bb->lock);
8005
8006 p = bb->page;
8007 lo = 0;
8008 hi = bb->count;
8009 /* Find the last range that starts at-or-before 's' */
8010 while (hi - lo > 1) {
8011 int mid = (lo + hi) / 2;
8012 sector_t a = BB_OFFSET(p[mid]);
8013 if (a <= s)
8014 lo = mid;
8015 else
8016 hi = mid;
8017 }
8018 if (hi > lo && BB_OFFSET(p[lo]) > s)
8019 hi = lo;
8020
8021 if (hi > lo) {
8022 /* we found a range that might merge with the start
8023 * of our new range
8024 */
8025 sector_t a = BB_OFFSET(p[lo]);
8026 sector_t e = a + BB_LEN(p[lo]);
8027 int ack = BB_ACK(p[lo]);
8028 if (e >= s) {
8029 /* Yes, we can merge with a previous range */
8030 if (s == a && s + sectors >= e)
8031 /* new range covers old */
8032 ack = acknowledged;
8033 else
8034 ack = ack && acknowledged;
8035
8036 if (e < s + sectors)
8037 e = s + sectors;
8038 if (e - a <= BB_MAX_LEN) {
8039 p[lo] = BB_MAKE(a, e-a, ack);
8040 s = e;
8041 } else {
8042 /* does not all fit in one range,
8043 * make p[lo] maximal
8044 */
8045 if (BB_LEN(p[lo]) != BB_MAX_LEN)
8046 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8047 s = a + BB_MAX_LEN;
8048 }
8049 sectors = e - s;
8050 }
8051 }
8052 if (sectors && hi < bb->count) {
8053 /* 'hi' points to the first range that starts after 's'.
8054 * Maybe we can merge with the start of that range */
8055 sector_t a = BB_OFFSET(p[hi]);
8056 sector_t e = a + BB_LEN(p[hi]);
8057 int ack = BB_ACK(p[hi]);
8058 if (a <= s + sectors) {
8059 /* merging is possible */
8060 if (e <= s + sectors) {
8061 /* full overlap */
8062 e = s + sectors;
8063 ack = acknowledged;
8064 } else
8065 ack = ack && acknowledged;
8066
8067 a = s;
8068 if (e - a <= BB_MAX_LEN) {
8069 p[hi] = BB_MAKE(a, e-a, ack);
8070 s = e;
8071 } else {
8072 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8073 s = a + BB_MAX_LEN;
8074 }
8075 sectors = e - s;
8076 lo = hi;
8077 hi++;
8078 }
8079 }
8080 if (sectors == 0 && hi < bb->count) {
8081 /* we might be able to combine lo and hi */
8082 /* Note: 's' is at the end of 'lo' */
8083 sector_t a = BB_OFFSET(p[hi]);
8084 int lolen = BB_LEN(p[lo]);
8085 int hilen = BB_LEN(p[hi]);
8086 int newlen = lolen + hilen - (s - a);
8087 if (s >= a && newlen < BB_MAX_LEN) {
8088 /* yes, we can combine them */
8089 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8090 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8091 memmove(p + hi, p + hi + 1,
8092 (bb->count - hi - 1) * 8);
8093 bb->count--;
8094 }
8095 }
8096 while (sectors) {
8097 /* didn't merge (it all).
8098 * Need to add a range just before 'hi' */
8099 if (bb->count >= MD_MAX_BADBLOCKS) {
8100 /* No room for more */
8101 rv = 0;
8102 break;
8103 } else {
8104 int this_sectors = sectors;
8105 memmove(p + hi + 1, p + hi,
8106 (bb->count - hi) * 8);
8107 bb->count++;
8108
8109 if (this_sectors > BB_MAX_LEN)
8110 this_sectors = BB_MAX_LEN;
8111 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8112 sectors -= this_sectors;
8113 s += this_sectors;
8114 }
8115 }
8116
8117 bb->changed = 1;
NeilBrownde393cd2011-07-28 11:31:48 +10008118 if (!acknowledged)
8119 bb->unacked_exist = 1;
NeilBrown2230dfe2011-07-28 11:31:46 +10008120 write_sequnlock_irq(&bb->lock);
8121
8122 return rv;
8123}
8124
NeilBrown3cb03002011-10-11 16:45:26 +11008125int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
NeilBrownc6563a82012-05-21 09:27:00 +10008126 int is_new)
NeilBrown2230dfe2011-07-28 11:31:46 +10008127{
NeilBrownc6563a82012-05-21 09:27:00 +10008128 int rv;
8129 if (is_new)
8130 s += rdev->new_data_offset;
8131 else
8132 s += rdev->data_offset;
8133 rv = md_set_badblocks(&rdev->badblocks,
8134 s, sectors, 0);
NeilBrown2230dfe2011-07-28 11:31:46 +10008135 if (rv) {
8136 /* Make sure they get written out promptly */
NeilBrown8bd2f0a2011-12-08 16:26:08 +11008137 sysfs_notify_dirent_safe(rdev->sysfs_state);
NeilBrown2230dfe2011-07-28 11:31:46 +10008138 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8139 md_wakeup_thread(rdev->mddev->thread);
8140 }
8141 return rv;
8142}
8143EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8144
8145/*
8146 * Remove a range of bad blocks from the table.
8147 * This may involve extending the table if we spilt a region,
8148 * but it must not fail. So if the table becomes full, we just
8149 * drop the remove request.
8150 */
8151static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8152{
8153 u64 *p;
8154 int lo, hi;
8155 sector_t target = s + sectors;
8156 int rv = 0;
8157
8158 if (bb->shift > 0) {
8159 /* When clearing we round the start up and the end down.
8160 * This should not matter as the shift should align with
8161 * the block size and no rounding should ever be needed.
8162 * However it is better the think a block is bad when it
8163 * isn't than to think a block is not bad when it is.
8164 */
8165 s += (1<<bb->shift) - 1;
8166 s >>= bb->shift;
8167 target >>= bb->shift;
8168 sectors = target - s;
8169 }
8170
8171 write_seqlock_irq(&bb->lock);
8172
8173 p = bb->page;
8174 lo = 0;
8175 hi = bb->count;
8176 /* Find the last range that starts before 'target' */
8177 while (hi - lo > 1) {
8178 int mid = (lo + hi) / 2;
8179 sector_t a = BB_OFFSET(p[mid]);
8180 if (a < target)
8181 lo = mid;
8182 else
8183 hi = mid;
8184 }
8185 if (hi > lo) {
8186 /* p[lo] is the last range that could overlap the
8187 * current range. Earlier ranges could also overlap,
8188 * but only this one can overlap the end of the range.
8189 */
8190 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8191 /* Partial overlap, leave the tail of this range */
8192 int ack = BB_ACK(p[lo]);
8193 sector_t a = BB_OFFSET(p[lo]);
8194 sector_t end = a + BB_LEN(p[lo]);
8195
8196 if (a < s) {
8197 /* we need to split this range */
8198 if (bb->count >= MD_MAX_BADBLOCKS) {
8199 rv = 0;
8200 goto out;
8201 }
8202 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8203 bb->count++;
8204 p[lo] = BB_MAKE(a, s-a, ack);
8205 lo++;
8206 }
8207 p[lo] = BB_MAKE(target, end - target, ack);
8208 /* there is no longer an overlap */
8209 hi = lo;
8210 lo--;
8211 }
8212 while (lo >= 0 &&
8213 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8214 /* This range does overlap */
8215 if (BB_OFFSET(p[lo]) < s) {
8216 /* Keep the early parts of this range. */
8217 int ack = BB_ACK(p[lo]);
8218 sector_t start = BB_OFFSET(p[lo]);
8219 p[lo] = BB_MAKE(start, s - start, ack);
8220 /* now low doesn't overlap, so.. */
8221 break;
8222 }
8223 lo--;
8224 }
8225 /* 'lo' is strictly before, 'hi' is strictly after,
8226 * anything between needs to be discarded
8227 */
8228 if (hi - lo > 1) {
8229 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8230 bb->count -= (hi - lo - 1);
8231 }
8232 }
8233
8234 bb->changed = 1;
8235out:
8236 write_sequnlock_irq(&bb->lock);
8237 return rv;
8238}
8239
NeilBrownc6563a82012-05-21 09:27:00 +10008240int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8241 int is_new)
NeilBrown2230dfe2011-07-28 11:31:46 +10008242{
NeilBrownc6563a82012-05-21 09:27:00 +10008243 if (is_new)
8244 s += rdev->new_data_offset;
8245 else
8246 s += rdev->data_offset;
NeilBrown2230dfe2011-07-28 11:31:46 +10008247 return md_clear_badblocks(&rdev->badblocks,
NeilBrownc6563a82012-05-21 09:27:00 +10008248 s, sectors);
NeilBrown2230dfe2011-07-28 11:31:46 +10008249}
8250EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8251
8252/*
8253 * Acknowledge all bad blocks in a list.
8254 * This only succeeds if ->changed is clear. It is used by
8255 * in-kernel metadata updates
8256 */
8257void md_ack_all_badblocks(struct badblocks *bb)
8258{
8259 if (bb->page == NULL || bb->changed)
8260 /* no point even trying */
8261 return;
8262 write_seqlock_irq(&bb->lock);
8263
majianpengecb178b2012-03-19 12:46:42 +11008264 if (bb->changed == 0 && bb->unacked_exist) {
NeilBrown2230dfe2011-07-28 11:31:46 +10008265 u64 *p = bb->page;
8266 int i;
8267 for (i = 0; i < bb->count ; i++) {
8268 if (!BB_ACK(p[i])) {
8269 sector_t start = BB_OFFSET(p[i]);
8270 int len = BB_LEN(p[i]);
8271 p[i] = BB_MAKE(start, len, 1);
8272 }
8273 }
NeilBrownde393cd2011-07-28 11:31:48 +10008274 bb->unacked_exist = 0;
NeilBrown2230dfe2011-07-28 11:31:46 +10008275 }
8276 write_sequnlock_irq(&bb->lock);
8277}
8278EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8279
NeilBrown16c791a2011-07-28 11:31:47 +10008280/* sysfs access to bad-blocks list.
8281 * We present two files.
8282 * 'bad-blocks' lists sector numbers and lengths of ranges that
8283 * are recorded as bad. The list is truncated to fit within
8284 * the one-page limit of sysfs.
8285 * Writing "sector length" to this file adds an acknowledged
8286 * bad block list.
8287 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8288 * been acknowledged. Writing to this file adds bad blocks
8289 * without acknowledging them. This is largely for testing.
8290 */
8291
8292static ssize_t
8293badblocks_show(struct badblocks *bb, char *page, int unack)
8294{
8295 size_t len;
8296 int i;
8297 u64 *p = bb->page;
8298 unsigned seq;
8299
8300 if (bb->shift < 0)
8301 return 0;
8302
8303retry:
8304 seq = read_seqbegin(&bb->lock);
8305
8306 len = 0;
8307 i = 0;
8308
8309 while (len < PAGE_SIZE && i < bb->count) {
8310 sector_t s = BB_OFFSET(p[i]);
8311 unsigned int length = BB_LEN(p[i]);
8312 int ack = BB_ACK(p[i]);
8313 i++;
8314
8315 if (unack && ack)
8316 continue;
8317
8318 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8319 (unsigned long long)s << bb->shift,
8320 length << bb->shift);
8321 }
NeilBrownde393cd2011-07-28 11:31:48 +10008322 if (unack && len == 0)
8323 bb->unacked_exist = 0;
NeilBrown16c791a2011-07-28 11:31:47 +10008324
8325 if (read_seqretry(&bb->lock, seq))
8326 goto retry;
8327
8328 return len;
8329}
8330
8331#define DO_DEBUG 1
8332
8333static ssize_t
8334badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8335{
8336 unsigned long long sector;
8337 int length;
8338 char newline;
8339#ifdef DO_DEBUG
8340 /* Allow clearing via sysfs *only* for testing/debugging.
8341 * Normally only a successful write may clear a badblock
8342 */
8343 int clear = 0;
8344 if (page[0] == '-') {
8345 clear = 1;
8346 page++;
8347 }
8348#endif /* DO_DEBUG */
8349
8350 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8351 case 3:
8352 if (newline != '\n')
8353 return -EINVAL;
8354 case 2:
8355 if (length <= 0)
8356 return -EINVAL;
8357 break;
8358 default:
8359 return -EINVAL;
8360 }
8361
8362#ifdef DO_DEBUG
8363 if (clear) {
8364 md_clear_badblocks(bb, sector, length);
8365 return len;
8366 }
8367#endif /* DO_DEBUG */
8368 if (md_set_badblocks(bb, sector, length, !unack))
8369 return len;
8370 else
8371 return -ENOSPC;
8372}
8373
Adrian Bunk75c96f82005-05-05 16:16:09 -07008374static int md_notify_reboot(struct notifier_block *this,
8375 unsigned long code, void *x)
Linus Torvalds1da177e2005-04-16 15:20:36 -07008376{
8377 struct list_head *tmp;
NeilBrownfd01b882011-10-11 16:47:53 +11008378 struct mddev *mddev;
Daniel P. Berrange2dba6a92011-09-23 10:40:45 +01008379 int need_delay = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008380
NeilBrownc744a652012-03-19 12:46:37 +11008381 for_each_mddev(mddev, tmp) {
8382 if (mddev_trylock(mddev)) {
NeilBrown30b8aa92012-04-24 10:23:16 +10008383 if (mddev->pers)
8384 __md_stop_writes(mddev);
NeilBrownc744a652012-03-19 12:46:37 +11008385 mddev->safemode = 2;
8386 mddev_unlock(mddev);
Daniel P. Berrange2dba6a92011-09-23 10:40:45 +01008387 }
NeilBrownc744a652012-03-19 12:46:37 +11008388 need_delay = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008389 }
NeilBrownc744a652012-03-19 12:46:37 +11008390 /*
8391 * certain more exotic SCSI devices are known to be
8392 * volatile wrt too early system reboots. While the
8393 * right place to handle this issue is the given
8394 * driver, we do want to have a safe RAID driver ...
8395 */
8396 if (need_delay)
8397 mdelay(1000*1);
8398
Linus Torvalds1da177e2005-04-16 15:20:36 -07008399 return NOTIFY_DONE;
8400}
8401
Adrian Bunk75c96f82005-05-05 16:16:09 -07008402static struct notifier_block md_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07008403 .notifier_call = md_notify_reboot,
8404 .next = NULL,
8405 .priority = INT_MAX, /* before any real devices */
8406};
8407
8408static void md_geninit(void)
8409{
NeilBrown36a4e1f2011-10-07 14:23:17 +11008410 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
Linus Torvalds1da177e2005-04-16 15:20:36 -07008411
Denis V. Lunevc7705f3442008-04-29 01:02:35 -07008412 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008413}
8414
Adrian Bunk75c96f82005-05-05 16:16:09 -07008415static int __init md_init(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07008416{
Tejun Heoe804ac72010-10-15 15:36:08 +02008417 int ret = -ENOMEM;
8418
Tejun Heoada609e2011-01-25 14:35:54 +01008419 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
Tejun Heoe804ac72010-10-15 15:36:08 +02008420 if (!md_wq)
8421 goto err_wq;
8422
8423 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8424 if (!md_misc_wq)
8425 goto err_misc_wq;
8426
8427 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8428 goto err_md;
8429
8430 if ((ret = register_blkdev(0, "mdp")) < 0)
8431 goto err_mdp;
8432 mdp_major = ret;
8433
Christoph Hellwig3dbd8c22009-03-31 14:27:02 +11008434 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
NeilBrowne8703fe2006-10-03 01:15:59 -07008435 md_probe, NULL, NULL);
8436 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07008437 md_probe, NULL, NULL);
8438
Linus Torvalds1da177e2005-04-16 15:20:36 -07008439 register_reboot_notifier(&md_notifier);
Eric W. Biederman0b4d4142007-02-14 00:34:09 -08008440 raid_table_header = register_sysctl_table(raid_root_table);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008441
8442 md_geninit();
NeilBrownd710e132008-10-13 11:55:12 +11008443 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008444
Tejun Heoe804ac72010-10-15 15:36:08 +02008445err_mdp:
8446 unregister_blkdev(MD_MAJOR, "md");
8447err_md:
8448 destroy_workqueue(md_misc_wq);
8449err_misc_wq:
8450 destroy_workqueue(md_wq);
8451err_wq:
8452 return ret;
8453}
Linus Torvalds1da177e2005-04-16 15:20:36 -07008454
8455#ifndef MODULE
8456
8457/*
8458 * Searches all registered partitions for autorun RAID arrays
8459 * at boot time.
8460 */
Michael J. Evans4d936ec2007-10-16 23:30:52 -07008461
8462static LIST_HEAD(all_detected_devices);
8463struct detected_devices_node {
8464 struct list_head list;
8465 dev_t dev;
8466};
Linus Torvalds1da177e2005-04-16 15:20:36 -07008467
8468void md_autodetect_dev(dev_t dev)
8469{
Michael J. Evans4d936ec2007-10-16 23:30:52 -07008470 struct detected_devices_node *node_detected_dev;
8471
8472 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8473 if (node_detected_dev) {
8474 node_detected_dev->dev = dev;
8475 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8476 } else {
8477 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8478 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8479 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07008480}
8481
8482
8483static void autostart_arrays(int part)
8484{
NeilBrown3cb03002011-10-11 16:45:26 +11008485 struct md_rdev *rdev;
Michael J. Evans4d936ec2007-10-16 23:30:52 -07008486 struct detected_devices_node *node_detected_dev;
8487 dev_t dev;
8488 int i_scanned, i_passed;
8489
8490 i_scanned = 0;
8491 i_passed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008492
8493 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8494
Michael J. Evans4d936ec2007-10-16 23:30:52 -07008495 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8496 i_scanned++;
8497 node_detected_dev = list_entry(all_detected_devices.next,
8498 struct detected_devices_node, list);
8499 list_del(&node_detected_dev->list);
8500 dev = node_detected_dev->dev;
8501 kfree(node_detected_dev);
NeilBrowndf968c42007-07-17 04:06:11 -07008502 rdev = md_import_device(dev,0, 90);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008503 if (IS_ERR(rdev))
8504 continue;
8505
NeilBrownb2d444d2005-11-08 21:39:31 -08008506 if (test_bit(Faulty, &rdev->flags)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07008507 MD_BUG();
8508 continue;
8509 }
NeilBrownd0fae182008-03-04 14:29:31 -08008510 set_bit(AutoDetected, &rdev->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008511 list_add(&rdev->same_set, &pending_raid_disks);
Michael J. Evans4d936ec2007-10-16 23:30:52 -07008512 i_passed++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008513 }
Michael J. Evans4d936ec2007-10-16 23:30:52 -07008514
8515 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8516 i_scanned, i_passed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008517
8518 autorun_devices(part);
8519}
8520
Jeff Garzikfdee8ae2006-12-10 02:20:50 -08008521#endif /* !MODULE */
Linus Torvalds1da177e2005-04-16 15:20:36 -07008522
8523static __exit void md_exit(void)
8524{
NeilBrownfd01b882011-10-11 16:47:53 +11008525 struct mddev *mddev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008526 struct list_head *tmp;
Greg Kroah-Hartman8ab5e4c2005-06-20 21:15:16 -07008527
Christoph Hellwig3dbd8c22009-03-31 14:27:02 +11008528 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
NeilBrowne8703fe2006-10-03 01:15:59 -07008529 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008530
Christoph Hellwig3dbd8c22009-03-31 14:27:02 +11008531 unregister_blkdev(MD_MAJOR,"md");
Linus Torvalds1da177e2005-04-16 15:20:36 -07008532 unregister_blkdev(mdp_major, "mdp");
8533 unregister_reboot_notifier(&md_notifier);
8534 unregister_sysctl_table(raid_table_header);
8535 remove_proc_entry("mdstat", NULL);
NeilBrown29ac4aa2008-02-06 01:39:58 -08008536 for_each_mddev(mddev, tmp) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07008537 export_array(mddev);
NeilBrownd3374822009-01-09 08:31:10 +11008538 mddev->hold_active = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008539 }
Tejun Heoe804ac72010-10-15 15:36:08 +02008540 destroy_workqueue(md_misc_wq);
8541 destroy_workqueue(md_wq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008542}
8543
Dan Williams685784a2007-07-09 11:56:42 -07008544subsys_initcall(md_init);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008545module_exit(md_exit)
8546
NeilBrownf91de922005-11-08 21:39:36 -08008547static int get_ro(char *buffer, struct kernel_param *kp)
8548{
8549 return sprintf(buffer, "%d", start_readonly);
8550}
8551static int set_ro(const char *val, struct kernel_param *kp)
8552{
8553 char *e;
8554 int num = simple_strtoul(val, &e, 10);
8555 if (*val && (*e == '\0' || *e == '\n')) {
8556 start_readonly = num;
NeilBrown4dbcdc72006-01-06 00:20:52 -08008557 return 0;
NeilBrownf91de922005-11-08 21:39:36 -08008558 }
8559 return -EINVAL;
8560}
8561
NeilBrown80ca3a42006-07-10 04:44:18 -07008562module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
8563module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
NeilBrown6ff8d8ec2006-01-06 00:20:15 -08008564
NeilBrownefeb53c2009-01-09 08:31:10 +11008565module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
NeilBrownf91de922005-11-08 21:39:36 -08008566
Linus Torvalds1da177e2005-04-16 15:20:36 -07008567EXPORT_SYMBOL(register_md_personality);
8568EXPORT_SYMBOL(unregister_md_personality);
8569EXPORT_SYMBOL(md_error);
8570EXPORT_SYMBOL(md_done_sync);
8571EXPORT_SYMBOL(md_write_start);
8572EXPORT_SYMBOL(md_write_end);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008573EXPORT_SYMBOL(md_register_thread);
8574EXPORT_SYMBOL(md_unregister_thread);
8575EXPORT_SYMBOL(md_wakeup_thread);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008576EXPORT_SYMBOL(md_check_recovery);
8577MODULE_LICENSE("GPL");
NeilBrown0efb9e62009-12-14 12:49:58 +11008578MODULE_DESCRIPTION("MD RAID framework");
NeilBrownaa1595e2005-08-04 12:53:32 -07008579MODULE_ALIAS("md");
NeilBrown72008652005-08-26 18:34:15 -07008580MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);