dm mpath: add optional "queue_mode" feature

Allow a user to specify an optional feature 'queue_mode <mode>' where
<mode> may be "bio", "rq" or "mq" -- which corresponds to bio-based,
request_fn rq-based, and blk-mq rq-based respectively.

If the queue_mode feature isn't specified the default for the
"multipath" target is still "rq" but if dm_mod.use_blk_mq is set to Y
it'll default to mode "mq".

This new queue_mode feature introduces the ability for each multipath
device to have its own queue_mode (whereas before this feature all
multipath devices effectively had to have the same queue_mode).

This commit also goes a long way to eliminate the awkward (ab)use of
DM_TYPE_*, the associated filter_md_type() and other relatively fragile
and difficult to maintain code.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2d10ff7..7eac080 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,6 +90,8 @@
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
+	unsigned queue_mode;
+
 	/*
 	 * We must use a mempool of dm_mpath_io structs so that we
 	 * can resubmit bios on error.
@@ -131,7 +133,6 @@
 #define MPATHF_PG_INIT_DISABLED 4		/* pg_init is not currently allowed */
 #define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
 #define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
-#define MPATHF_BIO_BASED 7			/* Device is bio-based? */
 
 /*-----------------------------------------------
  * Allocation routines
@@ -191,8 +192,7 @@
 	kfree(pg);
 }
 
-static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq,
-					 bool bio_based)
+static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
 
@@ -210,25 +210,7 @@
 		mutex_init(&m->work_mutex);
 
 		m->mpio_pool = NULL;
-		if (!use_blk_mq && !bio_based) {
-			unsigned min_ios = dm_get_reserved_rq_based_ios();
-
-			m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
-			if (!m->mpio_pool) {
-				kfree(m);
-				return NULL;
-			}
-		}
-
-		if (bio_based) {
-			INIT_WORK(&m->process_queued_bios, process_queued_bios);
-			set_bit(MPATHF_BIO_BASED, &m->flags);
-			/*
-			 * bio-based doesn't support any direct scsi_dh management;
-			 * it just discovers if a scsi_dh is attached.
-			 */
-			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
-		}
+		m->queue_mode = DM_TYPE_NONE;
 
 		m->ti = ti;
 		ti->private = m;
@@ -237,6 +219,39 @@
 	return m;
 }
 
+static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
+{
+	if (m->queue_mode == DM_TYPE_NONE) {
+		/*
+		 * Default to request-based.
+		 */
+		if (dm_use_blk_mq(dm_table_get_md(ti->table)))
+			m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
+		else
+			m->queue_mode = DM_TYPE_REQUEST_BASED;
+	}
+
+	if (m->queue_mode == DM_TYPE_REQUEST_BASED) {
+		unsigned min_ios = dm_get_reserved_rq_based_ios();
+
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
+		if (!m->mpio_pool)
+			return -ENOMEM;
+	}
+	else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+		INIT_WORK(&m->process_queued_bios, process_queued_bios);
+		/*
+		 * bio-based doesn't support any direct scsi_dh management;
+		 * it just discovers if a scsi_dh is attached.
+		 */
+		set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+	}
+
+	dm_table_set_type(ti->table, m->queue_mode);
+
+	return 0;
+}
+
 static void free_multipath(struct multipath *m)
 {
 	struct priority_group *pg, *tmp;
@@ -653,7 +668,7 @@
 
 static void process_queued_bios_list(struct multipath *m)
 {
-	if (test_bit(MPATHF_BIO_BASED, &m->flags))
+	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		queue_work(kmultipathd, &m->process_queued_bios);
 }
 
@@ -964,7 +979,7 @@
 	if (!hw_argc)
 		return 0;
 
-	if (test_bit(MPATHF_BIO_BASED, &m->flags)) {
+	if (m->queue_mode == DM_TYPE_BIO_BASED) {
 		dm_consume_args(as, hw_argc);
 		DMERR("bio-based multipath doesn't allow hardware handler args");
 		return 0;
@@ -1005,7 +1020,7 @@
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 6, "invalid number of feature args"},
+		{0, 8, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
@@ -1045,6 +1060,24 @@
 			continue;
 		}
 
+		if (!strcasecmp(arg_name, "queue_mode") &&
+		    (argc >= 1)) {
+			const char *queue_mode_name = dm_shift_arg(as);
+
+			if (!strcasecmp(queue_mode_name, "bio"))
+				m->queue_mode = DM_TYPE_BIO_BASED;
+			else if (!strcasecmp(queue_mode_name, "rq"))
+				m->queue_mode = DM_TYPE_REQUEST_BASED;
+			else if (!strcasecmp(queue_mode_name, "mq"))
+				m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
+			else {
+				ti->error = "Unknown 'queue_mode' requested";
+				r = -EINVAL;
+			}
+			argc--;
+			continue;
+		}
+
 		ti->error = "Unrecognised multipath feature request";
 		r = -EINVAL;
 	} while (argc && !r);
@@ -1052,8 +1085,7 @@
 	return r;
 }
 
-static int __multipath_ctr(struct dm_target *ti, unsigned int argc,
-			   char **argv, bool bio_based)
+static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	/* target arguments */
 	static struct dm_arg _args[] = {
@@ -1066,12 +1098,11 @@
 	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
-	bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table));
 
 	as.argc = argc;
 	as.argv = argv;
 
-	m = alloc_multipath(ti, use_blk_mq, bio_based);
+	m = alloc_multipath(ti);
 	if (!m) {
 		ti->error = "can't allocate multipath";
 		return -EINVAL;
@@ -1081,6 +1112,10 @@
 	if (r)
 		goto bad;
 
+	r = alloc_multipath_stage2(ti, m);
+	if (r)
+		goto bad;
+
 	r = parse_hw_handler(&as, m);
 	if (r)
 		goto bad;
@@ -1130,9 +1165,9 @@
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
-	if (bio_based)
+	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
-	else if (use_blk_mq)
+	else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
 		ti->per_io_data_size = sizeof(struct dm_mpath_io);
 
 	return 0;
@@ -1142,16 +1177,6 @@
 	return r;
 }
 
-static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-	return __multipath_ctr(ti, argc, argv, false);
-}
-
-static int multipath_bio_ctr(struct dm_target *ti, unsigned argc, char **argv)
-{
-	return __multipath_ctr(ti, argc, argv, true);
-}
-
 static void multipath_wait_for_pg_init_completion(struct multipath *m)
 {
 	DECLARE_WAITQUEUE(wait, current);
@@ -1700,7 +1725,9 @@
 		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
-			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags));
+			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
+			      (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
+
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
 			DMEMIT("queue_if_no_path ");
 		if (m->pg_init_retries)
@@ -1709,6 +1736,16 @@
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
 		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
 			DMEMIT("retain_attached_hw_handler ");
+		if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
+			switch(m->queue_mode) {
+			case DM_TYPE_BIO_BASED:
+				DMEMIT("queue_mode bio ");
+				break;
+			case DM_TYPE_MQ_REQUEST_BASED:
+				DMEMIT("queue_mode mq ");
+				break;
+			}
+		}
 	}
 
 	if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1995,7 +2032,7 @@
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 11, 0},
+	.version = {1, 12, 0},
 	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
@@ -2004,22 +2041,6 @@
 	.clone_and_map_rq = multipath_clone_and_map,
 	.release_clone_rq = multipath_release_clone,
 	.rq_end_io = multipath_end_io,
-	.presuspend = multipath_presuspend,
-	.postsuspend = multipath_postsuspend,
-	.resume = multipath_resume,
-	.status = multipath_status,
-	.message = multipath_message,
-	.prepare_ioctl = multipath_prepare_ioctl,
-	.iterate_devices = multipath_iterate_devices,
-	.busy = multipath_busy,
-};
-
-static struct target_type multipath_bio_target = {
-	.name = "multipath-bio",
-	.version = {1, 0, 0},
-	.module = THIS_MODULE,
-	.ctr = multipath_bio_ctr,
-	.dtr = multipath_dtr,
 	.map = multipath_map_bio,
 	.end_io = multipath_end_io_bio,
 	.presuspend = multipath_presuspend,
@@ -2048,13 +2069,6 @@
 		goto bad_register_target;
 	}
 
-	r = dm_register_target(&multipath_bio_target);
-	if (r < 0) {
-		DMERR("bio-based register failed %d", r);
-		r = -EINVAL;
-		goto bad_register_bio_based_target;
-	}
-
 	kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
 	if (!kmultipathd) {
 		DMERR("failed to create workqueue kmpathd");
@@ -2081,8 +2095,6 @@
 bad_alloc_kmpath_handlerd:
 	destroy_workqueue(kmultipathd);
 bad_alloc_kmultipathd:
-	dm_unregister_target(&multipath_bio_target);
-bad_register_bio_based_target:
 	dm_unregister_target(&multipath_target);
 bad_register_target:
 	kmem_cache_destroy(_mpio_cache);
@@ -2096,7 +2108,6 @@
 	destroy_workqueue(kmultipathd);
 
 	dm_unregister_target(&multipath_target);
-	dm_unregister_target(&multipath_bio_target);
 	kmem_cache_destroy(_mpio_cache);
 }
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 787c81b..266f7b6 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -230,7 +230,14 @@
 
 	blk_rq_unprep_clone(clone);
 
-	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
+	/*
+	 * It is possible for a clone_old_rq() allocated clone to
+	 * get passed in -- it may not yet have a request_queue.
+	 * This is known to occur if the error target replaces
+	 * a multipath target that has a request_fn queue stacked
+	 * on blk-mq queue(s).
+	 */
+	if (clone->q && clone->q->mq_ops)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
 	else if (!md->queue->mq_ops)
@@ -561,7 +568,7 @@
 	 * Must clone a request if this .request_fn DM device
 	 * is stacked on .request_fn device(s).
 	 */
-	if (!dm_table_mq_request_based(table)) {
+	if (!dm_table_all_blk_mq_devices(table)) {
 		if (!clone_old_rq(rq, md, tio, gfp_mask)) {
 			dm_put_live_table(md, srcu_idx);
 			free_old_rq_tio(tio);
@@ -711,7 +718,7 @@
 {
 	unsigned deadline;
 
-	if (!dm_request_based(md) || md->use_blk_mq)
+	if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED)
 		return count;
 
 	if (kstrtouint(buf, 10, &deadline))
@@ -886,12 +893,13 @@
 	.init_request = dm_mq_init_request,
 };
 
-int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutable_tgt)
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 {
 	struct request_queue *q;
+	struct dm_target *immutable_tgt;
 	int err;
 
-	if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
+	if (!dm_table_all_blk_mq_devices(t)) {
 		DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
 		return -EINVAL;
 	}
@@ -908,6 +916,7 @@
 	md->tag_set->driver_data = md;
 
 	md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+	immutable_tgt = dm_table_get_immutable_target(t);
 	if (immutable_tgt && immutable_tgt->per_io_data_size) {
 		/* any target-specific per-io data is immediately after the tio */
 		md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index 1559f64..9e6f0a3 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -49,7 +49,7 @@
 bool dm_use_blk_mq(struct mapped_device *md);
 
 int dm_old_init_request_queue(struct mapped_device *md);
-int dm_mq_init_request_queue(struct mapped_device *md, struct dm_target *immutable_tgt);
+int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
 void dm_mq_cleanup_mapped_device(struct mapped_device *md);
 
 void dm_start_queue(struct request_queue *q);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a682d51..88f0174 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -43,8 +43,10 @@
 	struct dm_target *targets;
 
 	struct target_type *immutable_target_type;
-	unsigned integrity_supported:1;
-	unsigned singleton:1;
+
+	bool integrity_supported:1;
+	bool singleton:1;
+	bool all_blk_mq:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -206,6 +208,7 @@
 		return -ENOMEM;
 	}
 
+	t->type = DM_TYPE_NONE;
 	t->mode = mode;
 	t->md = md;
 	*result = t;
@@ -703,7 +706,7 @@
 			      dm_device_name(t->md), type);
 			return -EINVAL;
 		}
-		t->singleton = 1;
+		t->singleton = true;
 	}
 
 	if (dm_target_always_writeable(tgt->type) && !(t->mode & FMODE_WRITE)) {
@@ -830,16 +833,29 @@
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 }
 
-static int dm_table_set_type(struct dm_table *t)
+void dm_table_set_type(struct dm_table *t, unsigned type)
+{
+	t->type = type;
+}
+EXPORT_SYMBOL_GPL(dm_table_set_type);
+
+static int dm_table_determine_type(struct dm_table *t)
 {
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0, hybrid = 0;
-	bool use_blk_mq = false;
+	bool verify_blk_mq = false;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
-	struct list_head *devices;
+	struct list_head *devices = dm_table_get_devices(t);
 	unsigned live_md_type = dm_get_md_type(t->md);
 
+	if (t->type != DM_TYPE_NONE) {
+		/* target already set the table's type */
+		if (t->type == DM_TYPE_BIO_BASED)
+			return 0;
+		goto verify_rq_based;
+	}
+
 	for (i = 0; i < t->num_targets; i++) {
 		tgt = t->targets + i;
 		if (dm_target_hybrid(tgt))
@@ -876,6 +892,19 @@
 
 	BUG_ON(!request_based); /* No targets in this table */
 
+	if (list_empty(devices) && __table_type_request_based(live_md_type)) {
+		/* inherit live MD type */
+		t->type = live_md_type;
+		return 0;
+	}
+
+	/*
+	 * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
+	 * having a compatible target use dm_table_set_type.
+	 */
+	t->type = DM_TYPE_REQUEST_BASED;
+
+verify_rq_based:
 	/*
 	 * Request-based dm supports only tables that have a single target now.
 	 * To support multiple targets, request splitting support is needed,
@@ -888,7 +917,6 @@
 	}
 
 	/* Non-request-stackable devices can't be used for request-based dm */
-	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev);
 
@@ -899,10 +927,10 @@
 		}
 
 		if (q->mq_ops)
-			use_blk_mq = true;
+			verify_blk_mq = true;
 	}
 
-	if (use_blk_mq) {
+	if (verify_blk_mq) {
 		/* verify _all_ devices in the table are blk-mq devices */
 		list_for_each_entry(dd, devices, list)
 			if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) {
@@ -910,14 +938,9 @@
 				      " are blk-mq request-stackable");
 				return -EINVAL;
 			}
-		t->type = DM_TYPE_MQ_REQUEST_BASED;
 
-	} else if (list_empty(devices) && __table_type_request_based(live_md_type)) {
-		/* inherit live MD type */
-		t->type = live_md_type;
-
-	} else
-		t->type = DM_TYPE_REQUEST_BASED;
+		t->all_blk_mq = true;
+	}
 
 	return 0;
 }
@@ -961,9 +984,9 @@
 	return __table_type_request_based(dm_table_get_type(t));
 }
 
-bool dm_table_mq_request_based(struct dm_table *t)
+bool dm_table_all_blk_mq_devices(struct dm_table *t)
 {
-	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
+	return t->all_blk_mq;
 }
 
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
@@ -1106,7 +1129,7 @@
 		return 0;
 
 	if (!integrity_profile_exists(dm_disk(md))) {
-		t->integrity_supported = 1;
+		t->integrity_supported = true;
 		/*
 		 * Register integrity profile during table load; we can do
 		 * this because the final profile must match during resume.
@@ -1129,7 +1152,7 @@
 	}
 
 	/* Preserve existing integrity profile */
-	t->integrity_supported = 1;
+	t->integrity_supported = true;
 	return 0;
 }
 
@@ -1141,9 +1164,9 @@
 {
 	int r;
 
-	r = dm_table_set_type(t);
+	r = dm_table_determine_type(t);
 	if (r) {
-		DMERR("unable to set table type");
+		DMERR("unable to determine table type");
 		return r;
 	}
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8f22527..2c907bc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1738,23 +1738,14 @@
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 /*
  * Setup the DM device's queue based on md's type
  */
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
-	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
 
-	switch (md_type) {
+	switch (dm_get_md_type(md)) {
 	case DM_TYPE_REQUEST_BASED:
 		r = dm_old_init_request_queue(md);
 		if (r) {
@@ -1763,7 +1754,7 @@
 		}
 		break;
 	case DM_TYPE_MQ_REQUEST_BASED:
-		r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t));
+		r = dm_mq_init_request_queue(md, t);
 		if (r) {
 			DMERR("Cannot initialize queue for request-based dm-mq mapped device");
 			return r;
@@ -2472,8 +2463,6 @@
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
-
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b611b30..2e0e4a5 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -34,14 +34,6 @@
 #define DM_STATUS_NOFLUSH_FLAG		(1 << 0)
 
 /*
- * Type of table and mapped_device's mempool
- */
-#define DM_TYPE_NONE			0
-#define DM_TYPE_BIO_BASED		1
-#define DM_TYPE_REQUEST_BASED		2
-#define DM_TYPE_MQ_REQUEST_BASED	3
-
-/*
  * List of devices that a metadevice uses and should open/close.
  */
 struct dm_dev_internal {
@@ -77,7 +69,7 @@
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-bool dm_table_mq_request_based(struct dm_table *t);
+bool dm_table_all_blk_mq_devices(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);