Merge branches 'cma', 'cxgb4', 'flowsteer', 'ipoib', 'misc', 'mlx4', 'mlx5', 'nes', 'ocrdma', 'qib' and 'srp' into for-next
diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp
index 5c53d28..b9688de 100644
--- a/Documentation/ABI/stable/sysfs-driver-ib_srp
+++ b/Documentation/ABI/stable/sysfs-driver-ib_srp
@@ -61,6 +61,12 @@
 		  interrupt is handled by a different CPU then the comp_vector
 		  parameter can be used to spread the SRP completion workload
 		  over multiple CPU's.
+		* tl_retry_count, a number in the range 2..7 specifying the
+		  IB RC retry count.
+		* queue_size, the maximum number of commands that the
+		  initiator is allowed to queue per SCSI host. The default
+		  value for this parameter is 62. The lowest supported value
+		  is 2.
 
 What:		/sys/class/infiniband_srp/srp-<hca>-<port_number>/ibdev
 Date:		January 2, 2006
@@ -153,6 +159,13 @@
 Description:	InfiniBand service ID used for establishing communication with
 		the SRP	target.
 
+What:		/sys/class/scsi_host/host<n>/sgid
+Date:		February 1, 2014
+KernelVersion:	3.13
+Contact:	linux-rdma@vger.kernel.org
+Description:	InfiniBand GID of the source port used for communication with
+		the SRP target.
+
 What:		/sys/class/scsi_host/host<n>/zero_req_lim
 Date:		September 20, 2006
 KernelVersion:	2.6.18
diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp
index b36fb0d..ec7af69 100644
--- a/Documentation/ABI/stable/sysfs-transport-srp
+++ b/Documentation/ABI/stable/sysfs-transport-srp
@@ -5,6 +5,24 @@
 Description:	Instructs an SRP initiator to disconnect from a target and to
 		remove all LUNs imported from that target.
 
+What:		/sys/class/srp_remote_ports/port-<h>:<n>/dev_loss_tmo
+Date:		February 1, 2014
+KernelVersion:	3.13
+Contact:	linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:	Number of seconds the SCSI layer will wait after a transport
+		layer error has been observed before removing a target port.
+		Zero means immediate removal. Setting this attribute to "off"
+		will disable the dev_loss timer.
+
+What:		/sys/class/srp_remote_ports/port-<h>:<n>/fast_io_fail_tmo
+Date:		February 1, 2014
+KernelVersion:	3.13
+Contact:	linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:	Number of seconds the SCSI layer will wait after a transport
+		layer error has been observed before failing I/O. Zero means
+		failing I/O immediately. Setting this attribute to "off" will
+		disable the fast_io_fail timer.
+
 What:		/sys/class/srp_remote_ports/port-<h>:<n>/port_id
 Date:		June 27, 2007
 KernelVersion:	2.6.24
@@ -12,8 +30,29 @@
 Description:	16-byte local SRP port identifier in hexadecimal format. An
 		example: 4c:49:4e:55:58:20:56:49:4f:00:00:00:00:00:00:00.
 
+What:		/sys/class/srp_remote_ports/port-<h>:<n>/reconnect_delay
+Date:		February 1, 2014
+KernelVersion:	3.13
+Contact:	linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:	Number of seconds the SCSI layer will wait after a reconnect
+		attempt failed before retrying. Setting this attribute to
+		"off" will disable time-based reconnecting.
+
 What:		/sys/class/srp_remote_ports/port-<h>:<n>/roles
 Date:		June 27, 2007
 KernelVersion:	2.6.24
 Contact:	linux-scsi@vger.kernel.org
 Description:	Role of the remote port. Either "SRP Initiator" or "SRP Target".
+
+What:		/sys/class/srp_remote_ports/port-<h>:<n>/state
+Date:		February 1, 2014
+KernelVersion:	3.13
+Contact:	linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org
+Description:	State of the transport layer used for communication with the
+		remote port. "running" if the transport layer is operational;
+		"blocked" if a transport layer error has been encountered but
+		the fast_io_fail_tmo timer has not yet fired; "fail-fast"
+		after the fast_io_fail_tmo timer has fired and before the
+		"dev_loss_tmo" timer has fired; "lost" after the
+		"dev_loss_tmo" timer has fired and before the port is finally
+		removed.
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index b84791f..5ceda71 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -31,17 +31,6 @@
 	  libibverbs, libibcm and a hardware driver library from
 	  <http://www.openfabrics.org/git/>.
 
-config INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-       bool "Experimental and unstable ABI for userspace access to flow steering verbs"
-       depends on INFINIBAND_USER_ACCESS
-       depends on STAGING
-	---help---
-	  The final ABI for userspace access to flow steering verbs
-	  has not been defined.  To use the current ABI, *WHICH WILL
-	  CHANGE IN THE FUTURE*, say Y here.
-
-	  If unsure, say N.
-
 config INFINIBAND_USER_MEM
 	bool
 	depends on INFINIBAND_USER_ACCESS != n
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 784b97c..f2ef7ef 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -383,14 +383,11 @@
 {
 	unsigned long flags;
 	int id;
-	static int next_id;
 
 	idr_preload(GFP_KERNEL);
 	spin_lock_irqsave(&cm.lock, flags);
 
-	id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT);
-	if (id >= 0)
-		next_id = max(id + 1, 0);
+	id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
 
 	spin_unlock_irqrestore(&cm.lock, flags);
 	idr_preload_end();
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index dab4b41..830c983 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -328,28 +328,6 @@
 	return ret;
 }
 
-static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num)
-{
-	int i;
-	int err;
-	struct ib_port_attr props;
-	union ib_gid tmp;
-
-	err = ib_query_port(device, port_num, &props);
-	if (err)
-		return err;
-
-	for (i = 0; i < props.gid_tbl_len; ++i) {
-		err = ib_query_gid(device, port_num, i, &tmp);
-		if (err)
-			return err;
-		if (!memcmp(&tmp, gid, sizeof tmp))
-			return 0;
-	}
-
-	return -EADDRNOTAVAIL;
-}
-
 static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
 {
 	dev_addr->dev_type = ARPHRD_INFINIBAND;
@@ -371,13 +349,14 @@
 	return ret;
 }
 
-static int cma_acquire_dev(struct rdma_id_private *id_priv)
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+			   struct rdma_id_private *listen_id_priv)
 {
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
 	struct cma_device *cma_dev;
 	union ib_gid gid, iboe_gid;
 	int ret = -ENODEV;
-	u8 port;
+	u8 port, found_port;
 	enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
 		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
 
@@ -389,17 +368,39 @@
 	iboe_addr_get_sgid(dev_addr, &iboe_gid);
 	memcpy(&gid, dev_addr->src_dev_addr +
 	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+	if (listen_id_priv &&
+	    rdma_port_get_link_layer(listen_id_priv->id.device,
+				     listen_id_priv->id.port_num) == dev_ll) {
+		cma_dev = listen_id_priv->cma_dev;
+		port = listen_id_priv->id.port_num;
+		if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+		    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+			ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
+						 &found_port, NULL);
+		else
+			ret = ib_find_cached_gid(cma_dev->device, &gid,
+						 &found_port, NULL);
+
+		if (!ret && (port  == found_port)) {
+			id_priv->id.port_num = found_port;
+			goto out;
+		}
+	}
 	list_for_each_entry(cma_dev, &dev_list, list) {
 		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+			if (listen_id_priv &&
+			    listen_id_priv->cma_dev == cma_dev &&
+			    listen_id_priv->id.port_num == port)
+				continue;
 			if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
 				if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
 				    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
-					ret = find_gid_port(cma_dev->device, &iboe_gid, port);
+					ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
 				else
-					ret = find_gid_port(cma_dev->device, &gid, port);
+					ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
 
-				if (!ret) {
-					id_priv->id.port_num = port;
+				if (!ret && (port == found_port)) {
+					id_priv->id.port_num = found_port;
 					goto out;
 				}
 			}
@@ -1292,7 +1293,7 @@
 	}
 
 	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
-	ret = cma_acquire_dev(conn_id);
+	ret = cma_acquire_dev(conn_id, listen_id);
 	if (ret)
 		goto err2;
 
@@ -1451,7 +1452,6 @@
 {
 	struct rdma_cm_id *new_cm_id;
 	struct rdma_id_private *listen_id, *conn_id;
-	struct net_device *dev = NULL;
 	struct rdma_cm_event event;
 	int ret;
 	struct ib_device_attr attr;
@@ -1481,7 +1481,7 @@
 		goto out;
 	}
 
-	ret = cma_acquire_dev(conn_id);
+	ret = cma_acquire_dev(conn_id, listen_id);
 	if (ret) {
 		mutex_unlock(&conn_id->handler_mutex);
 		rdma_destroy_id(new_cm_id);
@@ -1529,8 +1529,6 @@
 	cma_deref_id(conn_id);
 
 out:
-	if (dev)
-		dev_put(dev);
 	mutex_unlock(&listen_id->handler_mutex);
 	return ret;
 }
@@ -2050,7 +2048,7 @@
 		goto out;
 
 	if (!status && !id_priv->cma_dev)
-		status = cma_acquire_dev(id_priv);
+		status = cma_acquire_dev(id_priv, NULL);
 
 	if (status) {
 		if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
@@ -2547,7 +2545,7 @@
 		if (ret)
 			goto err1;
 
-		ret = cma_acquire_dev(id_priv);
+		ret = cma_acquire_dev(id_priv, NULL);
 		if (ret)
 			goto err1;
 	}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index da06abd..a1e9cba 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -148,7 +148,7 @@
 	list_for_each_entry(client, &client_list, list) {
 		if (client->index == index) {
 			if (op < 0 || op >= client->nops ||
-			    !client->cb_table[RDMA_NL_GET_OP(op)].dump)
+			    !client->cb_table[op].dump)
 				return -EINVAL;
 
 			{
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index cde1e7b..faad2ca 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -612,6 +612,7 @@
 	switch (dev->node_type) {
 	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
 	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC:	  return sprintf(buf, "%d: usNIC\n", dev->node_type);
 	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
 	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
 	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index b0f189b..ab8b1c3 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -57,7 +57,7 @@
 static unsigned int max_backlog = 1024;
 
 static struct ctl_table_header *ucma_ctl_table_hdr;
-static ctl_table ucma_ctl_table[] = {
+static struct ctl_table ucma_ctl_table[] = {
 	{
 		.procname	= "max_backlog",
 		.data		= &max_backlog,
@@ -271,7 +271,7 @@
 			goto out;
 		}
 		ctx->backlog--;
-	} else if (!ctx->uid) {
+	} else if (!ctx->uid || ctx->cm_id != cm_id) {
 		/*
 		 * We ignore events for new connections until userspace has set
 		 * their context.  This can only happen if an error occurs on a
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index d8f9c6c..bdc842e 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -47,6 +47,14 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_user_verbs.h>
 
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
+	do {								\
+		(udata)->inbuf  = (void __user *) (ibuf);		\
+		(udata)->outbuf = (void __user *) (obuf);		\
+		(udata)->inlen  = (ilen);				\
+		(udata)->outlen = (olen);				\
+	} while (0)
+
 /*
  * Our lifetime rules for these structs are the following:
  *
@@ -178,6 +186,22 @@
 			     struct ib_event *event);
 void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
 
+struct ib_uverbs_flow_spec {
+	union {
+		union {
+			struct ib_uverbs_flow_spec_hdr hdr;
+			struct {
+				__u32 type;
+				__u16 size;
+				__u16 reserved;
+			};
+		};
+		struct ib_uverbs_flow_spec_eth     eth;
+		struct ib_uverbs_flow_spec_ipv4    ipv4;
+		struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
 #define IB_UVERBS_DECLARE_CMD(name)					\
 	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
 				 const char __user *buf, int in_len,	\
@@ -217,9 +241,13 @@
 IB_UVERBS_DECLARE_CMD(create_xsrq);
 IB_UVERBS_DECLARE_CMD(open_xrcd);
 IB_UVERBS_DECLARE_CMD(close_xrcd);
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-IB_UVERBS_DECLARE_CMD(create_flow);
-IB_UVERBS_DECLARE_CMD(destroy_flow);
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+
+#define IB_UVERBS_DECLARE_EX_CMD(name)				\
+	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
+				struct ib_udata *ucore,		\
+				struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 2f0f01b..65f6e7d 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -54,17 +54,7 @@
 static struct uverbs_lock_class ah_lock_class	= { .name = "AH-uobj" };
 static struct uverbs_lock_class srq_lock_class	= { .name = "SRQ-uobj" };
 static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
 static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
-
-#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
-	do {								\
-		(udata)->inbuf  = (void __user *) (ibuf);		\
-		(udata)->outbuf = (void __user *) (obuf);		\
-		(udata)->inlen  = (ilen);				\
-		(udata)->outlen = (olen);				\
-	} while (0)
 
 /*
  * The ib_uobject locking scheme is as follows:
@@ -939,13 +929,9 @@
 	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
 		return -EINVAL;
 
-	/*
-	 * Local write permission is required if remote write or
-	 * remote atomic permission is also requested.
-	 */
-	if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
-	    !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE))
-		return -EINVAL;
+	ret = ib_check_mr_access(cmd.access_flags);
+	if (ret)
+		return ret;
 
 	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
 	if (!uobj)
@@ -2128,6 +2114,9 @@
 			}
 			next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
 			next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+			if (next->opcode == IB_WR_SEND_WITH_IMM)
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
 		} else {
 			switch (next->opcode) {
 			case IB_WR_RDMA_WRITE_WITH_IMM:
@@ -2601,8 +2590,7 @@
 	return ret ? ret : in_len;
 }
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec,
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
 				union ib_flow_spec *ib_spec)
 {
 	ib_spec->type = kern_spec->type;
@@ -2642,28 +2630,31 @@
 	return 0;
 }
 
-ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
-			      const char __user *buf, int in_len,
-			      int out_len)
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+			     struct ib_udata *ucore,
+			     struct ib_udata *uhw)
 {
 	struct ib_uverbs_create_flow	  cmd;
 	struct ib_uverbs_create_flow_resp resp;
 	struct ib_uobject		  *uobj;
 	struct ib_flow			  *flow_id;
-	struct ib_kern_flow_attr	  *kern_flow_attr;
+	struct ib_uverbs_flow_attr	  *kern_flow_attr;
 	struct ib_flow_attr		  *flow_attr;
 	struct ib_qp			  *qp;
 	int err = 0;
 	void *kern_spec;
 	void *ib_spec;
 	int i;
-	int kern_attr_size;
 
-	if (out_len < sizeof(resp))
+	if (ucore->outlen < sizeof(resp))
 		return -ENOSPC;
 
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	ucore->inbuf += sizeof(cmd);
+	ucore->inlen -= sizeof(cmd);
 
 	if (cmd.comp_mask)
 		return -EINVAL;
@@ -2672,32 +2663,27 @@
 	     !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
 		return -EPERM;
 
-	if (cmd.flow_attr.num_of_specs < 0 ||
-	    cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+	if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
 		return -EINVAL;
 
-	kern_attr_size = cmd.flow_attr.size - sizeof(cmd) -
-			 sizeof(struct ib_uverbs_cmd_hdr_ex);
-
-	if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len ||
-	    kern_attr_size < 0 || kern_attr_size >
-	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec)))
+	if (cmd.flow_attr.size > ucore->inlen ||
+	    cmd.flow_attr.size >
+	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
 		return -EINVAL;
 
 	if (cmd.flow_attr.num_of_specs) {
-		kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL);
+		kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+					 GFP_KERNEL);
 		if (!kern_flow_attr)
 			return -ENOMEM;
 
 		memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
-		if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd),
-				   kern_attr_size)) {
-			err = -EFAULT;
+		err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+					 cmd.flow_attr.size);
+		if (err)
 			goto err_free_attr;
-		}
 	} else {
 		kern_flow_attr = &cmd.flow_attr;
-		kern_attr_size = sizeof(cmd.flow_attr);
 	}
 
 	uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
@@ -2714,7 +2700,7 @@
 		goto err_uobj;
 	}
 
-	flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL);
+	flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL);
 	if (!flow_attr) {
 		err = -ENOMEM;
 		goto err_put;
@@ -2729,19 +2715,22 @@
 
 	kern_spec = kern_flow_attr + 1;
 	ib_spec = flow_attr + 1;
-	for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) {
+	for (i = 0; i < flow_attr->num_of_specs &&
+	     cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+	     cmd.flow_attr.size >=
+	     ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
 		err = kern_spec_to_ib_spec(kern_spec, ib_spec);
 		if (err)
 			goto err_free;
 		flow_attr->size +=
 			((union ib_flow_spec *) ib_spec)->size;
-		kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size;
-		kern_spec += ((struct ib_kern_spec *) kern_spec)->size;
+		cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+		kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
 		ib_spec += ((union ib_flow_spec *) ib_spec)->size;
 	}
-	if (kern_attr_size) {
-		pr_warn("create flow failed, %d bytes left from uverb cmd\n",
-			kern_attr_size);
+	if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+		pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+			i, cmd.flow_attr.size);
 		goto err_free;
 	}
 	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
@@ -2760,11 +2749,10 @@
 	memset(&resp, 0, sizeof(resp));
 	resp.flow_handle = uobj->id;
 
-	if (copy_to_user((void __user *)(unsigned long) cmd.response,
-			 &resp, sizeof(resp))) {
-		err = -EFAULT;
+	err = ib_copy_to_udata(ucore,
+			       &resp, sizeof(resp));
+	if (err)
 		goto err_copy;
-	}
 
 	put_qp_read(qp);
 	mutex_lock(&file->mutex);
@@ -2777,7 +2765,7 @@
 	kfree(flow_attr);
 	if (cmd.flow_attr.num_of_specs)
 		kfree(kern_flow_attr);
-	return in_len;
+	return 0;
 err_copy:
 	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
 destroy_flow:
@@ -2794,16 +2782,18 @@
 	return err;
 }
 
-ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file,
-			       const char __user *buf, int in_len,
-			       int out_len) {
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+			      struct ib_udata *ucore,
+			      struct ib_udata *uhw)
+{
 	struct ib_uverbs_destroy_flow	cmd;
 	struct ib_flow			*flow_id;
 	struct ib_uobject		*uobj;
 	int				ret;
 
-	if (copy_from_user(&cmd, buf, sizeof(cmd)))
-		return -EFAULT;
+	ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (ret)
+		return ret;
 
 	uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
 			      file->ucontext);
@@ -2825,9 +2815,8 @@
 
 	put_uobj(uobj);
 
-	return ret ? ret : in_len;
+	return ret;
 }
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
 
 static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 				struct ib_uverbs_create_xsrq *cmd,
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 2df31f6..3438694 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -115,10 +115,13 @@
 	[IB_USER_VERBS_CMD_CLOSE_XRCD]		= ib_uverbs_close_xrcd,
 	[IB_USER_VERBS_CMD_CREATE_XSRQ]		= ib_uverbs_create_xsrq,
 	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp,
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-	[IB_USER_VERBS_CMD_CREATE_FLOW]		= ib_uverbs_create_flow,
-	[IB_USER_VERBS_CMD_DESTROY_FLOW]	= ib_uverbs_destroy_flow
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+				    struct ib_udata *ucore,
+				    struct ib_udata *uhw) = {
+	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
+	[IB_USER_VERBS_EX_CMD_DESTROY_FLOW]	= ib_uverbs_ex_destroy_flow
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -589,6 +592,7 @@
 {
 	struct ib_uverbs_file *file = filp->private_data;
 	struct ib_uverbs_cmd_hdr hdr;
+	__u32 flags;
 
 	if (count < sizeof hdr)
 		return -EINVAL;
@@ -596,45 +600,105 @@
 	if (copy_from_user(&hdr, buf, sizeof hdr))
 		return -EFAULT;
 
-	if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
-	    !uverbs_cmd_table[hdr.command])
-		return -EINVAL;
+	flags = (hdr.command &
+		 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
 
-	if (!file->ucontext &&
-	    hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
-		return -EINVAL;
+	if (!flags) {
+		__u32 command;
 
-	if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
-		return -ENOSYS;
-
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-	if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) {
-		struct ib_uverbs_cmd_hdr_ex hdr_ex;
-
-		if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex)))
-			return -EFAULT;
-
-		if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count)
+		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+					   IB_USER_VERBS_CMD_COMMAND_MASK))
 			return -EINVAL;
 
-		return uverbs_cmd_table[hdr.command](file,
-						     buf + sizeof(hdr_ex),
-						     (hdr_ex.in_words +
-						      hdr_ex.provider_in_words) * 4,
-						     (hdr_ex.out_words +
-						      hdr_ex.provider_out_words) * 4);
-	} else {
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+		if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+		    !uverbs_cmd_table[command])
+			return -EINVAL;
+
+		if (!file->ucontext &&
+		    command != IB_USER_VERBS_CMD_GET_CONTEXT)
+			return -EINVAL;
+
+		if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+			return -ENOSYS;
+
 		if (hdr.in_words * 4 != count)
 			return -EINVAL;
 
-		return uverbs_cmd_table[hdr.command](file,
-						     buf + sizeof(hdr),
-						     hdr.in_words * 4,
-						     hdr.out_words * 4);
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
+		return uverbs_cmd_table[command](file,
+						 buf + sizeof(hdr),
+						 hdr.in_words * 4,
+						 hdr.out_words * 4);
+
+	} else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+		__u32 command;
+
+		struct ib_uverbs_ex_cmd_hdr ex_hdr;
+		struct ib_udata ucore;
+		struct ib_udata uhw;
+		int err;
+		size_t written_count = count;
+
+		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+					   IB_USER_VERBS_CMD_COMMAND_MASK))
+			return -EINVAL;
+
+		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+		if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+		    !uverbs_ex_cmd_table[command])
+			return -ENOSYS;
+
+		if (!file->ucontext)
+			return -EINVAL;
+
+		if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+			return -ENOSYS;
+
+		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+			return -EINVAL;
+
+		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+			return -EFAULT;
+
+		count -= sizeof(hdr) + sizeof(ex_hdr);
+		buf += sizeof(hdr) + sizeof(ex_hdr);
+
+		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+			return -EINVAL;
+
+		if (ex_hdr.response) {
+			if (!hdr.out_words && !ex_hdr.provider_out_words)
+				return -EINVAL;
+		} else {
+			if (hdr.out_words || ex_hdr.provider_out_words)
+				return -EINVAL;
+		}
+
+		INIT_UDATA(&ucore,
+			   (hdr.in_words) ? buf : 0,
+			   (unsigned long)ex_hdr.response,
+			   hdr.in_words * 8,
+			   hdr.out_words * 8);
+
+		INIT_UDATA(&uhw,
+			   (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0,
+			   (ex_hdr.provider_out_words) ? (unsigned long)ex_hdr.response + ucore.outlen : 0,
+			   ex_hdr.provider_in_words * 8,
+			   ex_hdr.provider_out_words * 8);
+
+		err = uverbs_ex_cmd_table[command](file,
+						   &ucore,
+						   &uhw);
+
+		if (err)
+			return err;
+
+		return written_count;
 	}
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+
+	return -ENOSYS;
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index a321df2..d4f6ddf 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -114,6 +114,8 @@
 		return RDMA_TRANSPORT_IB;
 	case RDMA_NODE_RNIC:
 		return RDMA_TRANSPORT_IWARP;
+	case RDMA_NODE_USNIC:
+		return RDMA_TRANSPORT_USNIC;
 	default:
 		BUG();
 		return 0;
@@ -130,6 +132,7 @@
 	case RDMA_TRANSPORT_IB:
 		return IB_LINK_LAYER_INFINIBAND;
 	case RDMA_TRANSPORT_IWARP:
+	case RDMA_TRANSPORT_USNIC:
 		return IB_LINK_LAYER_ETHERNET;
 	default:
 		return IB_LINK_LAYER_UNSPECIFIED;
@@ -958,6 +961,11 @@
 struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 {
 	struct ib_mr *mr;
+	int err;
+
+	err = ib_check_mr_access(mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
 
 	mr = pd->device->get_dma_mr(pd, mr_access_flags);
 
@@ -980,6 +988,11 @@
 			     u64 *iova_start)
 {
 	struct ib_mr *mr;
+	int err;
+
+	err = ib_check_mr_access(mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
 
 	if (!pd->device->reg_phys_mr)
 		return ERR_PTR(-ENOSYS);
@@ -1010,6 +1023,10 @@
 	struct ib_pd *old_pd;
 	int ret;
 
+	ret = ib_check_mr_access(mr_access_flags);
+	if (ret)
+		return ret;
+
 	if (!mr->device->rereg_phys_mr)
 		return -ENOSYS;
 
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 33d2cc6..4a03385 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -602,10 +602,10 @@
 	     rdev->lldi.vr->qp.size,
 	     rdev->lldi.vr->cq.start,
 	     rdev->lldi.vr->cq.size);
-	PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu "
+	PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu "
 	     "qpmask 0x%x cqshift %lu cqmask 0x%x\n",
 	     (unsigned)pci_resource_len(rdev->lldi.pdev, 2),
-	     (void *)(unsigned long)pci_resource_start(rdev->lldi.pdev, 2),
+	     (u64)pci_resource_start(rdev->lldi.pdev, 2),
 	     rdev->lldi.db_reg,
 	     rdev->lldi.gts_reg,
 	     rdev->qpshift, rdev->qpmask,
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index f5cb13b..cc04b7b 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -280,9 +280,7 @@
 	int j;
 	int ret;
 
-	ret = get_user_pages(current, current->mm, addr,
-			     npages, 0, 1, pages, NULL);
-
+	ret = get_user_pages_fast(addr, npages, 0, pages);
 	if (ret != npages) {
 		int i;
 
@@ -811,10 +809,7 @@
 	while (dim) {
 		const int mxp = 8;
 
-		down_write(&current->mm->mmap_sem);
 		ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
-		up_write(&current->mm->mmap_sem);
-
 		if (ret <= 0)
 			goto done_unlock;
 		else {
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index d5e60f4..66dbf80 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -324,7 +324,7 @@
 	u32 i;
 
 	i = cq->mcq.cons_index;
-	while (get_sw_cqe(cq, i & cq->ibcq.cqe))
+	while (get_sw_cqe(cq, i))
 		++i;
 
 	return i - cq->mcq.cons_index;
@@ -365,7 +365,7 @@
 
 	mutex_lock(&cq->resize_mutex);
 
-	if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+	if (entries < 1) {
 		err = -EINVAL;
 		goto out;
 	}
@@ -376,6 +376,11 @@
 		goto out;
 	}
 
+	if (entries > dev->dev->caps.max_cqes) {
+		err = -EINVAL;
+		goto out;
+	}
+
 	if (ibcq->uobject) {
 		err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
 		if (err)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index f061264..1aad9b3 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1691,11 +1691,9 @@
 		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
 		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-		ibdev->ib_dev.uverbs_cmd_mask	|=
-			(1ull << IB_USER_VERBS_CMD_CREATE_FLOW) |
-			(1ull << IB_USER_VERBS_CMD_DESTROY_FLOW);
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+		ibdev->ib_dev.uverbs_ex_cmd_mask	|=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
 	}
 
 	mlx4_ib_alloc_eqs(dev, ibdev);
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 344ab03..b726274 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -556,7 +556,7 @@
 		goto err_db;
 	}
 	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0);
-	(*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+	(*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
 	*index = to_mucontext(context)->uuari.uars[0].index;
 
@@ -620,7 +620,7 @@
 	}
 	mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas);
 
-	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT;
+	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	*index = dev->mdev.priv.uuari.uars[0].index;
 
 	return 0;
@@ -653,8 +653,11 @@
 	int eqn;
 	int err;
 
+	if (entries < 0)
+		return ERR_PTR(-EINVAL);
+
 	entries = roundup_pow_of_two(entries + 1);
-	if (entries < 1 || entries > dev->mdev.caps.max_cqes)
+	if (entries > dev->mdev.caps.max_cqes)
 		return ERR_PTR(-EINVAL);
 
 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
@@ -747,17 +750,9 @@
 	return 0;
 }
 
-static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq,
-			u32 rsn)
+static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
 {
-	u32 lrsn;
-
-	if (srq)
-		lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff;
-	else
-		lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff;
-
-	return rsn == lrsn;
+	return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);
 }
 
 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
@@ -787,8 +782,8 @@
 	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
 		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
 		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
-		if (is_equal_rsn(cqe64, srq, rsn)) {
-			if (srq)
+		if (is_equal_rsn(cqe64, rsn)) {
+			if (srq && (ntohl(cqe64->srqn) & 0xffffff))
 				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
 			++nfreed;
 		} else if (nfreed) {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index b1a6cb3..30653410 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -745,7 +745,8 @@
 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
 	seg->start_addr = 0;
 
-	err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in));
+	err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in),
+				    NULL, NULL, NULL);
 	if (err) {
 		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
 		goto err_in;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 836be91..4c134d9 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -262,6 +262,9 @@
 	int			npages;
 	struct completion	done;
 	enum ib_wc_status	status;
+	struct mlx5_ib_dev     *dev;
+	struct mlx5_create_mkey_mbox_out out;
+	unsigned long		start;
 };
 
 struct mlx5_ib_fast_reg_page_list {
@@ -323,6 +326,7 @@
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
+	int			pending;
 };
 
 struct mlx5_mr_cache {
@@ -358,6 +362,8 @@
 	spinlock_t			mr_lock;
 	struct mlx5_ib_resources	devr;
 	struct mlx5_mr_cache		cache;
+	struct timer_list		delay_timer;
+	int				fill_delay;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3453580..039c3e4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -35,11 +35,12 @@
 #include <linux/random.h>
 #include <linux/debugfs.h>
 #include <linux/export.h>
+#include <linux/delay.h>
 #include <rdma/ib_umem.h>
 #include "mlx5_ib.h"
 
 enum {
-	DEF_CACHE_SIZE	= 10,
+	MAX_PENDING_REG_MR = 8,
 };
 
 enum {
@@ -63,6 +64,51 @@
 		return order - cache->ent[0].order;
 }
 
+static void reg_mr_callback(int status, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int c = order2idx(dev, mr->order);
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	u8 key;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	ent->pending--;
+	spin_unlock_irqrestore(&ent->lock, flags);
+	if (status) {
+		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	if (mr->out.hdr.status) {
+		mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
+			     mr->out.hdr.status,
+			     be32_to_cpu(mr->out.hdr.syndrome));
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags);
+	key = dev->mdev.priv.mkey_key++;
+	spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags);
+	mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
+
+	cache->last_add = jiffies;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	ent->size++;
+	spin_unlock_irqrestore(&ent->lock, flags);
+}
+
 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
@@ -78,36 +124,39 @@
 		return -ENOMEM;
 
 	for (i = 0; i < num; i++) {
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			break;
+		}
+
 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 		if (!mr) {
 			err = -ENOMEM;
-			goto out;
+			break;
 		}
 		mr->order = ent->order;
 		mr->umred = 1;
+		mr->dev = dev;
 		in->seg.status = 1 << 6;
 		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
 		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
 		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
 		in->seg.log2_page_size = 12;
 
+		spin_lock_irq(&ent->lock);
+		ent->pending++;
+		spin_unlock_irq(&ent->lock);
+		mr->start = jiffies;
 		err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in,
-					    sizeof(*in));
+					    sizeof(*in), reg_mr_callback,
+					    mr, &mr->out);
 		if (err) {
 			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
 			kfree(mr);
-			goto out;
+			break;
 		}
-		cache->last_add = jiffies;
-
-		spin_lock(&ent->lock);
-		list_add_tail(&mr->list, &ent->head);
-		ent->cur++;
-		ent->size++;
-		spin_unlock(&ent->lock);
 	}
 
-out:
 	kfree(in);
 	return err;
 }
@@ -121,16 +170,16 @@
 	int i;
 
 	for (i = 0; i < num; i++) {
-		spin_lock(&ent->lock);
+		spin_lock_irq(&ent->lock);
 		if (list_empty(&ent->head)) {
-			spin_unlock(&ent->lock);
+			spin_unlock_irq(&ent->lock);
 			return;
 		}
 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 		list_del(&mr->list);
 		ent->cur--;
 		ent->size--;
-		spin_unlock(&ent->lock);
+		spin_unlock_irq(&ent->lock);
 		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
 		if (err)
 			mlx5_ib_warn(dev, "failed destroy mkey\n");
@@ -162,9 +211,13 @@
 		return -EINVAL;
 
 	if (var > ent->size) {
-		err = add_keys(dev, c, var - ent->size);
-		if (err)
-			return err;
+		do {
+			err = add_keys(dev, c, var - ent->size);
+			if (err && err != -EAGAIN)
+				return err;
+
+			usleep_range(3000, 5000);
+		} while (err);
 	} else if (var < ent->size) {
 		remove_keys(dev, c, ent->size - var);
 	}
@@ -280,23 +333,37 @@
 	struct mlx5_ib_dev *dev = ent->dev;
 	struct mlx5_mr_cache *cache = &dev->cache;
 	int i = order2idx(dev, ent->order);
+	int err;
 
 	if (cache->stopped)
 		return;
 
 	ent = &dev->cache.ent[i];
-	if (ent->cur < 2 * ent->limit) {
-		add_keys(dev, i, 1);
-		if (ent->cur < 2 * ent->limit)
-			queue_work(cache->wq, &ent->work);
+	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+		err = add_keys(dev, i, 1);
+		if (ent->cur < 2 * ent->limit) {
+			if (err == -EAGAIN) {
+				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
+					    i + 2);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(3));
+			} else if (err) {
+				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+					     i + 2, err);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(1000));
+			} else {
+				queue_work(cache->wq, &ent->work);
+			}
+		}
 	} else if (ent->cur > 2 * ent->limit) {
 		if (!someone_adding(cache) &&
-		    time_after(jiffies, cache->last_add + 60 * HZ)) {
+		    time_after(jiffies, cache->last_add + 300 * HZ)) {
 			remove_keys(dev, i, 1);
 			if (ent->cur > ent->limit)
 				queue_work(cache->wq, &ent->work);
 		} else {
-			queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ);
+			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 		}
 	}
 }
@@ -336,18 +403,18 @@
 
 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
 
-		spin_lock(&ent->lock);
+		spin_lock_irq(&ent->lock);
 		if (!list_empty(&ent->head)) {
 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 					      list);
 			list_del(&mr->list);
 			ent->cur--;
-			spin_unlock(&ent->lock);
+			spin_unlock_irq(&ent->lock);
 			if (ent->cur < ent->limit)
 				queue_work(cache->wq, &ent->work);
 			break;
 		}
-		spin_unlock(&ent->lock);
+		spin_unlock_irq(&ent->lock);
 
 		queue_work(cache->wq, &ent->work);
 
@@ -374,12 +441,12 @@
 		return;
 	}
 	ent = &cache->ent[c];
-	spin_lock(&ent->lock);
+	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
 	ent->cur++;
 	if (ent->cur > 2 * ent->limit)
 		shrink = 1;
-	spin_unlock(&ent->lock);
+	spin_unlock_irq(&ent->lock);
 
 	if (shrink)
 		queue_work(cache->wq, &ent->work);
@@ -394,16 +461,16 @@
 
 	cancel_delayed_work(&ent->dwork);
 	while (1) {
-		spin_lock(&ent->lock);
+		spin_lock_irq(&ent->lock);
 		if (list_empty(&ent->head)) {
-			spin_unlock(&ent->lock);
+			spin_unlock_irq(&ent->lock);
 			return;
 		}
 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 		list_del(&mr->list);
 		ent->cur--;
 		ent->size--;
-		spin_unlock(&ent->lock);
+		spin_unlock_irq(&ent->lock);
 		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
 		if (err)
 			mlx5_ib_warn(dev, "failed destroy mkey\n");
@@ -464,12 +531,18 @@
 	debugfs_remove_recursive(dev->cache.root);
 }
 
+static void delay_time_func(unsigned long ctx)
+{
+	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+
+	dev->fill_delay = 0;
+}
+
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	int limit;
-	int size;
 	int err;
 	int i;
 
@@ -479,6 +552,7 @@
 		return -ENOMEM;
 	}
 
+	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		INIT_LIST_HEAD(&cache->ent[i].head);
 		spin_lock_init(&cache->ent[i].lock);
@@ -489,13 +563,11 @@
 		ent->order = i + 2;
 		ent->dev = dev;
 
-		if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) {
-			size = dev->mdev.profile->mr_cache[i].size;
+		if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE)
 			limit = dev->mdev.profile->mr_cache[i].limit;
-		} else {
-			size = DEF_CACHE_SIZE;
+		else
 			limit = 0;
-		}
+
 		INIT_WORK(&ent->work, cache_work_func);
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 		ent->limit = limit;
@@ -522,6 +594,7 @@
 		clean_keys(dev, i);
 
 	destroy_workqueue(dev->cache.wq);
+	del_timer_sync(&dev->delay_timer);
 
 	return 0;
 }
@@ -551,7 +624,8 @@
 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
 	seg->start_addr = 0;
 
-	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in));
+	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
+				    NULL);
 	if (err)
 		goto err_in;
 
@@ -660,14 +734,14 @@
 	int err;
 	int i;
 
-	for (i = 0; i < 10; i++) {
+	for (i = 0; i < 1; i++) {
 		mr = alloc_cached_mr(dev, order);
 		if (mr)
 			break;
 
 		err = add_keys(dev, order2idx(dev, order), 1);
-		if (err) {
-			mlx5_ib_warn(dev, "add_keys failed\n");
+		if (err && err != -EAGAIN) {
+			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
 			break;
 		}
 	}
@@ -759,8 +833,10 @@
 	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
 	in->seg.log2_page_size = page_shift;
 	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
-	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
-	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen);
+	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
+							 1 << page_shift));
+	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL,
+				    NULL, NULL);
 	if (err) {
 		mlx5_ib_warn(dev, "create mkey failed\n");
 		goto err_2;
@@ -944,7 +1020,8 @@
 	 * TBD not needed - issue 197292 */
 	in->seg.log2_page_size = PAGE_SHIFT;
 
-	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in));
+	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
+				    NULL, NULL);
 	kfree(in);
 	if (err)
 		goto err_free;
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 5659ea8..7c6b4ba 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -551,7 +551,7 @@
 	}
 	mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
 	(*in)->ctx.log_pg_sz_remote_qpn =
-		cpu_to_be32((page_shift - PAGE_SHIFT) << 24);
+		cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
 	(*in)->ctx.params2 = cpu_to_be32(offset << 6);
 
 	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
@@ -648,7 +648,8 @@
 		goto err_buf;
 	}
 	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
-	(*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24);
+	(*in)->ctx.log_pg_sz_remote_qpn =
+		cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
 	/* Set "fast registration enabled" for all kernel QPs */
 	(*in)->ctx.params1 |= cpu_to_be32(1 << 11);
 	(*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4);
@@ -1317,9 +1318,11 @@
 					  MLX5_QP_OPTPAR_RAE		|
 					  MLX5_QP_OPTPAR_RWE		|
 					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
-					  MLX5_QP_OPTPAR_PM_STATE,
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
-					  MLX5_QP_OPTPAR_PM_STATE,
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
 					  MLX5_QP_OPTPAR_SRQN		|
 					  MLX5_QP_OPTPAR_CQN_RCV,
@@ -1550,7 +1553,7 @@
 	mlx5_cur = to_mlx5_state(cur_state);
 	mlx5_new = to_mlx5_state(new_state);
 	mlx5_st = to_mlx5_st(ibqp->qp_type);
-	if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0)
+	if (mlx5_st < 0)
 		goto out;
 
 	optpar = ib_mask_to_mlx5_opt(attr_mask);
@@ -1744,6 +1747,7 @@
 			MLX5_MKEY_MASK_PD		|
 			MLX5_MKEY_MASK_LR		|
 			MLX5_MKEY_MASK_LW		|
+			MLX5_MKEY_MASK_KEY		|
 			MLX5_MKEY_MASK_RR		|
 			MLX5_MKEY_MASK_RW		|
 			MLX5_MKEY_MASK_A		|
@@ -1800,7 +1804,8 @@
 	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
 	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
 	seg->log2_page_size = wr->wr.fast_reg.page_shift;
-	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+				       mlx5_mkey_variant(wr->wr.fast_reg.rkey));
 }
 
 static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
@@ -1913,6 +1918,10 @@
 	if (unlikely((*seg == qp->sq.qend)))
 		*seg = mlx5_get_send_wqe(qp, 0);
 	if (!li) {
+		if (unlikely(wr->wr.fast_reg.page_list_len >
+			     wr->wr.fast_reg.page_list->max_page_list_len))
+			return	-ENOMEM;
+
 		set_frwr_pages(*seg, wr, mdev, pd, writ);
 		*seg += sizeof(struct mlx5_wqe_data_seg);
 		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 0aa478b..210b3ea 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -123,7 +123,7 @@
 		goto err_in;
 	}
 
-	(*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
 
 	return 0;
@@ -192,7 +192,7 @@
 	}
 	srq->wq_sig = !!srq_signature;
 
-	(*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
 	return 0;
 
@@ -390,9 +390,7 @@
 		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
 		ib_umem_release(msrq->umem);
 	} else {
-		kfree(msrq->wrid);
-		mlx5_buf_free(&dev->mdev, &msrq->buf);
-		mlx5_db_free(&dev->mdev, &msrq->db);
+		destroy_srq_kernel(dev, msrq);
 	}
 
 	kfree(srq);
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 5b53ca5..8308e36 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2834,7 +2834,7 @@
 	init_attr->qp_context = nesqp->ibqp.qp_context;
 	init_attr->send_cq = nesqp->ibqp.send_cq;
 	init_attr->recv_cq = nesqp->ibqp.recv_cq;
-	init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq;
+	init_attr->srq = nesqp->ibqp.srq;
 	init_attr->cap = attr->cap;
 
 	return 0;
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index 016e7429..5bfc02f 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -6190,21 +6190,20 @@
 {
 	struct qib_devdata *dd;
 	unsigned long val;
-	int ret;
-
+	char *n;
 	if (strlen(str) >= MAX_ATTEN_LEN) {
 		pr_info("txselect_values string too long\n");
 		return -ENOSPC;
 	}
-	ret = kstrtoul(str, 0, &val);
-	if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+	val = simple_strtoul(str, &n, 0);
+	if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
 				TXDDS_MFG_SZ)) {
 		pr_info("txselect_values must start with a number < %d\n",
 			TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ);
-		return ret ? ret : -EINVAL;
+		return -EINVAL;
 	}
-
 	strcpy(txselect_list, str);
+
 	list_for_each_entry(dd, &qib_dev_list, list)
 		if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
 			set_no_qsfp_atten(dd, 1);
diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h
index 28874f86..941d4d5 100644
--- a/drivers/infiniband/hw/qib/qib_mad.h
+++ b/drivers/infiniband/hw/qib/qib_mad.h
@@ -54,7 +54,7 @@
 	__be32 revision;
 	u8 local_port_num;
 	u8 vendor_id[3];
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_mad_notice_attr {
 	u8 generic_type;
@@ -73,7 +73,7 @@
 			__be16	reserved;
 			__be16	lid;		/* where violation happened */
 			u8	port_num;	/* where violation happened */
-		} __attribute__ ((packed)) ntc_129_131;
+		} __packed ntc_129_131;
 
 		struct {
 			__be16	reserved;
@@ -83,14 +83,14 @@
 			__be32	new_cap_mask;	/* new capability mask */
 			u8	reserved3;
 			u8	change_flags;	/* low 3 bits only */
-		} __attribute__ ((packed)) ntc_144;
+		} __packed ntc_144;
 
 		struct {
 			__be16	reserved;
 			__be16	lid;		/* lid where sys guid changed */
 			__be16	reserved2;
 			__be64	new_sys_guid;
-		} __attribute__ ((packed)) ntc_145;
+		} __packed ntc_145;
 
 		struct {
 			__be16	reserved;
@@ -104,7 +104,7 @@
 			u8	reserved3;
 			u8	dr_trunc_hop;
 			u8	dr_rtn_path[30];
-		} __attribute__ ((packed)) ntc_256;
+		} __packed ntc_256;
 
 		struct {
 			__be16		reserved;
@@ -115,7 +115,7 @@
 			__be32		qp2;	/* high 8 bits reserved */
 			union ib_gid	gid1;
 			union ib_gid	gid2;
-		} __attribute__ ((packed)) ntc_257_258;
+		} __packed ntc_257_258;
 
 	} details;
 };
@@ -209,7 +209,7 @@
 	__be64 port_rcv_packets;
 	__be64 port_xmit_wait;
 	__be64 port_adr_events;
-} __attribute__ ((packed));
+} __packed;
 
 #define IB_PMA_CONG_HW_CONTROL_TIMER            0x00
 #define IB_PMA_CONG_HW_CONTROL_SAMPLE           0x01
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index d0a0ea0..165aee2 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -594,8 +594,7 @@
 		else
 			j = npages;
 
-		ret = get_user_pages(current, current->mm, addr,
-			     j, 0, 1, pages, NULL);
+		ret = get_user_pages_fast(addr, j, 0, pages);
 		if (ret != j) {
 			i = 0;
 			j = ret;
@@ -1294,11 +1293,8 @@
 		int mxp = 8;
 		int ndesc = 0;
 
-		down_write(&current->mm->mmap_sem);
 		ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
 				iov, dim, &list, &mxp, &ndesc);
-		up_write(&current->mm->mmap_sem);
-
 		if (ret < 0)
 			goto done_unlock;
 		else {
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 012e2c7..a01c7d2 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -150,14 +150,14 @@
 	__be64 vaddr;
 	__be32 rkey;
 	__be32 length;
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_atomic_eth {
 	__be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
 	__be32 rkey;
 	__be64 swap_data;
 	__be64 compare_data;
-} __attribute__ ((packed));
+} __packed;
 
 struct qib_other_headers {
 	__be32 bth[3];
@@ -178,7 +178,7 @@
 		__be32 aeth;
 		struct ib_atomic_eth atomic_eth;
 	} u;
-} __attribute__ ((packed));
+} __packed;
 
 /*
  * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
@@ -195,12 +195,12 @@
 		} l;
 		struct qib_other_headers oth;
 	} u;
-} __attribute__ ((packed));
+} __packed;
 
 struct qib_pio_header {
 	__le32 pbc[2];
 	struct qib_ib_header hdr;
-} __attribute__ ((packed));
+} __packed;
 
 /*
  * There is one struct qib_mcast for each multicast GID.
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index eb71aaa..c639f90 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -101,6 +101,7 @@
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
 	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
+	IPOIB_MCAST_JOIN_STARTED  = 4,
 
 	MAX_SEND_CQE		  = 16,
 	IPOIB_CM_COPYBREAK	  = 256,
@@ -151,6 +152,7 @@
 	struct sk_buff_head pkt_queue;
 
 	struct net_device *dev;
+	struct completion done;
 };
 
 struct ipoib_rx_buf {
@@ -299,7 +301,7 @@
 
 	unsigned long flags;
 
-	struct mutex vlan_mutex;
+	struct rw_semaphore vlan_rwsem;
 
 	struct rb_root  path_tree;
 	struct list_head path_list;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 7a31754..1377f85 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -140,7 +140,8 @@
 static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
 					     struct ipoib_cm_rx_buf *rx_ring,
 					     int id, int frags,
-					     u64 mapping[IPOIB_CM_RX_SG])
+					     u64 mapping[IPOIB_CM_RX_SG],
+					     gfp_t gfp)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
@@ -164,7 +165,7 @@
 	}
 
 	for (i = 0; i < frags; i++) {
-		struct page *page = alloc_page(GFP_ATOMIC);
+		struct page *page = alloc_page(gfp);
 
 		if (!page)
 			goto partial_error;
@@ -382,7 +383,8 @@
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
 		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
-					   rx->rx_ring[i].mapping)) {
+					   rx->rx_ring[i].mapping,
+					   GFP_KERNEL)) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 				ret = -ENOMEM;
 				goto err_count;
@@ -639,7 +641,8 @@
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
 					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
-	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
+	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
+				       mapping, GFP_ATOMIC);
 	if (unlikely(!newskb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
@@ -1556,7 +1559,8 @@
 		for (i = 0; i < ipoib_recvq_size; ++i) {
 			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
 						   priv->cm.num_frags - 1,
-						   priv->cm.srq_ring[i].mapping)) {
+						   priv->cm.srq_ring[i].mapping,
+						   GFP_KERNEL)) {
 				ipoib_warn(priv, "failed to allocate "
 					   "receive buffer %d\n", i);
 				ipoib_cm_dev_cleanup(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 196b1d1..6a7003d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -685,15 +685,13 @@
 	ret = ipoib_ib_post_receives(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-		ipoib_ib_dev_stop(dev, 1);
-		return -1;
+		goto dev_stop;
 	}
 
 	ret = ipoib_cm_dev_open(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
-		ipoib_ib_dev_stop(dev, 1);
-		return -1;
+		goto dev_stop;
 	}
 
 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
@@ -704,6 +702,11 @@
 		napi_enable(&priv->napi);
 
 	return 0;
+dev_stop:
+	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_enable(&priv->napi);
+	ipoib_ib_dev_stop(dev, 1);
+	return -1;
 }
 
 static void ipoib_pkey_dev_check_presence(struct net_device *dev)
@@ -746,10 +749,8 @@
 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
 		mutex_lock(&pkey_mutex);
 		set_bit(IPOIB_PKEY_STOP, &priv->flags);
-		cancel_delayed_work(&priv->pkey_poll_task);
+		cancel_delayed_work_sync(&priv->pkey_poll_task);
 		mutex_unlock(&pkey_mutex);
-		if (flush)
-			flush_workqueue(ipoib_workqueue);
 	}
 
 	ipoib_mcast_stop_thread(dev, flush);
@@ -974,7 +975,7 @@
 	u16 new_index;
 	int result;
 
-	mutex_lock(&priv->vlan_mutex);
+	down_read(&priv->vlan_rwsem);
 
 	/*
 	 * Flush any child interfaces too -- they might be up even if
@@ -983,7 +984,7 @@
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
 		__ipoib_ib_dev_flush(cpriv, level);
 
-	mutex_unlock(&priv->vlan_mutex);
+	up_read(&priv->vlan_rwsem);
 
 	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
 		/* for non-child devices must check/update the pkey value here */
@@ -1081,6 +1082,11 @@
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	ipoib_dbg(priv, "cleaning up ib_dev\n");
+	/*
+	 * We must make sure there are no more (path) completions
+	 * that may wish to touch priv fields that are no longer valid
+	 */
+	ipoib_flush_paths(dev);
 
 	ipoib_mcast_stop_thread(dev, 1);
 	ipoib_mcast_dev_flush(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 82cec1a..d64ed05 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -119,7 +119,7 @@
 		struct ipoib_dev_priv *cpriv;
 
 		/* Bring up any child interfaces too */
-		mutex_lock(&priv->vlan_mutex);
+		down_read(&priv->vlan_rwsem);
 		list_for_each_entry(cpriv, &priv->child_intfs, list) {
 			int flags;
 
@@ -129,7 +129,7 @@
 
 			dev_change_flags(cpriv->dev, flags | IFF_UP);
 		}
-		mutex_unlock(&priv->vlan_mutex);
+		up_read(&priv->vlan_rwsem);
 	}
 
 	netif_start_queue(dev);
@@ -162,7 +162,7 @@
 		struct ipoib_dev_priv *cpriv;
 
 		/* Bring down any child interfaces too */
-		mutex_lock(&priv->vlan_mutex);
+		down_read(&priv->vlan_rwsem);
 		list_for_each_entry(cpriv, &priv->child_intfs, list) {
 			int flags;
 
@@ -172,7 +172,7 @@
 
 			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
 		}
-		mutex_unlock(&priv->vlan_mutex);
+		up_read(&priv->vlan_rwsem);
 	}
 
 	return 0;
@@ -1350,7 +1350,7 @@
 
 	ipoib_set_ethtool_ops(dev);
 
-	netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
+	netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
 
 	dev->watchdog_timeo	 = HZ;
 
@@ -1372,7 +1372,7 @@
 
 	spin_lock_init(&priv->lock);
 
-	mutex_init(&priv->vlan_mutex);
+	init_rwsem(&priv->vlan_rwsem);
 
 	INIT_LIST_HEAD(&priv->path_list);
 	INIT_LIST_HEAD(&priv->child_intfs);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index cecb98a..d4e0057 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -386,8 +386,10 @@
 			mcast->mcmember.mgid.raw, status);
 
 	/* We trap for port events ourselves. */
-	if (status == -ENETRESET)
-		return 0;
+	if (status == -ENETRESET) {
+		status = 0;
+		goto out;
+	}
 
 	if (!status)
 		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
@@ -407,7 +409,8 @@
 		if (mcast == priv->broadcast)
 			queue_work(ipoib_workqueue, &priv->carrier_on_task);
 
-		return 0;
+		status = 0;
+		goto out;
 	}
 
 	if (mcast->logcount++ < 20) {
@@ -434,7 +437,8 @@
 				   mcast->backoff * HZ);
 	spin_unlock_irq(&priv->lock);
 	mutex_unlock(&mcast_mutex);
-
+out:
+	complete(&mcast->done);
 	return status;
 }
 
@@ -484,11 +488,15 @@
 	}
 
 	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	init_completion(&mcast->done);
+	set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
+
 	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
 					 &rec, comp_mask, GFP_KERNEL,
 					 ipoib_mcast_join_complete, mcast);
 	if (IS_ERR(mcast->mc)) {
 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		complete(&mcast->done);
 		ret = PTR_ERR(mcast->mc);
 		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
 
@@ -510,10 +518,18 @@
 	struct ipoib_dev_priv *priv =
 		container_of(work, struct ipoib_dev_priv, mcast_task.work);
 	struct net_device *dev = priv->dev;
+	struct ib_port_attr port_attr;
 
 	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
 		return;
 
+	if (ib_query_port(priv->ca, priv->port, &port_attr) ||
+	    port_attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n",
+			  port_attr.state);
+		return;
+	}
+
 	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
 		ipoib_warn(priv, "ib_query_gid() failed\n");
 	else
@@ -751,6 +767,11 @@
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
+	/* seperate between the wait to the leave*/
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+		if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
+			wait_for_completion(&mcast->done);
+
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
 		ipoib_mcast_leave(dev, mcast);
 		ipoib_mcast_free(mcast);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
index f81abe1..c29b5c8 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -142,10 +142,10 @@
 	priv = netdev_priv(dev);
 	ppriv = netdev_priv(priv->parent);
 
-	mutex_lock(&ppriv->vlan_mutex);
+	down_write(&ppriv->vlan_rwsem);
 	unregister_netdevice_queue(dev, head);
 	list_del(&priv->list);
-	mutex_unlock(&ppriv->vlan_mutex);
+	up_write(&ppriv->vlan_rwsem);
 }
 
 static size_t ipoib_get_size(const struct net_device *dev)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 8292554..9fad7b5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -140,7 +140,7 @@
 	if (!rtnl_trylock())
 		return restart_syscall();
 
-	mutex_lock(&ppriv->vlan_mutex);
+	down_write(&ppriv->vlan_rwsem);
 
 	/*
 	 * First ensure this isn't a duplicate. We check the parent device and
@@ -163,7 +163,7 @@
 	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
 
 out:
-	mutex_unlock(&ppriv->vlan_mutex);
+	up_write(&ppriv->vlan_rwsem);
 
 	if (result)
 		free_netdev(priv->dev);
@@ -185,7 +185,8 @@
 
 	if (!rtnl_trylock())
 		return restart_syscall();
-	mutex_lock(&ppriv->vlan_mutex);
+
+	down_write(&ppriv->vlan_rwsem);
 	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
 		if (priv->pkey == pkey &&
 		    priv->child_type == IPOIB_LEGACY_CHILD) {
@@ -195,7 +196,8 @@
 			break;
 		}
 	}
-	mutex_unlock(&ppriv->vlan_mutex);
+	up_write(&ppriv->vlan_rwsem);
+
 	rtnl_unlock();
 
 	if (dev) {
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index f93baf8..a886319 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -46,6 +46,7 @@
 #include <scsi/scsi.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_dbg.h>
+#include <scsi/scsi_tcq.h>
 #include <scsi/srp.h>
 #include <scsi/scsi_transport_srp.h>
 
@@ -86,6 +87,32 @@
 MODULE_PARM_DESC(topspin_workarounds,
 		 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
 
+static struct kernel_param_ops srp_tmo_ops;
+
+static int srp_reconnect_delay = 10;
+module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts");
+
+static int srp_fast_io_fail_tmo = 15;
+module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fast_io_fail_tmo,
+		 "Number of seconds between the observation of a transport"
+		 " layer error and failing all I/O. \"off\" means that this"
+		 " functionality is disabled.");
+
+static int srp_dev_loss_tmo = 600;
+module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dev_loss_tmo,
+		 "Maximum number of seconds that the SRP transport should"
+		 " insulate transport layer errors. After this time has been"
+		 " exceeded the SCSI host is removed. Should be"
+		 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		 " if fast_io_fail_tmo has not been set. \"off\" means that"
+		 " this functionality is disabled.");
+
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device);
 static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
@@ -102,6 +129,48 @@
 
 static struct ib_sa_client srp_sa_client;
 
+static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
+{
+	int tmo = *(int *)kp->arg;
+
+	if (tmo >= 0)
+		return sprintf(buffer, "%d", tmo);
+	else
+		return sprintf(buffer, "off");
+}
+
+static int srp_tmo_set(const char *val, const struct kernel_param *kp)
+{
+	int tmo, res;
+
+	if (strncmp(val, "off", 3) != 0) {
+		res = kstrtoint(val, 0, &tmo);
+		if (res)
+			goto out;
+	} else {
+		tmo = -1;
+	}
+	if (kp->arg == &srp_reconnect_delay)
+		res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
+				    srp_dev_loss_tmo);
+	else if (kp->arg == &srp_fast_io_fail_tmo)
+		res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
+	else
+		res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
+				    tmo);
+	if (res)
+		goto out;
+	*(int *)kp->arg = tmo;
+
+out:
+	return res;
+}
+
+static struct kernel_param_ops srp_tmo_ops = {
+	.get = srp_tmo_get,
+	.set = srp_tmo_set,
+};
+
 static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
 {
 	return (struct srp_target_port *) host->hostdata;
@@ -231,16 +300,16 @@
 		return -ENOMEM;
 
 	recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-			       srp_recv_completion, NULL, target, SRP_RQ_SIZE,
-			       target->comp_vector);
+			       srp_recv_completion, NULL, target,
+			       target->queue_size, target->comp_vector);
 	if (IS_ERR(recv_cq)) {
 		ret = PTR_ERR(recv_cq);
 		goto err;
 	}
 
 	send_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-			       srp_send_completion, NULL, target, SRP_SQ_SIZE,
-			       target->comp_vector);
+			       srp_send_completion, NULL, target,
+			       target->queue_size, target->comp_vector);
 	if (IS_ERR(send_cq)) {
 		ret = PTR_ERR(send_cq);
 		goto err_recv_cq;
@@ -249,8 +318,8 @@
 	ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
 
 	init_attr->event_handler       = srp_qp_event;
-	init_attr->cap.max_send_wr     = SRP_SQ_SIZE;
-	init_attr->cap.max_recv_wr     = SRP_RQ_SIZE;
+	init_attr->cap.max_send_wr     = target->queue_size;
+	init_attr->cap.max_recv_wr     = target->queue_size;
 	init_attr->cap.max_recv_sge    = 1;
 	init_attr->cap.max_send_sge    = 1;
 	init_attr->sq_sig_type         = IB_SIGNAL_ALL_WR;
@@ -296,6 +365,10 @@
 	return ret;
 }
 
+/*
+ * Note: this function may be called without srp_alloc_iu_bufs() having been
+ * invoked. Hence the target->[rt]x_ring checks.
+ */
 static void srp_free_target_ib(struct srp_target_port *target)
 {
 	int i;
@@ -307,10 +380,18 @@
 	target->qp = NULL;
 	target->send_cq = target->recv_cq = NULL;
 
-	for (i = 0; i < SRP_RQ_SIZE; ++i)
-		srp_free_iu(target->srp_host, target->rx_ring[i]);
-	for (i = 0; i < SRP_SQ_SIZE; ++i)
-		srp_free_iu(target->srp_host, target->tx_ring[i]);
+	if (target->rx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, target->rx_ring[i]);
+		kfree(target->rx_ring);
+		target->rx_ring = NULL;
+	}
+	if (target->tx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, target->tx_ring[i]);
+		kfree(target->tx_ring);
+		target->tx_ring = NULL;
+	}
 }
 
 static void srp_path_rec_completion(int status,
@@ -390,7 +471,7 @@
 	req->param.responder_resources	      = 4;
 	req->param.remote_cm_response_timeout = 20;
 	req->param.local_cm_response_timeout  = 20;
-	req->param.retry_count 		      = 7;
+	req->param.retry_count                = target->tl_retry_count;
 	req->param.rnr_retry_count 	      = 7;
 	req->param.max_cm_retries 	      = 15;
 
@@ -496,7 +577,11 @@
 	struct srp_request *req;
 	int i;
 
-	for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) {
+	if (!target->req_ring)
+		return;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &target->req_ring[i];
 		kfree(req->fmr_list);
 		kfree(req->map_page);
 		if (req->indirect_dma_addr) {
@@ -506,6 +591,50 @@
 		}
 		kfree(req->indirect_desc);
 	}
+
+	kfree(target->req_ring);
+	target->req_ring = NULL;
+}
+
+static int srp_alloc_req_data(struct srp_target_port *target)
+{
+	struct srp_device *srp_dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	struct srp_request *req;
+	dma_addr_t dma_addr;
+	int i, ret = -ENOMEM;
+
+	INIT_LIST_HEAD(&target->free_reqs);
+
+	target->req_ring = kzalloc(target->req_ring_size *
+				   sizeof(*target->req_ring), GFP_KERNEL);
+	if (!target->req_ring)
+		goto out;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &target->req_ring[i];
+		req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+					GFP_KERNEL);
+		req->map_page = kmalloc(SRP_FMR_SIZE * sizeof(void *),
+					GFP_KERNEL);
+		req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
+		if (!req->fmr_list || !req->map_page || !req->indirect_desc)
+			goto out;
+
+		dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
+					     target->indirect_size,
+					     DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ibdev, dma_addr))
+			goto out;
+
+		req->indirect_dma_addr = dma_addr;
+		req->index = i;
+		list_add_tail(&req->list, &target->free_reqs);
+	}
+	ret = 0;
+
+out:
+	return ret;
 }
 
 /**
@@ -528,12 +657,20 @@
 	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
 
 	srp_del_scsi_host_attr(target->scsi_host);
+	srp_rport_get(target->rport);
 	srp_remove_host(target->scsi_host);
 	scsi_remove_host(target->scsi_host);
 	srp_disconnect_target(target);
 	ib_destroy_cm_id(target->cm_id);
 	srp_free_target_ib(target);
+	cancel_work_sync(&target->tl_err_work);
+	srp_rport_put(target->rport);
 	srp_free_req_data(target);
+
+	spin_lock(&target->srp_host->target_lock);
+	list_del(&target->list);
+	spin_unlock(&target->srp_host->target_lock);
+
 	scsi_host_put(target->scsi_host);
 }
 
@@ -545,10 +682,6 @@
 	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
 
 	srp_remove_target(target);
-
-	spin_lock(&target->srp_host->target_lock);
-	list_del(&target->list);
-	spin_unlock(&target->srp_host->target_lock);
 }
 
 static void srp_rport_delete(struct srp_rport *rport)
@@ -686,23 +819,42 @@
 	spin_unlock_irqrestore(&target->lock, flags);
 }
 
-static void srp_reset_req(struct srp_target_port *target, struct srp_request *req)
+static void srp_finish_req(struct srp_target_port *target,
+			   struct srp_request *req, int result)
 {
 	struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL);
 
 	if (scmnd) {
 		srp_free_req(target, req, scmnd, 0);
-		scmnd->result = DID_RESET << 16;
+		scmnd->result = result;
 		scmnd->scsi_done(scmnd);
 	}
 }
 
-static int srp_reconnect_target(struct srp_target_port *target)
+static void srp_terminate_io(struct srp_rport *rport)
 {
-	struct Scsi_Host *shost = target->scsi_host;
-	int i, ret;
+	struct srp_target_port *target = rport->lld_data;
+	int i;
 
-	scsi_target_block(&shost->shost_gendev);
+	for (i = 0; i < target->req_ring_size; ++i) {
+		struct srp_request *req = &target->req_ring[i];
+		srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16);
+	}
+}
+
+/*
+ * It is up to the caller to ensure that srp_rport_reconnect() calls are
+ * serialized and that no concurrent srp_queuecommand(), srp_abort(),
+ * srp_reset_device() or srp_reset_host() calls will occur while this function
+ * is in progress. One way to realize that is not to call this function
+ * directly but to call srp_reconnect_rport() instead since that last function
+ * serializes calls of this function via rport->mutex and also blocks
+ * srp_queuecommand() calls before invoking this function.
+ */
+static int srp_rport_reconnect(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	int i, ret;
 
 	srp_disconnect_target(target);
 	/*
@@ -721,41 +873,21 @@
 	else
 		srp_create_target_ib(target);
 
-	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
+	for (i = 0; i < target->req_ring_size; ++i) {
 		struct srp_request *req = &target->req_ring[i];
-		if (req->scmnd)
-			srp_reset_req(target, req);
+		srp_finish_req(target, req, DID_RESET << 16);
 	}
 
 	INIT_LIST_HEAD(&target->free_tx);
-	for (i = 0; i < SRP_SQ_SIZE; ++i)
+	for (i = 0; i < target->queue_size; ++i)
 		list_add(&target->tx_ring[i]->list, &target->free_tx);
 
 	if (ret == 0)
 		ret = srp_connect_target(target);
 
-	scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING :
-			    SDEV_TRANSPORT_OFFLINE);
-	target->transport_offline = !!ret;
-
-	if (ret)
-		goto err;
-
-	shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n");
-
-	return ret;
-
-err:
-	shost_printk(KERN_ERR, target->scsi_host,
-		     PFX "reconnect failed (%d), removing target port.\n", ret);
-
-	/*
-	 * We couldn't reconnect, so kill our target port off.
-	 * However, we have to defer the real removal because we
-	 * are in the context of the SCSI error handler now, which
-	 * will deadlock if we call scsi_remove_host().
-	 */
-	srp_queue_remove_work(target);
+	if (ret == 0)
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "reconnect succeeded\n");
 
 	return ret;
 }
@@ -1302,15 +1434,30 @@
 			     PFX "Recv failed with error code %d\n", res);
 }
 
-static void srp_handle_qp_err(enum ib_wc_status wc_status,
-			      enum ib_wc_opcode wc_opcode,
+/**
+ * srp_tl_err_work() - handle a transport layer error
+ *
+ * Note: This function may get invoked before the rport has been created,
+ * hence the target->rport test.
+ */
+static void srp_tl_err_work(struct work_struct *work)
+{
+	struct srp_target_port *target;
+
+	target = container_of(work, struct srp_target_port, tl_err_work);
+	if (target->rport)
+		srp_start_tl_fail_timers(target->rport);
+}
+
+static void srp_handle_qp_err(enum ib_wc_status wc_status, bool send_err,
 			      struct srp_target_port *target)
 {
 	if (target->connected && !target->qp_in_error) {
 		shost_printk(KERN_ERR, target->scsi_host,
 			     PFX "failed %s status %d\n",
-			     wc_opcode & IB_WC_RECV ? "receive" : "send",
+			     send_err ? "send" : "receive",
 			     wc_status);
+		queue_work(system_long_wq, &target->tl_err_work);
 	}
 	target->qp_in_error = true;
 }
@@ -1325,7 +1472,7 @@
 		if (likely(wc.status == IB_WC_SUCCESS)) {
 			srp_handle_recv(target, &wc);
 		} else {
-			srp_handle_qp_err(wc.status, wc.opcode, target);
+			srp_handle_qp_err(wc.status, false, target);
 		}
 	}
 }
@@ -1341,7 +1488,7 @@
 			iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
 			list_add(&iu->list, &target->free_tx);
 		} else {
-			srp_handle_qp_err(wc.status, wc.opcode, target);
+			srp_handle_qp_err(wc.status, true, target);
 		}
 	}
 }
@@ -1349,17 +1496,29 @@
 static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(shost);
+	struct srp_rport *rport = target->rport;
 	struct srp_request *req;
 	struct srp_iu *iu;
 	struct srp_cmd *cmd;
 	struct ib_device *dev;
 	unsigned long flags;
-	int len;
+	int len, result;
+	const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
 
-	if (unlikely(target->transport_offline)) {
-		scmnd->result = DID_NO_CONNECT << 16;
+	/*
+	 * The SCSI EH thread is the only context from which srp_queuecommand()
+	 * can get invoked for blocked devices (SDEV_BLOCK /
+	 * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
+	 * locking the rport mutex if invoked from inside the SCSI EH.
+	 */
+	if (in_scsi_eh)
+		mutex_lock(&rport->mutex);
+
+	result = srp_chkready(target->rport);
+	if (unlikely(result)) {
+		scmnd->result = result;
 		scmnd->scsi_done(scmnd);
-		return 0;
+		goto unlock_rport;
 	}
 
 	spin_lock_irqsave(&target->lock, flags);
@@ -1404,6 +1563,10 @@
 		goto err_unmap;
 	}
 
+unlock_rport:
+	if (in_scsi_eh)
+		mutex_unlock(&rport->mutex);
+
 	return 0;
 
 err_unmap:
@@ -1418,14 +1581,30 @@
 err_unlock:
 	spin_unlock_irqrestore(&target->lock, flags);
 
+	if (in_scsi_eh)
+		mutex_unlock(&rport->mutex);
+
 	return SCSI_MLQUEUE_HOST_BUSY;
 }
 
+/*
+ * Note: the resources allocated in this function are freed in
+ * srp_free_target_ib().
+ */
 static int srp_alloc_iu_bufs(struct srp_target_port *target)
 {
 	int i;
 
-	for (i = 0; i < SRP_RQ_SIZE; ++i) {
+	target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring),
+				  GFP_KERNEL);
+	if (!target->rx_ring)
+		goto err_no_ring;
+	target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring),
+				  GFP_KERNEL);
+	if (!target->tx_ring)
+		goto err_no_ring;
+
+	for (i = 0; i < target->queue_size; ++i) {
 		target->rx_ring[i] = srp_alloc_iu(target->srp_host,
 						  target->max_ti_iu_len,
 						  GFP_KERNEL, DMA_FROM_DEVICE);
@@ -1433,7 +1612,7 @@
 			goto err;
 	}
 
-	for (i = 0; i < SRP_SQ_SIZE; ++i) {
+	for (i = 0; i < target->queue_size; ++i) {
 		target->tx_ring[i] = srp_alloc_iu(target->srp_host,
 						  target->max_iu_len,
 						  GFP_KERNEL, DMA_TO_DEVICE);
@@ -1446,15 +1625,17 @@
 	return 0;
 
 err:
-	for (i = 0; i < SRP_RQ_SIZE; ++i) {
+	for (i = 0; i < target->queue_size; ++i) {
 		srp_free_iu(target->srp_host, target->rx_ring[i]);
-		target->rx_ring[i] = NULL;
+		srp_free_iu(target->srp_host, target->tx_ring[i]);
 	}
 
-	for (i = 0; i < SRP_SQ_SIZE; ++i) {
-		srp_free_iu(target->srp_host, target->tx_ring[i]);
-		target->tx_ring[i] = NULL;
-	}
+
+err_no_ring:
+	kfree(target->tx_ring);
+	target->tx_ring = NULL;
+	kfree(target->rx_ring);
+	target->rx_ring = NULL;
 
 	return -ENOMEM;
 }
@@ -1506,6 +1687,9 @@
 		target->scsi_host->can_queue
 			= min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE,
 			      target->scsi_host->can_queue);
+		target->scsi_host->cmd_per_lun
+			= min_t(int, target->scsi_host->can_queue,
+				target->scsi_host->cmd_per_lun);
 	} else {
 		shost_printk(KERN_WARNING, target->scsi_host,
 			     PFX "Unhandled RSP opcode %#x\n", lrsp->opcode);
@@ -1513,7 +1697,7 @@
 		goto error;
 	}
 
-	if (!target->rx_ring[0]) {
+	if (!target->rx_ring) {
 		ret = srp_alloc_iu_bufs(target);
 		if (ret)
 			goto error;
@@ -1533,7 +1717,7 @@
 	if (ret)
 		goto error_free;
 
-	for (i = 0; i < SRP_RQ_SIZE; i++) {
+	for (i = 0; i < target->queue_size; i++) {
 		struct srp_iu *iu = target->rx_ring[i];
 		ret = srp_post_recv(target, iu);
 		if (ret)
@@ -1672,6 +1856,7 @@
 		if (ib_send_cm_drep(cm_id, NULL, 0))
 			shost_printk(KERN_ERR, target->scsi_host,
 				     PFX "Sending CM DREP failed\n");
+		queue_work(system_long_wq, &target->tl_err_work);
 		break;
 
 	case IB_CM_TIMEWAIT_EXIT:
@@ -1698,9 +1883,61 @@
 	return 0;
 }
 
+/**
+ * srp_change_queue_type - changing device queue tag type
+ * @sdev: scsi device struct
+ * @tag_type: requested tag type
+ *
+ * Returns queue tag type.
+ */
+static int
+srp_change_queue_type(struct scsi_device *sdev, int tag_type)
+{
+	if (sdev->tagged_supported) {
+		scsi_set_tag_type(sdev, tag_type);
+		if (tag_type)
+			scsi_activate_tcq(sdev, sdev->queue_depth);
+		else
+			scsi_deactivate_tcq(sdev, sdev->queue_depth);
+	} else
+		tag_type = 0;
+
+	return tag_type;
+}
+
+/**
+ * srp_change_queue_depth - setting device queue depth
+ * @sdev: scsi device struct
+ * @qdepth: requested queue depth
+ * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP
+ * (see include/scsi/scsi_host.h for definition)
+ *
+ * Returns queue depth.
+ */
+static int
+srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason)
+{
+	struct Scsi_Host *shost = sdev->host;
+	int max_depth;
+	if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) {
+		max_depth = shost->can_queue;
+		if (!sdev->tagged_supported)
+			max_depth = 1;
+		if (qdepth > max_depth)
+			qdepth = max_depth;
+		scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth);
+	} else if (reason == SCSI_QDEPTH_QFULL)
+		scsi_track_queue_full(sdev, qdepth);
+	else
+		return -EOPNOTSUPP;
+
+	return sdev->queue_depth;
+}
+
 static int srp_send_tsk_mgmt(struct srp_target_port *target,
 			     u64 req_tag, unsigned int lun, u8 func)
 {
+	struct srp_rport *rport = target->rport;
 	struct ib_device *dev = target->srp_host->srp_dev->dev;
 	struct srp_iu *iu;
 	struct srp_tsk_mgmt *tsk_mgmt;
@@ -1710,12 +1947,20 @@
 
 	init_completion(&target->tsk_mgmt_done);
 
+	/*
+	 * Lock the rport mutex to avoid that srp_create_target_ib() is
+	 * invoked while a task management function is being sent.
+	 */
+	mutex_lock(&rport->mutex);
 	spin_lock_irq(&target->lock);
 	iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT);
 	spin_unlock_irq(&target->lock);
 
-	if (!iu)
+	if (!iu) {
+		mutex_unlock(&rport->mutex);
+
 		return -1;
+	}
 
 	ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
 				   DMA_TO_DEVICE);
@@ -1732,8 +1977,11 @@
 				      DMA_TO_DEVICE);
 	if (srp_post_send(target, iu, sizeof *tsk_mgmt)) {
 		srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT);
+		mutex_unlock(&rport->mutex);
+
 		return -1;
 	}
+	mutex_unlock(&rport->mutex);
 
 	if (!wait_for_completion_timeout(&target->tsk_mgmt_done,
 					 msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))
@@ -1751,11 +1999,11 @@
 	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
 
 	if (!req || !srp_claim_req(target, req, scmnd))
-		return FAILED;
+		return SUCCESS;
 	if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
 			      SRP_TSK_ABORT_TASK) == 0)
 		ret = SUCCESS;
-	else if (target->transport_offline)
+	else if (target->rport->state == SRP_RPORT_LOST)
 		ret = FAST_IO_FAIL;
 	else
 		ret = FAILED;
@@ -1779,10 +2027,10 @@
 	if (target->tsk_mgmt_status)
 		return FAILED;
 
-	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
+	for (i = 0; i < target->req_ring_size; ++i) {
 		struct srp_request *req = &target->req_ring[i];
 		if (req->scmnd && req->scmnd->device == scmnd->device)
-			srp_reset_req(target, req);
+			srp_finish_req(target, req, DID_RESET << 16);
 	}
 
 	return SUCCESS;
@@ -1791,14 +2039,10 @@
 static int srp_reset_host(struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
-	int ret = FAILED;
 
 	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
 
-	if (!srp_reconnect_target(target))
-		ret = SUCCESS;
-
-	return ret;
+	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
 }
 
 static int srp_slave_configure(struct scsi_device *sdev)
@@ -1851,6 +2095,14 @@
 	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey));
 }
 
+static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->path.sgid.raw);
+}
+
 static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
@@ -1907,6 +2159,14 @@
 	return sprintf(buf, "%d\n", target->comp_vector);
 }
 
+static ssize_t show_tl_retry_count(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->tl_retry_count);
+}
+
 static ssize_t show_cmd_sg_entries(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -1927,6 +2187,7 @@
 static DEVICE_ATTR(ioc_guid,	    S_IRUGO, show_ioc_guid,	   NULL);
 static DEVICE_ATTR(service_id,	    S_IRUGO, show_service_id,	   NULL);
 static DEVICE_ATTR(pkey,	    S_IRUGO, show_pkey,		   NULL);
+static DEVICE_ATTR(sgid,	    S_IRUGO, show_sgid,		   NULL);
 static DEVICE_ATTR(dgid,	    S_IRUGO, show_dgid,		   NULL);
 static DEVICE_ATTR(orig_dgid,	    S_IRUGO, show_orig_dgid,	   NULL);
 static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
@@ -1934,6 +2195,7 @@
 static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
 static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
 static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
+static DEVICE_ATTR(tl_retry_count,  S_IRUGO, show_tl_retry_count,  NULL);
 static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
 static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
 
@@ -1942,6 +2204,7 @@
 	&dev_attr_ioc_guid,
 	&dev_attr_service_id,
 	&dev_attr_pkey,
+	&dev_attr_sgid,
 	&dev_attr_dgid,
 	&dev_attr_orig_dgid,
 	&dev_attr_req_lim,
@@ -1949,6 +2212,7 @@
 	&dev_attr_local_ib_port,
 	&dev_attr_local_ib_device,
 	&dev_attr_comp_vector,
+	&dev_attr_tl_retry_count,
 	&dev_attr_cmd_sg_entries,
 	&dev_attr_allow_ext_sg,
 	NULL
@@ -1961,14 +2225,16 @@
 	.slave_configure		= srp_slave_configure,
 	.info				= srp_target_info,
 	.queuecommand			= srp_queuecommand,
+	.change_queue_depth             = srp_change_queue_depth,
+	.change_queue_type              = srp_change_queue_type,
 	.eh_abort_handler		= srp_abort,
 	.eh_device_reset_handler	= srp_reset_device,
 	.eh_host_reset_handler		= srp_reset_host,
 	.skip_settle_delay		= true,
 	.sg_tablesize			= SRP_DEF_SG_TABLESIZE,
-	.can_queue			= SRP_CMD_SQ_SIZE,
+	.can_queue			= SRP_DEFAULT_CMD_SQ_SIZE,
 	.this_id			= -1,
-	.cmd_per_lun			= SRP_CMD_SQ_SIZE,
+	.cmd_per_lun			= SRP_DEFAULT_CMD_SQ_SIZE,
 	.use_clustering			= ENABLE_CLUSTERING,
 	.shost_attrs			= srp_host_attrs
 };
@@ -1994,6 +2260,7 @@
 	}
 
 	rport->lld_data = target;
+	target->rport = rport;
 
 	spin_lock(&host->target_lock);
 	list_add_tail(&target->list, &host->target_list);
@@ -2073,6 +2340,8 @@
 	SRP_OPT_ALLOW_EXT_SG	= 1 << 10,
 	SRP_OPT_SG_TABLESIZE	= 1 << 11,
 	SRP_OPT_COMP_VECTOR	= 1 << 12,
+	SRP_OPT_TL_RETRY_COUNT	= 1 << 13,
+	SRP_OPT_QUEUE_SIZE	= 1 << 14,
 	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|
 				   SRP_OPT_IOC_GUID	|
 				   SRP_OPT_DGID		|
@@ -2094,6 +2363,8 @@
 	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	},
 	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	},
 	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
+	{ SRP_OPT_TL_RETRY_COUNT,	"tl_retry_count=%u"	},
+	{ SRP_OPT_QUEUE_SIZE,		"queue_size=%d"		},
 	{ SRP_OPT_ERR,			NULL 			}
 };
 
@@ -2188,13 +2459,25 @@
 			target->scsi_host->max_sectors = token;
 			break;
 
+		case SRP_OPT_QUEUE_SIZE:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad queue_size parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->can_queue = token;
+			target->queue_size = token + SRP_RSP_SQ_SIZE +
+					     SRP_TSK_MGMT_SQ_SIZE;
+			if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+				target->scsi_host->cmd_per_lun = token;
+			break;
+
 		case SRP_OPT_MAX_CMD_PER_LUN:
-			if (match_int(args, &token)) {
+			if (match_int(args, &token) || token < 1) {
 				pr_warn("bad max cmd_per_lun parameter '%s'\n",
 					p);
 				goto out;
 			}
-			target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE);
+			target->scsi_host->cmd_per_lun = token;
 			break;
 
 		case SRP_OPT_IO_CLASS:
@@ -2257,6 +2540,15 @@
 			target->comp_vector = token;
 			break;
 
+		case SRP_OPT_TL_RETRY_COUNT:
+			if (match_int(args, &token) || token < 2 || token > 7) {
+				pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n",
+					p);
+				goto out;
+			}
+			target->tl_retry_count = token;
+			break;
+
 		default:
 			pr_warn("unknown parameter or missing value '%s' in target creation request\n",
 				p);
@@ -2273,6 +2565,12 @@
 				pr_warn("target creation request is missing parameter '%s'\n",
 					srp_opt_tokens[i].pattern);
 
+	if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue
+	    && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+		pr_warn("cmd_per_lun = %d > queue_size = %d\n",
+			target->scsi_host->cmd_per_lun,
+			target->scsi_host->can_queue);
+
 out:
 	kfree(options);
 	return ret;
@@ -2287,8 +2585,7 @@
 	struct Scsi_Host *target_host;
 	struct srp_target_port *target;
 	struct ib_device *ibdev = host->srp_dev->dev;
-	dma_addr_t dma_addr;
-	int i, ret;
+	int ret;
 
 	target_host = scsi_host_alloc(&srp_template,
 				      sizeof (struct srp_target_port));
@@ -2311,11 +2608,15 @@
 	target->cmd_sg_cnt	= cmd_sg_entries;
 	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries;
 	target->allow_ext_sg	= allow_ext_sg;
+	target->tl_retry_count	= 7;
+	target->queue_size	= SRP_DEFAULT_QUEUE_SIZE;
 
 	ret = srp_parse_options(buf, target);
 	if (ret)
 		goto err;
 
+	target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
+
 	if (!srp_conn_unique(target->srp_host, target)) {
 		shost_printk(KERN_INFO, target->scsi_host,
 			     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
@@ -2339,31 +2640,13 @@
 			     sizeof (struct srp_indirect_buf) +
 			     target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
 
+	INIT_WORK(&target->tl_err_work, srp_tl_err_work);
 	INIT_WORK(&target->remove_work, srp_remove_work);
 	spin_lock_init(&target->lock);
 	INIT_LIST_HEAD(&target->free_tx);
-	INIT_LIST_HEAD(&target->free_reqs);
-	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
-		struct srp_request *req = &target->req_ring[i];
-
-		req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *),
-					GFP_KERNEL);
-		req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *),
-					GFP_KERNEL);
-		req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
-		if (!req->fmr_list || !req->map_page || !req->indirect_desc)
-			goto err_free_mem;
-
-		dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
-					     target->indirect_size,
-					     DMA_TO_DEVICE);
-		if (ib_dma_mapping_error(ibdev, dma_addr))
-			goto err_free_mem;
-
-		req->indirect_dma_addr = dma_addr;
-		req->index = i;
-		list_add_tail(&req->list, &target->free_reqs);
-	}
+	ret = srp_alloc_req_data(target);
+	if (ret)
+		goto err_free_mem;
 
 	ib_query_gid(ibdev, host->port, 0, &target->path.sgid);
 
@@ -2612,7 +2895,14 @@
 }
 
 static struct srp_function_template ib_srp_transport_functions = {
+	.has_rport_state	 = true,
+	.reset_timer_if_blocked	 = true,
+	.reconnect_delay	 = &srp_reconnect_delay,
+	.fast_io_fail_tmo	 = &srp_fast_io_fail_tmo,
+	.dev_loss_tmo		 = &srp_dev_loss_tmo,
+	.reconnect		 = srp_rport_reconnect,
 	.rport_delete		 = srp_rport_delete,
+	.terminate_rport_io	 = srp_terminate_io,
 };
 
 static int __init srp_init_module(void)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index e641088..5756810 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -57,14 +57,11 @@
 	SRP_MAX_LUN		= 512,
 	SRP_DEF_SG_TABLESIZE	= 12,
 
-	SRP_RQ_SHIFT    	= 6,
-	SRP_RQ_SIZE		= 1 << SRP_RQ_SHIFT,
-
-	SRP_SQ_SIZE		= SRP_RQ_SIZE,
+	SRP_DEFAULT_QUEUE_SIZE	= 1 << 6,
 	SRP_RSP_SQ_SIZE		= 1,
-	SRP_REQ_SQ_SIZE		= SRP_SQ_SIZE - SRP_RSP_SQ_SIZE,
 	SRP_TSK_MGMT_SQ_SIZE	= 1,
-	SRP_CMD_SQ_SIZE		= SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE,
+	SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE -
+				  SRP_TSK_MGMT_SQ_SIZE,
 
 	SRP_TAG_NO_REQ		= ~0U,
 	SRP_TAG_TSK_MGMT	= 1U << 31,
@@ -140,7 +137,6 @@
 	unsigned int		cmd_sg_cnt;
 	unsigned int		indirect_size;
 	bool			allow_ext_sg;
-	bool			transport_offline;
 
 	/* Everything above this point is used in the hot path of
 	 * command processing. Try to keep them packed into cachelines.
@@ -153,10 +149,14 @@
 	u16			io_class;
 	struct srp_host	       *srp_host;
 	struct Scsi_Host       *scsi_host;
+	struct srp_rport       *rport;
 	char			target_name[32];
 	unsigned int		scsi_id;
 	unsigned int		sg_tablesize;
+	int			queue_size;
+	int			req_ring_size;
 	int			comp_vector;
+	int			tl_retry_count;
 
 	struct ib_sa_path_rec	path;
 	__be16			orig_dgid[8];
@@ -172,10 +172,11 @@
 
 	int			zero_req_lim;
 
-	struct srp_iu	       *tx_ring[SRP_SQ_SIZE];
-	struct srp_iu	       *rx_ring[SRP_RQ_SIZE];
-	struct srp_request	req_ring[SRP_CMD_SQ_SIZE];
+	struct srp_iu	       **tx_ring;
+	struct srp_iu	       **rx_ring;
+	struct srp_request	*req_ring;
 
+	struct work_struct	tl_err_work;
 	struct work_struct	remove_work;
 
 	struct list_head	list;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 6ca3073..8675d26 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -98,6 +98,7 @@
 static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
 					   struct mlx5_cmd_msg *in,
 					   struct mlx5_cmd_msg *out,
+					   void *uout, int uout_size,
 					   mlx5_cmd_cbk_t cbk,
 					   void *context, int page_queue)
 {
@@ -110,6 +111,8 @@
 
 	ent->in		= in;
 	ent->out	= out;
+	ent->uout	= uout;
+	ent->uout_size	= uout_size;
 	ent->callback	= cbk;
 	ent->context	= context;
 	ent->cmd	= cmd;
@@ -534,6 +537,7 @@
 	ent->lay = lay;
 	memset(lay, 0, sizeof(*lay));
 	memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+	ent->op = be32_to_cpu(lay->in[0]) >> 16;
 	if (ent->in->next)
 		lay->in_ptr = cpu_to_be64(ent->in->next->dma);
 	lay->inlen = cpu_to_be32(ent->in->len);
@@ -628,7 +632,8 @@
  *    2. page queue commands do not support asynchrous completion
  */
 static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
-			   struct mlx5_cmd_msg *out, mlx5_cmd_cbk_t callback,
+			   struct mlx5_cmd_msg *out, void *uout, int uout_size,
+			   mlx5_cmd_cbk_t callback,
 			   void *context, int page_queue, u8 *status)
 {
 	struct mlx5_cmd *cmd = &dev->cmd;
@@ -642,7 +647,8 @@
 	if (callback && page_queue)
 		return -EINVAL;
 
-	ent = alloc_cmd(cmd, in, out, callback, context, page_queue);
+	ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context,
+			page_queue);
 	if (IS_ERR(ent))
 		return PTR_ERR(ent);
 
@@ -670,10 +676,10 @@
 		op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode);
 		if (op < ARRAY_SIZE(cmd->stats)) {
 			stats = &cmd->stats[op];
-			spin_lock(&stats->lock);
+			spin_lock_irq(&stats->lock);
 			stats->sum += ds;
 			++stats->n;
-			spin_unlock(&stats->lock);
+			spin_unlock_irq(&stats->lock);
 		}
 		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
 				   "fw exec time for %s is %lld nsec\n",
@@ -826,7 +832,7 @@
 	int n;
 	int i;
 
-	msg = kzalloc(sizeof(*msg), GFP_KERNEL);
+	msg = kzalloc(sizeof(*msg), flags);
 	if (!msg)
 		return ERR_PTR(-ENOMEM);
 
@@ -1109,6 +1115,19 @@
 		up(&cmd->sem);
 }
 
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+	unsigned long flags;
+
+	if (msg->cache) {
+		spin_lock_irqsave(&msg->cache->lock, flags);
+		list_add_tail(&msg->list, &msg->cache->head);
+		spin_unlock_irqrestore(&msg->cache->lock, flags);
+	} else {
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
 void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
 {
 	struct mlx5_cmd *cmd = &dev->cmd;
@@ -1117,6 +1136,10 @@
 	void *context;
 	int err;
 	int i;
+	ktime_t t1, t2, delta;
+	s64 ds;
+	struct mlx5_cmd_stats *stats;
+	unsigned long flags;
 
 	for (i = 0; i < (1 << cmd->log_sz); i++) {
 		if (test_bit(i, &vector)) {
@@ -1141,9 +1164,29 @@
 			}
 			free_ent(cmd, ent->idx);
 			if (ent->callback) {
+				t1 = timespec_to_ktime(ent->ts1);
+				t2 = timespec_to_ktime(ent->ts2);
+				delta = ktime_sub(t2, t1);
+				ds = ktime_to_ns(delta);
+				if (ent->op < ARRAY_SIZE(cmd->stats)) {
+					stats = &cmd->stats[ent->op];
+					spin_lock_irqsave(&stats->lock, flags);
+					stats->sum += ds;
+					++stats->n;
+					spin_unlock_irqrestore(&stats->lock, flags);
+				}
+
 				callback = ent->callback;
 				context = ent->context;
 				err = ent->ret;
+				if (!err)
+					err = mlx5_copy_from_msg(ent->uout,
+								 ent->out,
+								 ent->uout_size);
+
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+
 				free_cmd(ent);
 				callback(err, context);
 			} else {
@@ -1160,7 +1203,8 @@
 	return status ? -1 : 0; /* TBD more meaningful codes */
 }
 
-static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size)
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
+				      gfp_t gfp)
 {
 	struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
 	struct mlx5_cmd *cmd = &dev->cmd;
@@ -1172,7 +1216,7 @@
 		ent = &cmd->cache.med;
 
 	if (ent) {
-		spin_lock(&ent->lock);
+		spin_lock_irq(&ent->lock);
 		if (!list_empty(&ent->head)) {
 			msg = list_entry(ent->head.next, typeof(*msg), list);
 			/* For cached lists, we must explicitly state what is
@@ -1181,43 +1225,34 @@
 			msg->len = in_size;
 			list_del(&msg->list);
 		}
-		spin_unlock(&ent->lock);
+		spin_unlock_irq(&ent->lock);
 	}
 
 	if (IS_ERR(msg))
-		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, in_size);
+		msg = mlx5_alloc_cmd_msg(dev, gfp, in_size);
 
 	return msg;
 }
 
-static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
-{
-	if (msg->cache) {
-		spin_lock(&msg->cache->lock);
-		list_add_tail(&msg->list, &msg->cache->head);
-		spin_unlock(&msg->cache->lock);
-	} else {
-		mlx5_free_cmd_msg(dev, msg);
-	}
-}
-
 static int is_manage_pages(struct mlx5_inbox_hdr *in)
 {
 	return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES;
 }
 
-int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
-		  int out_size)
+static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		    int out_size, mlx5_cmd_cbk_t callback, void *context)
 {
 	struct mlx5_cmd_msg *inb;
 	struct mlx5_cmd_msg *outb;
 	int pages_queue;
+	gfp_t gfp;
 	int err;
 	u8 status = 0;
 
 	pages_queue = is_manage_pages(in);
+	gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
 
-	inb = alloc_msg(dev, in_size);
+	inb = alloc_msg(dev, in_size, gfp);
 	if (IS_ERR(inb)) {
 		err = PTR_ERR(inb);
 		return err;
@@ -1229,13 +1264,14 @@
 		goto out_in;
 	}
 
-	outb = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, out_size);
+	outb = mlx5_alloc_cmd_msg(dev, gfp, out_size);
 	if (IS_ERR(outb)) {
 		err = PTR_ERR(outb);
 		goto out_in;
 	}
 
-	err = mlx5_cmd_invoke(dev, inb, outb, NULL, NULL, pages_queue, &status);
+	err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context,
+			      pages_queue, &status);
 	if (err)
 		goto out_out;
 
@@ -1248,14 +1284,30 @@
 	err = mlx5_copy_from_msg(out, outb, out_size);
 
 out_out:
-	mlx5_free_cmd_msg(dev, outb);
+	if (!callback)
+		mlx5_free_cmd_msg(dev, outb);
 
 out_in:
-	free_msg(dev, inb);
+	if (!callback)
+		free_msg(dev, inb);
 	return err;
 }
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size)
+{
+	return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL);
+}
 EXPORT_SYMBOL(mlx5_cmd_exec);
 
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context)
+{
+	return cmd_exec(dev, in, in_size, out, out_size, callback, context);
+}
+EXPORT_SYMBOL(mlx5_cmd_exec_cb);
+
 static void destroy_msg_cache(struct mlx5_core_dev *dev)
 {
 	struct mlx5_cmd *cmd = &dev->cmd;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
index 9c7194b..80f6d12 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -154,10 +154,10 @@
 		return 0;
 
 	stats = filp->private_data;
-	spin_lock(&stats->lock);
+	spin_lock_irq(&stats->lock);
 	if (stats->n)
 		field = div64_u64(stats->sum, stats->n);
-	spin_unlock(&stats->lock);
+	spin_unlock_irq(&stats->lock);
 	ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
 	if (ret > 0) {
 		if (copy_to_user(buf, tbuf, ret))
@@ -175,10 +175,10 @@
 	struct mlx5_cmd_stats *stats;
 
 	stats = filp->private_data;
-	spin_lock(&stats->lock);
+	spin_lock_irq(&stats->lock);
 	stats->sum = 0;
 	stats->n = 0;
-	spin_unlock(&stats->lock);
+	spin_unlock_irq(&stats->lock);
 
 	*pos += count;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 2231d93..64a61b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -354,7 +354,7 @@
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ);
 	in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index);
 	in->ctx.intr = vecidx;
-	in->ctx.log_page_size = PAGE_SHIFT - 12;
+	in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	in->events_mask = cpu_to_be64(mask);
 
 	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index bc0f5fb..40a9f5e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -159,6 +159,36 @@
 	u8      rsvd[15];
 };
 
+
+#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos))
+
+enum {
+	MLX5_CAP_BITS_RW_MASK	= CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) |
+				  CAP_MASK(MLX5_CAP_OFF_DCT, 1),
+};
+
+/* selectively copy writable fields clearing any reserved area
+ */
+static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from)
+{
+	u64 v64;
+
+	to->log_max_qp = from->log_max_qp & 0x1f;
+	to->log_max_ra_req_dc = from->log_max_ra_req_dc & 0x3f;
+	to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f;
+	to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f;
+	to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f;
+	to->log_max_atomic_size_qp = from->log_max_atomic_size_qp;
+	to->log_max_atomic_size_dc = from->log_max_atomic_size_dc;
+	v64 = be64_to_cpu(from->flags) & MLX5_CAP_BITS_RW_MASK;
+	to->flags = cpu_to_be64(v64);
+}
+
+enum {
+	HCA_CAP_OPMOD_GET_MAX	= 0,
+	HCA_CAP_OPMOD_GET_CUR	= 1,
+};
+
 static int handle_hca_cap(struct mlx5_core_dev *dev)
 {
 	struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL;
@@ -180,7 +210,7 @@
 	}
 
 	query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP);
-	query_ctx.hdr.opmod  = cpu_to_be16(0x1);
+	query_ctx.hdr.opmod  = cpu_to_be16(HCA_CAP_OPMOD_GET_CUR);
 	err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx),
 				 query_out, sizeof(*query_out));
 	if (err)
@@ -192,8 +222,7 @@
 		goto query_ex;
 	}
 
-	memcpy(&set_ctx->hca_cap, &query_out->hca_cap,
-	       sizeof(set_ctx->hca_cap));
+	copy_rw_fields(&set_ctx->hca_cap, &query_out->hca_cap);
 
 	if (dev->profile->mask & MLX5_PROF_MASK_QP_SIZE)
 		set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
index 5b44e2e..35e514d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -37,31 +37,41 @@
 #include "mlx5_core.h"
 
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
-			  struct mlx5_create_mkey_mbox_in *in, int inlen)
+			  struct mlx5_create_mkey_mbox_in *in, int inlen,
+			  mlx5_cmd_cbk_t callback, void *context,
+			  struct mlx5_create_mkey_mbox_out *out)
 {
-	struct mlx5_create_mkey_mbox_out out;
+	struct mlx5_create_mkey_mbox_out lout;
 	int err;
 	u8 key;
 
-	memset(&out, 0, sizeof(out));
-	spin_lock(&dev->priv.mkey_lock);
+	memset(&lout, 0, sizeof(lout));
+	spin_lock_irq(&dev->priv.mkey_lock);
 	key = dev->priv.mkey_key++;
-	spin_unlock(&dev->priv.mkey_lock);
+	spin_unlock_irq(&dev->priv.mkey_lock);
 	in->seg.qpn_mkey7_0 |= cpu_to_be32(key);
 	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY);
-	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (callback) {
+		err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out),
+				       callback, context);
+		return err;
+	} else {
+		err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout));
+	}
+
 	if (err) {
 		mlx5_core_dbg(dev, "cmd exec faile %d\n", err);
 		return err;
 	}
 
-	if (out.hdr.status) {
-		mlx5_core_dbg(dev, "status %d\n", out.hdr.status);
-		return mlx5_cmd_status_to_err(&out.hdr);
+	if (lout.hdr.status) {
+		mlx5_core_dbg(dev, "status %d\n", lout.hdr.status);
+		return mlx5_cmd_status_to_err(&lout.hdr);
 	}
 
-	mr->key = mlx5_idx_to_mkey(be32_to_cpu(out.mkey) & 0xffffff) | key;
-	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", be32_to_cpu(out.mkey), key, mr->key);
+	mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key;
+	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n",
+		      be32_to_cpu(lout.mkey), key, mr->key);
 
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 7b12acf..37b6ad1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -57,10 +57,13 @@
 };
 
 struct fw_page {
-	struct rb_node	rb_node;
-	u64		addr;
-	struct page	*page;
-	u16		func_id;
+	struct rb_node		rb_node;
+	u64			addr;
+	struct page	       *page;
+	u16			func_id;
+	unsigned long		bitmask;
+	struct list_head	list;
+	unsigned		free_count;
 };
 
 struct mlx5_query_pages_inbox {
@@ -94,6 +97,11 @@
 	MAX_RECLAIM_TIME_MSECS	= 5000,
 };
 
+enum {
+	MLX5_MAX_RECLAIM_TIME_MILI	= 5000,
+	MLX5_NUM_4K_IN_PAGE		= PAGE_SIZE / 4096,
+};
+
 static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id)
 {
 	struct rb_root *root = &dev->priv.page_root;
@@ -101,6 +109,7 @@
 	struct rb_node *parent = NULL;
 	struct fw_page *nfp;
 	struct fw_page *tfp;
+	int i;
 
 	while (*new) {
 		parent = *new;
@@ -113,25 +122,29 @@
 			return -EEXIST;
 	}
 
-	nfp = kmalloc(sizeof(*nfp), GFP_KERNEL);
+	nfp = kzalloc(sizeof(*nfp), GFP_KERNEL);
 	if (!nfp)
 		return -ENOMEM;
 
 	nfp->addr = addr;
 	nfp->page = page;
 	nfp->func_id = func_id;
+	nfp->free_count = MLX5_NUM_4K_IN_PAGE;
+	for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++)
+		set_bit(i, &nfp->bitmask);
 
 	rb_link_node(&nfp->rb_node, parent, new);
 	rb_insert_color(&nfp->rb_node, root);
+	list_add(&nfp->list, &dev->priv.free_list);
 
 	return 0;
 }
 
-static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr)
+static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr)
 {
 	struct rb_root *root = &dev->priv.page_root;
 	struct rb_node *tmp = root->rb_node;
-	struct page *result = NULL;
+	struct fw_page *result = NULL;
 	struct fw_page *tfp;
 
 	while (tmp) {
@@ -141,9 +154,7 @@
 		} else if (tfp->addr > addr) {
 			tmp = tmp->rb_right;
 		} else {
-			rb_erase(&tfp->rb_node, root);
-			result = tfp->page;
-			kfree(tfp);
+			result = tfp;
 			break;
 		}
 	}
@@ -176,12 +187,98 @@
 	return err;
 }
 
+static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr)
+{
+	struct fw_page *fp;
+	unsigned n;
+
+	if (list_empty(&dev->priv.free_list)) {
+		return -ENOMEM;
+		mlx5_core_warn(dev, "\n");
+	}
+
+	fp = list_entry(dev->priv.free_list.next, struct fw_page, list);
+	n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask));
+	if (n >= MLX5_NUM_4K_IN_PAGE) {
+		mlx5_core_warn(dev, "alloc 4k bug\n");
+		return -ENOENT;
+	}
+	clear_bit(n, &fp->bitmask);
+	fp->free_count--;
+	if (!fp->free_count)
+		list_del(&fp->list);
+
+	*addr = fp->addr + n * 4096;
+
+	return 0;
+}
+
+static void free_4k(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct fw_page *fwp;
+	int n;
+
+	fwp = find_fw_page(dev, addr & PAGE_MASK);
+	if (!fwp) {
+		mlx5_core_warn(dev, "page not found\n");
+		return;
+	}
+
+	n = (addr & ~PAGE_MASK) % 4096;
+	fwp->free_count++;
+	set_bit(n, &fwp->bitmask);
+	if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) {
+		rb_erase(&fwp->rb_node, &dev->priv.page_root);
+		if (fwp->free_count != 1)
+			list_del(&fwp->list);
+		dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+		__free_page(fwp->page);
+		kfree(fwp);
+	} else if (fwp->free_count == 1) {
+		list_add(&fwp->list, &dev->priv.free_list);
+	}
+}
+
+static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id)
+{
+	struct page *page;
+	u64 addr;
+	int err;
+
+	page = alloc_page(GFP_HIGHUSER);
+	if (!page) {
+		mlx5_core_warn(dev, "failed to allocate page\n");
+		return -ENOMEM;
+	}
+	addr = dma_map_page(&dev->pdev->dev, page, 0,
+			    PAGE_SIZE, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&dev->pdev->dev, addr)) {
+		mlx5_core_warn(dev, "failed dma mapping page\n");
+		err = -ENOMEM;
+		goto out_alloc;
+	}
+	err = insert_page(dev, addr, page, func_id);
+	if (err) {
+		mlx5_core_err(dev, "failed to track allocated page\n");
+		goto out_mapping;
+	}
+
+	return 0;
+
+out_mapping:
+	dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+out_alloc:
+	__free_page(page);
+
+	return err;
+}
 static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
 		      int notify_fail)
 {
 	struct mlx5_manage_pages_inbox *in;
 	struct mlx5_manage_pages_outbox out;
-	struct page *page;
+	struct mlx5_manage_pages_inbox *nin;
 	int inlen;
 	u64 addr;
 	int err;
@@ -196,27 +293,15 @@
 	memset(&out, 0, sizeof(out));
 
 	for (i = 0; i < npages; i++) {
-		page = alloc_page(GFP_HIGHUSER);
-		if (!page) {
-			err = -ENOMEM;
-			mlx5_core_warn(dev, "failed to allocate page\n");
-			goto out_alloc;
-		}
-		addr = dma_map_page(&dev->pdev->dev, page, 0,
-				    PAGE_SIZE, DMA_BIDIRECTIONAL);
-		if (dma_mapping_error(&dev->pdev->dev, addr)) {
-			mlx5_core_warn(dev, "failed dma mapping page\n");
-			__free_page(page);
-			err = -ENOMEM;
-			goto out_alloc;
-		}
-		err = insert_page(dev, addr, page, func_id);
+retry:
+		err = alloc_4k(dev, &addr);
 		if (err) {
-			mlx5_core_err(dev, "failed to track allocated page\n");
-			dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-			__free_page(page);
-			err = -ENOMEM;
-			goto out_alloc;
+			if (err == -ENOMEM)
+				err = alloc_system_page(dev, func_id);
+			if (err)
+				goto out_4k;
+
+			goto retry;
 		}
 		in->pas[i] = cpu_to_be64(addr);
 	}
@@ -226,7 +311,6 @@
 	in->func_id = cpu_to_be16(func_id);
 	in->num_entries = cpu_to_be32(npages);
 	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
-	mlx5_core_dbg(dev, "err %d\n", err);
 	if (err) {
 		mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err);
 		goto out_alloc;
@@ -247,25 +331,22 @@
 
 out_alloc:
 	if (notify_fail) {
-		memset(in, 0, inlen);
-		memset(&out, 0, sizeof(out));
-		in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
-		in->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE);
-		if (mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)))
-			mlx5_core_warn(dev, "\n");
-	}
-	for (i--; i >= 0; i--) {
-		addr = be64_to_cpu(in->pas[i]);
-		page = remove_page(dev, addr);
-		if (!page) {
-			mlx5_core_err(dev, "BUG: can't remove page at addr 0x%llx\n",
-				      addr);
-			continue;
+		nin = kzalloc(sizeof(*nin), GFP_KERNEL);
+		if (!nin) {
+			mlx5_core_warn(dev, "allocation failed\n");
+			goto out_4k;
 		}
-		dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-		__free_page(page);
+		memset(&out, 0, sizeof(out));
+		nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+		nin->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE);
+		if (mlx5_cmd_exec(dev, nin, sizeof(*nin), &out, sizeof(out)))
+			mlx5_core_warn(dev, "page notify failed\n");
+		kfree(nin);
 	}
 
+out_4k:
+	for (i--; i >= 0; i--)
+		free_4k(dev, be64_to_cpu(in->pas[i]));
 out_free:
 	mlx5_vfree(in);
 	return err;
@@ -276,7 +357,6 @@
 {
 	struct mlx5_manage_pages_inbox   in;
 	struct mlx5_manage_pages_outbox *out;
-	struct page *page;
 	int num_claimed;
 	int outlen;
 	u64 addr;
@@ -315,13 +395,7 @@
 
 	for (i = 0; i < num_claimed; i++) {
 		addr = be64_to_cpu(out->pas[i]);
-		page = remove_page(dev, addr);
-		if (!page) {
-			mlx5_core_warn(dev, "FW reported unknown DMA address 0x%llx\n", addr);
-		} else {
-			dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
-			__free_page(page);
-		}
+		free_4k(dev, addr);
 	}
 
 out_free:
@@ -381,14 +455,19 @@
 	return give_pages(dev, func_id, npages, 0);
 }
 
+enum {
+	MLX5_BLKS_FOR_RECLAIM_PAGES = 12
+};
+
 static int optimal_reclaimed_pages(void)
 {
 	struct mlx5_cmd_prot_block *block;
 	struct mlx5_cmd_layout *lay;
 	int ret;
 
-	ret = (sizeof(lay->in) + sizeof(block->data) -
-	       sizeof(struct mlx5_manage_pages_outbox)) / 8;
+	ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) -
+	       sizeof(struct mlx5_manage_pages_outbox)) /
+	       FIELD_SIZEOF(struct mlx5_manage_pages_outbox, pas[0]);
 
 	return ret;
 }
@@ -427,6 +506,7 @@
 void mlx5_pagealloc_init(struct mlx5_core_dev *dev)
 {
 	dev->priv.page_root = RB_ROOT;
+	INIT_LIST_HEAD(&dev->priv.free_list);
 }
 
 void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev)
diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c
index f379c7f..2700a5a 100644
--- a/drivers/scsi/scsi_transport_srp.c
+++ b/drivers/scsi/scsi_transport_srp.c
@@ -24,12 +24,15 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/delay.h>
 
 #include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_transport.h>
 #include <scsi/scsi_transport_srp.h>
+#include "scsi_priv.h"
 #include "scsi_transport_srp_internal.h"
 
 struct srp_host_attrs {
@@ -38,7 +41,7 @@
 #define to_srp_host_attrs(host)	((struct srp_host_attrs *)(host)->shost_data)
 
 #define SRP_HOST_ATTRS 0
-#define SRP_RPORT_ATTRS 3
+#define SRP_RPORT_ATTRS 8
 
 struct srp_internal {
 	struct scsi_transport_template t;
@@ -54,6 +57,36 @@
 
 #define	dev_to_rport(d)	container_of(d, struct srp_rport, dev)
 #define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent)
+static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r)
+{
+	return dev_to_shost(r->dev.parent);
+}
+
+/**
+ * srp_tmo_valid() - check timeout combination validity
+ *
+ * The combination of the timeout parameters must be such that SCSI commands
+ * are finished in a reasonable time. Hence do not allow the fast I/O fail
+ * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT. Furthermore, these
+ * parameters must be such that multipath can detect failed paths timely.
+ * Hence do not allow all three parameters to be disabled simultaneously.
+ */
+int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo)
+{
+	if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0)
+		return -EINVAL;
+	if (reconnect_delay == 0)
+		return -EINVAL;
+	if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		return -EINVAL;
+	if (dev_loss_tmo >= LONG_MAX / HZ)
+		return -EINVAL;
+	if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 &&
+	    fast_io_fail_tmo >= dev_loss_tmo)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(srp_tmo_valid);
 
 static int srp_host_setup(struct transport_container *tc, struct device *dev,
 			  struct device *cdev)
@@ -134,10 +167,465 @@
 
 static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete);
 
+static ssize_t show_srp_rport_state(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	static const char *const state_name[] = {
+		[SRP_RPORT_RUNNING]	= "running",
+		[SRP_RPORT_BLOCKED]	= "blocked",
+		[SRP_RPORT_FAIL_FAST]	= "fail-fast",
+		[SRP_RPORT_LOST]	= "lost",
+	};
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	enum srp_rport_state state = rport->state;
+
+	return sprintf(buf, "%s\n",
+		       (unsigned)state < ARRAY_SIZE(state_name) ?
+		       state_name[state] : "???");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL);
+
+static ssize_t srp_show_tmo(char *buf, int tmo)
+{
+	return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n");
+}
+
+static int srp_parse_tmo(int *tmo, const char *buf)
+{
+	int res = 0;
+
+	if (strncmp(buf, "off", 3) != 0)
+		res = kstrtoint(buf, 0, tmo);
+	else
+		*tmo = -1;
+
+	return res;
+}
+
+static ssize_t show_reconnect_delay(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->reconnect_delay);
+}
+
+static ssize_t store_reconnect_delay(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, const size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res, delay;
+
+	res = srp_parse_tmo(&delay, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(delay, rport->fast_io_fail_tmo,
+			    rport->dev_loss_tmo);
+	if (res)
+		goto out;
+
+	if (rport->reconnect_delay <= 0 && delay > 0 &&
+	    rport->state != SRP_RPORT_RUNNING) {
+		queue_delayed_work(system_long_wq, &rport->reconnect_work,
+				   delay * HZ);
+	} else if (delay <= 0) {
+		cancel_delayed_work(&rport->reconnect_work);
+	}
+	rport->reconnect_delay = delay;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay,
+		   store_reconnect_delay);
+
+static ssize_t show_failed_reconnects(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return sprintf(buf, "%d\n", rport->failed_reconnects);
+}
+
+static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL);
+
+static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev,
+					       struct device_attribute *attr,
+					       char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->fast_io_fail_tmo);
+}
+
+static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev,
+						struct device_attribute *attr,
+						const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res;
+	int fast_io_fail_tmo;
+
+	res = srp_parse_tmo(&fast_io_fail_tmo, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo,
+			    rport->dev_loss_tmo);
+	if (res)
+		goto out;
+	rport->fast_io_fail_tmo = fast_io_fail_tmo;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR,
+		   show_srp_rport_fast_io_fail_tmo,
+		   store_srp_rport_fast_io_fail_tmo);
+
+static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+
+	return srp_show_tmo(buf, rport->dev_loss_tmo);
+}
+
+static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct srp_rport *rport = transport_class_to_srp_rport(dev);
+	int res;
+	int dev_loss_tmo;
+
+	res = srp_parse_tmo(&dev_loss_tmo, buf);
+	if (res)
+		goto out;
+	res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo,
+			    dev_loss_tmo);
+	if (res)
+		goto out;
+	rport->dev_loss_tmo = dev_loss_tmo;
+	res = count;
+
+out:
+	return res;
+}
+
+static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR,
+		   show_srp_rport_dev_loss_tmo,
+		   store_srp_rport_dev_loss_tmo);
+
+static int srp_rport_set_state(struct srp_rport *rport,
+			       enum srp_rport_state new_state)
+{
+	enum srp_rport_state old_state = rport->state;
+
+	lockdep_assert_held(&rport->mutex);
+
+	switch (new_state) {
+	case SRP_RPORT_RUNNING:
+		switch (old_state) {
+		case SRP_RPORT_LOST:
+			goto invalid;
+		default:
+			break;
+		}
+		break;
+	case SRP_RPORT_BLOCKED:
+		switch (old_state) {
+		case SRP_RPORT_RUNNING:
+			break;
+		default:
+			goto invalid;
+		}
+		break;
+	case SRP_RPORT_FAIL_FAST:
+		switch (old_state) {
+		case SRP_RPORT_LOST:
+			goto invalid;
+		default:
+			break;
+		}
+		break;
+	case SRP_RPORT_LOST:
+		break;
+	}
+	rport->state = new_state;
+	return 0;
+
+invalid:
+	return -EINVAL;
+}
+
+/**
+ * srp_reconnect_work() - reconnect and schedule a new attempt if necessary
+ */
+static void srp_reconnect_work(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, reconnect_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	int delay, res;
+
+	res = srp_reconnect_rport(rport);
+	if (res != 0) {
+		shost_printk(KERN_ERR, shost,
+			     "reconnect attempt %d failed (%d)\n",
+			     ++rport->failed_reconnects, res);
+		delay = rport->reconnect_delay *
+			min(100, max(1, rport->failed_reconnects - 10));
+		if (delay > 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->reconnect_work, delay * HZ);
+	}
+}
+
+static void __rport_fail_io_fast(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i;
+
+	lockdep_assert_held(&rport->mutex);
+
+	if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST))
+		return;
+	scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+
+	/* Involve the LLD if possible to terminate all I/O on the rport. */
+	i = to_srp_internal(shost->transportt);
+	if (i->f->terminate_rport_io)
+		i->f->terminate_rport_io(rport);
+}
+
+/**
+ * rport_fast_io_fail_timedout() - fast I/O failure timeout handler
+ */
+static void rport_fast_io_fail_timedout(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, fast_io_fail_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+
+	pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n",
+		dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+	mutex_lock(&rport->mutex);
+	if (rport->state == SRP_RPORT_BLOCKED)
+		__rport_fail_io_fast(rport);
+	mutex_unlock(&rport->mutex);
+}
+
+/**
+ * rport_dev_loss_timedout() - device loss timeout handler
+ */
+static void rport_dev_loss_timedout(struct work_struct *work)
+{
+	struct srp_rport *rport = container_of(to_delayed_work(work),
+					struct srp_rport, dev_loss_work);
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	pr_info("dev_loss_tmo expired for SRP %s / %s.\n",
+		dev_name(&rport->dev), dev_name(&shost->shost_gendev));
+
+	mutex_lock(&rport->mutex);
+	WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0);
+	scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE);
+	mutex_unlock(&rport->mutex);
+
+	i->f->rport_delete(rport);
+}
+
+static void __srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	int delay, fast_io_fail_tmo, dev_loss_tmo;
+
+	lockdep_assert_held(&rport->mutex);
+
+	if (!rport->deleted) {
+		delay = rport->reconnect_delay;
+		fast_io_fail_tmo = rport->fast_io_fail_tmo;
+		dev_loss_tmo = rport->dev_loss_tmo;
+		pr_debug("%s current state: %d\n",
+			 dev_name(&shost->shost_gendev), rport->state);
+
+		if (delay > 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->reconnect_work,
+					   1UL * delay * HZ);
+		if (fast_io_fail_tmo >= 0 &&
+		    srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
+			pr_debug("%s new state: %d\n",
+				 dev_name(&shost->shost_gendev),
+				 rport->state);
+			scsi_target_block(&shost->shost_gendev);
+			queue_delayed_work(system_long_wq,
+					   &rport->fast_io_fail_work,
+					   1UL * fast_io_fail_tmo * HZ);
+		}
+		if (dev_loss_tmo >= 0)
+			queue_delayed_work(system_long_wq,
+					   &rport->dev_loss_work,
+					   1UL * dev_loss_tmo * HZ);
+	} else {
+		pr_debug("%s has already been deleted\n",
+			 dev_name(&shost->shost_gendev));
+		srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST);
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+	}
+}
+
+/**
+ * srp_start_tl_fail_timers() - start the transport layer failure timers
+ *
+ * Start the transport layer fast I/O failure and device loss timers. Do not
+ * modify a timer that was already started.
+ */
+void srp_start_tl_fail_timers(struct srp_rport *rport)
+{
+	mutex_lock(&rport->mutex);
+	__srp_start_tl_fail_timers(rport);
+	mutex_unlock(&rport->mutex);
+}
+EXPORT_SYMBOL(srp_start_tl_fail_timers);
+
+/**
+ * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
+ */
+static int scsi_request_fn_active(struct Scsi_Host *shost)
+{
+	struct scsi_device *sdev;
+	struct request_queue *q;
+	int request_fn_active = 0;
+
+	shost_for_each_device(sdev, shost) {
+		q = sdev->request_queue;
+
+		spin_lock_irq(q->queue_lock);
+		request_fn_active += q->request_fn_active;
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	return request_fn_active;
+}
+
+/**
+ * srp_reconnect_rport() - reconnect to an SRP target port
+ *
+ * Blocks SCSI command queueing before invoking reconnect() such that
+ * queuecommand() won't be invoked concurrently with reconnect() from outside
+ * the SCSI EH. This is important since a reconnect() implementation may
+ * reallocate resources needed by queuecommand().
+ *
+ * Notes:
+ * - This function neither waits until outstanding requests have finished nor
+ *   tries to abort these. It is the responsibility of the reconnect()
+ *   function to finish outstanding commands before reconnecting to the target
+ *   port.
+ * - It is the responsibility of the caller to ensure that the resources
+ *   reallocated by the reconnect() function won't be used while this function
+ *   is in progress. One possible strategy is to invoke this function from
+ *   the context of the SCSI EH thread only. Another possible strategy is to
+ *   lock the rport mutex inside each SCSI LLD callback that can be invoked by
+ *   the SCSI EH (the scsi_host_template.eh_*() functions and also the
+ *   scsi_host_template.queuecommand() function).
+ */
+int srp_reconnect_rport(struct srp_rport *rport)
+{
+	struct Scsi_Host *shost = rport_to_shost(rport);
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+	struct scsi_device *sdev;
+	int res;
+
+	pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev));
+
+	res = mutex_lock_interruptible(&rport->mutex);
+	if (res)
+		goto out;
+	scsi_target_block(&shost->shost_gendev);
+	while (scsi_request_fn_active(shost))
+		msleep(20);
+	res = i->f->reconnect(rport);
+	pr_debug("%s (state %d): transport.reconnect() returned %d\n",
+		 dev_name(&shost->shost_gendev), rport->state, res);
+	if (res == 0) {
+		cancel_delayed_work(&rport->fast_io_fail_work);
+		cancel_delayed_work(&rport->dev_loss_work);
+
+		rport->failed_reconnects = 0;
+		srp_rport_set_state(rport, SRP_RPORT_RUNNING);
+		scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING);
+		/*
+		 * If the SCSI error handler has offlined one or more devices,
+		 * invoking scsi_target_unblock() won't change the state of
+		 * these devices into running so do that explicitly.
+		 */
+		spin_lock_irq(shost->host_lock);
+		__shost_for_each_device(sdev, shost)
+			if (sdev->sdev_state == SDEV_OFFLINE)
+				sdev->sdev_state = SDEV_RUNNING;
+		spin_unlock_irq(shost->host_lock);
+	} else if (rport->state == SRP_RPORT_RUNNING) {
+		/*
+		 * srp_reconnect_rport() was invoked with fast_io_fail
+		 * off. Mark the port as failed and start the TL failure
+		 * timers if these had not yet been started.
+		 */
+		__rport_fail_io_fast(rport);
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+		__srp_start_tl_fail_timers(rport);
+	} else if (rport->state != SRP_RPORT_BLOCKED) {
+		scsi_target_unblock(&shost->shost_gendev,
+				    SDEV_TRANSPORT_OFFLINE);
+	}
+	mutex_unlock(&rport->mutex);
+
+out:
+	return res;
+}
+EXPORT_SYMBOL(srp_reconnect_rport);
+
+/**
+ * srp_timed_out() - SRP transport intercept of the SCSI timeout EH
+ *
+ * If a timeout occurs while an rport is in the blocked state, ask the SCSI
+ * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core
+ * handle the timeout (BLK_EH_NOT_HANDLED).
+ *
+ * Note: This function is called from soft-IRQ context and with the request
+ * queue lock held.
+ */
+static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd)
+{
+	struct scsi_device *sdev = scmd->device;
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_internal *i = to_srp_internal(shost->transportt);
+
+	pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev));
+	return i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ?
+		BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
+}
+
 static void srp_rport_release(struct device *dev)
 {
 	struct srp_rport *rport = dev_to_rport(dev);
 
+	cancel_delayed_work_sync(&rport->reconnect_work);
+	cancel_delayed_work_sync(&rport->fast_io_fail_work);
+	cancel_delayed_work_sync(&rport->dev_loss_work);
+
 	put_device(dev->parent);
 	kfree(rport);
 }
@@ -185,6 +673,24 @@
 }
 
 /**
+ * srp_rport_get() - increment rport reference count
+ */
+void srp_rport_get(struct srp_rport *rport)
+{
+	get_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_get);
+
+/**
+ * srp_rport_put() - decrement rport reference count
+ */
+void srp_rport_put(struct srp_rport *rport)
+{
+	put_device(&rport->dev);
+}
+EXPORT_SYMBOL(srp_rport_put);
+
+/**
  * srp_rport_add - add a SRP remote port to the device hierarchy
  * @shost:	scsi host the remote port is connected to.
  * @ids:	The port id for the remote port.
@@ -196,12 +702,15 @@
 {
 	struct srp_rport *rport;
 	struct device *parent = &shost->shost_gendev;
+	struct srp_internal *i = to_srp_internal(shost->transportt);
 	int id, ret;
 
 	rport = kzalloc(sizeof(*rport), GFP_KERNEL);
 	if (!rport)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&rport->mutex);
+
 	device_initialize(&rport->dev);
 
 	rport->dev.parent = get_device(parent);
@@ -210,6 +719,17 @@
 	memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id));
 	rport->roles = ids->roles;
 
+	if (i->f->reconnect)
+		rport->reconnect_delay = i->f->reconnect_delay ?
+			*i->f->reconnect_delay : 10;
+	INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work);
+	rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ?
+		*i->f->fast_io_fail_tmo : 15;
+	rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60;
+	INIT_DELAYED_WORK(&rport->fast_io_fail_work,
+			  rport_fast_io_fail_timedout);
+	INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout);
+
 	id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id);
 	dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id);
 
@@ -259,6 +779,13 @@
 	transport_remove_device(dev);
 	device_del(dev);
 	transport_destroy_device(dev);
+
+	mutex_lock(&rport->mutex);
+	if (rport->state == SRP_RPORT_BLOCKED)
+		__rport_fail_io_fast(rport);
+	rport->deleted = true;
+	mutex_unlock(&rport->mutex);
+
 	put_device(dev);
 }
 EXPORT_SYMBOL_GPL(srp_rport_del);
@@ -310,6 +837,8 @@
 	if (!i)
 		return NULL;
 
+	i->t.eh_timed_out = srp_timed_out;
+
 	i->t.tsk_mgmt_response = srp_tsk_mgmt_response;
 	i->t.it_nexus_response = srp_it_nexus_response;
 
@@ -327,6 +856,15 @@
 	count = 0;
 	i->rport_attrs[count++] = &dev_attr_port_id;
 	i->rport_attrs[count++] = &dev_attr_roles;
+	if (ft->has_rport_state) {
+		i->rport_attrs[count++] = &dev_attr_state;
+		i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo;
+		i->rport_attrs[count++] = &dev_attr_dev_loss_tmo;
+	}
+	if (ft->reconnect) {
+		i->rport_attrs[count++] = &dev_attr_reconnect_delay;
+		i->rport_attrs[count++] = &dev_attr_failed_reconnects;
+	}
 	if (ft->rport_delete)
 		i->rport_attrs[count++] = &dev_attr_delete;
 	i->rport_attrs[count++] = NULL;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 5eb4e31..da78875 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -230,6 +230,15 @@
 	MLX5_MAX_PAGE_SHIFT		= 31
 };
 
+enum {
+	MLX5_ADAPTER_PAGE_SHIFT		= 12
+};
+
+enum {
+	MLX5_CAP_OFF_DCT		= 41,
+	MLX5_CAP_OFF_CMDIF_CSUM		= 46,
+};
+
 struct mlx5_inbox_hdr {
 	__be16		opcode;
 	u8		rsvd[4];
@@ -319,9 +328,9 @@
 	u8	rsvd25[42];
 	__be16  log_uar_page_sz;
 	u8	rsvd26[28];
-	u8	log_msx_atomic_size_qp;
+	u8	log_max_atomic_size_qp;
 	u8	rsvd27[2];
-	u8	log_msx_atomic_size_dc;
+	u8	log_max_atomic_size_dc;
 	u8	rsvd28[76];
 };
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6b8c496..554548c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -483,6 +483,7 @@
 	struct rb_root		page_root;
 	int			fw_pages;
 	int			reg_pages;
+	struct list_head	free_list;
 
 	struct mlx5_core_health health;
 
@@ -557,9 +558,11 @@
 struct mlx5_cmd_work_ent {
 	struct mlx5_cmd_msg    *in;
 	struct mlx5_cmd_msg    *out;
+	void		       *uout;
+	int			uout_size;
 	mlx5_cmd_cbk_t		callback;
 	void		       *context;
-	int idx;
+	int			idx;
 	struct completion	done;
 	struct mlx5_cmd        *cmd;
 	struct work_struct	work;
@@ -570,6 +573,7 @@
 	u8			token;
 	struct timespec		ts1;
 	struct timespec		ts2;
+	u16			op;
 };
 
 struct mlx5_pas {
@@ -653,6 +657,9 @@
 int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
 int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
 		  int out_size);
+int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size,
+		     void *out, int out_size, mlx5_cmd_cbk_t callback,
+		     void *context);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
@@ -676,7 +683,9 @@
 int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
 		      u16 lwm, int is_srq);
 int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
-			  struct mlx5_create_mkey_mbox_in *in, int inlen);
+			  struct mlx5_create_mkey_mbox_in *in, int inlen,
+			  mlx5_cmd_cbk_t callback, void *context,
+			  struct mlx5_create_mkey_mbox_out *out);
 int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr);
 int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
 			 struct mlx5_query_mkey_mbox_out *out, int outlen);
@@ -745,6 +754,11 @@
 	return mkey_idx << 8;
 }
 
+static inline u8 mlx5_mkey_variant(u32 mkey)
+{
+	return mkey & 0xff;
+}
+
 enum {
 	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
 	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e393171..979874c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -67,12 +67,14 @@
 	RDMA_NODE_IB_CA 	= 1,
 	RDMA_NODE_IB_SWITCH,
 	RDMA_NODE_IB_ROUTER,
-	RDMA_NODE_RNIC
+	RDMA_NODE_RNIC,
+	RDMA_NODE_USNIC,
 };
 
 enum rdma_transport_type {
 	RDMA_TRANSPORT_IB,
-	RDMA_TRANSPORT_IWARP
+	RDMA_TRANSPORT_IWARP,
+	RDMA_TRANSPORT_USNIC
 };
 
 enum rdma_transport_type
@@ -1436,6 +1438,7 @@
 
 	int			     uverbs_abi_ver;
 	u64			     uverbs_cmd_mask;
+	u64			     uverbs_ex_cmd_mask;
 
 	char			     node_desc[64];
 	__be64			     node_guid;
@@ -2384,4 +2387,17 @@
 			       struct ib_flow_attr *flow_attr, int domain);
 int ib_destroy_flow(struct ib_flow *flow_id);
 
+static inline int ib_check_mr_access(int flags)
+{
+	/*
+	 * Local write permission is required if remote write or
+	 * remote atomic permission is also requested.
+	 */
+	if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+	    !(flags & IB_ACCESS_LOCAL_WRITE))
+		return -EINVAL;
+
+	return 0;
+}
+
 #endif /* IB_VERBS_H */
diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
index ff0f04a..4ebf6913 100644
--- a/include/scsi/scsi_transport_srp.h
+++ b/include/scsi/scsi_transport_srp.h
@@ -13,6 +13,27 @@
 	u8 roles;
 };
 
+/**
+ * enum srp_rport_state - SRP transport layer state
+ * @SRP_RPORT_RUNNING:   Transport layer operational.
+ * @SRP_RPORT_BLOCKED:   Transport layer not operational; fast I/O fail timer
+ *                       is running and I/O has been blocked.
+ * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
+ * @SRP_RPORT_LOST:      Device loss timer has expired; port is being removed.
+ */
+enum srp_rport_state {
+	SRP_RPORT_RUNNING,
+	SRP_RPORT_BLOCKED,
+	SRP_RPORT_FAIL_FAST,
+	SRP_RPORT_LOST,
+};
+
+/**
+ * struct srp_rport
+ * @lld_data: LLD private data.
+ * @mutex:    Protects against concurrent rport reconnect / fast_io_fail /
+ *   dev_loss_tmo activity.
+ */
 struct srp_rport {
 	/* for initiator and target drivers */
 
@@ -23,11 +44,43 @@
 
 	/* for initiator drivers */
 
-	void *lld_data;	/* LLD private data */
+	void			*lld_data;
+
+	struct mutex		mutex;
+	enum srp_rport_state	state;
+	bool			deleted;
+	int			reconnect_delay;
+	int			failed_reconnects;
+	struct delayed_work	reconnect_work;
+	int			fast_io_fail_tmo;
+	int			dev_loss_tmo;
+	struct delayed_work	fast_io_fail_work;
+	struct delayed_work	dev_loss_work;
 };
 
+/**
+ * struct srp_function_template
+ * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and
+ *     dev_loss_tmo sysfs attribute for an rport.
+ * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command
+ *     timer if the device on which it has been queued is blocked.
+ * @reconnect_delay: If not NULL, points to the default reconnect_delay value.
+ * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value.
+ * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value.
+ * @reconnect: Callback function for reconnecting to the target. See also
+ *     srp_reconnect_rport().
+ * @terminate_rport_io: Callback function for terminating all outstanding I/O
+ *     requests for an rport.
+ */
 struct srp_function_template {
 	/* for initiator drivers */
+	bool has_rport_state;
+	bool reset_timer_if_blocked;
+	int *reconnect_delay;
+	int *fast_io_fail_tmo;
+	int *dev_loss_tmo;
+	int (*reconnect)(struct srp_rport *rport);
+	void (*terminate_rport_io)(struct srp_rport *rport);
 	void (*rport_delete)(struct srp_rport *rport);
 	/* for target drivers */
 	int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int);
@@ -38,10 +91,36 @@
 srp_attach_transport(struct srp_function_template *);
 extern void srp_release_transport(struct scsi_transport_template *);
 
+extern void srp_rport_get(struct srp_rport *rport);
+extern void srp_rport_put(struct srp_rport *rport);
 extern struct srp_rport *srp_rport_add(struct Scsi_Host *,
 				       struct srp_rport_identifiers *);
 extern void srp_rport_del(struct srp_rport *);
-
+extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
+			 int dev_loss_tmo);
+extern int srp_reconnect_rport(struct srp_rport *rport);
+extern void srp_start_tl_fail_timers(struct srp_rport *rport);
 extern void srp_remove_host(struct Scsi_Host *);
 
+/**
+ * srp_chkready() - evaluate the transport layer state before I/O
+ *
+ * Returns a SCSI result code that can be returned by the LLD queuecommand()
+ * implementation. The role of this function is similar to that of
+ * fc_remote_port_chkready().
+ */
+static inline int srp_chkready(struct srp_rport *rport)
+{
+	switch (rport->state) {
+	case SRP_RPORT_RUNNING:
+	case SRP_RPORT_BLOCKED:
+	default:
+		return 0;
+	case SRP_RPORT_FAIL_FAST:
+		return DID_TRANSPORT_FAILFAST << 16;
+	case SRP_RPORT_LOST:
+		return DID_NO_CONNECT << 16;
+	}
+}
+
 #endif
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index e3ddd86..cbfdd4c 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -87,10 +87,11 @@
 	IB_USER_VERBS_CMD_CLOSE_XRCD,
 	IB_USER_VERBS_CMD_CREATE_XSRQ,
 	IB_USER_VERBS_CMD_OPEN_QP,
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-	IB_USER_VERBS_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
-	IB_USER_VERBS_CMD_DESTROY_FLOW
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
+};
+
+enum {
+	IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
+	IB_USER_VERBS_EX_CMD_DESTROY_FLOW
 };
 
 /*
@@ -122,22 +123,24 @@
  * the rest of the command struct based on these value.
  */
 
+#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff
+#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u
+#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24
+
+#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80
+
 struct ib_uverbs_cmd_hdr {
 	__u32 command;
 	__u16 in_words;
 	__u16 out_words;
 };
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-struct ib_uverbs_cmd_hdr_ex {
-	__u32 command;
-	__u16 in_words;
-	__u16 out_words;
+struct ib_uverbs_ex_cmd_hdr {
+	__u64 response;
 	__u16 provider_in_words;
 	__u16 provider_out_words;
 	__u32 cmd_hdr_reserved;
 };
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
 
 struct ib_uverbs_get_context {
 	__u64 response;
@@ -700,62 +703,71 @@
 	__u64 driver_data[0];
 };
 
-#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING
-struct ib_kern_eth_filter {
+struct ib_uverbs_flow_spec_hdr {
+	__u32 type;
+	__u16 size;
+	__u16 reserved;
+	/* followed by flow_spec */
+	__u64 flow_spec_data[0];
+};
+
+struct ib_uverbs_flow_eth_filter {
 	__u8  dst_mac[6];
 	__u8  src_mac[6];
 	__be16 ether_type;
 	__be16 vlan_tag;
 };
 
-struct ib_kern_spec_eth {
-	__u32  type;
-	__u16  size;
-	__u16  reserved;
-	struct ib_kern_eth_filter val;
-	struct ib_kern_eth_filter mask;
-};
-
-struct ib_kern_ipv4_filter {
-	__be32 src_ip;
-	__be32 dst_ip;
-};
-
-struct ib_kern_spec_ipv4 {
-	__u32  type;
-	__u16  size;
-	__u16  reserved;
-	struct ib_kern_ipv4_filter val;
-	struct ib_kern_ipv4_filter mask;
-};
-
-struct ib_kern_tcp_udp_filter {
-	__be16 dst_port;
-	__be16 src_port;
-};
-
-struct ib_kern_spec_tcp_udp {
-	__u32  type;
-	__u16  size;
-	__u16  reserved;
-	struct ib_kern_tcp_udp_filter val;
-	struct ib_kern_tcp_udp_filter mask;
-};
-
-struct ib_kern_spec {
+struct ib_uverbs_flow_spec_eth {
 	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
 		struct {
 			__u32 type;
 			__u16 size;
 			__u16 reserved;
 		};
-		struct ib_kern_spec_eth	    eth;
-		struct ib_kern_spec_ipv4    ipv4;
-		struct ib_kern_spec_tcp_udp tcp_udp;
 	};
+	struct ib_uverbs_flow_eth_filter val;
+	struct ib_uverbs_flow_eth_filter mask;
 };
 
-struct ib_kern_flow_attr {
+struct ib_uverbs_flow_ipv4_filter {
+	__be32 src_ip;
+	__be32 dst_ip;
+};
+
+struct ib_uverbs_flow_spec_ipv4 {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_ipv4_filter val;
+	struct ib_uverbs_flow_ipv4_filter mask;
+};
+
+struct ib_uverbs_flow_tcp_udp_filter {
+	__be16 dst_port;
+	__be16 src_port;
+};
+
+struct ib_uverbs_flow_spec_tcp_udp {
+	union {
+		struct ib_uverbs_flow_spec_hdr hdr;
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+	};
+	struct ib_uverbs_flow_tcp_udp_filter val;
+	struct ib_uverbs_flow_tcp_udp_filter mask;
+};
+
+struct ib_uverbs_flow_attr {
 	__u32 type;
 	__u16 size;
 	__u16 priority;
@@ -767,13 +779,13 @@
 	 * struct ib_flow_spec_xxx
 	 * struct ib_flow_spec_yyy
 	 */
+	struct ib_uverbs_flow_spec_hdr flow_specs[0];
 };
 
 struct ib_uverbs_create_flow  {
 	__u32 comp_mask;
-	__u64 response;
 	__u32 qp_handle;
-	struct ib_kern_flow_attr flow_attr;
+	struct ib_uverbs_flow_attr flow_attr;
 };
 
 struct ib_uverbs_create_flow_resp {
@@ -785,7 +797,6 @@
 	__u32 comp_mask;
 	__u32 flow_handle;
 };
-#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */
 
 struct ib_uverbs_create_srq {
 	__u64 response;