target: Drop lun_sep_lock for se_lun->lun_se_dev RCU usage

With se_port and t10_alua_tg_pt_gp_member being absored into se_lun,
there is no need for an extra lock to protect se_lun->lun_se_dev
assignment.

This patch also converts backend drivers to use call_rcu() release
to allow any se_device readers to complete.  The call_rcu() instead
of kfree_rcu() is required here because se_device is embedded into
the backend driver specific structure.

Also, convert se_lun->lun_stats to use atomic_long_t within the
target_complete_ok_work() completion callback, and add FIXME for
transport_lookup_tmr_lun() with se_lun->lun_ref.

Finally, update sbp_update_unit_directory() special case usage with
proper rcu_dereference_raw() and configfs symlink comment.

Reported-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Chris Boot <bootc@bootc.net>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index ef6fdd8..d8a5912 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1261,10 +1261,7 @@
 		return ret;
 
 	cmd->se_cmd_flags |= SCF_SUPPORTED_SAM_OPCODE;
-
-	spin_lock(&cmd->se_lun->lun_sep_lock);
-	cmd->se_lun->lun_stats.cmd_pdus++;
-	spin_unlock(&cmd->se_lun->lun_sep_lock);
+	atomic_long_inc(&cmd->se_lun->lun_stats.cmd_pdus);
 	return 0;
 }
 EXPORT_SYMBOL(target_setup_cmd_from_cdb);
@@ -2060,9 +2057,8 @@
 queue_rsp:
 	switch (cmd->data_direction) {
 	case DMA_FROM_DEVICE:
-		spin_lock(&cmd->se_lun->lun_sep_lock);
-		cmd->se_lun->lun_stats.tx_data_octets += cmd->data_length;
-		spin_unlock(&cmd->se_lun->lun_sep_lock);
+		atomic_long_add(cmd->data_length,
+				&cmd->se_lun->lun_stats.tx_data_octets);
 		/*
 		 * Perform READ_STRIP of PI using software emulation when
 		 * backend had PI enabled, if the transport will not be
@@ -2085,16 +2081,14 @@
 			goto queue_full;
 		break;
 	case DMA_TO_DEVICE:
-		spin_lock(&cmd->se_lun->lun_sep_lock);
-		cmd->se_lun->lun_stats.rx_data_octets += cmd->data_length;
-		spin_unlock(&cmd->se_lun->lun_sep_lock);
+		atomic_long_add(cmd->data_length,
+				&cmd->se_lun->lun_stats.rx_data_octets);
 		/*
 		 * Check if we need to send READ payload for BIDI-COMMAND
 		 */
 		if (cmd->se_cmd_flags & SCF_BIDI) {
-			spin_lock(&cmd->se_lun->lun_sep_lock);
-			cmd->se_lun->lun_stats.tx_data_octets += cmd->data_length;
-			spin_unlock(&cmd->se_lun->lun_sep_lock);
+			atomic_long_add(cmd->data_length,
+					&cmd->se_lun->lun_stats.tx_data_octets);
 			ret = cmd->se_tfo->queue_data_in(cmd);
 			if (ret == -EAGAIN || ret == -ENOMEM)
 				goto queue_full;