Merge tag 'iommu-updates-v4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull iommu updates for from Joerg Roedel:
 "This time the IOMMU updates are mostly cleanups or fixes.  No big new
  features or drivers this time.  In particular the changes include:

   - Bigger cleanup of the Domain<->IOMMU data structures and the code
     that manages them in the Intel VT-d driver.  This makes the code
     easier to understand and maintain, and also easier to keep the data
     structures in sync.  It is also a preparation step to make use of
     default domains from the IOMMU core in the Intel VT-d driver.

   - Fixes for a couple of DMA-API misuses in ARM IOMMU drivers, namely
     in the ARM and Tegra SMMU drivers.

   - Fix for a potential buffer overflow in the OMAP iommu driver's
     debug code

   - A couple of smaller fixes and cleanups in various drivers

   - One small new feature: Report domain-id usage in the Intel VT-d
     driver to easier detect bugs where these are leaked"

* tag 'iommu-updates-v4.3' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu: (83 commits)
  iommu/vt-d: Really use upper context table when necessary
  x86/vt-d: Fix documentation of DRHD
  iommu/fsl: Really fix init section(s) content
  iommu/io-pgtable-arm: Unmap and free table when overwriting with block
  iommu/io-pgtable-arm: Move init-fn declarations to io-pgtable.h
  iommu/msm: Use BUG_ON instead of if () BUG()
  iommu/vt-d: Access iomem correctly
  iommu/vt-d: Make two functions static
  iommu/vt-d: Use BUG_ON instead of if () BUG()
  iommu/vt-d: Return false instead of 0 in irq_remapping_cap()
  iommu/amd: Use BUG_ON instead of if () BUG()
  iommu/amd: Make a symbol static
  iommu/amd: Simplify allocation in irq_remapping_alloc()
  iommu/tegra-smmu: Parameterize number of TLB lines
  iommu/tegra-smmu: Factor out tegra_smmu_set_pde()
  iommu/tegra-smmu: Extract tegra_smmu_pte_get_use()
  iommu/tegra-smmu: Use __GFP_ZERO to allocate zeroed pages
  iommu/tegra-smmu: Remove PageReserved manipulation
  iommu/tegra-smmu: Convert to use DMA API
  iommu/tegra-smmu: smmu_flush_ptc() wants device addresses
  ...
diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.txt b/Documentation/devicetree/bindings/iommu/arm,smmu.txt
index 0676050..7180745 100644
--- a/Documentation/devicetree/bindings/iommu/arm,smmu.txt
+++ b/Documentation/devicetree/bindings/iommu/arm,smmu.txt
@@ -43,6 +43,12 @@
 
 ** System MMU optional properties:
 
+- dma-coherent  : Present if page table walks made by the SMMU are
+                  cache coherent with the CPU.
+
+                  NOTE: this only applies to the SMMU itself, not
+                  masters connected upstream of the SMMU.
+
 - calxeda,smmu-secure-config-access : Enable proper handling of buggy
                   implementations that always use secure access to
                   SMMU configuration registers. In this case non-secure
diff --git a/Documentation/devicetree/bindings/iommu/ti,omap-iommu.txt b/Documentation/devicetree/bindings/iommu/ti,omap-iommu.txt
index 42531dc..8696999 100644
--- a/Documentation/devicetree/bindings/iommu/ti,omap-iommu.txt
+++ b/Documentation/devicetree/bindings/iommu/ti,omap-iommu.txt
@@ -8,6 +8,11 @@
 - ti,hwmods  : Name of the hwmod associated with the IOMMU instance
 - reg        : Address space for the configuration registers
 - interrupts : Interrupt specifier for the IOMMU instance
+- #iommu-cells : Should be 0. OMAP IOMMUs are all "single-master" devices,
+                 and needs no additional data in the pargs specifier. Please
+                 also refer to the generic bindings document for more info
+                 on this property,
+                     Documentation/devicetree/bindings/iommu/iommu.txt
 
 Optional properties:
 - ti,#tlb-entries : Number of entries in the translation look-aside buffer.
@@ -18,6 +23,7 @@
 Example:
 	/* OMAP3 ISP MMU */
 	mmu_isp: mmu@480bd400 {
+		#iommu-cells = <0>;
 		compatible = "ti,omap2-iommu";
 		reg = <0x480bd400 0x80>;
 		interrupts = <24>;
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f491aec..4664c2a 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -23,7 +23,8 @@
 config IOMMU_IO_PGTABLE_LPAE
 	bool "ARMv7/v8 Long Descriptor Format"
 	select IOMMU_IO_PGTABLE
-	depends on ARM || ARM64 || COMPILE_TEST
+	# SWIOTLB guarantees a dma_to_phys() implementation
+	depends on ARM || ARM64 || (COMPILE_TEST && SWIOTLB)
 	help
 	  Enable support for the ARM long descriptor pagetable format.
 	  This allocator supports 4K/2M/1G, 16K/32M and 64K/512M page
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 658ee39..f82060e7 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1835,8 +1835,8 @@
 		free_gcr3_tbl_level2(domain->gcr3_tbl);
 	else if (domain->glx == 1)
 		free_gcr3_tbl_level1(domain->gcr3_tbl);
-	else if (domain->glx != 0)
-		BUG();
+	else
+		BUG_ON(domain->glx != 0);
 
 	free_page((unsigned long)domain->gcr3_tbl);
 }
@@ -3947,11 +3947,6 @@
 	if (ret < 0)
 		return ret;
 
-	ret = -ENOMEM;
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		goto out_free_parent;
-
 	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
 		if (get_irq_table(devid, true))
 			index = info->ioapic_pin;
@@ -3962,7 +3957,6 @@
 	}
 	if (index < 0) {
 		pr_warn("Failed to allocate IRTE\n");
-		kfree(data);
 		goto out_free_parent;
 	}
 
@@ -3974,17 +3968,18 @@
 			goto out_free_data;
 		}
 
-		if (i > 0) {
-			data = kzalloc(sizeof(*data), GFP_KERNEL);
-			if (!data)
-				goto out_free_data;
-		}
+		ret = -ENOMEM;
+		data = kzalloc(sizeof(*data), GFP_KERNEL);
+		if (!data)
+			goto out_free_data;
+
 		irq_data->hwirq = (devid << 16) + i;
 		irq_data->chip_data = data;
 		irq_data->chip = &amd_ir_chip;
 		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
 		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
 	}
+
 	return 0;
 
 out_free_data:
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index a24495e..5ef347a 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -154,7 +154,7 @@
 u32 amd_iommu_max_pasid __read_mostly = ~0;
 
 bool amd_iommu_v2_present __read_mostly;
-bool amd_iommu_pc_present __read_mostly;
+static bool amd_iommu_pc_present __read_mostly;
 
 bool amd_iommu_force_isolation __read_mostly;
 
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index f7b875b..1131664 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -356,8 +356,8 @@
 		free_pasid_states_level2(dev_state->states);
 	else if (dev_state->pasid_levels == 1)
 		free_pasid_states_level1(dev_state->states);
-	else if (dev_state->pasid_levels != 0)
-		BUG();
+	else
+		BUG_ON(dev_state->pasid_levels != 0);
 
 	free_page((unsigned long)dev_state->states);
 }
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index da902ba..dafaf59 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -118,6 +118,7 @@
 
 #define ARM_SMMU_IRQ_CTRL		0x50
 #define IRQ_CTRL_EVTQ_IRQEN		(1 << 2)
+#define IRQ_CTRL_PRIQ_IRQEN		(1 << 1)
 #define IRQ_CTRL_GERROR_IRQEN		(1 << 0)
 
 #define ARM_SMMU_IRQ_CTRLACK		0x54
@@ -173,14 +174,14 @@
 #define ARM_SMMU_PRIQ_IRQ_CFG2		0xdc
 
 /* Common MSI config fields */
-#define MSI_CFG0_SH_SHIFT		60
-#define MSI_CFG0_SH_NSH			(0UL << MSI_CFG0_SH_SHIFT)
-#define MSI_CFG0_SH_OSH			(2UL << MSI_CFG0_SH_SHIFT)
-#define MSI_CFG0_SH_ISH			(3UL << MSI_CFG0_SH_SHIFT)
-#define MSI_CFG0_MEMATTR_SHIFT		56
-#define MSI_CFG0_MEMATTR_DEVICE_nGnRE	(0x1 << MSI_CFG0_MEMATTR_SHIFT)
 #define MSI_CFG0_ADDR_SHIFT		2
 #define MSI_CFG0_ADDR_MASK		0x3fffffffffffUL
+#define MSI_CFG2_SH_SHIFT		4
+#define MSI_CFG2_SH_NSH			(0UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_SH_OSH			(2UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_SH_ISH			(3UL << MSI_CFG2_SH_SHIFT)
+#define MSI_CFG2_MEMATTR_SHIFT		0
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE	(0x1 << MSI_CFG2_MEMATTR_SHIFT)
 
 #define Q_IDX(q, p)			((p) & ((1 << (q)->max_n_shift) - 1))
 #define Q_WRP(q, p)			((p) & (1 << (q)->max_n_shift))
@@ -1330,33 +1331,10 @@
 	arm_smmu_cmdq_issue_cmd(smmu, &cmd);
 }
 
-static void arm_smmu_flush_pgtable(void *addr, size_t size, void *cookie)
-{
-	struct arm_smmu_domain *smmu_domain = cookie;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	unsigned long offset = (unsigned long)addr & ~PAGE_MASK;
-
-	if (smmu->features & ARM_SMMU_FEAT_COHERENCY) {
-		dsb(ishst);
-	} else {
-		dma_addr_t dma_addr;
-		struct device *dev = smmu->dev;
-
-		dma_addr = dma_map_page(dev, virt_to_page(addr), offset, size,
-					DMA_TO_DEVICE);
-
-		if (dma_mapping_error(dev, dma_addr))
-			dev_err(dev, "failed to flush pgtable at %p\n", addr);
-		else
-			dma_unmap_page(dev, dma_addr, size, DMA_TO_DEVICE);
-	}
-}
-
 static struct iommu_gather_ops arm_smmu_gather_ops = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context,
 	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
 	.tlb_sync	= arm_smmu_tlb_sync,
-	.flush_pgtable	= arm_smmu_flush_pgtable,
 };
 
 /* IOMMU API */
@@ -1531,6 +1509,7 @@
 		.ias		= ias,
 		.oas		= oas,
 		.tlb		= &arm_smmu_gather_ops,
+		.iommu_dev	= smmu->dev,
 	};
 
 	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
@@ -2053,9 +2032,17 @@
 	int ret;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
-	/* Calculate the L1 size, capped to the SIDSIZE */
-	size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
-	size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+	/*
+	 * If we can resolve everything with a single L2 table, then we
+	 * just need a single L1 descriptor. Otherwise, calculate the L1
+	 * size, capped to the SIDSIZE.
+	 */
+	if (smmu->sid_bits < STRTAB_SPLIT) {
+		size = 0;
+	} else {
+		size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
+		size = min(size, smmu->sid_bits - STRTAB_SPLIT);
+	}
 	cfg->num_l1_ents = 1 << size;
 
 	size += STRTAB_SPLIT;
@@ -2198,6 +2185,7 @@
 static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
 {
 	int ret, irq;
+	u32 irqen_flags = IRQ_CTRL_EVTQ_IRQEN | IRQ_CTRL_GERROR_IRQEN;
 
 	/* Disable IRQs first */
 	ret = arm_smmu_write_reg_sync(smmu, 0, ARM_SMMU_IRQ_CTRL,
@@ -2252,13 +2240,13 @@
 			if (IS_ERR_VALUE(ret))
 				dev_warn(smmu->dev,
 					 "failed to enable priq irq\n");
+			else
+				irqen_flags |= IRQ_CTRL_PRIQ_IRQEN;
 		}
 	}
 
 	/* Enable interrupt generation on the SMMU */
-	ret = arm_smmu_write_reg_sync(smmu,
-				      IRQ_CTRL_EVTQ_IRQEN |
-				      IRQ_CTRL_GERROR_IRQEN,
+	ret = arm_smmu_write_reg_sync(smmu, irqen_flags,
 				      ARM_SMMU_IRQ_CTRL, ARM_SMMU_IRQ_CTRLACK);
 	if (ret)
 		dev_warn(smmu->dev, "failed to enable irqs\n");
@@ -2540,12 +2528,12 @@
 	case IDR5_OAS_44_BIT:
 		smmu->oas = 44;
 		break;
+	default:
+		dev_info(smmu->dev,
+			"unknown output address size. Truncating to 48-bit\n");
+		/* Fallthrough */
 	case IDR5_OAS_48_BIT:
 		smmu->oas = 48;
-		break;
-	default:
-		dev_err(smmu->dev, "unknown output address size!\n");
-		return -ENXIO;
 	}
 
 	/* Set the DMA mask for our table walker */
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 4cd0c29..48a39df 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -37,6 +37,7 @@
 #include <linux/iopoll.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_address.h>
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -607,34 +608,10 @@
 	}
 }
 
-static void arm_smmu_flush_pgtable(void *addr, size_t size, void *cookie)
-{
-	struct arm_smmu_domain *smmu_domain = cookie;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	unsigned long offset = (unsigned long)addr & ~PAGE_MASK;
-
-
-	/* Ensure new page tables are visible to the hardware walker */
-	if (smmu->features & ARM_SMMU_FEAT_COHERENT_WALK) {
-		dsb(ishst);
-	} else {
-		/*
-		 * If the SMMU can't walk tables in the CPU caches, treat them
-		 * like non-coherent DMA since we need to flush the new entries
-		 * all the way out to memory. There's no possibility of
-		 * recursion here as the SMMU table walker will not be wired
-		 * through another SMMU.
-		 */
-		dma_map_page(smmu->dev, virt_to_page(addr), offset, size,
-			     DMA_TO_DEVICE);
-	}
-}
-
 static struct iommu_gather_ops arm_smmu_gather_ops = {
 	.tlb_flush_all	= arm_smmu_tlb_inv_context,
 	.tlb_add_flush	= arm_smmu_tlb_inv_range_nosync,
 	.tlb_sync	= arm_smmu_tlb_sync,
-	.flush_pgtable	= arm_smmu_flush_pgtable,
 };
 
 static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
@@ -898,6 +875,7 @@
 		.ias		= ias,
 		.oas		= oas,
 		.tlb		= &arm_smmu_gather_ops,
+		.iommu_dev	= smmu->dev,
 	};
 
 	smmu_domain->smmu = smmu;
@@ -1532,6 +1510,7 @@
 	unsigned long size;
 	void __iomem *gr0_base = ARM_SMMU_GR0(smmu);
 	u32 id;
+	bool cttw_dt, cttw_reg;
 
 	dev_notice(smmu->dev, "probing hardware configuration...\n");
 	dev_notice(smmu->dev, "SMMUv%d with:\n", smmu->version);
@@ -1571,10 +1550,22 @@
 		dev_notice(smmu->dev, "\taddress translation ops\n");
 	}
 
-	if (id & ID0_CTTW) {
+	/*
+	 * In order for DMA API calls to work properly, we must defer to what
+	 * the DT says about coherency, regardless of what the hardware claims.
+	 * Fortunately, this also opens up a workaround for systems where the
+	 * ID register value has ended up configured incorrectly.
+	 */
+	cttw_dt = of_dma_is_coherent(smmu->dev->of_node);
+	cttw_reg = !!(id & ID0_CTTW);
+	if (cttw_dt)
 		smmu->features |= ARM_SMMU_FEAT_COHERENT_WALK;
-		dev_notice(smmu->dev, "\tcoherent table walk\n");
-	}
+	if (cttw_dt || cttw_reg)
+		dev_notice(smmu->dev, "\t%scoherent table walk\n",
+			   cttw_dt ? "" : "non-");
+	if (cttw_dt != cttw_reg)
+		dev_notice(smmu->dev,
+			   "\t(IDR0.CTTW overridden by dma-coherent property)\n");
 
 	if (id & ID0_SMS) {
 		u32 smr, sid, mask;
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index c9db04d4..8757f8d 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1068,7 +1068,7 @@
 	if (intel_iommu_enabled)
 		iommu->iommu_dev = iommu_device_create(NULL, iommu,
 						       intel_iommu_groups,
-						       iommu->name);
+						       "%s", iommu->name);
 
 	return 0;
 
diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c
index abeedc9..2570f2a2 100644
--- a/drivers/iommu/fsl_pamu.c
+++ b/drivers/iommu/fsl_pamu.c
@@ -41,7 +41,6 @@
 
 static struct paace *ppaact;
 static struct paace *spaact;
-static struct ome *omt __initdata;
 
 /*
  * Table for matching compatible strings, for device tree
@@ -50,7 +49,7 @@
  * SOCs. For the older SOCs "fsl,qoriq-device-config-1.0"
  * string would be used.
  */
-static const struct of_device_id guts_device_ids[] __initconst = {
+static const struct of_device_id guts_device_ids[] = {
 	{ .compatible = "fsl,qoriq-device-config-1.0", },
 	{ .compatible = "fsl,qoriq-device-config-2.0", },
 	{}
@@ -599,7 +598,7 @@
  * Memory accesses to QMAN and BMAN private memory need not be coherent, so
  * clear the PAACE entry coherency attribute for them.
  */
-static void __init setup_qbman_paace(struct paace *ppaace, int  paace_type)
+static void setup_qbman_paace(struct paace *ppaace, int  paace_type)
 {
 	switch (paace_type) {
 	case QMAN_PAACE:
@@ -629,7 +628,7 @@
  * this table to translate device transaction to appropriate corenet
  * transaction.
  */
-static void __init setup_omt(struct ome *omt)
+static void setup_omt(struct ome *omt)
 {
 	struct ome *ome;
 
@@ -666,7 +665,7 @@
  * Get the maximum number of PAACT table entries
  * and subwindows supported by PAMU
  */
-static void __init get_pamu_cap_values(unsigned long pamu_reg_base)
+static void get_pamu_cap_values(unsigned long pamu_reg_base)
 {
 	u32 pc_val;
 
@@ -676,9 +675,9 @@
 }
 
 /* Setup PAMU registers pointing to PAACT, SPAACT and OMT */
-static int __init setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
-				 phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
-				 phys_addr_t omt_phys)
+static int setup_one_pamu(unsigned long pamu_reg_base, unsigned long pamu_reg_size,
+			  phys_addr_t ppaact_phys, phys_addr_t spaact_phys,
+			  phys_addr_t omt_phys)
 {
 	u32 *pc;
 	struct pamu_mmap_regs *pamu_regs;
@@ -720,7 +719,7 @@
 }
 
 /* Enable all device LIODNS */
-static void __init setup_liodns(void)
+static void setup_liodns(void)
 {
 	int i, len;
 	struct paace *ppaace;
@@ -846,7 +845,7 @@
 /*
  * Create a coherence subdomain for a given memory block.
  */
-static int __init create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
+static int create_csd(phys_addr_t phys, size_t size, u32 csd_port_id)
 {
 	struct device_node *np;
 	const __be32 *iprop;
@@ -988,7 +987,7 @@
 static const struct {
 	u32 svr;
 	u32 port_id;
-} port_id_map[] __initconst = {
+} port_id_map[] = {
 	{(SVR_P2040 << 8) | 0x10, 0xFF000000},	/* P2040 1.0 */
 	{(SVR_P2040 << 8) | 0x11, 0xFF000000},	/* P2040 1.1 */
 	{(SVR_P2041 << 8) | 0x10, 0xFF000000},	/* P2041 1.0 */
@@ -1006,7 +1005,7 @@
 
 #define SVR_SECURITY	0x80000	/* The Security (E) bit */
 
-static int __init fsl_pamu_probe(struct platform_device *pdev)
+static int fsl_pamu_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	void __iomem *pamu_regs = NULL;
@@ -1022,6 +1021,7 @@
 	int irq;
 	phys_addr_t ppaact_phys;
 	phys_addr_t spaact_phys;
+	struct ome *omt;
 	phys_addr_t omt_phys;
 	size_t mem_size = 0;
 	unsigned int order = 0;
@@ -1200,7 +1200,7 @@
 	return ret;
 }
 
-static struct platform_driver fsl_of_pamu_driver __initdata = {
+static struct platform_driver fsl_of_pamu_driver = {
 	.driver = {
 		.name = "fsl-of-pamu",
 	},
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c82ebee..2d7349a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -364,7 +364,8 @@
 static struct dmar_domain *si_domain;
 static int hw_pass_through = 1;
 
-/* domain represents a virtual machine, more than one devices
+/*
+ * Domain represents a virtual machine, more than one devices
  * across iommus may be owned in one domain, e.g. kvm guest.
  */
 #define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 0)
@@ -372,11 +373,21 @@
 /* si_domain contains mulitple devices */
 #define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 1)
 
+#define for_each_domain_iommu(idx, domain)			\
+	for (idx = 0; idx < g_num_of_iommus; idx++)		\
+		if (domain->iommu_refcnt[idx])
+
 struct dmar_domain {
-	int	id;			/* domain id */
 	int	nid;			/* node id */
-	DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
-					/* bitmap of iommus this domain uses*/
+
+	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
+					/* Refcount of devices per iommu */
+
+
+	u16		iommu_did[DMAR_UNITS_SUPPORTED];
+					/* Domain ids per IOMMU. Use u16 since
+					 * domain ids are 16 bit wide according
+					 * to VT-d spec, section 9.3 */
 
 	struct list_head devices;	/* all devices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
@@ -395,7 +406,6 @@
 	int		iommu_superpage;/* Level of superpages supported:
 					   0 == 4KiB (no superpages), 1 == 2MiB,
 					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
-	spinlock_t	iommu_lock;	/* protect iommu set in domain */
 	u64		max_addr;	/* maximum mapped address */
 
 	struct iommu_domain domain;	/* generic domain data structure for
@@ -465,10 +475,11 @@
 
 static void domain_exit(struct dmar_domain *domain);
 static void domain_remove_dev_info(struct dmar_domain *domain);
-static void domain_remove_one_dev_info(struct dmar_domain *domain,
-				       struct device *dev);
-static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
-					   struct device *dev);
+static void dmar_remove_one_dev_info(struct dmar_domain *domain,
+				     struct device *dev);
+static void __dmar_remove_one_dev_info(struct device_domain_info *info);
+static void domain_context_clear(struct intel_iommu *iommu,
+				 struct device *dev);
 static int domain_detach_iommu(struct dmar_domain *domain,
 			       struct intel_iommu *iommu);
 
@@ -568,6 +579,36 @@
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
 
+static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
+{
+	struct dmar_domain **domains;
+	int idx = did >> 8;
+
+	domains = iommu->domains[idx];
+	if (!domains)
+		return NULL;
+
+	return domains[did & 0xff];
+}
+
+static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
+			     struct dmar_domain *domain)
+{
+	struct dmar_domain **domains;
+	int idx = did >> 8;
+
+	if (!iommu->domains[idx]) {
+		size_t size = 256 * sizeof(struct dmar_domain *);
+		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
+	}
+
+	domains = iommu->domains[idx];
+	if (WARN_ON(!domains))
+		return;
+	else
+		domains[did & 0xff] = domain;
+}
+
 static inline void *alloc_pgtable_page(int node)
 {
 	struct page *page;
@@ -609,6 +650,11 @@
 	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 }
 
+static inline int domain_type_is_si(struct dmar_domain *domain)
+{
+	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
+}
+
 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 {
 	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
@@ -663,7 +709,9 @@
 
 	/* si_domain and vm domain should not get here. */
 	BUG_ON(domain_type_is_vm_or_si(domain));
-	iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
+	for_each_domain_iommu(iommu_id, domain)
+		break;
+
 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 		return NULL;
 
@@ -679,7 +727,7 @@
 
 	domain->iommu_coherency = 1;
 
-	for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
+	for_each_domain_iommu(i, domain) {
 		found = true;
 		if (!ecap_coherent(g_iommus[i]->ecap)) {
 			domain->iommu_coherency = 0;
@@ -759,6 +807,7 @@
 	struct context_entry *context;
 	u64 *entry;
 
+	entry = &root->lo;
 	if (ecs_enabled(iommu)) {
 		if (devfn >= 0x80) {
 			devfn -= 0x80;
@@ -766,7 +815,6 @@
 		}
 		devfn *= 2;
 	}
-	entry = &root->lo;
 	if (*entry & 1)
 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 	else {
@@ -1166,9 +1214,9 @@
 /* We can't just free the pages because the IOMMU may still be walking
    the page tables, and may have cached the intermediate levels. The
    pages can only be freed after the IOTLB flush has been done. */
-struct page *domain_unmap(struct dmar_domain *domain,
-			  unsigned long start_pfn,
-			  unsigned long last_pfn)
+static struct page *domain_unmap(struct dmar_domain *domain,
+				 unsigned long start_pfn,
+				 unsigned long last_pfn)
 {
 	struct page *freelist = NULL;
 
@@ -1192,7 +1240,7 @@
 	return freelist;
 }
 
-void dma_free_pagelist(struct page *freelist)
+static void dma_free_pagelist(struct page *freelist)
 {
 	struct page *pg;
 
@@ -1360,24 +1408,23 @@
 			 u8 bus, u8 devfn)
 {
 	bool found = false;
-	unsigned long flags;
 	struct device_domain_info *info;
 	struct pci_dev *pdev;
 
+	assert_spin_locked(&device_domain_lock);
+
 	if (!ecap_dev_iotlb_support(iommu->ecap))
 		return NULL;
 
 	if (!iommu->qi)
 		return NULL;
 
-	spin_lock_irqsave(&device_domain_lock, flags);
 	list_for_each_entry(info, &domain->devices, link)
 		if (info->iommu == iommu && info->bus == bus &&
 		    info->devfn == devfn) {
 			found = true;
 			break;
 		}
-	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	if (!found || !info->dev || !dev_is_pci(info->dev))
 		return NULL;
@@ -1436,11 +1483,14 @@
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
-static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-				  unsigned long pfn, unsigned int pages, int ih, int map)
+static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
+				  struct dmar_domain *domain,
+				  unsigned long pfn, unsigned int pages,
+				  int ih, int map)
 {
 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
+	u16 did = domain->iommu_did[iommu->seq_id];
 
 	BUG_ON(pages == 0);
 
@@ -1464,7 +1514,8 @@
 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
 	 */
 	if (!cap_caching_mode(iommu->cap) || !map)
-		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
+		iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
+				      addr, mask);
 }
 
 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
@@ -1519,65 +1570,80 @@
 
 static int iommu_init_domains(struct intel_iommu *iommu)
 {
-	unsigned long ndomains;
-	unsigned long nlongs;
+	u32 ndomains, nlongs;
+	size_t size;
 
 	ndomains = cap_ndoms(iommu->cap);
-	pr_debug("%s: Number of Domains supported <%ld>\n",
+	pr_debug("%s: Number of Domains supported <%d>\n",
 		 iommu->name, ndomains);
 	nlongs = BITS_TO_LONGS(ndomains);
 
 	spin_lock_init(&iommu->lock);
 
-	/* TBD: there might be 64K domains,
-	 * consider other allocation for future chip
-	 */
 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
 	if (!iommu->domain_ids) {
 		pr_err("%s: Allocating domain id array failed\n",
 		       iommu->name);
 		return -ENOMEM;
 	}
-	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
-			GFP_KERNEL);
-	if (!iommu->domains) {
+
+	size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
+	iommu->domains = kzalloc(size, GFP_KERNEL);
+
+	if (iommu->domains) {
+		size = 256 * sizeof(struct dmar_domain *);
+		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
+	}
+
+	if (!iommu->domains || !iommu->domains[0]) {
 		pr_err("%s: Allocating domain array failed\n",
 		       iommu->name);
 		kfree(iommu->domain_ids);
+		kfree(iommu->domains);
 		iommu->domain_ids = NULL;
+		iommu->domains    = NULL;
 		return -ENOMEM;
 	}
 
+
+
 	/*
-	 * if Caching mode is set, then invalid translations are tagged
-	 * with domainid 0. Hence we need to pre-allocate it.
+	 * If Caching mode is set, then invalid translations are tagged
+	 * with domain-id 0, hence we need to pre-allocate it. We also
+	 * use domain-id 0 as a marker for non-allocated domain-id, so
+	 * make sure it is not used for a real domain.
 	 */
-	if (cap_caching_mode(iommu->cap))
-		set_bit(0, iommu->domain_ids);
+	set_bit(0, iommu->domain_ids);
+
 	return 0;
 }
 
 static void disable_dmar_iommu(struct intel_iommu *iommu)
 {
-	struct dmar_domain *domain;
-	int i;
+	struct device_domain_info *info, *tmp;
+	unsigned long flags;
 
-	if ((iommu->domains) && (iommu->domain_ids)) {
-		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
-			/*
-			 * Domain id 0 is reserved for invalid translation
-			 * if hardware supports caching mode.
-			 */
-			if (cap_caching_mode(iommu->cap) && i == 0)
-				continue;
+	if (!iommu->domains || !iommu->domain_ids)
+		return;
 
-			domain = iommu->domains[i];
-			clear_bit(i, iommu->domain_ids);
-			if (domain_detach_iommu(domain, iommu) == 0 &&
-			    !domain_type_is_vm(domain))
-				domain_exit(domain);
-		}
+	spin_lock_irqsave(&device_domain_lock, flags);
+	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
+		struct dmar_domain *domain;
+
+		if (info->iommu != iommu)
+			continue;
+
+		if (!info->dev || !info->domain)
+			continue;
+
+		domain = info->domain;
+
+		dmar_remove_one_dev_info(domain, info->dev);
+
+		if (!domain_type_is_vm_or_si(domain))
+			domain_exit(domain);
 	}
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	if (iommu->gcmd & DMA_GCMD_TE)
 		iommu_disable_translation(iommu);
@@ -1586,6 +1652,11 @@
 static void free_dmar_iommu(struct intel_iommu *iommu)
 {
 	if ((iommu->domains) && (iommu->domain_ids)) {
+		int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
+		int i;
+
+		for (i = 0; i < elems; i++)
+			kfree(iommu->domains[i]);
 		kfree(iommu->domains);
 		kfree(iommu->domain_ids);
 		iommu->domains = NULL;
@@ -1600,8 +1671,6 @@
 
 static struct dmar_domain *alloc_domain(int flags)
 {
-	/* domain id for virtual machine, it won't be set in context */
-	static atomic_t vm_domid = ATOMIC_INIT(0);
 	struct dmar_domain *domain;
 
 	domain = alloc_domain_mem();
@@ -1611,111 +1680,64 @@
 	memset(domain, 0, sizeof(*domain));
 	domain->nid = -1;
 	domain->flags = flags;
-	spin_lock_init(&domain->iommu_lock);
 	INIT_LIST_HEAD(&domain->devices);
-	if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
-		domain->id = atomic_inc_return(&vm_domid);
 
 	return domain;
 }
 
-static int __iommu_attach_domain(struct dmar_domain *domain,
-				 struct intel_iommu *iommu)
-{
-	int num;
-	unsigned long ndomains;
-
-	ndomains = cap_ndoms(iommu->cap);
-	num = find_first_zero_bit(iommu->domain_ids, ndomains);
-	if (num < ndomains) {
-		set_bit(num, iommu->domain_ids);
-		iommu->domains[num] = domain;
-	} else {
-		num = -ENOSPC;
-	}
-
-	return num;
-}
-
-static int iommu_attach_domain(struct dmar_domain *domain,
+/* Must be called with iommu->lock */
+static int domain_attach_iommu(struct dmar_domain *domain,
 			       struct intel_iommu *iommu)
 {
-	int num;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	num = __iommu_attach_domain(domain, iommu);
-	spin_unlock_irqrestore(&iommu->lock, flags);
-	if (num < 0)
-		pr_err("%s: No free domain ids\n", iommu->name);
-
-	return num;
-}
-
-static int iommu_attach_vm_domain(struct dmar_domain *domain,
-				  struct intel_iommu *iommu)
-{
-	int num;
 	unsigned long ndomains;
+	int num;
 
-	ndomains = cap_ndoms(iommu->cap);
-	for_each_set_bit(num, iommu->domain_ids, ndomains)
-		if (iommu->domains[num] == domain)
-			return num;
+	assert_spin_locked(&device_domain_lock);
+	assert_spin_locked(&iommu->lock);
 
-	return __iommu_attach_domain(domain, iommu);
-}
-
-static void iommu_detach_domain(struct dmar_domain *domain,
-				struct intel_iommu *iommu)
-{
-	unsigned long flags;
-	int num, ndomains;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	if (domain_type_is_vm_or_si(domain)) {
+	domain->iommu_refcnt[iommu->seq_id] += 1;
+	domain->iommu_count += 1;
+	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
 		ndomains = cap_ndoms(iommu->cap);
-		for_each_set_bit(num, iommu->domain_ids, ndomains) {
-			if (iommu->domains[num] == domain) {
-				clear_bit(num, iommu->domain_ids);
-				iommu->domains[num] = NULL;
-				break;
-			}
+		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
+
+		if (num >= ndomains) {
+			pr_err("%s: No free domain ids\n", iommu->name);
+			domain->iommu_refcnt[iommu->seq_id] -= 1;
+			domain->iommu_count -= 1;
+			return -ENOSPC;
 		}
-	} else {
-		clear_bit(domain->id, iommu->domain_ids);
-		iommu->domains[domain->id] = NULL;
-	}
-	spin_unlock_irqrestore(&iommu->lock, flags);
-}
 
-static void domain_attach_iommu(struct dmar_domain *domain,
-			       struct intel_iommu *iommu)
-{
-	unsigned long flags;
+		set_bit(num, iommu->domain_ids);
+		set_iommu_domain(iommu, num, domain);
 
-	spin_lock_irqsave(&domain->iommu_lock, flags);
-	if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
-		domain->iommu_count++;
-		if (domain->iommu_count == 1)
-			domain->nid = iommu->node;
+		domain->iommu_did[iommu->seq_id] = num;
+		domain->nid			 = iommu->node;
+
 		domain_update_iommu_cap(domain);
 	}
-	spin_unlock_irqrestore(&domain->iommu_lock, flags);
+
+	return 0;
 }
 
 static int domain_detach_iommu(struct dmar_domain *domain,
 			       struct intel_iommu *iommu)
 {
-	unsigned long flags;
-	int count = INT_MAX;
+	int num, count = INT_MAX;
 
-	spin_lock_irqsave(&domain->iommu_lock, flags);
-	if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
-		count = --domain->iommu_count;
+	assert_spin_locked(&device_domain_lock);
+	assert_spin_locked(&iommu->lock);
+
+	domain->iommu_refcnt[iommu->seq_id] -= 1;
+	count = --domain->iommu_count;
+	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
+		num = domain->iommu_did[iommu->seq_id];
+		clear_bit(num, iommu->domain_ids);
+		set_iommu_domain(iommu, num, NULL);
+
 		domain_update_iommu_cap(domain);
+		domain->iommu_did[iommu->seq_id] = 0;
 	}
-	spin_unlock_irqrestore(&domain->iommu_lock, flags);
 
 	return count;
 }
@@ -1782,9 +1804,9 @@
 	return agaw;
 }
 
-static int domain_init(struct dmar_domain *domain, int guest_width)
+static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
+		       int guest_width)
 {
-	struct intel_iommu *iommu;
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
@@ -1793,7 +1815,6 @@
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
-	iommu = domain_get_iommu(domain);
 	if (guest_width > cap_mgaw(iommu->cap))
 		guest_width = cap_mgaw(iommu->cap);
 	domain->gaw = guest_width;
@@ -1836,8 +1857,6 @@
 
 static void domain_exit(struct dmar_domain *domain)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
 	struct page *freelist = NULL;
 
 	/* Domain 0 is reserved, so dont process it */
@@ -1848,22 +1867,16 @@
 	if (!intel_iommu_strict)
 		flush_unmaps_timeout(0);
 
-	/* remove associated devices */
+	/* Remove associated devices and clear attached or cached domains */
+	rcu_read_lock();
 	domain_remove_dev_info(domain);
+	rcu_read_unlock();
 
 	/* destroy iovas */
 	put_iova_domain(&domain->iovad);
 
 	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
 
-	/* clear attached or cached domains */
-	rcu_read_lock();
-	for_each_active_iommu(iommu, drhd)
-		if (domain_type_is_vm(domain) ||
-		    test_bit(iommu->seq_id, domain->iommu_bmp))
-			iommu_detach_domain(domain, iommu);
-	rcu_read_unlock();
-
 	dma_free_pagelist(freelist);
 
 	free_domain_mem(domain);
@@ -1871,79 +1884,68 @@
 
 static int domain_context_mapping_one(struct dmar_domain *domain,
 				      struct intel_iommu *iommu,
-				      u8 bus, u8 devfn, int translation)
+				      u8 bus, u8 devfn)
 {
+	u16 did = domain->iommu_did[iommu->seq_id];
+	int translation = CONTEXT_TT_MULTI_LEVEL;
+	struct device_domain_info *info = NULL;
 	struct context_entry *context;
 	unsigned long flags;
 	struct dma_pte *pgd;
-	int id;
-	int agaw;
-	struct device_domain_info *info = NULL;
+	int ret, agaw;
+
+	WARN_ON(did == 0);
+
+	if (hw_pass_through && domain_type_is_si(domain))
+		translation = CONTEXT_TT_PASS_THROUGH;
 
 	pr_debug("Set context mapping for %02x:%02x.%d\n",
 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 
 	BUG_ON(!domain->pgd);
-	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
-	       translation != CONTEXT_TT_MULTI_LEVEL);
 
-	spin_lock_irqsave(&iommu->lock, flags);
+	spin_lock_irqsave(&device_domain_lock, flags);
+	spin_lock(&iommu->lock);
+
+	ret = -ENOMEM;
 	context = iommu_context_addr(iommu, bus, devfn, 1);
-	spin_unlock_irqrestore(&iommu->lock, flags);
 	if (!context)
-		return -ENOMEM;
-	spin_lock_irqsave(&iommu->lock, flags);
-	if (context_present(context)) {
-		spin_unlock_irqrestore(&iommu->lock, flags);
-		return 0;
-	}
+		goto out_unlock;
 
-	context_clear_entry(context);
+	ret = 0;
+	if (context_present(context))
+		goto out_unlock;
 
-	id = domain->id;
 	pgd = domain->pgd;
 
-	if (domain_type_is_vm_or_si(domain)) {
-		if (domain_type_is_vm(domain)) {
-			id = iommu_attach_vm_domain(domain, iommu);
-			if (id < 0) {
-				spin_unlock_irqrestore(&iommu->lock, flags);
-				pr_err("%s: No free domain ids\n", iommu->name);
-				return -EFAULT;
-			}
-		}
+	context_clear_entry(context);
+	context_set_domain_id(context, did);
 
-		/* Skip top levels of page tables for
-		 * iommu which has less agaw than default.
-		 * Unnecessary for PT mode.
-		 */
-		if (translation != CONTEXT_TT_PASS_THROUGH) {
-			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
-				pgd = phys_to_virt(dma_pte_addr(pgd));
-				if (!dma_pte_present(pgd)) {
-					spin_unlock_irqrestore(&iommu->lock, flags);
-					return -ENOMEM;
-				}
-			}
-		}
-	}
-
-	context_set_domain_id(context, id);
-
+	/*
+	 * Skip top levels of page tables for iommu which has less agaw
+	 * than default.  Unnecessary for PT mode.
+	 */
 	if (translation != CONTEXT_TT_PASS_THROUGH) {
+		for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
+			ret = -ENOMEM;
+			pgd = phys_to_virt(dma_pte_addr(pgd));
+			if (!dma_pte_present(pgd))
+				goto out_unlock;
+		}
+
 		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
 		translation = info ? CONTEXT_TT_DEV_IOTLB :
 				     CONTEXT_TT_MULTI_LEVEL;
-	}
-	/*
-	 * In pass through mode, AW must be programmed to indicate the largest
-	 * AGAW value supported by hardware. And ASR is ignored by hardware.
-	 */
-	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
-		context_set_address_width(context, iommu->msagaw);
-	else {
+
 		context_set_address_root(context, virt_to_phys(pgd));
 		context_set_address_width(context, iommu->agaw);
+	} else {
+		/*
+		 * In pass through mode, AW must be programmed to
+		 * indicate the largest AGAW value supported by
+		 * hardware. And ASR is ignored by hardware.
+		 */
+		context_set_address_width(context, iommu->msagaw);
 	}
 
 	context_set_translation_type(context, translation);
@@ -1962,14 +1964,17 @@
 					   (((u16)bus) << 8) | devfn,
 					   DMA_CCMD_MASK_NOBIT,
 					   DMA_CCMD_DEVICE_INVL);
-		iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
+		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
 	} else {
 		iommu_flush_write_buffer(iommu);
 	}
 	iommu_enable_dev_iotlb(info);
-	spin_unlock_irqrestore(&iommu->lock, flags);
 
-	domain_attach_iommu(domain, iommu);
+	ret = 0;
+
+out_unlock:
+	spin_unlock(&iommu->lock);
+	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return 0;
 }
@@ -1977,7 +1982,6 @@
 struct domain_context_mapping_data {
 	struct dmar_domain *domain;
 	struct intel_iommu *iommu;
-	int translation;
 };
 
 static int domain_context_mapping_cb(struct pci_dev *pdev,
@@ -1986,13 +1990,11 @@
 	struct domain_context_mapping_data *data = opaque;
 
 	return domain_context_mapping_one(data->domain, data->iommu,
-					  PCI_BUS_NUM(alias), alias & 0xff,
-					  data->translation);
+					  PCI_BUS_NUM(alias), alias & 0xff);
 }
 
 static int
-domain_context_mapping(struct dmar_domain *domain, struct device *dev,
-		       int translation)
+domain_context_mapping(struct dmar_domain *domain, struct device *dev)
 {
 	struct intel_iommu *iommu;
 	u8 bus, devfn;
@@ -2003,12 +2005,10 @@
 		return -ENODEV;
 
 	if (!dev_is_pci(dev))
-		return domain_context_mapping_one(domain, iommu, bus, devfn,
-						  translation);
+		return domain_context_mapping_one(domain, iommu, bus, devfn);
 
 	data.domain = domain;
 	data.iommu = iommu;
-	data.translation = translation;
 
 	return pci_for_each_dma_alias(to_pci_dev(dev),
 				      &domain_context_mapping_cb, &data);
@@ -2194,7 +2194,7 @@
 	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
 }
 
-static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
+static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
 {
 	if (!iommu)
 		return;
@@ -2220,21 +2220,8 @@
 	unsigned long flags;
 
 	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry_safe(info, tmp, &domain->devices, link) {
-		unlink_domain_info(info);
-		spin_unlock_irqrestore(&device_domain_lock, flags);
-
-		iommu_disable_dev_iotlb(info);
-		iommu_detach_dev(info->iommu, info->bus, info->devfn);
-
-		if (domain_type_is_vm(domain)) {
-			iommu_detach_dependent_devices(info->iommu, info->dev);
-			domain_detach_iommu(domain, info->iommu);
-		}
-
-		free_devinfo_mem(info);
-		spin_lock_irqsave(&device_domain_lock, flags);
-	}
+	list_for_each_entry_safe(info, tmp, &domain->devices, link)
+		__dmar_remove_one_dev_info(info);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 }
 
@@ -2266,14 +2253,15 @@
 	return NULL;
 }
 
-static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
-						int bus, int devfn,
-						struct device *dev,
-						struct dmar_domain *domain)
+static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
+						    int bus, int devfn,
+						    struct device *dev,
+						    struct dmar_domain *domain)
 {
 	struct dmar_domain *found = NULL;
 	struct device_domain_info *info;
 	unsigned long flags;
+	int ret;
 
 	info = alloc_devinfo_mem();
 	if (!info)
@@ -2290,12 +2278,16 @@
 	spin_lock_irqsave(&device_domain_lock, flags);
 	if (dev)
 		found = find_domain(dev);
-	else {
+
+	if (!found) {
 		struct device_domain_info *info2;
 		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
-		if (info2)
-			found = info2->domain;
+		if (info2) {
+			found      = info2->domain;
+			info2->dev = dev;
+		}
 	}
+
 	if (found) {
 		spin_unlock_irqrestore(&device_domain_lock, flags);
 		free_devinfo_mem(info);
@@ -2303,12 +2295,27 @@
 		return found;
 	}
 
+	spin_lock(&iommu->lock);
+	ret = domain_attach_iommu(domain, iommu);
+	spin_unlock(&iommu->lock);
+
+	if (ret) {
+		spin_unlock_irqrestore(&device_domain_lock, flags);
+		return NULL;
+	}
+
 	list_add(&info->link, &domain->devices);
 	list_add(&info->global, &device_domain_list);
 	if (dev)
 		dev->archdata.iommu = info;
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
+	if (dev && domain_context_mapping(domain, dev)) {
+		pr_err("Domain context map for %s failed\n", dev_name(dev));
+		dmar_remove_one_dev_info(domain, dev);
+		return NULL;
+	}
+
 	return domain;
 }
 
@@ -2321,10 +2328,10 @@
 /* domain is initialized */
 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
 {
+	struct device_domain_info *info = NULL;
 	struct dmar_domain *domain, *tmp;
 	struct intel_iommu *iommu;
-	struct device_domain_info *info;
-	u16 dma_alias;
+	u16 req_id, dma_alias;
 	unsigned long flags;
 	u8 bus, devfn;
 
@@ -2336,6 +2343,8 @@
 	if (!iommu)
 		return NULL;
 
+	req_id = ((u16)bus << 8) | devfn;
+
 	if (dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(dev);
 
@@ -2360,21 +2369,15 @@
 	domain = alloc_domain(0);
 	if (!domain)
 		return NULL;
-	domain->id = iommu_attach_domain(domain, iommu);
-	if (domain->id < 0) {
-		free_domain_mem(domain);
-		return NULL;
-	}
-	domain_attach_iommu(domain, iommu);
-	if (domain_init(domain, gaw)) {
+	if (domain_init(domain, iommu, gaw)) {
 		domain_exit(domain);
 		return NULL;
 	}
 
 	/* register PCI DMA alias device */
-	if (dev_is_pci(dev)) {
-		tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
-					   dma_alias & 0xff, NULL, domain);
+	if (req_id != dma_alias && dev_is_pci(dev)) {
+		tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
+					       dma_alias & 0xff, NULL, domain);
 
 		if (!tmp || tmp != domain) {
 			domain_exit(domain);
@@ -2386,7 +2389,7 @@
 	}
 
 found_domain:
-	tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
+	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
 
 	if (!tmp || tmp != domain) {
 		domain_exit(domain);
@@ -2414,8 +2417,7 @@
 		return -ENOMEM;
 	}
 
-	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
-		 start, end, domain->id);
+	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
 	/*
 	 * RMRR range might have overlap with physical memory range,
 	 * clear it first
@@ -2476,11 +2478,6 @@
 	if (ret)
 		goto error;
 
-	/* context entry init */
-	ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
-	if (ret)
-		goto error;
-
 	return 0;
 
  error:
@@ -2526,37 +2523,18 @@
 
 static int __init si_domain_init(int hw)
 {
-	struct dmar_drhd_unit *drhd;
-	struct intel_iommu *iommu;
 	int nid, ret = 0;
-	bool first = true;
 
 	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
 	if (!si_domain)
 		return -EFAULT;
 
-	for_each_active_iommu(iommu, drhd) {
-		ret = iommu_attach_domain(si_domain, iommu);
-		if (ret < 0) {
-			domain_exit(si_domain);
-			return -EFAULT;
-		} else if (first) {
-			si_domain->id = ret;
-			first = false;
-		} else if (si_domain->id != ret) {
-			domain_exit(si_domain);
-			return -EFAULT;
-		}
-		domain_attach_iommu(si_domain, iommu);
-	}
-
 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
 		domain_exit(si_domain);
 		return -EFAULT;
 	}
 
-	pr_debug("Identity mapping domain is domain %d\n",
-		 si_domain->id);
+	pr_debug("Identity mapping domain allocated\n");
 
 	if (hw)
 		return 0;
@@ -2590,28 +2568,20 @@
 	return 0;
 }
 
-static int domain_add_dev_info(struct dmar_domain *domain,
-			       struct device *dev, int translation)
+static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 {
 	struct dmar_domain *ndomain;
 	struct intel_iommu *iommu;
 	u8 bus, devfn;
-	int ret;
 
 	iommu = device_to_iommu(dev, &bus, &devfn);
 	if (!iommu)
 		return -ENODEV;
 
-	ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
+	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
 	if (ndomain != domain)
 		return -EBUSY;
 
-	ret = domain_context_mapping(domain, dev, translation);
-	if (ret) {
-		domain_remove_one_dev_info(domain, dev);
-		return ret;
-	}
-
 	return 0;
 }
 
@@ -2751,9 +2721,7 @@
 	if (!iommu_should_identity_map(dev, 1))
 		return 0;
 
-	ret = domain_add_dev_info(si_domain, dev,
-				  hw ? CONTEXT_TT_PASS_THROUGH :
-				       CONTEXT_TT_MULTI_LEVEL);
+	ret = domain_add_dev_info(si_domain, dev);
 	if (!ret)
 		pr_info("%s identity mapping for device %s\n",
 			hw ? "Hardware" : "Software", dev_name(dev));
@@ -2839,15 +2807,18 @@
 }
 
 static int copy_context_table(struct intel_iommu *iommu,
-			      struct root_entry *old_re,
+			      struct root_entry __iomem *old_re,
 			      struct context_entry **tbl,
 			      int bus, bool ext)
 {
-	struct context_entry *old_ce = NULL, *new_ce = NULL, ce;
 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
+	struct context_entry __iomem *old_ce = NULL;
+	struct context_entry *new_ce = NULL, ce;
+	struct root_entry re;
 	phys_addr_t old_ce_phys;
 
 	tbl_idx = ext ? bus * 2 : bus;
+	memcpy_fromio(&re, old_re, sizeof(re));
 
 	for (devfn = 0; devfn < 256; devfn++) {
 		/* First calculate the correct index */
@@ -2867,9 +2838,9 @@
 
 			ret = 0;
 			if (devfn < 0x80)
-				old_ce_phys = root_entry_lctp(old_re);
+				old_ce_phys = root_entry_lctp(&re);
 			else
-				old_ce_phys = root_entry_uctp(old_re);
+				old_ce_phys = root_entry_uctp(&re);
 
 			if (!old_ce_phys) {
 				if (ext && devfn == 0) {
@@ -2894,7 +2865,7 @@
 		}
 
 		/* Now copy the context entry */
-		ce = old_ce[idx];
+		memcpy_fromio(&ce, old_ce + idx, sizeof(ce));
 
 		if (!__context_present(&ce))
 			continue;
@@ -2938,8 +2909,8 @@
 
 static int copy_translation_tables(struct intel_iommu *iommu)
 {
+	struct root_entry __iomem *old_rt;
 	struct context_entry **ctxt_tbls;
-	struct root_entry *old_rt;
 	phys_addr_t old_rt_phys;
 	int ctxt_table_entries;
 	unsigned long flags;
@@ -3269,7 +3240,6 @@
 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
 {
 	struct dmar_domain *domain;
-	int ret;
 
 	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 	if (!domain) {
@@ -3278,16 +3248,6 @@
 		return NULL;
 	}
 
-	/* make sure context mapping is ok */
-	if (unlikely(!domain_context_mapped(dev))) {
-		ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
-		if (ret) {
-			pr_err("Domain context map for %s failed\n",
-			       dev_name(dev));
-			return NULL;
-		}
-	}
-
 	return domain;
 }
 
@@ -3323,7 +3283,7 @@
 			 * 32 bit DMA is removed from si_domain and fall back
 			 * to non-identity mapping.
 			 */
-			domain_remove_one_dev_info(si_domain, dev);
+			dmar_remove_one_dev_info(si_domain, dev);
 			pr_info("32bit %s uses non-identity mapping\n",
 				dev_name(dev));
 			return 0;
@@ -3335,10 +3295,7 @@
 		 */
 		if (iommu_should_identity_map(dev, 0)) {
 			int ret;
-			ret = domain_add_dev_info(si_domain, dev,
-						  hw_pass_through ?
-						  CONTEXT_TT_PASS_THROUGH :
-						  CONTEXT_TT_MULTI_LEVEL);
+			ret = domain_add_dev_info(si_domain, dev);
 			if (!ret) {
 				pr_info("64bit %s uses identity mapping\n",
 					dev_name(dev));
@@ -3399,7 +3356,9 @@
 
 	/* it's a non-present to present mapping. Only flush if caching mode */
 	if (cap_caching_mode(iommu->cap))
-		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
+		iommu_flush_iotlb_psi(iommu, domain,
+				      mm_to_dma_pfn(iova->pfn_lo),
+				      size, 0, 1);
 	else
 		iommu_flush_write_buffer(iommu);
 
@@ -3450,7 +3409,7 @@
 
 			/* On real hardware multiple invalidations are expensive */
 			if (cap_caching_mode(iommu->cap))
-				iommu_flush_iotlb_psi(iommu, domain->id,
+				iommu_flush_iotlb_psi(iommu, domain,
 					iova->pfn_lo, iova_size(iova),
 					!deferred_flush[i].freelist[j], 0);
 			else {
@@ -3534,7 +3493,7 @@
 	freelist = domain_unmap(domain, start_pfn, last_pfn);
 
 	if (intel_iommu_strict) {
-		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
+		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
 				      last_pfn - start_pfn + 1, !freelist, 0);
 		/* free iova */
 		__free_iova(&domain->iovad, iova);
@@ -3692,7 +3651,7 @@
 
 	/* it's a non-present to present mapping. Only flush if caching mode */
 	if (cap_caching_mode(iommu->cap))
-		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
+		iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
 	else
 		iommu_flush_write_buffer(iommu);
 
@@ -4169,13 +4128,6 @@
 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	iommu_enable_translation(iommu);
 
-	if (si_domain) {
-		ret = iommu_attach_domain(si_domain, iommu);
-		if (ret < 0 || si_domain->id != ret)
-			goto disable_iommu;
-		domain_attach_iommu(si_domain, iommu);
-	}
-
 	iommu_disable_protect_mem_regions(iommu);
 	return 0;
 
@@ -4337,11 +4289,9 @@
 	if (!domain)
 		return 0;
 
-	down_read(&dmar_global_lock);
-	domain_remove_one_dev_info(domain, dev);
+	dmar_remove_one_dev_info(domain, dev);
 	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
 		domain_exit(domain);
-	up_read(&dmar_global_lock);
 
 	return 0;
 }
@@ -4398,7 +4348,7 @@
 
 			rcu_read_lock();
 			for_each_active_iommu(iommu, drhd)
-				iommu_flush_iotlb_psi(iommu, si_domain->id,
+				iommu_flush_iotlb_psi(iommu, si_domain,
 					iova->pfn_lo, iova_size(iova),
 					!freelist, 0);
 			rcu_read_unlock();
@@ -4457,11 +4407,32 @@
 }
 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
 
+static ssize_t intel_iommu_show_ndoms(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct intel_iommu *iommu = dev_get_drvdata(dev);
+	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
+}
+static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
+
+static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct intel_iommu *iommu = dev_get_drvdata(dev);
+	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
+						  cap_ndoms(iommu->cap)));
+}
+static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
+
 static struct attribute *intel_iommu_attrs[] = {
 	&dev_attr_version.attr,
 	&dev_attr_address.attr,
 	&dev_attr_cap.attr,
 	&dev_attr_ecap.attr,
+	&dev_attr_domains_supported.attr,
+	&dev_attr_domains_used.attr,
 	NULL,
 };
 
@@ -4541,7 +4512,7 @@
 	for_each_active_iommu(iommu, drhd)
 		iommu->iommu_dev = iommu_device_create(NULL, iommu,
 						       intel_iommu_groups,
-						       iommu->name);
+						       "%s", iommu->name);
 
 	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
 	bus_register_notifier(&pci_bus_type, &device_nb);
@@ -4561,11 +4532,11 @@
 	return ret;
 }
 
-static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
+static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
 {
 	struct intel_iommu *iommu = opaque;
 
-	iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
+	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
 	return 0;
 }
 
@@ -4575,63 +4546,50 @@
  * devices, unbinding the driver from any one of them will possibly leave
  * the others unable to operate.
  */
-static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
-					   struct device *dev)
+static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
 {
 	if (!iommu || !dev || !dev_is_pci(dev))
 		return;
 
-	pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
+	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
 }
 
-static void domain_remove_one_dev_info(struct dmar_domain *domain,
-				       struct device *dev)
+static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 {
-	struct device_domain_info *info, *tmp;
 	struct intel_iommu *iommu;
 	unsigned long flags;
-	bool found = false;
-	u8 bus, devfn;
 
-	iommu = device_to_iommu(dev, &bus, &devfn);
-	if (!iommu)
+	assert_spin_locked(&device_domain_lock);
+
+	if (WARN_ON(!info))
 		return;
 
+	iommu = info->iommu;
+
+	if (info->dev) {
+		iommu_disable_dev_iotlb(info);
+		domain_context_clear(iommu, info->dev);
+	}
+
+	unlink_domain_info(info);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	domain_detach_iommu(info->domain, iommu);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	free_devinfo_mem(info);
+}
+
+static void dmar_remove_one_dev_info(struct dmar_domain *domain,
+				     struct device *dev)
+{
+	struct device_domain_info *info;
+	unsigned long flags;
+
 	spin_lock_irqsave(&device_domain_lock, flags);
-	list_for_each_entry_safe(info, tmp, &domain->devices, link) {
-		if (info->iommu == iommu && info->bus == bus &&
-		    info->devfn == devfn) {
-			unlink_domain_info(info);
-			spin_unlock_irqrestore(&device_domain_lock, flags);
-
-			iommu_disable_dev_iotlb(info);
-			iommu_detach_dev(iommu, info->bus, info->devfn);
-			iommu_detach_dependent_devices(iommu, dev);
-			free_devinfo_mem(info);
-
-			spin_lock_irqsave(&device_domain_lock, flags);
-
-			if (found)
-				break;
-			else
-				continue;
-		}
-
-		/* if there is no other devices under the same iommu
-		 * owned by this domain, clear this iommu in iommu_bmp
-		 * update iommu count and coherency
-		 */
-		if (info->iommu == iommu)
-			found = true;
-	}
-
+	info = dev->archdata.iommu;
+	__dmar_remove_one_dev_info(info);
 	spin_unlock_irqrestore(&device_domain_lock, flags);
-
-	if (found == 0) {
-		domain_detach_iommu(domain, iommu);
-		if (!domain_type_is_vm_or_si(domain))
-			iommu_detach_domain(domain, iommu);
-	}
 }
 
 static int md_domain_init(struct dmar_domain *domain, int guest_width)
@@ -4712,10 +4670,9 @@
 
 		old_domain = find_domain(dev);
 		if (old_domain) {
-			if (domain_type_is_vm_or_si(dmar_domain))
-				domain_remove_one_dev_info(old_domain, dev);
-			else
-				domain_remove_dev_info(old_domain);
+			rcu_read_lock();
+			dmar_remove_one_dev_info(old_domain, dev);
+			rcu_read_unlock();
 
 			if (!domain_type_is_vm_or_si(old_domain) &&
 			     list_empty(&old_domain->devices))
@@ -4755,13 +4712,13 @@
 		dmar_domain->agaw--;
 	}
 
-	return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
+	return domain_add_dev_info(dmar_domain, dev);
 }
 
 static void intel_iommu_detach_device(struct iommu_domain *domain,
 				      struct device *dev)
 {
-	domain_remove_one_dev_info(to_dmar_domain(domain), dev);
+	dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
 }
 
 static int intel_iommu_map(struct iommu_domain *domain,
@@ -4810,12 +4767,11 @@
 	struct intel_iommu *iommu;
 	unsigned long start_pfn, last_pfn;
 	unsigned int npages;
-	int iommu_id, num, ndomains, level = 0;
+	int iommu_id, level = 0;
 
 	/* Cope with horrid API which requires us to unmap more than the
 	   size argument if it happens to be a large-page mapping. */
-	if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
-		BUG();
+	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
 
 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
@@ -4827,19 +4783,11 @@
 
 	npages = last_pfn - start_pfn + 1;
 
-	for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
-               iommu = g_iommus[iommu_id];
+	for_each_domain_iommu(iommu_id, dmar_domain) {
+		iommu = g_iommus[iommu_id];
 
-               /*
-                * find bit position of dmar_domain
-                */
-               ndomains = cap_ndoms(iommu->cap);
-               for_each_set_bit(num, iommu->domain_ids, ndomains) {
-                       if (iommu->domains[num] == dmar_domain)
-                               iommu_flush_iotlb_psi(iommu, num, start_pfn,
-						     npages, !freelist, 0);
-	       }
-
+		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
+				      start_pfn, npages, !freelist, 0);
 	}
 
 	dma_free_pagelist(freelist);
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index f15692a..9ec4e0d 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -384,7 +384,7 @@
 
 static int iommu_load_old_irte(struct intel_iommu *iommu)
 {
-	struct irte *old_ir_table;
+	struct irte __iomem *old_ir_table;
 	phys_addr_t irt_phys;
 	unsigned int i;
 	size_t size;
@@ -413,7 +413,7 @@
 		return -ENOMEM;
 
 	/* Copy data over */
-	memcpy(iommu->ir_table->base, old_ir_table, size);
+	memcpy_fromio(iommu->ir_table->base, old_ir_table, size);
 
 	__iommu_flush_cache(iommu, iommu->ir_table->base, size);
 
@@ -426,6 +426,8 @@
 			bitmap_set(iommu->ir_table->bitmap, i, 1);
 	}
 
+	iounmap(old_ir_table);
+
 	return 0;
 }
 
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 4e46021..73c0748 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
+#include <asm/barrier.h>
+
 #include "io-pgtable.h"
 
 #define ARM_LPAE_MAX_ADDR_BITS		48
@@ -200,20 +202,97 @@
 
 static bool selftest_running = false;
 
+static dma_addr_t __arm_lpae_dma_addr(struct device *dev, void *pages)
+{
+	return phys_to_dma(dev, virt_to_phys(pages));
+}
+
+static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
+				    struct io_pgtable_cfg *cfg)
+{
+	struct device *dev = cfg->iommu_dev;
+	dma_addr_t dma;
+	void *pages = alloc_pages_exact(size, gfp | __GFP_ZERO);
+
+	if (!pages)
+		return NULL;
+
+	if (!selftest_running) {
+		dma = dma_map_single(dev, pages, size, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, dma))
+			goto out_free;
+		/*
+		 * We depend on the IOMMU being able to work with any physical
+		 * address directly, so if the DMA layer suggests it can't by
+		 * giving us back some translation, that bodes very badly...
+		 */
+		if (dma != __arm_lpae_dma_addr(dev, pages))
+			goto out_unmap;
+	}
+
+	return pages;
+
+out_unmap:
+	dev_err(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+	free_pages_exact(pages, size);
+	return NULL;
+}
+
+static void __arm_lpae_free_pages(void *pages, size_t size,
+				  struct io_pgtable_cfg *cfg)
+{
+	struct device *dev = cfg->iommu_dev;
+
+	if (!selftest_running)
+		dma_unmap_single(dev, __arm_lpae_dma_addr(dev, pages),
+				 size, DMA_TO_DEVICE);
+	free_pages_exact(pages, size);
+}
+
+static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
+			       struct io_pgtable_cfg *cfg)
+{
+	struct device *dev = cfg->iommu_dev;
+
+	*ptep = pte;
+
+	if (!selftest_running)
+		dma_sync_single_for_device(dev, __arm_lpae_dma_addr(dev, ptep),
+					   sizeof(pte), DMA_TO_DEVICE);
+}
+
+static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
+			    unsigned long iova, size_t size, int lvl,
+			    arm_lpae_iopte *ptep);
+
 static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 			     unsigned long iova, phys_addr_t paddr,
 			     arm_lpae_iopte prot, int lvl,
 			     arm_lpae_iopte *ptep)
 {
 	arm_lpae_iopte pte = prot;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
-	/* We require an unmap first */
 	if (iopte_leaf(*ptep, lvl)) {
+		/* We require an unmap first */
 		WARN_ON(!selftest_running);
 		return -EEXIST;
+	} else if (iopte_type(*ptep, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
+		/*
+		 * We need to unmap and free the old table before
+		 * overwriting it with a block entry.
+		 */
+		arm_lpae_iopte *tblp;
+		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
+		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
+			return -EINVAL;
 	}
 
-	if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 		pte |= ARM_LPAE_PTE_NS;
 
 	if (lvl == ARM_LPAE_MAX_LEVELS - 1)
@@ -224,8 +303,7 @@
 	pte |= ARM_LPAE_PTE_AF | ARM_LPAE_PTE_SH_IS;
 	pte |= pfn_to_iopte(paddr >> data->pg_shift, data);
 
-	*ptep = pte;
-	data->iop.cfg.tlb->flush_pgtable(ptep, sizeof(*ptep), data->iop.cookie);
+	__arm_lpae_set_pte(ptep, pte, cfg);
 	return 0;
 }
 
@@ -234,14 +312,14 @@
 			  int lvl, arm_lpae_iopte *ptep)
 {
 	arm_lpae_iopte *cptep, pte;
-	void *cookie = data->iop.cookie;
 	size_t block_size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
 	/* Find our entry at the current level */
 	ptep += ARM_LPAE_LVL_IDX(iova, lvl, data);
 
 	/* If we can install a leaf entry at this level, then do so */
-	if (size == block_size && (size & data->iop.cfg.pgsize_bitmap))
+	if (size == block_size && (size & cfg->pgsize_bitmap))
 		return arm_lpae_init_pte(data, iova, paddr, prot, lvl, ptep);
 
 	/* We can't allocate tables at the final level */
@@ -251,18 +329,15 @@
 	/* Grab a pointer to the next level */
 	pte = *ptep;
 	if (!pte) {
-		cptep = alloc_pages_exact(1UL << data->pg_shift,
-					 GFP_ATOMIC | __GFP_ZERO);
+		cptep = __arm_lpae_alloc_pages(1UL << data->pg_shift,
+					       GFP_ATOMIC, cfg);
 		if (!cptep)
 			return -ENOMEM;
 
-		data->iop.cfg.tlb->flush_pgtable(cptep, 1UL << data->pg_shift,
-						 cookie);
 		pte = __pa(cptep) | ARM_LPAE_PTE_TYPE_TABLE;
-		if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 			pte |= ARM_LPAE_PTE_NSTABLE;
-		*ptep = pte;
-		data->iop.cfg.tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+		__arm_lpae_set_pte(ptep, pte, cfg);
 	} else {
 		cptep = iopte_deref(pte, data);
 	}
@@ -309,7 +384,7 @@
 {
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	arm_lpae_iopte *ptep = data->pgd;
-	int lvl = ARM_LPAE_START_LVL(data);
+	int ret, lvl = ARM_LPAE_START_LVL(data);
 	arm_lpae_iopte prot;
 
 	/* If no access, then nothing to do */
@@ -317,7 +392,14 @@
 		return 0;
 
 	prot = arm_lpae_prot_to_pte(data, iommu_prot);
-	return __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+	ret = __arm_lpae_map(data, iova, paddr, size, prot, lvl, ptep);
+	/*
+	 * Synchronise all PTE updates for the new mapping before there's
+	 * a chance for anything to kick off a table walk for the new iova.
+	 */
+	wmb();
+
+	return ret;
 }
 
 static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
@@ -347,7 +429,7 @@
 		__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
 	}
 
-	free_pages_exact(start, table_size);
+	__arm_lpae_free_pages(start, table_size, &data->iop.cfg);
 }
 
 static void arm_lpae_free_pgtable(struct io_pgtable *iop)
@@ -366,8 +448,7 @@
 	unsigned long blk_start, blk_end;
 	phys_addr_t blk_paddr;
 	arm_lpae_iopte table = 0;
-	void *cookie = data->iop.cookie;
-	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
 
 	blk_start = iova & ~(blk_size - 1);
 	blk_end = blk_start + blk_size;
@@ -393,10 +474,9 @@
 		}
 	}
 
-	*ptep = table;
-	tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+	__arm_lpae_set_pte(ptep, table, cfg);
 	iova &= ~(blk_size - 1);
-	tlb->tlb_add_flush(iova, blk_size, true, cookie);
+	cfg->tlb->tlb_add_flush(iova, blk_size, true, data->iop.cookie);
 	return size;
 }
 
@@ -418,13 +498,12 @@
 
 	/* If the size matches this level, we're in the right place */
 	if (size == blk_size) {
-		*ptep = 0;
-		tlb->flush_pgtable(ptep, sizeof(*ptep), cookie);
+		__arm_lpae_set_pte(ptep, 0, &data->iop.cfg);
 
 		if (!iopte_leaf(pte, lvl)) {
 			/* Also flush any partial walks */
 			tlb->tlb_add_flush(iova, size, false, cookie);
-			tlb->tlb_sync(data->iop.cookie);
+			tlb->tlb_sync(cookie);
 			ptep = iopte_deref(pte, data);
 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
 		} else {
@@ -640,11 +719,12 @@
 	cfg->arm_lpae_s1_cfg.mair[1] = 0;
 
 	/* Looking good; allocate a pgd */
-	data->pgd = alloc_pages_exact(data->pgd_size, GFP_KERNEL | __GFP_ZERO);
+	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
 	if (!data->pgd)
 		goto out_free_data;
 
-	cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
 
 	/* TTBRs */
 	cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd);
@@ -728,11 +808,12 @@
 	cfg->arm_lpae_s2_cfg.vtcr = reg;
 
 	/* Allocate pgd pages */
-	data->pgd = alloc_pages_exact(data->pgd_size, GFP_KERNEL | __GFP_ZERO);
+	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg);
 	if (!data->pgd)
 		goto out_free_data;
 
-	cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+	/* Ensure the empty pgd is visible before any actual TTBR write */
+	wmb();
 
 	/* VTTBR */
 	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
@@ -818,16 +899,10 @@
 	WARN_ON(cookie != cfg_cookie);
 }
 
-static void dummy_flush_pgtable(void *ptr, size_t size, void *cookie)
-{
-	WARN_ON(cookie != cfg_cookie);
-}
-
 static struct iommu_gather_ops dummy_tlb_ops __initdata = {
 	.tlb_flush_all	= dummy_tlb_flush_all,
 	.tlb_add_flush	= dummy_tlb_add_flush,
 	.tlb_sync	= dummy_tlb_sync,
-	.flush_pgtable	= dummy_flush_pgtable,
 };
 
 static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 6436fe2..6f2e319 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -24,11 +24,6 @@
 
 #include "io-pgtable.h"
 
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
-extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
-
 static const struct io_pgtable_init_fns *
 io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
 {
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 10e32f6..ac9e234 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -17,8 +17,9 @@
  *
  * @tlb_flush_all: Synchronously invalidate the entire TLB context.
  * @tlb_add_flush: Queue up a TLB invalidation for a virtual address range.
- * @tlb_sync:      Ensure any queue TLB invalidation has taken effect.
- * @flush_pgtable: Ensure page table updates are visible to the IOMMU.
+ * @tlb_sync:      Ensure any queued TLB invalidation has taken effect, and
+ *                 any corresponding page table updates are visible to the
+ *                 IOMMU.
  *
  * Note that these can all be called in atomic context and must therefore
  * not block.
@@ -28,7 +29,6 @@
 	void (*tlb_add_flush)(unsigned long iova, size_t size, bool leaf,
 			      void *cookie);
 	void (*tlb_sync)(void *cookie);
-	void (*flush_pgtable)(void *ptr, size_t size, void *cookie);
 };
 
 /**
@@ -41,6 +41,8 @@
  * @ias:           Input address (iova) size, in bits.
  * @oas:           Output address (paddr) size, in bits.
  * @tlb:           TLB management callbacks for this set of tables.
+ * @iommu_dev:     The device representing the DMA configuration for the
+ *                 page table walker.
  */
 struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_NS	(1 << 0)	/* Set NS bit in PTEs */
@@ -49,6 +51,7 @@
 	unsigned int			ias;
 	unsigned int			oas;
 	const struct iommu_gather_ops	*tlb;
+	struct device			*iommu_dev;
 
 	/* Low-level data specific to the table format */
 	union {
@@ -140,4 +143,9 @@
 	void (*free)(struct io_pgtable *iop);
 };
 
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+
 #endif /* __IO_PGTABLE_H */
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 1a67c53..8cf605fa 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -283,24 +283,10 @@
 	/* The hardware doesn't support selective TLB flush. */
 }
 
-static void ipmmu_flush_pgtable(void *ptr, size_t size, void *cookie)
-{
-	unsigned long offset = (unsigned long)ptr & ~PAGE_MASK;
-	struct ipmmu_vmsa_domain *domain = cookie;
-
-	/*
-	 * TODO: Add support for coherent walk through CCI with DVM and remove
-	 * cache handling.
-	 */
-	dma_map_page(domain->mmu->dev, virt_to_page(ptr), offset, size,
-		     DMA_TO_DEVICE);
-}
-
 static struct iommu_gather_ops ipmmu_gather_ops = {
 	.tlb_flush_all = ipmmu_tlb_flush_all,
 	.tlb_add_flush = ipmmu_tlb_add_flush,
 	.tlb_sync = ipmmu_tlb_flush_all,
-	.flush_pgtable = ipmmu_flush_pgtable,
 };
 
 /* -----------------------------------------------------------------------------
@@ -327,6 +313,11 @@
 	domain->cfg.ias = 32;
 	domain->cfg.oas = 40;
 	domain->cfg.tlb = &ipmmu_gather_ops;
+	/*
+	 * TODO: Add support for coherent walk through CCI with DVM and remove
+	 * cache handling. For now, delegate it to the io-pgtable code.
+	 */
+	domain->cfg.iommu_dev = domain->mmu->dev;
 
 	domain->iop = alloc_io_pgtable_ops(ARM_32_LPAE_S1, &domain->cfg,
 					   domain);
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 2d99930..913455a 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -84,7 +84,7 @@
 bool irq_remapping_cap(enum irq_remap_cap cap)
 {
 	if (!remap_ops || disable_irq_post)
-		return 0;
+		return false;
 
 	return (remap_ops->capability & (1 << cap));
 }
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 15a2063..e321fa5 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -106,8 +106,8 @@
 #endif
 
 	list_for_each_entry(ctx_drvdata, &priv->list_attached, attached_elm) {
-		if (!ctx_drvdata->pdev || !ctx_drvdata->pdev->dev.parent)
-			BUG();
+
+		BUG_ON(!ctx_drvdata->pdev || !ctx_drvdata->pdev->dev.parent);
 
 		iommu_drvdata = dev_get_drvdata(ctx_drvdata->pdev->dev.parent);
 		BUG_ON(!iommu_drvdata);
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 43429ab..60ba238 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -141,10 +141,12 @@
 	struct iommu_ops *ops = NULL;
 	int idx = 0;
 
-	if (dev_is_pci(dev)) {
-		dev_err(dev, "IOMMU is currently not supported for PCI\n");
+	/*
+	 * We can't do much for PCI devices without knowing how
+	 * device IDs are wired up from the PCI bus to the IOMMU.
+	 */
+	if (dev_is_pci(dev))
 		return NULL;
-	}
 
 	/*
 	 * We don't currently walk up the tree looking for a parent IOMMU.
diff --git a/drivers/iommu/omap-iommu-debug.c b/drivers/iommu/omap-iommu-debug.c
index f3d20a2..0717aa9 100644
--- a/drivers/iommu/omap-iommu-debug.c
+++ b/drivers/iommu/omap-iommu-debug.c
@@ -14,6 +14,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
 #include <linux/debugfs.h>
 #include <linux/platform_data/iommu-omap.h>
 
@@ -29,6 +30,59 @@
 	return !obj->domain;
 }
 
+#define pr_reg(name)							\
+	do {								\
+		ssize_t bytes;						\
+		const char *str = "%20s: %08x\n";			\
+		const int maxcol = 32;					\
+		bytes = snprintf(p, maxcol, str, __stringify(name),	\
+				 iommu_read_reg(obj, MMU_##name));	\
+		p += bytes;						\
+		len -= bytes;						\
+		if (len < maxcol)					\
+			goto out;					\
+	} while (0)
+
+static ssize_t
+omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
+{
+	char *p = buf;
+
+	pr_reg(REVISION);
+	pr_reg(IRQSTATUS);
+	pr_reg(IRQENABLE);
+	pr_reg(WALKING_ST);
+	pr_reg(CNTL);
+	pr_reg(FAULT_AD);
+	pr_reg(TTB);
+	pr_reg(LOCK);
+	pr_reg(LD_TLB);
+	pr_reg(CAM);
+	pr_reg(RAM);
+	pr_reg(GFLUSH);
+	pr_reg(FLUSH_ENTRY);
+	pr_reg(READ_CAM);
+	pr_reg(READ_RAM);
+	pr_reg(EMU_FAULT_AD);
+out:
+	return p - buf;
+}
+
+static ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf,
+				   ssize_t bytes)
+{
+	if (!obj || !buf)
+		return -EINVAL;
+
+	pm_runtime_get_sync(obj->dev);
+
+	bytes = omap2_iommu_dump_ctx(obj, buf, bytes);
+
+	pm_runtime_put_sync(obj->dev);
+
+	return bytes;
+}
+
 static ssize_t debug_read_regs(struct file *file, char __user *userbuf,
 			       size_t count, loff_t *ppos)
 {
@@ -55,34 +109,71 @@
 	return bytes;
 }
 
-static ssize_t debug_read_tlb(struct file *file, char __user *userbuf,
-			      size_t count, loff_t *ppos)
+static int
+__dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
 {
-	struct omap_iommu *obj = file->private_data;
-	char *p, *buf;
-	ssize_t bytes, rest;
+	int i;
+	struct iotlb_lock saved;
+	struct cr_regs tmp;
+	struct cr_regs *p = crs;
+
+	pm_runtime_get_sync(obj->dev);
+	iotlb_lock_get(obj, &saved);
+
+	for_each_iotlb_cr(obj, num, i, tmp) {
+		if (!iotlb_cr_valid(&tmp))
+			continue;
+		*p++ = tmp;
+	}
+
+	iotlb_lock_set(obj, &saved);
+	pm_runtime_put_sync(obj->dev);
+
+	return  p - crs;
+}
+
+static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
+			     struct seq_file *s)
+{
+	return seq_printf(s, "%08x %08x %01x\n", cr->cam, cr->ram,
+			  (cr->cam & MMU_CAM_P) ? 1 : 0);
+}
+
+static size_t omap_dump_tlb_entries(struct omap_iommu *obj, struct seq_file *s)
+{
+	int i, num;
+	struct cr_regs *cr;
+
+	num = obj->nr_tlb_entries;
+
+	cr = kcalloc(num, sizeof(*cr), GFP_KERNEL);
+	if (!cr)
+		return 0;
+
+	num = __dump_tlb_entries(obj, cr, num);
+	for (i = 0; i < num; i++)
+		iotlb_dump_cr(obj, cr + i, s);
+	kfree(cr);
+
+	return 0;
+}
+
+static int debug_read_tlb(struct seq_file *s, void *data)
+{
+	struct omap_iommu *obj = s->private;
 
 	if (is_omap_iommu_detached(obj))
 		return -EPERM;
 
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	p = buf;
-
 	mutex_lock(&iommu_debug_lock);
 
-	p += sprintf(p, "%8s %8s\n", "cam:", "ram:");
-	p += sprintf(p, "-----------------------------------------\n");
-	rest = count - (p - buf);
-	p += omap_dump_tlb_entries(obj, p, rest);
-
-	bytes = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+	seq_printf(s, "%8s %8s\n", "cam:", "ram:");
+	seq_puts(s, "-----------------------------------------\n");
+	omap_dump_tlb_entries(obj, s);
 
 	mutex_unlock(&iommu_debug_lock);
-	kfree(buf);
 
-	return bytes;
+	return 0;
 }
 
 static void dump_ioptable(struct seq_file *s)
@@ -154,10 +245,10 @@
 		.open = simple_open,					\
 		.read = debug_read_##name,				\
 		.llseek = generic_file_llseek,				\
-	};
+	}
 
 DEBUG_FOPS_RO(regs);
-DEBUG_FOPS_RO(tlb);
+DEBUG_SEQ_FOPS_RO(tlb);
 DEBUG_SEQ_FOPS_RO(pagetable);
 
 #define __DEBUG_ADD_FILE(attr, mode)					\
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index a22c33d..36d0033 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -12,7 +12,6 @@
  */
 
 #include <linux/err.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
@@ -38,11 +37,6 @@
 #define to_iommu(dev)							\
 	((struct omap_iommu *)platform_get_drvdata(to_platform_device(dev)))
 
-#define for_each_iotlb_cr(obj, n, __i, cr)				\
-	for (__i = 0;							\
-	     (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true);	\
-	     __i++)
-
 /* bitmap of the page sizes currently supported */
 #define OMAP_IOMMU_PGSIZES	(SZ_4K | SZ_64K | SZ_1M | SZ_16M)
 
@@ -72,11 +66,6 @@
 #define MMU_LOCK_VICT(x)	\
 	((x & MMU_LOCK_VICT_MASK) >> MMU_LOCK_VICT_SHIFT)
 
-struct iotlb_lock {
-	short base;
-	short vict;
-};
-
 static struct platform_driver omap_iommu_driver;
 static struct kmem_cache *iopte_cachep;
 
@@ -213,14 +202,6 @@
 /*
  *	TLB operations
  */
-static inline int iotlb_cr_valid(struct cr_regs *cr)
-{
-	if (!cr)
-		return -EINVAL;
-
-	return cr->cam & MMU_CAM_V;
-}
-
 static u32 iotlb_cr_to_virt(struct cr_regs *cr)
 {
 	u32 page_size = cr->cam & MMU_CAM_PGSZ_MASK;
@@ -260,7 +241,7 @@
 	return status;
 }
 
-static void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l)
 {
 	u32 val;
 
@@ -268,10 +249,9 @@
 
 	l->base = MMU_LOCK_BASE(val);
 	l->vict = MMU_LOCK_VICT(val);
-
 }
 
-static void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l)
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l)
 {
 	u32 val;
 
@@ -297,7 +277,7 @@
 }
 
 /* only used in iotlb iteration for-loop */
-static struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n)
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n)
 {
 	struct cr_regs cr;
 	struct iotlb_lock l;
@@ -468,129 +448,6 @@
 	pm_runtime_put_sync(obj->dev);
 }
 
-#ifdef CONFIG_OMAP_IOMMU_DEBUG
-
-#define pr_reg(name)							\
-	do {								\
-		ssize_t bytes;						\
-		const char *str = "%20s: %08x\n";			\
-		const int maxcol = 32;					\
-		bytes = snprintf(p, maxcol, str, __stringify(name),	\
-				 iommu_read_reg(obj, MMU_##name));	\
-		p += bytes;						\
-		len -= bytes;						\
-		if (len < maxcol)					\
-			goto out;					\
-	} while (0)
-
-static ssize_t
-omap2_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len)
-{
-	char *p = buf;
-
-	pr_reg(REVISION);
-	pr_reg(IRQSTATUS);
-	pr_reg(IRQENABLE);
-	pr_reg(WALKING_ST);
-	pr_reg(CNTL);
-	pr_reg(FAULT_AD);
-	pr_reg(TTB);
-	pr_reg(LOCK);
-	pr_reg(LD_TLB);
-	pr_reg(CAM);
-	pr_reg(RAM);
-	pr_reg(GFLUSH);
-	pr_reg(FLUSH_ENTRY);
-	pr_reg(READ_CAM);
-	pr_reg(READ_RAM);
-	pr_reg(EMU_FAULT_AD);
-out:
-	return p - buf;
-}
-
-ssize_t omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t bytes)
-{
-	if (!obj || !buf)
-		return -EINVAL;
-
-	pm_runtime_get_sync(obj->dev);
-
-	bytes = omap2_iommu_dump_ctx(obj, buf, bytes);
-
-	pm_runtime_put_sync(obj->dev);
-
-	return bytes;
-}
-
-static int
-__dump_tlb_entries(struct omap_iommu *obj, struct cr_regs *crs, int num)
-{
-	int i;
-	struct iotlb_lock saved;
-	struct cr_regs tmp;
-	struct cr_regs *p = crs;
-
-	pm_runtime_get_sync(obj->dev);
-	iotlb_lock_get(obj, &saved);
-
-	for_each_iotlb_cr(obj, num, i, tmp) {
-		if (!iotlb_cr_valid(&tmp))
-			continue;
-		*p++ = tmp;
-	}
-
-	iotlb_lock_set(obj, &saved);
-	pm_runtime_put_sync(obj->dev);
-
-	return  p - crs;
-}
-
-/**
- * iotlb_dump_cr - Dump an iommu tlb entry into buf
- * @obj:	target iommu
- * @cr:		contents of cam and ram register
- * @buf:	output buffer
- **/
-static ssize_t iotlb_dump_cr(struct omap_iommu *obj, struct cr_regs *cr,
-			     char *buf)
-{
-	char *p = buf;
-
-	/* FIXME: Need more detail analysis of cam/ram */
-	p += sprintf(p, "%08x %08x %01x\n", cr->cam, cr->ram,
-					(cr->cam & MMU_CAM_P) ? 1 : 0);
-
-	return p - buf;
-}
-
-/**
- * omap_dump_tlb_entries - dump cr arrays to given buffer
- * @obj:	target iommu
- * @buf:	output buffer
- **/
-size_t omap_dump_tlb_entries(struct omap_iommu *obj, char *buf, ssize_t bytes)
-{
-	int i, num;
-	struct cr_regs *cr;
-	char *p = buf;
-
-	num = bytes / sizeof(*cr);
-	num = min(obj->nr_tlb_entries, num);
-
-	cr = kcalloc(num, sizeof(*cr), GFP_KERNEL);
-	if (!cr)
-		return 0;
-
-	num = __dump_tlb_entries(obj, cr, num);
-	for (i = 0; i < num; i++)
-		p += iotlb_dump_cr(obj, cr + i, p);
-	kfree(cr);
-
-	return p - buf;
-}
-
-#endif /* CONFIG_OMAP_IOMMU_DEBUG */
-
 /*
  *	H/W pagetable operations
  */
@@ -930,14 +787,14 @@
 
 	if (!iopgd_is_table(*iopgd)) {
 		dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:px%08x\n",
-				obj->name, errs, da, iopgd, *iopgd);
+			obj->name, errs, da, iopgd, *iopgd);
 		return IRQ_NONE;
 	}
 
 	iopte = iopte_offset(iopgd, da);
 
 	dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:0x%08x pte:0x%p *pte:0x%08x\n",
-			obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
+		obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
 
 	return IRQ_NONE;
 }
@@ -963,9 +820,8 @@
 	struct device *dev;
 	struct omap_iommu *obj;
 
-	dev = driver_find_device(&omap_iommu_driver.driver, NULL,
-				(void *)name,
-				device_match_by_alias);
+	dev = driver_find_device(&omap_iommu_driver.driver, NULL, (void *)name,
+				 device_match_by_alias);
 	if (!dev)
 		return ERR_PTR(-ENODEV);
 
@@ -1089,7 +945,6 @@
 	{ .compatible = "ti,dra7-iommu"	},
 	{},
 };
-MODULE_DEVICE_TABLE(of, omap_iommu_of_match);
 
 static struct platform_driver omap_iommu_driver = {
 	.probe	= omap_iommu_probe,
@@ -1121,7 +976,7 @@
 }
 
 static int omap_iommu_map(struct iommu_domain *domain, unsigned long da,
-			 phys_addr_t pa, size_t bytes, int prot)
+			  phys_addr_t pa, size_t bytes, int prot)
 {
 	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
 	struct omap_iommu *oiommu = omap_domain->iommu_dev;
@@ -1148,7 +1003,7 @@
 }
 
 static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da,
-			    size_t size)
+			       size_t size)
 {
 	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
 	struct omap_iommu *oiommu = omap_domain->iommu_dev;
@@ -1199,7 +1054,7 @@
 }
 
 static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
-			struct device *dev)
+				   struct device *dev)
 {
 	struct omap_iommu *oiommu = dev_to_omap_iommu(dev);
 	struct omap_iommu_arch_data *arch_data = dev->archdata.iommu;
@@ -1220,7 +1075,7 @@
 }
 
 static void omap_iommu_detach_dev(struct iommu_domain *domain,
-				 struct device *dev)
+				  struct device *dev)
 {
 	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
 
@@ -1237,16 +1092,12 @@
 		return NULL;
 
 	omap_domain = kzalloc(sizeof(*omap_domain), GFP_KERNEL);
-	if (!omap_domain) {
-		pr_err("kzalloc failed\n");
+	if (!omap_domain)
 		goto out;
-	}
 
 	omap_domain->pgtable = kzalloc(IOPGD_TABLE_SIZE, GFP_KERNEL);
-	if (!omap_domain->pgtable) {
-		pr_err("kzalloc failed\n");
+	if (!omap_domain->pgtable)
 		goto fail_nomem;
-	}
 
 	/*
 	 * should never fail, but please keep this around to ensure
@@ -1285,7 +1136,7 @@
 }
 
 static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
-					  dma_addr_t da)
+					   dma_addr_t da)
 {
 	struct omap_iommu_domain *omap_domain = to_omap_domain(domain);
 	struct omap_iommu *oiommu = omap_domain->iommu_dev;
@@ -1302,7 +1153,7 @@
 			ret = omap_iommu_translate(*pte, da, IOLARGE_MASK);
 		else
 			dev_err(dev, "bogus pte 0x%x, da 0x%llx", *pte,
-							(unsigned long long)da);
+				(unsigned long long)da);
 	} else {
 		if (iopgd_is_section(*pgd))
 			ret = omap_iommu_translate(*pgd, da, IOSECTION_MASK);
@@ -1310,7 +1161,7 @@
 			ret = omap_iommu_translate(*pgd, da, IOSUPER_MASK);
 		else
 			dev_err(dev, "bogus pgd 0x%x, da 0x%llx", *pgd,
-							(unsigned long long)da);
+				(unsigned long long)da);
 	}
 
 	return ret;
@@ -1405,20 +1256,5 @@
 
 	return platform_driver_register(&omap_iommu_driver);
 }
-/* must be ready before omap3isp is probed */
 subsys_initcall(omap_iommu_init);
-
-static void __exit omap_iommu_exit(void)
-{
-	kmem_cache_destroy(iopte_cachep);
-
-	platform_driver_unregister(&omap_iommu_driver);
-
-	omap_iommu_debugfs_exit();
-}
-module_exit(omap_iommu_exit);
-
-MODULE_DESCRIPTION("omap iommu: tlb and pagetable primitives");
-MODULE_ALIAS("platform:omap-iommu");
-MODULE_AUTHOR("Hiroshi DOYU, Paul Mundt and Toshihiro Kobayashi");
-MODULE_LICENSE("GPL v2");
+/* must be ready before omap3isp is probed */
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index d736630..a656df2 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -13,16 +13,18 @@
 #ifndef _OMAP_IOMMU_H
 #define _OMAP_IOMMU_H
 
+#include <linux/bitops.h>
+
+#define for_each_iotlb_cr(obj, n, __i, cr)				\
+	for (__i = 0;							\
+	     (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true);	\
+	     __i++)
+
 struct iotlb_entry {
 	u32 da;
 	u32 pa;
 	u32 pgsz, prsvd, valid;
-	union {
-		u16 ap;
-		struct {
-			u32 endian, elsz, mixed;
-		};
-	};
+	u32 endian, elsz, mixed;
 };
 
 struct omap_iommu {
@@ -49,20 +51,13 @@
 };
 
 struct cr_regs {
-	union {
-		struct {
-			u16 cam_l;
-			u16 cam_h;
-		};
-		u32 cam;
-	};
-	union {
-		struct {
-			u16 ram_l;
-			u16 ram_h;
-		};
-		u32 ram;
-	};
+	u32 cam;
+	u32 ram;
+};
+
+struct iotlb_lock {
+	short base;
+	short vict;
 };
 
 /**
@@ -103,11 +98,11 @@
  * MMU Register bit definitions
  */
 /* IRQSTATUS & IRQENABLE */
-#define MMU_IRQ_MULTIHITFAULT	(1 << 4)
-#define MMU_IRQ_TABLEWALKFAULT	(1 << 3)
-#define MMU_IRQ_EMUMISS		(1 << 2)
-#define MMU_IRQ_TRANSLATIONFAULT	(1 << 1)
-#define MMU_IRQ_TLBMISS		(1 << 0)
+#define MMU_IRQ_MULTIHITFAULT	BIT(4)
+#define MMU_IRQ_TABLEWALKFAULT	BIT(3)
+#define MMU_IRQ_EMUMISS		BIT(2)
+#define MMU_IRQ_TRANSLATIONFAULT	BIT(1)
+#define MMU_IRQ_TLBMISS		BIT(0)
 
 #define __MMU_IRQ_FAULT		\
 	(MMU_IRQ_MULTIHITFAULT | MMU_IRQ_EMUMISS | MMU_IRQ_TRANSLATIONFAULT)
@@ -119,16 +114,16 @@
 /* MMU_CNTL */
 #define MMU_CNTL_SHIFT		1
 #define MMU_CNTL_MASK		(7 << MMU_CNTL_SHIFT)
-#define MMU_CNTL_EML_TLB	(1 << 3)
-#define MMU_CNTL_TWL_EN		(1 << 2)
-#define MMU_CNTL_MMU_EN		(1 << 1)
+#define MMU_CNTL_EML_TLB	BIT(3)
+#define MMU_CNTL_TWL_EN		BIT(2)
+#define MMU_CNTL_MMU_EN		BIT(1)
 
 /* CAM */
 #define MMU_CAM_VATAG_SHIFT	12
 #define MMU_CAM_VATAG_MASK \
 	((~0UL >> MMU_CAM_VATAG_SHIFT) << MMU_CAM_VATAG_SHIFT)
-#define MMU_CAM_P		(1 << 3)
-#define MMU_CAM_V		(1 << 2)
+#define MMU_CAM_P		BIT(3)
+#define MMU_CAM_V		BIT(2)
 #define MMU_CAM_PGSZ_MASK	3
 #define MMU_CAM_PGSZ_1M		(0 << 0)
 #define MMU_CAM_PGSZ_64K	(1 << 0)
@@ -141,9 +136,9 @@
 	((~0UL >> MMU_RAM_PADDR_SHIFT) << MMU_RAM_PADDR_SHIFT)
 
 #define MMU_RAM_ENDIAN_SHIFT	9
-#define MMU_RAM_ENDIAN_MASK	(1 << MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_MASK	BIT(MMU_RAM_ENDIAN_SHIFT)
 #define MMU_RAM_ENDIAN_LITTLE	(0 << MMU_RAM_ENDIAN_SHIFT)
-#define MMU_RAM_ENDIAN_BIG	(1 << MMU_RAM_ENDIAN_SHIFT)
+#define MMU_RAM_ENDIAN_BIG	BIT(MMU_RAM_ENDIAN_SHIFT)
 
 #define MMU_RAM_ELSZ_SHIFT	7
 #define MMU_RAM_ELSZ_MASK	(3 << MMU_RAM_ELSZ_SHIFT)
@@ -152,7 +147,7 @@
 #define MMU_RAM_ELSZ_32		(2 << MMU_RAM_ELSZ_SHIFT)
 #define MMU_RAM_ELSZ_NONE	(3 << MMU_RAM_ELSZ_SHIFT)
 #define MMU_RAM_MIXED_SHIFT	6
-#define MMU_RAM_MIXED_MASK	(1 << MMU_RAM_MIXED_SHIFT)
+#define MMU_RAM_MIXED_MASK	BIT(MMU_RAM_MIXED_SHIFT)
 #define MMU_RAM_MIXED		MMU_RAM_MIXED_MASK
 
 #define MMU_GP_REG_BUS_ERR_BACK_EN	0x1
@@ -190,12 +185,12 @@
 /*
  * global functions
  */
-#ifdef CONFIG_OMAP_IOMMU_DEBUG
-extern ssize_t
-omap_iommu_dump_ctx(struct omap_iommu *obj, char *buf, ssize_t len);
-extern size_t
-omap_dump_tlb_entries(struct omap_iommu *obj, char *buf, ssize_t len);
 
+struct cr_regs __iotlb_read_cr(struct omap_iommu *obj, int n);
+void iotlb_lock_get(struct omap_iommu *obj, struct iotlb_lock *l);
+void iotlb_lock_set(struct omap_iommu *obj, struct iotlb_lock *l);
+
+#ifdef CONFIG_OMAP_IOMMU_DEBUG
 void omap_iommu_debugfs_init(void);
 void omap_iommu_debugfs_exit(void);
 
@@ -222,4 +217,12 @@
 	__raw_writel(val, obj->regbase + offs);
 }
 
+static inline int iotlb_cr_valid(struct cr_regs *cr)
+{
+	if (!cr)
+		return -EINVAL;
+
+	return cr->cam & MMU_CAM_V;
+}
+
 #endif /* _OMAP_IOMMU_H */
diff --git a/drivers/iommu/omap-iopgtable.h b/drivers/iommu/omap-iopgtable.h
index f891683..01a3152 100644
--- a/drivers/iommu/omap-iopgtable.h
+++ b/drivers/iommu/omap-iopgtable.h
@@ -10,25 +10,30 @@
  * published by the Free Software Foundation.
  */
 
+#ifndef _OMAP_IOPGTABLE_H
+#define _OMAP_IOPGTABLE_H
+
+#include <linux/bitops.h>
+
 /*
  * "L2 table" address mask and size definitions.
  */
 #define IOPGD_SHIFT		20
-#define IOPGD_SIZE		(1UL << IOPGD_SHIFT)
+#define IOPGD_SIZE		BIT(IOPGD_SHIFT)
 #define IOPGD_MASK		(~(IOPGD_SIZE - 1))
 
 /*
  * "section" address mask and size definitions.
  */
 #define IOSECTION_SHIFT		20
-#define IOSECTION_SIZE		(1UL << IOSECTION_SHIFT)
+#define IOSECTION_SIZE		BIT(IOSECTION_SHIFT)
 #define IOSECTION_MASK		(~(IOSECTION_SIZE - 1))
 
 /*
  * "supersection" address mask and size definitions.
  */
 #define IOSUPER_SHIFT		24
-#define IOSUPER_SIZE		(1UL << IOSUPER_SHIFT)
+#define IOSUPER_SIZE		BIT(IOSUPER_SHIFT)
 #define IOSUPER_MASK		(~(IOSUPER_SIZE - 1))
 
 #define PTRS_PER_IOPGD		(1UL << (32 - IOPGD_SHIFT))
@@ -38,14 +43,14 @@
  * "small page" address mask and size definitions.
  */
 #define IOPTE_SHIFT		12
-#define IOPTE_SIZE		(1UL << IOPTE_SHIFT)
+#define IOPTE_SIZE		BIT(IOPTE_SHIFT)
 #define IOPTE_MASK		(~(IOPTE_SIZE - 1))
 
 /*
  * "large page" address mask and size definitions.
  */
 #define IOLARGE_SHIFT		16
-#define IOLARGE_SIZE		(1UL << IOLARGE_SHIFT)
+#define IOLARGE_SIZE		BIT(IOLARGE_SHIFT)
 #define IOLARGE_MASK		(~(IOLARGE_SIZE - 1))
 
 #define PTRS_PER_IOPTE		(1UL << (IOPGD_SHIFT - IOPTE_SHIFT))
@@ -69,16 +74,16 @@
 /*
  * some descriptor attributes.
  */
-#define IOPGD_TABLE		(1 << 0)
-#define IOPGD_SECTION		(2 << 0)
-#define IOPGD_SUPER		(1 << 18 | 2 << 0)
+#define IOPGD_TABLE		(1)
+#define IOPGD_SECTION		(2)
+#define IOPGD_SUPER		(BIT(18) | IOPGD_SECTION)
 
 #define iopgd_is_table(x)	(((x) & 3) == IOPGD_TABLE)
 #define iopgd_is_section(x)	(((x) & (1 << 18 | 3)) == IOPGD_SECTION)
 #define iopgd_is_super(x)	(((x) & (1 << 18 | 3)) == IOPGD_SUPER)
 
-#define IOPTE_SMALL		(2 << 0)
-#define IOPTE_LARGE		(1 << 0)
+#define IOPTE_SMALL		(2)
+#define IOPTE_LARGE		(1)
 
 #define iopte_is_small(x)	(((x) & 2) == IOPTE_SMALL)
 #define iopte_is_large(x)	(((x) & 3) == IOPTE_LARGE)
@@ -93,3 +98,5 @@
 /* to find an entry in the second-level page table. */
 #define iopte_index(da)		(((da) >> IOPTE_SHIFT) & (PTRS_PER_IOPTE - 1))
 #define iopte_offset(iopgd, da)	(iopgd_page_vaddr(iopgd) + iopte_index(da))
+
+#endif /* _OMAP_IOPGTABLE_H */
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index c1f2e52..9305964 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -27,6 +27,7 @@
 	const struct tegra_smmu_soc *soc;
 
 	unsigned long pfn_mask;
+	unsigned long tlb_mask;
 
 	unsigned long *asids;
 	struct mutex lock;
@@ -40,8 +41,10 @@
 	struct iommu_domain domain;
 	struct tegra_smmu *smmu;
 	unsigned int use_count;
-	struct page *count;
+	u32 *count;
+	struct page **pts;
 	struct page *pd;
+	dma_addr_t pd_dma;
 	unsigned id;
 	u32 attr;
 };
@@ -68,7 +71,8 @@
 #define SMMU_TLB_CONFIG 0x14
 #define  SMMU_TLB_CONFIG_HIT_UNDER_MISS (1 << 29)
 #define  SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION (1 << 28)
-#define  SMMU_TLB_CONFIG_ACTIVE_LINES(x) ((x) & 0x3f)
+#define  SMMU_TLB_CONFIG_ACTIVE_LINES(smmu) \
+	((smmu)->soc->num_tlb_lines & (smmu)->tlb_mask)
 
 #define SMMU_PTC_CONFIG 0x18
 #define  SMMU_PTC_CONFIG_ENABLE (1 << 29)
@@ -79,9 +83,9 @@
 #define  SMMU_PTB_ASID_VALUE(x) ((x) & 0x7f)
 
 #define SMMU_PTB_DATA 0x020
-#define  SMMU_PTB_DATA_VALUE(page, attr) (page_to_phys(page) >> 12 | (attr))
+#define  SMMU_PTB_DATA_VALUE(dma, attr) ((dma) >> 12 | (attr))
 
-#define SMMU_MK_PDE(page, attr) (page_to_phys(page) >> SMMU_PTE_SHIFT | (attr))
+#define SMMU_MK_PDE(dma, attr) ((dma) >> SMMU_PTE_SHIFT | (attr))
 
 #define SMMU_TLB_FLUSH 0x030
 #define  SMMU_TLB_FLUSH_VA_MATCH_ALL     (0 << 0)
@@ -134,29 +138,49 @@
 #define SMMU_PTE_ATTR		(SMMU_PTE_READABLE | SMMU_PTE_WRITABLE | \
 				 SMMU_PTE_NONSECURE)
 
-static inline void smmu_flush_ptc(struct tegra_smmu *smmu, struct page *page,
+static unsigned int iova_pd_index(unsigned long iova)
+{
+	return (iova >> SMMU_PDE_SHIFT) & (SMMU_NUM_PDE - 1);
+}
+
+static unsigned int iova_pt_index(unsigned long iova)
+{
+	return (iova >> SMMU_PTE_SHIFT) & (SMMU_NUM_PTE - 1);
+}
+
+static bool smmu_dma_addr_valid(struct tegra_smmu *smmu, dma_addr_t addr)
+{
+	addr >>= 12;
+	return (addr & smmu->pfn_mask) == addr;
+}
+
+static dma_addr_t smmu_pde_to_dma(u32 pde)
+{
+	return pde << 12;
+}
+
+static void smmu_flush_ptc_all(struct tegra_smmu *smmu)
+{
+	smmu_writel(smmu, SMMU_PTC_FLUSH_TYPE_ALL, SMMU_PTC_FLUSH);
+}
+
+static inline void smmu_flush_ptc(struct tegra_smmu *smmu, dma_addr_t dma,
 				  unsigned long offset)
 {
-	phys_addr_t phys = page ? page_to_phys(page) : 0;
 	u32 value;
 
-	if (page) {
-		offset &= ~(smmu->mc->soc->atom_size - 1);
+	offset &= ~(smmu->mc->soc->atom_size - 1);
 
-		if (smmu->mc->soc->num_address_bits > 32) {
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-			value = (phys >> 32) & SMMU_PTC_FLUSH_HI_MASK;
+	if (smmu->mc->soc->num_address_bits > 32) {
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+		value = (dma >> 32) & SMMU_PTC_FLUSH_HI_MASK;
 #else
-			value = 0;
+		value = 0;
 #endif
-			smmu_writel(smmu, value, SMMU_PTC_FLUSH_HI);
-		}
-
-		value = (phys + offset) | SMMU_PTC_FLUSH_TYPE_ADR;
-	} else {
-		value = SMMU_PTC_FLUSH_TYPE_ALL;
+		smmu_writel(smmu, value, SMMU_PTC_FLUSH_HI);
 	}
 
+	value = (dma + offset) | SMMU_PTC_FLUSH_TYPE_ADR;
 	smmu_writel(smmu, value, SMMU_PTC_FLUSH);
 }
 
@@ -236,8 +260,6 @@
 static struct iommu_domain *tegra_smmu_domain_alloc(unsigned type)
 {
 	struct tegra_smmu_as *as;
-	unsigned int i;
-	uint32_t *pd;
 
 	if (type != IOMMU_DOMAIN_UNMANAGED)
 		return NULL;
@@ -248,32 +270,26 @@
 
 	as->attr = SMMU_PD_READABLE | SMMU_PD_WRITABLE | SMMU_PD_NONSECURE;
 
-	as->pd = alloc_page(GFP_KERNEL | __GFP_DMA);
+	as->pd = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
 	if (!as->pd) {
 		kfree(as);
 		return NULL;
 	}
 
-	as->count = alloc_page(GFP_KERNEL);
+	as->count = kcalloc(SMMU_NUM_PDE, sizeof(u32), GFP_KERNEL);
 	if (!as->count) {
 		__free_page(as->pd);
 		kfree(as);
 		return NULL;
 	}
 
-	/* clear PDEs */
-	pd = page_address(as->pd);
-	SetPageReserved(as->pd);
-
-	for (i = 0; i < SMMU_NUM_PDE; i++)
-		pd[i] = 0;
-
-	/* clear PDE usage counters */
-	pd = page_address(as->count);
-	SetPageReserved(as->count);
-
-	for (i = 0; i < SMMU_NUM_PDE; i++)
-		pd[i] = 0;
+	as->pts = kcalloc(SMMU_NUM_PDE, sizeof(*as->pts), GFP_KERNEL);
+	if (!as->pts) {
+		kfree(as->count);
+		__free_page(as->pd);
+		kfree(as);
+		return NULL;
+	}
 
 	/* setup aperture */
 	as->domain.geometry.aperture_start = 0;
@@ -288,7 +304,6 @@
 	struct tegra_smmu_as *as = to_smmu_as(domain);
 
 	/* TODO: free page directory and page tables */
-	ClearPageReserved(as->pd);
 
 	kfree(as);
 }
@@ -376,16 +391,26 @@
 		return 0;
 	}
 
+	as->pd_dma = dma_map_page(smmu->dev, as->pd, 0, SMMU_SIZE_PD,
+				  DMA_TO_DEVICE);
+	if (dma_mapping_error(smmu->dev, as->pd_dma))
+		return -ENOMEM;
+
+	/* We can't handle 64-bit DMA addresses */
+	if (!smmu_dma_addr_valid(smmu, as->pd_dma)) {
+		err = -ENOMEM;
+		goto err_unmap;
+	}
+
 	err = tegra_smmu_alloc_asid(smmu, &as->id);
 	if (err < 0)
-		return err;
+		goto err_unmap;
 
-	smmu->soc->ops->flush_dcache(as->pd, 0, SMMU_SIZE_PD);
-	smmu_flush_ptc(smmu, as->pd, 0);
+	smmu_flush_ptc(smmu, as->pd_dma, 0);
 	smmu_flush_tlb_asid(smmu, as->id);
 
 	smmu_writel(smmu, as->id & 0x7f, SMMU_PTB_ASID);
-	value = SMMU_PTB_DATA_VALUE(as->pd, as->attr);
+	value = SMMU_PTB_DATA_VALUE(as->pd_dma, as->attr);
 	smmu_writel(smmu, value, SMMU_PTB_DATA);
 	smmu_flush(smmu);
 
@@ -393,6 +418,10 @@
 	as->use_count++;
 
 	return 0;
+
+err_unmap:
+	dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+	return err;
 }
 
 static void tegra_smmu_as_unprepare(struct tegra_smmu *smmu,
@@ -402,6 +431,9 @@
 		return;
 
 	tegra_smmu_free_asid(smmu, as->id);
+
+	dma_unmap_page(smmu->dev, as->pd_dma, SMMU_SIZE_PD, DMA_TO_DEVICE);
+
 	as->smmu = NULL;
 }
 
@@ -465,96 +497,155 @@
 	}
 }
 
-static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
-		       struct page **pagep)
+static void tegra_smmu_set_pde(struct tegra_smmu_as *as, unsigned long iova,
+			       u32 value)
 {
-	u32 *pd = page_address(as->pd), *pt, *count;
-	u32 pde = (iova >> SMMU_PDE_SHIFT) & 0x3ff;
-	u32 pte = (iova >> SMMU_PTE_SHIFT) & 0x3ff;
+	unsigned int pd_index = iova_pd_index(iova);
 	struct tegra_smmu *smmu = as->smmu;
-	struct page *page;
-	unsigned int i;
+	u32 *pd = page_address(as->pd);
+	unsigned long offset = pd_index * sizeof(*pd);
 
-	if (pd[pde] == 0) {
-		page = alloc_page(GFP_KERNEL | __GFP_DMA);
+	/* Set the page directory entry first */
+	pd[pd_index] = value;
+
+	/* The flush the page directory entry from caches */
+	dma_sync_single_range_for_device(smmu->dev, as->pd_dma, offset,
+					 sizeof(*pd), DMA_TO_DEVICE);
+
+	/* And flush the iommu */
+	smmu_flush_ptc(smmu, as->pd_dma, offset);
+	smmu_flush_tlb_section(smmu, as->id, iova);
+	smmu_flush(smmu);
+}
+
+static u32 *tegra_smmu_pte_offset(struct page *pt_page, unsigned long iova)
+{
+	u32 *pt = page_address(pt_page);
+
+	return pt + iova_pt_index(iova);
+}
+
+static u32 *tegra_smmu_pte_lookup(struct tegra_smmu_as *as, unsigned long iova,
+				  dma_addr_t *dmap)
+{
+	unsigned int pd_index = iova_pd_index(iova);
+	struct page *pt_page;
+	u32 *pd;
+
+	pt_page = as->pts[pd_index];
+	if (!pt_page)
+		return NULL;
+
+	pd = page_address(as->pd);
+	*dmap = smmu_pde_to_dma(pd[pd_index]);
+
+	return tegra_smmu_pte_offset(pt_page, iova);
+}
+
+static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t iova,
+		       dma_addr_t *dmap)
+{
+	unsigned int pde = iova_pd_index(iova);
+	struct tegra_smmu *smmu = as->smmu;
+
+	if (!as->pts[pde]) {
+		struct page *page;
+		dma_addr_t dma;
+
+		page = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
 		if (!page)
 			return NULL;
 
-		pt = page_address(page);
-		SetPageReserved(page);
+		dma = dma_map_page(smmu->dev, page, 0, SMMU_SIZE_PT,
+				   DMA_TO_DEVICE);
+		if (dma_mapping_error(smmu->dev, dma)) {
+			__free_page(page);
+			return NULL;
+		}
 
-		for (i = 0; i < SMMU_NUM_PTE; i++)
-			pt[i] = 0;
+		if (!smmu_dma_addr_valid(smmu, dma)) {
+			dma_unmap_page(smmu->dev, dma, SMMU_SIZE_PT,
+				       DMA_TO_DEVICE);
+			__free_page(page);
+			return NULL;
+		}
 
-		smmu->soc->ops->flush_dcache(page, 0, SMMU_SIZE_PT);
+		as->pts[pde] = page;
 
-		pd[pde] = SMMU_MK_PDE(page, SMMU_PDE_ATTR | SMMU_PDE_NEXT);
+		tegra_smmu_set_pde(as, iova, SMMU_MK_PDE(dma, SMMU_PDE_ATTR |
+							      SMMU_PDE_NEXT));
 
-		smmu->soc->ops->flush_dcache(as->pd, pde << 2, 4);
-		smmu_flush_ptc(smmu, as->pd, pde << 2);
-		smmu_flush_tlb_section(smmu, as->id, iova);
-		smmu_flush(smmu);
+		*dmap = dma;
 	} else {
-		page = pfn_to_page(pd[pde] & smmu->pfn_mask);
-		pt = page_address(page);
+		u32 *pd = page_address(as->pd);
+
+		*dmap = smmu_pde_to_dma(pd[pde]);
 	}
 
-	*pagep = page;
-
-	/* Keep track of entries in this page table. */
-	count = page_address(as->count);
-	if (pt[pte] == 0)
-		count[pde]++;
-
-	return &pt[pte];
+	return tegra_smmu_pte_offset(as->pts[pde], iova);
 }
 
-static void as_put_pte(struct tegra_smmu_as *as, dma_addr_t iova)
+static void tegra_smmu_pte_get_use(struct tegra_smmu_as *as, unsigned long iova)
 {
-	u32 pde = (iova >> SMMU_PDE_SHIFT) & 0x3ff;
-	u32 pte = (iova >> SMMU_PTE_SHIFT) & 0x3ff;
-	u32 *count = page_address(as->count);
-	u32 *pd = page_address(as->pd), *pt;
-	struct page *page;
+	unsigned int pd_index = iova_pd_index(iova);
 
-	page = pfn_to_page(pd[pde] & as->smmu->pfn_mask);
-	pt = page_address(page);
+	as->count[pd_index]++;
+}
+
+static void tegra_smmu_pte_put_use(struct tegra_smmu_as *as, unsigned long iova)
+{
+	unsigned int pde = iova_pd_index(iova);
+	struct page *page = as->pts[pde];
 
 	/*
 	 * When no entries in this page table are used anymore, return the
 	 * memory page to the system.
 	 */
-	if (pt[pte] != 0) {
-		if (--count[pde] == 0) {
-			ClearPageReserved(page);
-			__free_page(page);
-			pd[pde] = 0;
-		}
+	if (--as->count[pde] == 0) {
+		struct tegra_smmu *smmu = as->smmu;
+		u32 *pd = page_address(as->pd);
+		dma_addr_t pte_dma = smmu_pde_to_dma(pd[pde]);
 
-		pt[pte] = 0;
+		tegra_smmu_set_pde(as, iova, 0);
+
+		dma_unmap_page(smmu->dev, pte_dma, SMMU_SIZE_PT, DMA_TO_DEVICE);
+		__free_page(page);
+		as->pts[pde] = NULL;
 	}
 }
 
+static void tegra_smmu_set_pte(struct tegra_smmu_as *as, unsigned long iova,
+			       u32 *pte, dma_addr_t pte_dma, u32 val)
+{
+	struct tegra_smmu *smmu = as->smmu;
+	unsigned long offset = offset_in_page(pte);
+
+	*pte = val;
+
+	dma_sync_single_range_for_device(smmu->dev, pte_dma, offset,
+					 4, DMA_TO_DEVICE);
+	smmu_flush_ptc(smmu, pte_dma, offset);
+	smmu_flush_tlb_group(smmu, as->id, iova);
+	smmu_flush(smmu);
+}
+
 static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
 			  phys_addr_t paddr, size_t size, int prot)
 {
 	struct tegra_smmu_as *as = to_smmu_as(domain);
-	struct tegra_smmu *smmu = as->smmu;
-	unsigned long offset;
-	struct page *page;
+	dma_addr_t pte_dma;
 	u32 *pte;
 
-	pte = as_get_pte(as, iova, &page);
+	pte = as_get_pte(as, iova, &pte_dma);
 	if (!pte)
 		return -ENOMEM;
 
-	*pte = __phys_to_pfn(paddr) | SMMU_PTE_ATTR;
-	offset = offset_in_page(pte);
+	/* If we aren't overwriting a pre-existing entry, increment use */
+	if (*pte == 0)
+		tegra_smmu_pte_get_use(as, iova);
 
-	smmu->soc->ops->flush_dcache(page, offset, 4);
-	smmu_flush_ptc(smmu, page, offset);
-	smmu_flush_tlb_group(smmu, as->id, iova);
-	smmu_flush(smmu);
+	tegra_smmu_set_pte(as, iova, pte, pte_dma,
+			   __phys_to_pfn(paddr) | SMMU_PTE_ATTR);
 
 	return 0;
 }
@@ -563,22 +654,15 @@
 			       size_t size)
 {
 	struct tegra_smmu_as *as = to_smmu_as(domain);
-	struct tegra_smmu *smmu = as->smmu;
-	unsigned long offset;
-	struct page *page;
+	dma_addr_t pte_dma;
 	u32 *pte;
 
-	pte = as_get_pte(as, iova, &page);
-	if (!pte)
+	pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+	if (!pte || !*pte)
 		return 0;
 
-	offset = offset_in_page(pte);
-	as_put_pte(as, iova);
-
-	smmu->soc->ops->flush_dcache(page, offset, 4);
-	smmu_flush_ptc(smmu, page, offset);
-	smmu_flush_tlb_group(smmu, as->id, iova);
-	smmu_flush(smmu);
+	tegra_smmu_set_pte(as, iova, pte, pte_dma, 0);
+	tegra_smmu_pte_put_use(as, iova);
 
 	return size;
 }
@@ -587,11 +671,14 @@
 					   dma_addr_t iova)
 {
 	struct tegra_smmu_as *as = to_smmu_as(domain);
-	struct page *page;
 	unsigned long pfn;
+	dma_addr_t pte_dma;
 	u32 *pte;
 
-	pte = as_get_pte(as, iova, &page);
+	pte = tegra_smmu_pte_lookup(as, iova, &pte_dma);
+	if (!pte || !*pte)
+		return 0;
+
 	pfn = *pte & as->smmu->pfn_mask;
 
 	return PFN_PHYS(pfn);
@@ -816,6 +903,9 @@
 	smmu->pfn_mask = BIT_MASK(mc->soc->num_address_bits - PAGE_SHIFT) - 1;
 	dev_dbg(dev, "address bits: %u, PFN mask: %#lx\n",
 		mc->soc->num_address_bits, smmu->pfn_mask);
+	smmu->tlb_mask = (smmu->soc->num_tlb_lines << 1) - 1;
+	dev_dbg(dev, "TLB lines: %u, mask: %#lx\n", smmu->soc->num_tlb_lines,
+		smmu->tlb_mask);
 
 	value = SMMU_PTC_CONFIG_ENABLE | SMMU_PTC_CONFIG_INDEX_MAP(0x3f);
 
@@ -825,14 +915,14 @@
 	smmu_writel(smmu, value, SMMU_PTC_CONFIG);
 
 	value = SMMU_TLB_CONFIG_HIT_UNDER_MISS |
-		SMMU_TLB_CONFIG_ACTIVE_LINES(0x20);
+		SMMU_TLB_CONFIG_ACTIVE_LINES(smmu);
 
 	if (soc->supports_round_robin_arbitration)
 		value |= SMMU_TLB_CONFIG_ROUND_ROBIN_ARBITRATION;
 
 	smmu_writel(smmu, value, SMMU_TLB_CONFIG);
 
-	smmu_flush_ptc(smmu, NULL, 0);
+	smmu_flush_ptc_all(smmu);
 	smmu_flush_tlb(smmu);
 	smmu_writel(smmu, SMMU_CONFIG_ENABLE, SMMU_CONFIG);
 	smmu_flush(smmu);
diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c
index c8765db..ba8fff3 100644
--- a/drivers/memory/tegra/tegra114.c
+++ b/drivers/memory/tegra/tegra114.c
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mm.h>
 
-#include <asm/cacheflush.h>
-
 #include <dt-bindings/memory/tegra114-mc.h>
 
 #include "mc.h"
@@ -914,20 +912,6 @@
 	{ .name = "tsec",      .swgroup = TEGRA_SWGROUP_TSEC,      .reg = 0x294 },
 };
 
-static void tegra114_flush_dcache(struct page *page, unsigned long offset,
-				  size_t size)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	void *virt = page_address(page) + offset;
-
-	__cpuc_flush_dcache_area(virt, size);
-	outer_flush_range(phys, phys + size);
-}
-
-static const struct tegra_smmu_ops tegra114_smmu_ops = {
-	.flush_dcache = tegra114_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra114_smmu_soc = {
 	.clients = tegra114_mc_clients,
 	.num_clients = ARRAY_SIZE(tegra114_mc_clients),
@@ -935,8 +919,8 @@
 	.num_swgroups = ARRAY_SIZE(tegra114_swgroups),
 	.supports_round_robin_arbitration = false,
 	.supports_request_limit = false,
+	.num_tlb_lines = 32,
 	.num_asids = 4,
-	.ops = &tegra114_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra114_mc_soc = {
diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c
index 060fb3d..21e7255 100644
--- a/drivers/memory/tegra/tegra124.c
+++ b/drivers/memory/tegra/tegra124.c
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mm.h>
 
-#include <asm/cacheflush.h>
-
 #include <dt-bindings/memory/tegra124-mc.h>
 
 #include "mc.h"
@@ -1002,20 +1000,6 @@
 };
 
 #ifdef CONFIG_ARCH_TEGRA_124_SOC
-static void tegra124_flush_dcache(struct page *page, unsigned long offset,
-				  size_t size)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	void *virt = page_address(page) + offset;
-
-	__cpuc_flush_dcache_area(virt, size);
-	outer_flush_range(phys, phys + size);
-}
-
-static const struct tegra_smmu_ops tegra124_smmu_ops = {
-	.flush_dcache = tegra124_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra124_smmu_soc = {
 	.clients = tegra124_mc_clients,
 	.num_clients = ARRAY_SIZE(tegra124_mc_clients),
@@ -1024,7 +1008,6 @@
 	.supports_round_robin_arbitration = true,
 	.supports_request_limit = true,
 	.num_asids = 128,
-	.ops = &tegra124_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra124_mc_soc = {
@@ -1040,18 +1023,6 @@
 #endif /* CONFIG_ARCH_TEGRA_124_SOC */
 
 #ifdef CONFIG_ARCH_TEGRA_132_SOC
-static void tegra132_flush_dcache(struct page *page, unsigned long offset,
-				  size_t size)
-{
-	void *virt = page_address(page) + offset;
-
-	__flush_dcache_area(virt, size);
-}
-
-static const struct tegra_smmu_ops tegra132_smmu_ops = {
-	.flush_dcache = tegra132_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra132_smmu_soc = {
 	.clients = tegra124_mc_clients,
 	.num_clients = ARRAY_SIZE(tegra124_mc_clients),
@@ -1059,8 +1030,8 @@
 	.num_swgroups = ARRAY_SIZE(tegra124_swgroups),
 	.supports_round_robin_arbitration = true,
 	.supports_request_limit = true,
+	.num_tlb_lines = 32,
 	.num_asids = 128,
-	.ops = &tegra132_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra132_mc_soc = {
diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c
index 52e16c7..b447378 100644
--- a/drivers/memory/tegra/tegra30.c
+++ b/drivers/memory/tegra/tegra30.c
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/mm.h>
 
-#include <asm/cacheflush.h>
-
 #include <dt-bindings/memory/tegra30-mc.h>
 
 #include "mc.h"
@@ -936,20 +934,6 @@
 	{ .name = "isp",  .swgroup = TEGRA_SWGROUP_ISP,  .reg = 0x258 },
 };
 
-static void tegra30_flush_dcache(struct page *page, unsigned long offset,
-				 size_t size)
-{
-	phys_addr_t phys = page_to_phys(page) + offset;
-	void *virt = page_address(page) + offset;
-
-	__cpuc_flush_dcache_area(virt, size);
-	outer_flush_range(phys, phys + size);
-}
-
-static const struct tegra_smmu_ops tegra30_smmu_ops = {
-	.flush_dcache = tegra30_flush_dcache,
-};
-
 static const struct tegra_smmu_soc tegra30_smmu_soc = {
 	.clients = tegra30_mc_clients,
 	.num_clients = ARRAY_SIZE(tegra30_mc_clients),
@@ -957,8 +941,8 @@
 	.num_swgroups = ARRAY_SIZE(tegra30_swgroups),
 	.supports_round_robin_arbitration = false,
 	.supports_request_limit = false,
+	.num_tlb_lines = 16,
 	.num_asids = 4,
-	.ops = &tegra30_smmu_ops,
 };
 
 const struct tegra_mc_soc tegra30_mc_soc = {
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d9a366d..6240063 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -344,7 +344,7 @@
 
 #ifdef CONFIG_INTEL_IOMMU
 	unsigned long 	*domain_ids; /* bitmap of domains */
-	struct dmar_domain **domains; /* ptr to domains */
+	struct dmar_domain ***domains; /* ptr to domains */
 	spinlock_t	lock; /* protect context, domain ids */
 	struct root_entry *root_entry; /* virtual address */
 
diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index 370f2909..44202ff 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -51,11 +51,6 @@
 	unsigned int reg;
 };
 
-struct tegra_smmu_ops {
-	void (*flush_dcache)(struct page *page, unsigned long offset,
-			     size_t size);
-};
-
 struct tegra_smmu_soc {
 	const struct tegra_mc_client *clients;
 	unsigned int num_clients;
@@ -66,9 +61,8 @@
 	bool supports_round_robin_arbitration;
 	bool supports_request_limit;
 
+	unsigned int num_tlb_lines;
 	unsigned int num_asids;
-
-	const struct tegra_smmu_ops *ops;
 };
 
 struct tegra_mc;