arch/x86/kvm/vmx/pkvm/hyp/iommu.c - chromiumos/third_party/kernel - Git at Google

 /*
  * SPDX-License-Identifier: GPL-2.0
  * Copyright (C) 2022 Intel Corporation
  */
 #include <linux/intel-iommu.h>
 #include <pkvm_spinlock.h>
 #include <pkvm.h>
 #include <gfp.h>
 #include "pkvm_hyp.h"
 #include "memory.h"
 #include "mmu.h"
 #include "ept.h"
 #include "pgtable.h"
 #include "iommu_internal.h"
 #include "debug.h"
 #include "ptdev.h"

 #define for_each_valid_iommu(p)						\
 	for ((p) = iommus; (p) < iommus + PKVM_MAX_IOMMU_NUM; (p)++)	\
 		if (!(p) || !(p)->iommu.reg_phys) {			\
 			continue;					\
 		} else

 static struct pkvm_iommu iommus[PKVM_MAX_IOMMU_NUM];

 static struct pkvm_pool iommu_pool;

 /*
  * Guest page table walking parameter.
  * pkvm IOMMU driver walks the guest page table when syncing
  * with the shadow page table.
  */
 struct pgt_sync_walk_data {
 	struct pkvm_iommu *iommu;
 	/*
 	 * Used to hold shadow page table physical address
 	 * which is used for sync shadow entries at each
 	 * page table level.
 	 */
 	u64 shadow_pa[IOMMU_SM_LEVEL_NUM];
 	/*
 	 * Used when just syncing a part of shadow
 	 * page table entries which match with this did if
 	 * it is set as a non-zero did value.
 	 */
 	u16 did;
 };

 #define DEFINE_PGT_SYNC_WALK_DATA(name, _iommu, domain_id)	\
 	struct pgt_sync_walk_data (name) = {			\
 		.iommu = (_iommu),				\
 		.shadow_pa = {0},				\
 		.did = (domain_id),				\
 	}

 /*
  * Used to config a shadow page table entry in root/context/pasid
  * level.
  */
 struct pgt_sync_data {
 	union {
 		u64 root_entry;
 		struct context_entry ct_entry;
 		struct pasid_dir_entry pd_entry;
 		struct pasid_entry p_entry;
 	};
 	void *guest_ptep;
 	void *shadow_ptep;
 	int level;
 	u64 iommu_ecap;
 	u64 shadow_pa;
 	struct pkvm_pgtable *spgt;
 	unsigned long vaddr;
 };

 static inline void *iommu_zalloc_pages(size_t size)
 {
 	return pkvm_alloc_pages(&iommu_pool, get_order(size));
 }

 static void *iommu_zalloc_page(void)
 {
 	return pkvm_alloc_pages(&iommu_pool, 0);
 }

 static void iommu_get_page(void *vaddr)
 {
 	pkvm_get_page(&iommu_pool, vaddr);
 }

 static void iommu_put_page(void *vaddr)
 {
 	pkvm_put_page(&iommu_pool, vaddr);
 }

 static void iommu_flush_cache(void *ptep, unsigned int size)
 {
 	pkvm_clflush_cache_range(ptep, size);
 }

 static struct pkvm_mm_ops viommu_mm_ops = {
 	.phys_to_virt = host_gpa2hva,
 };

 static struct pkvm_mm_ops iommu_pw_coherency_mm_ops = {
 	.phys_to_virt = pkvm_phys_to_virt,
 	.virt_to_phys = pkvm_virt_to_phys,
 	.zalloc_page = iommu_zalloc_page,
 	.get_page = iommu_get_page,
 	.put_page = iommu_put_page,
 	.page_count = pkvm_page_count,
 };

 static struct pkvm_mm_ops iommu_pw_noncoherency_mm_ops = {
 	.phys_to_virt = pkvm_phys_to_virt,
 	.virt_to_phys = pkvm_virt_to_phys,
 	.zalloc_page = iommu_zalloc_page,
 	.get_page = iommu_get_page,
 	.put_page = iommu_put_page,
 	.page_count = pkvm_page_count,
 	.flush_cache = iommu_flush_cache,
 };

 static bool iommu_id_entry_present(void *ptep)
 {
 	u64 val;

 	val = *(u64 *)ptep;
 	return !!(val & 1);
 }

 static unsigned long iommu_id_entry_to_phys(void *ptep)
 {
 	u64 val = *(u64 *)ptep;

 	return val & VTD_PAGE_MASK;
 }

 static int iommu_sm_id_entry_to_index(unsigned long vaddr, int level)
 {
 	switch (level) {
 	case IOMMU_PASID_TABLE:
 		return vaddr & (BIT(PASIDDIR_BITS) - 1);
 	case IOMMU_PASID_DIR:
 		return (vaddr >> PASIDDIR_SHIFT) & (BIT(PASIDDIR_BITS) - 1);
 	case IOMMU_SM_CONTEXT:
 		return (vaddr >> DEVFN_SHIFT) & (BIT(SM_DEVFN_BITS) - 1);
 	case IOMMU_SM_ROOT:
 		return (vaddr >> SM_BUS_SHIFT) & (BIT(SM_BUS_BITS) - 1);
 	default:
 		break;
 	}

 	return -EINVAL;
 }

 static bool iommu_id_entry_is_leaf(void *ptep, int level)
 {
 	if (LAST_LEVEL(level) ||
 		!iommu_id_entry_present(ptep))
 		return true;

 	return false;
 }

 static int iommu_sm_id_level_entry_size(int level)
 {
 	switch (level) {
 	case IOMMU_PASID_TABLE:
 		return sizeof(struct pasid_entry);
 	case IOMMU_PASID_DIR:
 		return sizeof(struct pasid_dir_entry);
 	case IOMMU_SM_CONTEXT:
 		/* scalable mode requires 32bytes for context */
 		return sizeof(struct context_entry) * 2;
 	case IOMMU_SM_ROOT:
 		return sizeof(u64);
 	default:
 		break;
 	}

 	return -EINVAL;
 }

 static int iommu_sm_id_level_to_entries(int level)
 {
 	switch (level) {
 	case IOMMU_PASID_TABLE:
 		return 1 << PASIDTAB_BITS;
 	case IOMMU_PASID_DIR:
 		return 1 << PASIDDIR_BITS;
 	case IOMMU_SM_CONTEXT:
 		return 1 << SM_DEVFN_BITS;
 	case IOMMU_SM_ROOT:
 		return 1 << SM_BUS_BITS;
 	default:
 		break;
 	}

 	return -EINVAL;
 }

 static unsigned long iommu_sm_id_level_to_size(int level)
 {
 	switch (level) {
 	case IOMMU_PASID_TABLE:
 		return 1;
 	case IOMMU_PASID_DIR:
 		return 1 << PASIDDIR_SHIFT;
 	case IOMMU_SM_CONTEXT:
 		return 1 << DEVFN_SHIFT;
 	case IOMMU_SM_ROOT:
 		return 1 << SM_BUS_SHIFT;
 	default:
 		break;
 	}

 	return 0;
 }

 struct pkvm_pgtable_ops iommu_sm_id_ops = {
 	.pgt_entry_present = iommu_id_entry_present,
 	.pgt_entry_to_phys = iommu_id_entry_to_phys,
 	.pgt_entry_to_index = iommu_sm_id_entry_to_index,
 	.pgt_entry_is_leaf = iommu_id_entry_is_leaf,
 	.pgt_level_entry_size = iommu_sm_id_level_entry_size,
 	.pgt_level_to_entries = iommu_sm_id_level_to_entries,
 	.pgt_level_to_size = iommu_sm_id_level_to_size,
 };

 static int iommu_lm_id_entry_to_index(unsigned long vaddr, int level)
 {
 	switch (level) {
 	case IOMMU_LM_CONTEXT:
 		return (vaddr >> LM_DEVFN_SHIFT) & (BIT(LM_DEVFN_BITS) - 1);
 	case IOMMU_LM_ROOT:
 		return (vaddr >> LM_BUS_SHIFT) & (BIT(LM_BUS_BITS) - 1);
 	default:
 		break;
 	}

 	return -EINVAL;
 }

 static int iommu_lm_id_level_entry_size(int level)
 {
 	switch (level) {
 	case IOMMU_LM_CONTEXT:
 		return sizeof(struct context_entry);
 	case IOMMU_LM_ROOT:
 		return sizeof(struct root_entry);
 	default:
 		break;
 	}

 	return -EINVAL;
 }

 static int iommu_lm_id_level_to_entries(int level)
 {
 	switch (level) {
 	case IOMMU_LM_CONTEXT:
 		return 1 << LM_DEVFN_BITS;
 	case IOMMU_LM_ROOT:
 		return 1 << LM_BUS_BITS;
 	default:
 		break;
 	}

 	return -EINVAL;
 }

 static unsigned long iommu_lm_id_level_to_size(int level)
 {
 	switch (level) {
 	case IOMMU_LM_CONTEXT:
 		return 1 << LM_DEVFN_SHIFT;
 	case IOMMU_LM_ROOT:
 		return 1 << LM_BUS_SHIFT;
 	default:
 		break;
 	}

 	return 0;
 }

 struct pkvm_pgtable_ops iommu_lm_id_ops = {
 	.pgt_entry_present = iommu_id_entry_present,
 	.pgt_entry_to_phys = iommu_id_entry_to_phys,
 	.pgt_entry_to_index = iommu_lm_id_entry_to_index,
 	.pgt_entry_is_leaf = iommu_id_entry_is_leaf,
 	.pgt_level_entry_size = iommu_lm_id_level_entry_size,
 	.pgt_level_to_entries = iommu_lm_id_level_to_entries,
 	.pgt_level_to_size = iommu_lm_id_level_to_size,
 };

 static int iommu_pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr,
 		       unsigned long vaddr_end, struct pkvm_pgtable_walker *walker)
 {
 	if (!pgt->root_pa)
 		return 0;

 	return pgtable_walk(pgt, vaddr, vaddr_end - vaddr, false, walker);
 }

 /* present root entry when shadow_pa valid, otherwise un-present it */
 static bool sync_root_entry(struct pgt_sync_data *sdata)
 {
 	u64 *sre = sdata->shadow_ptep;
 	u64 sre_val = sdata->shadow_pa ? (sdata->shadow_pa | 1) : 0;

 	if (READ_ONCE(*sre) != sre_val) {
 		WRITE_ONCE(*sre, sre_val);
 		return true;
 	}

 	return false;
 }

 /* sync context entry when guest_ptep & shadow_pa valid, otherwise un-present it */
 static bool sync_shadow_context_entry(struct pgt_sync_data *sdata)
 {
 	struct context_entry *shadow_ce = sdata->shadow_ptep, tmp = {0};
 	struct context_entry *guest_ce = sdata->guest_ptep;
 	bool updated = false;
 	u8 aw;

 	if (sdata->guest_ptep && sdata->shadow_pa) {
 		tmp.hi = guest_ce->hi;
 		tmp.lo = sdata->shadow_pa | (guest_ce->lo & 0xfff);

 		if (ecap_smts(sdata->iommu_ecap))
 			/* Clear DTE to make sure device TLB is disabled for security */
 			context_clear_dte(&tmp);
 		else {
 			/*
 			 * Set translation type to CONTEXT_TT_MULTI_LEVEL to ensure using
 			 * 2nd-level translation and to disable device TLB for security.
 			 */
 			context_lm_set_tt(&tmp, CONTEXT_TT_MULTI_LEVEL);

 			/*
 			 * For now, set the address width only when host IOMMU driver
 			 * is using pass-through mode.
 			 * FIXME: Once shadow 2nd-stage page tables are supported by pKVM,
 			 * set the address width in all cases.
 			 */
 			if (sdata->shadow_pa == pkvm_hyp->host_vm.ept->root_pa) {
 				aw = (pkvm_hyp->ept_iommu_pgt_level == 4) ? 2 : 3;
 				context_lm_set_aw(&tmp, aw);
 			}
 		}
 	}

 	if (READ_ONCE(shadow_ce->hi) != tmp.hi) {
 		WRITE_ONCE(shadow_ce->hi, tmp.hi);
 		updated = true;
 	}

 	if (READ_ONCE(shadow_ce->lo) != tmp.lo) {
 		WRITE_ONCE(shadow_ce->lo, tmp.lo);
 		updated = true;
 	}

 	return updated;
 }

 /* sync pasid dir entry when guest_ptep & shadow_pa valid, otherwise un-present it */
 static bool sync_shadow_pasid_dir_entry(struct pgt_sync_data *sdata)
 {
 	struct pasid_dir_entry *shadow_pde = sdata->shadow_ptep;
 	u64 val = 0;

 	if (sdata->guest_ptep && sdata->shadow_pa) {
 		struct pasid_dir_entry *guest_pde = sdata->guest_ptep;

 		val = guest_pde->val & (PASID_PTE_FPD | PASID_PTE_PRESENT);
 		val |= sdata->shadow_pa;
 	}

 	if (READ_ONCE(shadow_pde->val) != val) {
 		WRITE_ONCE(shadow_pde->val, val);
 		return true;
 	}

 	return false;
 }

 static int iommu_audit_did(struct pkvm_iommu *iommu, u16 did, int shadow_vm_handle)
 {
 	struct pkvm_ptdev *tmp;
 	int ret = 0;

 	list_for_each_entry(tmp, &iommu->ptdev_head, iommu_node) {
 		if (tmp->shadow_vm_handle != shadow_vm_handle) {
 			if (tmp->did == did) {
 				/*
 				 * The devices belong to different VMs but behind
 				 * the same IOMMU, cannot use the same did.
 				 */
 				ret = -EPERM;
 				break;
 			}
 		}
 	}

 	return ret;
 }

 static struct pkvm_ptdev *iommu_find_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid)
 {
 	struct pkvm_ptdev *p;

 	list_for_each_entry(p, &iommu->ptdev_head, iommu_node) {
 		if (match_ptdev(p, bdf, pasid))
 			return p;
 	}

 	return NULL;
 }

 static struct pkvm_ptdev *iommu_add_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid)
 {
 	struct pkvm_ptdev *ptdev = pkvm_get_ptdev(bdf, pasid);

 	if (!ptdev)
 		return NULL;

 	list_add_tail(&ptdev->iommu_node, &iommu->ptdev_head);
 	return ptdev;
 }

 static void iommu_del_ptdev(struct pkvm_iommu *iommu, struct pkvm_ptdev *ptdev)
 {
 	list_del_init(&ptdev->iommu_node);
 	pkvm_put_ptdev(ptdev);
 }

 /* sync pasid table entry when guest_ptep valid, otherwise un-present it */
 static bool sync_shadow_pasid_table_entry(struct pgt_sync_data *sdata)
 {
 	u16 bdf = sdata->vaddr >> DEVFN_SHIFT;
 	u32 pasid = sdata->vaddr & ((1UL << MAX_NR_PASID_BITS) - 1);
 	struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(sdata->spgt);
 	struct pkvm_ptdev *ptdev = iommu_find_ptdev(iommu, bdf, pasid);
 	struct pasid_entry *shadow_pte = sdata->shadow_ptep, tmp_pte = {0};
 	struct pasid_entry *guest_pte;
 	bool synced = false;
 	u64 type, aw;

 	if (!ptdev) {
 		ptdev = iommu_add_ptdev(iommu, bdf, pasid);
 		if (!ptdev)
 			return false;
 	}

 	if (!sdata->guest_ptep) {
 		if (pasid_pte_is_present(shadow_pte)) {
 			/*
 			 * Making a pasid entry not present needs to remove
 			 * the corresponding ptdev from IOMMU. It also means
 			 * a ptdev's vpgt/did should be reset as well as
 			 * deleting ptdev from this iommu.
 			 */
 			pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL);
 			pkvm_setup_ptdev_did(ptdev, 0);
 			iommu_del_ptdev(iommu, ptdev);

 			synced = pasid_copy_entry(shadow_pte, &tmp_pte);
 		}
 		return synced;
 	}

 	guest_pte = sdata->guest_ptep;
 	type = pasid_pte_get_pgtt(guest_pte);
 	if (type == PASID_ENTRY_PGTT_FL_ONLY) {
 		struct pkvm_pgtable_cap cap;

 		if (ptdev_attached_to_vm(ptdev))
 			/*
 			 * For the attached ptdev, use SL Only mode with
 			 * using ptdev->pgt so that the translation is
 			 * totally controlled by pkvm.
 			 */
 			type = PASID_ENTRY_PGTT_SL_ONLY;
 		else
 			/*
 			 * For the other ptdev, pkvm IOMMU will use nested
 			 * translation to add one more layer translation to
 			 * guarantee the protection. This one more layer is the
 			 * primary VM's EPT.
 			 */
 			type = PASID_ENTRY_PGTT_NESTED;

 		/* ptdev vpgt can be initialized with flptr */
 		cap.level = pasid_get_flpm(guest_pte) == 0 ? 4 : 5;
 		cap.allowed_pgsz = pkvm_hyp->mmu_cap.allowed_pgsz;
 		pkvm_setup_ptdev_vpgt(ptdev, pasid_get_flptr(guest_pte),
 				      &viommu_mm_ops, &mmu_ops, &cap);
 	} else if (type == PASID_ENTRY_PGTT_PT) {
 		/*
 		 * When host IOMMU driver is using pass-through mode, pkvm
 		 * IOMMU will actually use the second-level only translation
 		 * to guarantee the protection. This second-level is als
 		 * the EPT.
 		 */
 		type = PASID_ENTRY_PGTT_SL_ONLY;
 	} else {
 		/*
 		 * As the host IOMMU driver in the pkvm enabled kernel has
 		 * already been configured to use first-level only or
 		 * pass-through mode, it will not use any other mode. But
 		 * in case this has happened, reset the ptdev vpgt/did while
 		 * keep ptdev linked to this IOMMU, and clear the shadow entry
 		 * so that not to support it.
 		 */
 		pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL);
 		pkvm_setup_ptdev_did(ptdev, 0);

 		pkvm_err("pkvm: unsupported pasid type %lld\n", type);

 		return pasid_copy_entry(shadow_pte, &tmp_pte);
 	}

 	pkvm_setup_ptdev_did(ptdev, pasid_get_domain_id(guest_pte));

 	if (iommu_audit_did(iommu, ptdev->did, ptdev->shadow_vm_handle))
 		/*
 		 * It is possible that this ptdev will be attached to a protected
 		 * VM so primary VM allocates the same did used by this protected
 		 * VM and did a TLB flush. But at this moment, this ptdev is not
 		 * attached yet so audit is failed. For this case, can skip the sync
 		 * of this pasid table entry and it will be synced again when this
 		 * ptdev is attached.
 		 *
 		 * It is also possible that this ptdev is just detached from a
 		 * protected VM but still using the previous did due to primary VM
 		 * has not configured this ptdev yet. In this case, the did of this
 		 * ptdev is still the same as the did used by other ptdevs not
 		 * detached yet. For this case, can skip the sync of this pasid
 		 * table entry and it will be synced again when primary VM configures
 		 * this ptdev.
 		 *
 		 * If not the above cases but primary VM does this by purpose, also
 		 * not sync the pasid table entry to guarantee the isolation.
 		 */
 		return false;

 	/*
 	 * ptdev->pgt will be used as second-level translation table
 	 * which should be EPT format.
 	 */
 	if (!is_pgt_ops_ept(ptdev->pgt))
 		return false;

 	/*
 	 * Copy all the bits from guest_pte. As the translation type will
 	 * be re-configured in below, even some bits inherit from guest_pte
 	 * but hardware will ignore those bits according to the translation
 	 * type.
 	 */
 	memcpy(&tmp_pte, guest_pte, sizeof(struct pasid_entry));

 	pasid_set_page_snoop(&tmp_pte, !!ecap_smpwc(sdata->iommu_ecap));
 	if (ecap_sc_support(sdata->iommu_ecap))
 		pasid_set_pgsnp(&tmp_pte);

 	/*
 	 * Modify the second-level related bits:
 	 * Set PGTT/SLPTR/AW.
 	 * Clear SLADE/SLEE
 	 * Reuse FPD/P
 	 */
 	pasid_set_translation_type(&tmp_pte, type);
 	pasid_set_slptr(&tmp_pte, ptdev->pgt->root_pa);
 	aw = (ptdev->pgt->level == 4) ? 2 : 3;
 	pasid_set_address_width(&tmp_pte, aw);
 	pasid_set_ssade(&tmp_pte, 0);
 	pasid_set_ssee(&tmp_pte, 0);

 	return pasid_copy_entry(shadow_pte, &tmp_pte);
 }

 static bool iommu_id_sync_entry(struct pgt_sync_data *sdata)
 {
 	bool ret = false;
 	struct pkvm_pgtable *spgt = sdata->spgt;

 	if (ecap_smts(sdata->iommu_ecap)) {
 		switch (sdata->level) {
 		case IOMMU_PASID_TABLE:
 			ret = sync_shadow_pasid_table_entry(sdata);
 			break;
 		case IOMMU_PASID_DIR:
 			ret = sync_shadow_pasid_dir_entry(sdata);
 			break;
 		case IOMMU_SM_CONTEXT:
 			ret = sync_shadow_context_entry(sdata);
 			break;
 		case IOMMU_SM_ROOT:
 			ret = sync_root_entry(sdata);
 			break;
 		default:
 			break;
 		}
 	} else {
 		switch (sdata->level) {
 		case IOMMU_LM_CONTEXT:
 			ret = sync_shadow_context_entry(sdata);
 			break;
 		case IOMMU_LM_ROOT:
 			ret = sync_root_entry(sdata);
 			break;
 		default:
 			break;
 		}
 	}

 	if (ret) {
 		int entry_size = spgt->pgt_ops->pgt_level_entry_size(sdata->level);

 		if (entry_size && spgt->mm_ops->flush_cache)
 			spgt->mm_ops->flush_cache(sdata->shadow_ptep, entry_size);
 	}

 	return ret;
 }

 static int initialize_iommu_pgt(struct pkvm_iommu *iommu)
 {
 	struct pkvm_pgtable *pgt = &iommu->pgt;
 	struct pkvm_pgtable *vpgt = &iommu->viommu.pgt;
 	static struct pkvm_mm_ops *iommu_mm_ops;
 	struct pkvm_pgtable_ops *iommu_ops;
 	struct pkvm_pgtable_cap cap;
 	u64 grt_pa = readq(iommu->iommu.reg + DMAR_RTADDR_REG) & VTD_PAGE_MASK;
 	int ret;

 	if (ecap_smts(iommu->iommu.ecap)) {
 		cap.level = IOMMU_SM_ROOT;
 		iommu_ops = &iommu_sm_id_ops;
 	} else {
 		cap.level = IOMMU_LM_ROOT;
 		iommu_ops = &iommu_lm_id_ops;
 	}

 	vpgt->root_pa = grt_pa;
 	ret = pkvm_pgtable_init(vpgt, &viommu_mm_ops, iommu_ops, &cap, false);
 	if (ret)
 		return ret;

 	/*
 	 * For the IOMMU without Page-Walk Coherency, should use
 	 * iommu_pw_noncoherency_mm_ops to flush CPU cache when
 	 * modifying any remapping structure entry.
 	 *
 	 * For the IOMMU with Page-Walk Coherency, can use
 	 * iommu_pw_coherency_mm_ops to skip the CPU cache flushing.
 	 */
 	if (!ecap_coherent(iommu->iommu.ecap))
 		iommu_mm_ops = &iommu_pw_noncoherency_mm_ops;
 	else
 		iommu_mm_ops = &iommu_pw_coherency_mm_ops;

 	ret = pkvm_pgtable_init(pgt, iommu_mm_ops, iommu_ops, &cap, true);
 	if (!ret) {
 		/*
 		 * Hold additional reference count to make
 		 * sure root page won't be freed
 		 */
 		void *root = pgt->mm_ops->phys_to_virt(pgt->root_pa);

 		pgt->mm_ops->get_page(root);
 	}
 	return ret;
 }

 int pkvm_init_iommu(unsigned long mem_base, unsigned long nr_pages)
 {
 	struct pkvm_iommu_info *info = &pkvm_hyp->iommu_infos[0];
 	struct pkvm_iommu *piommu = &iommus[0];
 	int i, ret = pkvm_pool_init(&iommu_pool, mem_base >> PAGE_SHIFT, nr_pages, 0);

 	if (ret)
 		return ret;

 	for (i = 0; i < PKVM_MAX_IOMMU_NUM; piommu++, info++, i++) {
 		if (!info->reg_phys)
 			break;

 		INIT_LIST_HEAD(&piommu->ptdev_head);

 		pkvm_spinlock_init(&piommu->lock);
 		piommu->iommu.reg_phys = info->reg_phys;
 		piommu->iommu.reg_size = info->reg_size;
 		piommu->iommu.reg = pkvm_iophys_to_virt(info->reg_phys);
 		if ((unsigned long)piommu->iommu.reg == INVALID_ADDR)
 			return -ENOMEM;
 		piommu->iommu.seq_id = i;

 		ret = pkvm_mmu_map((unsigned long)piommu->iommu.reg,
 				   (unsigned long)info->reg_phys,
 				   info->reg_size, 1 << PG_LEVEL_4K,
 				   PKVM_PAGE_IO_NOCACHE);
 		if (ret)
 			return ret;

 		piommu->iommu.cap = readq(piommu->iommu.reg + DMAR_CAP_REG);
 		piommu->iommu.ecap = readq(piommu->iommu.reg + DMAR_ECAP_REG);
 		/* cache the enabled features from Global Status register */
 		piommu->iommu.gcmd = readl(piommu->iommu.reg + DMAR_GSTS_REG) &
 				     DMAR_GSTS_EN_BITS;

 		ret = pkvm_host_ept_unmap((unsigned long)info->reg_phys,
 				     (unsigned long)info->reg_phys,
 				     info->reg_size);
 		if (ret)
 			return ret;
 	}

 	return 0;
 }

 static int free_shadow_id_cb(struct pkvm_pgtable *pgt, unsigned long vaddr,
 			  unsigned long vaddr_end, int level, void *ptep,
 			  unsigned long flags, struct pgt_flush_data *flush_data,
 			  void *const arg)
 {
 	struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops;
 	struct pkvm_mm_ops *mm_ops = pgt->mm_ops;
 	struct pgt_sync_data sync_data = {0};
 	struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(pgt);
 	void *child_ptep;

 	/* Doesn't need to do anything if the shadow entry is not present */
 	if (!pgt_ops->pgt_entry_present(ptep))
 		return 0;

 	sync_data.shadow_ptep = ptep;
 	sync_data.level = level;
 	sync_data.spgt = pgt;
 	sync_data.iommu_ecap = iommu->iommu.ecap;
 	sync_data.vaddr = vaddr;

 	/* Un-present a present PASID Table entry */
 	if (LAST_LEVEL(level)) {
 		if (iommu_id_sync_entry(&sync_data))
 			mm_ops->put_page(ptep);
 		return 0;
 	}

 	/*
 	 * it's a present entry for PASID DIR, context or root.
 	 * its child ptep shall already be freed (the refcnt == 1), if so, we
 	 * can un-present itself as well now.
 	 */
 	child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep));
 	if (mm_ops->page_count(child_ptep) == 1) {
 		if (iommu_id_sync_entry(&sync_data)) {
 			mm_ops->put_page(ptep);
 			mm_ops->put_page(child_ptep);
 		}
 	}

 	return 0;
 }

 /* sync_data != NULL, data != NULL */
 static int init_sync_id_data(struct pgt_sync_data *sync_data,
 		struct pgt_sync_walk_data *data,
 		struct pkvm_iommu *iommu, void *guest_ptep,
 		unsigned long vaddr, int level)
 {
 	struct pkvm_pgtable *spgt = &iommu->pgt;
 	int idx = spgt->pgt_ops->pgt_entry_to_index(vaddr, level);
 	int entry_size = spgt->pgt_ops->pgt_level_entry_size(level);

 	if (ecap_smts(iommu->iommu.ecap)) {
 		switch (level) {
 		case IOMMU_PASID_TABLE:
 			sync_data->p_entry = *((struct pasid_entry *)guest_ptep);
 			sync_data->guest_ptep = &sync_data->p_entry;
 			break;
 		case IOMMU_PASID_DIR:
 			sync_data->pd_entry = *((struct pasid_dir_entry *)guest_ptep);
 			sync_data->guest_ptep = &sync_data->pd_entry;
 			break;
 		case IOMMU_SM_CONTEXT:
 			sync_data->ct_entry = *((struct context_entry *)guest_ptep);
 			sync_data->guest_ptep = &sync_data->ct_entry;
 			break;
 		case IOMMU_SM_ROOT:
 			sync_data->root_entry = *((u64 *)guest_ptep);
 			sync_data->guest_ptep = &sync_data->root_entry;
 			break;
 		default:
 			return -EINVAL;
 		}
 	} else {
 		switch (level) {
 		case IOMMU_LM_CONTEXT:
 			sync_data->ct_entry = *((struct context_entry *)guest_ptep);
 			sync_data->guest_ptep = &sync_data->ct_entry;
 			break;
 		case IOMMU_LM_ROOT:
 			sync_data->root_entry = *((u64 *)guest_ptep);
 			sync_data->guest_ptep = &sync_data->root_entry;
 			break;
 		default:
 			return -EINVAL;
 		}
 	}

 	/* shadow_pa of current level must be there */
 	if (!data->shadow_pa[level])
 		return -EINVAL;

 	/* get current shadow_ptep */
 	sync_data->shadow_ptep = spgt->mm_ops->phys_to_virt(data->shadow_pa[level]);
 	sync_data->shadow_ptep += idx * entry_size;

 	sync_data->level = level;
 	sync_data->spgt = spgt;
 	sync_data->iommu_ecap = iommu->iommu.ecap;
 	sync_data->shadow_pa = 0;
 	sync_data->vaddr = vaddr;

 	return 0;
 }

 static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr,
 		       unsigned long vaddr_end);
 static int sync_shadow_id_cb(struct pkvm_pgtable *vpgt, unsigned long vaddr,
 			  unsigned long vaddr_end, int level, void *ptep,
 			  unsigned long flags, struct pgt_flush_data *flush_data,
 			  void *const arg)
 {
 	struct pkvm_pgtable_ops *vpgt_ops = vpgt->pgt_ops;
 	struct pgt_sync_walk_data *data = arg;
 	struct pkvm_iommu *iommu = data->iommu;
 	struct pkvm_pgtable *spgt = &iommu->pgt;
 	struct pgt_sync_data sync_data;
 	void *shadow_ptep, *guest_ptep;
 	bool shadow_p, guest_p;
 	int ret = init_sync_id_data(&sync_data, data, iommu, ptep, vaddr, level);

 	if (ret < 0)
 		return ret;

 	guest_ptep = sync_data.guest_ptep;
 	shadow_ptep = sync_data.shadow_ptep;

 	/*
 	 * WALK_TABLE_PRE is for non leaf, WALK_LEAF is for leaf
 	 * if not match, it means guest changed it, return -EAGAIN
 	 * to re-walk the page table.
 	 */
 	if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE &&
 		vpgt_ops->pgt_entry_is_leaf(guest_ptep, level)) ||
 		(flags == PKVM_PGTABLE_WALK_LEAF &&
 		!vpgt_ops->pgt_entry_is_leaf(guest_ptep, level)))
 		return -EAGAIN;

 	shadow_p = spgt->pgt_ops->pgt_entry_present(shadow_ptep);
 	guest_p = vpgt_ops->pgt_entry_present(guest_ptep);
 	if (!guest_p) {
 		if (shadow_p) {
 			/*
 			 * For the case that guest not present but shadow present, just
 			 * simply free the shadow to make them consistent.
 			 */
 			unsigned long new_vaddr_end = spgt->pgt_ops->pgt_level_to_size(level) +
 						      vaddr;
 			/*
 			 * Get a reference count before free to make sure the current page
 			 * of this level and the pages of its parent levels won't be freed.
 			 * As here we only want to free its specific sub-level.
 			 */
 			spgt->mm_ops->get_page(shadow_ptep);
 			free_shadow_id(iommu, vaddr, new_vaddr_end);
 			spgt->mm_ops->put_page(shadow_ptep);
 		}
 		/*
 		 * As now both guest and shadow are not
 		 * present, don't need to do anything more.
 		 */
 		return ret;
 	}

 	if (LAST_LEVEL(level)) {
 		if (ecap_smts(iommu->iommu.ecap)) {
 			/*
 			 * For PASID_TABLE, cache invalidation may want to
 			 * sync specific PASID with did matched. So do the
 			 * check before sync the entry.
 			 *
 			 * According to vt-d spec 6.2.2.1, software must not
 			 * use domain-id value of 0 on when programming
 			 * context-entries on implementations reporting CM=1
 			 * in the Capability register.
 			 *
 			 * So non-zero DID means a real DID from host software.
 			 */
 			if (data->did && (pasid_get_domain_id(guest_ptep) != data->did))
 				return ret;

 			/*
 			 * The shadow_pa to configur the PASID table entry is
 			 * depending on the pgt used by the corresponding ptdev.
 			 * So no need to set sync_data.shadow_pa.
 			 */
 		} else {
 			switch (context_lm_get_tt(guest_ptep)) {
 			case CONTEXT_TT_MULTI_LEVEL:
 			case CONTEXT_TT_DEV_IOTLB:
 				/*
 				 * Right now reference to a virtual 2nd-stage paging table.
 				 * FIXME: Reference to a shadow paging table when shadowing
 				 * 2nd-stage paging table is supported by pKVM, to ensure
 				 * the memory protection.
 				 */
 				sync_data.shadow_pa = vpgt_ops->pgt_entry_to_phys(guest_ptep);
 				break;
 			case CONTEXT_TT_PASS_THROUGH:
 				/*
 				 * When host IOMMU driver is using pass-through mode, pkvm
 				 * IOMMU will actually use the 2nd-level translation using the
 				 * host EPT to guarantee the memory protection.
 				 */
 				sync_data.shadow_pa = pkvm_hyp->host_vm.ept->root_pa;
 				break;
 			default:
 				/*
 				 * Context entry with an unsupported (reserved) value of
 				 * Translation Type shall be non-present to the physical IOMMU.
 				 */
 				break;
 			}
 		}
 	} else if (!shadow_p) {
 		/*
 		 * For a non-present non-leaf (which may be root/context/pasid
 		 * dir) entry, needs to allocate a new page to make this entry
 		 * present. Root and context page are always one page with 4K
 		 * size. As we fixed the pasid only support 15bits, which makes
 		 * the pasid dir is also one page with 4K size.
 		 */
 		void *shadow = spgt->mm_ops->zalloc_page();

 		if (!shadow)
 			return -ENOMEM;
 		/* Get the shadow page physical address of the child level */
 		sync_data.shadow_pa = spgt->mm_ops->virt_to_phys(shadow);
 	} else
 		/*
 		 * For a present non-leaf (which is probably root/context/pasid dir)
 		 * entry, get the shadow page physical address of its child level.
 		 */
 		sync_data.shadow_pa = spgt->pgt_ops->pgt_entry_to_phys(shadow_ptep);

 	if (iommu_id_sync_entry(&sync_data)) {
 		if (!shadow_p)
 			/*
 			 * A non-present to present changing require to get
 			 * a new reference count for the shadow page.
 			 */
 			spgt->mm_ops->get_page(shadow_ptep);
 	}

 	if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE) && (!LAST_LEVEL(level))) {
 		/*
 		 * As guest page table walking will go to the child level, pass
 		 * the shadow page physical address of the child level to sync.
 		 */
 		data->shadow_pa[level - 1] = sync_data.shadow_pa;
 	}

 	return ret;
 }

 static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr,
 		       unsigned long vaddr_end)
 {
 	struct pkvm_pgtable_walker walker = {
 		.cb = free_shadow_id_cb,
 		.flags = PKVM_PGTABLE_WALK_LEAF |
 			 PKVM_PGTABLE_WALK_TABLE_POST,
 	};

 	/*
 	 * To free the shadow IOMMU page table, walks the shadow IOMMU
 	 * page table.
 	 */
 	if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES))
 		return 0;

 	return iommu_pgtable_walk(&iommu->pgt, vaddr, vaddr_end, &walker);
 }

 static int sync_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr,
 		       unsigned long vaddr_end, u16 did)
 {
 	DEFINE_PGT_SYNC_WALK_DATA(arg, iommu, did);
 	struct pkvm_pgtable_walker walker = {
 		.cb = sync_shadow_id_cb,
 		.flags = PKVM_PGTABLE_WALK_TABLE_PRE |
 			 PKVM_PGTABLE_WALK_LEAF,
 		.arg = &arg,
 	};
 	int ret, retry_cnt = 0;

 	if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES))
 		return 0;

 retry:
 	if (ecap_smts(iommu->iommu.ecap))
 		arg.shadow_pa[IOMMU_SM_ROOT] = iommu->pgt.root_pa;
 	else
 		arg.shadow_pa[IOMMU_LM_ROOT] = iommu->pgt.root_pa;
 	/*
 	 * To sync the shadow IOMMU page table, walks the guest IOMMU
 	 * page table
 	 */
 	ret = iommu_pgtable_walk(&iommu->viommu.pgt, vaddr, vaddr_end, &walker);
 	if ((ret == -EAGAIN) && (retry_cnt++ < 5))
 		goto retry;

 	return ret;
 }

 static void enable_qi(struct pkvm_iommu *iommu)
 {
 	void *desc = iommu->qi.desc;
 	int dw, qs;
 	u32 sts;

 	dw = !!ecap_smts(iommu->iommu.ecap);
 	qs = fls(iommu->qi.free_cnt >> (7 + !dw)) - 1;

 	/* Disable QI */
 	sts = readl(iommu->iommu.reg + DMAR_GSTS_REG);
 	if (sts & DMA_GSTS_QIES) {
 		iommu->iommu.gcmd &= ~DMA_GCMD_QIE;
 		writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
 		PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
 				   readl, !(sts & DMA_GSTS_QIES), sts);
 	}

 	/* Set tail to 0 */
 	writel(0, iommu->iommu.reg + DMAR_IQT_REG);

 	/* Set IQA */
 	iommu->piommu_iqa = pkvm_virt_to_phys(desc) | (dw << 11) | qs;
 	writeq(iommu->piommu_iqa, iommu->iommu.reg + DMAR_IQA_REG);

 	/* Enable QI */
 	iommu->iommu.gcmd |= DMA_GCMD_QIE;
 	writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
 	PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
 			   readl, (sts & DMA_GSTS_QIES), sts);
 }

 static int create_qi_desc(struct pkvm_iommu *iommu)
 {
 	struct pkvm_viommu *viommu = &iommu->viommu;
 	struct q_inval *qi = &iommu->qi;
 	void __iomem *reg = iommu->iommu.reg;

 	pkvm_spinlock_init(&iommu->qi_lock);
 	/*
 	 * Before switching the descriptor, need to wait any pending
 	 * invalidation descriptor completed. According to spec 6.5.2,
 	 * The invalidation queue is considered quiesced when the queue
 	 * is empty (head and tail registers equal) and the last
 	 * descriptor completed is an Invalidation Wait Descriptor
 	 * (which indicates no invalidation requests are pending in hardware).
 	 */
 	while (readq(reg + DMAR_IQH_REG) !=
 		readq(reg + DMAR_IQT_REG))
 		cpu_relax();

 	viommu->vreg.iqa = viommu->iqa = readq(reg + DMAR_IQA_REG);
 	viommu->vreg.iq_head = readq(reg + DMAR_IQH_REG);
 	viommu->vreg.iq_tail = readq(reg + DMAR_IQT_REG);

 	if (viommu->vreg.gsts & DMA_GSTS_QIES) {
 		struct qi_desc *wait_desc;
 		u64 iqa = viommu->iqa;
 		int shift = IQ_DESC_SHIFT(iqa);
 		int offset = ((viommu->vreg.iq_head >> shift) +
 			      IQ_DESC_LEN(iqa) - 1) % IQ_DESC_LEN(iqa);
 		int *desc_status;

 		/* Find out the last descriptor */
 		wait_desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(iqa)) + (offset << shift);

 		pkvm_dbg("pkvm: viommu iqa 0x%llx head 0x%llx tail 0x%llx qw0 0x%llx qw1 0x%llx",
 				viommu->vreg.iqa, viommu->vreg.iq_head, viommu->vreg.iq_tail,
 				wait_desc->qw0, wait_desc->qw1);

 		if (QI_DESC_TYPE(wait_desc->qw0) != QI_IWD_TYPE) {
 			pkvm_err("pkvm: %s: expect wait desc but 0x%llx\n",
 				 __func__, wait_desc->qw0);
 			return -EINVAL;
 		}

 		desc_status = pkvm_phys_to_virt(wait_desc->qw1);
 		/*
 		 * Wait until the wait descriptor is completed.
 		 *
 		 * The desc_status is from host. Checking this in pkvm
 		 * is relying on host IOMMU driver won't release the
 		 * desc_status after it is completed, and this is guarantee
 		 * by the current Linux IOMMU driver.
 		 */
 		while (READ_ONCE(*desc_status) == QI_IN_USE)
 			cpu_relax();
 	}

 	qi->free_cnt = PKVM_QI_DESC_ALIGNED_SIZE / sizeof(struct qi_desc);
 	qi->desc = iommu_zalloc_pages(PKVM_QI_DESC_ALIGNED_SIZE);
 	if (!qi->desc)
 		return -ENOMEM;

 	qi->desc_status = iommu_zalloc_pages(PKVM_QI_DESC_STATUS_ALIGNED_SIZE);
 	if (!qi->desc_status) {
 		iommu_put_page(qi->desc);
 		return -ENOMEM;
 	}

 	enable_qi(iommu);
 	return 0;
 }

 static int qi_check_fault(struct pkvm_iommu *iommu, int wait_index)
 {
 	u32 fault;
 	struct q_inval *qi = &iommu->qi;

 	if (qi->desc_status[wait_index] == QI_ABORT)
 		return -EAGAIN;

 	fault = readl(iommu->iommu.reg + DMAR_FSTS_REG);

 	/*
 	 * If IQE happens, the head points to the descriptor associated
 	 * with the error. No new descriptors are fetched until the IQE
 	 * is cleared.
 	 */
 	if (fault & DMA_FSTS_IQE) {
 		writel(DMA_FSTS_IQE, iommu->iommu.reg + DMAR_FSTS_REG);
 		pkvm_dbg("pkvm: Invalidation Queue Error (IQE) cleared\n");
 	}

 	/*
 	 * If ITE happens, all pending wait_desc commands are aborted.
 	 * No new descriptors are fetched until the ITE is cleared.
 	 */
 	if (fault & DMA_FSTS_ITE) {
 		writel(DMA_FSTS_ITE, iommu->iommu.reg + DMAR_FSTS_REG);
 		pkvm_dbg("pkvm: Invalidation Time-out Error (ITE) cleared\n");
 	}

 	if (fault & DMA_FSTS_ICE) {
 		writel(DMA_FSTS_ICE, iommu->iommu.reg + DMAR_FSTS_REG);
 		pkvm_dbg("pkvm: Invalidation Completion Error (ICE) cleared\n");
 	}

 	return 0;
 }

 static void submit_qi(struct pkvm_iommu *iommu, struct qi_desc *base, int count)
 {
 	int len = IQ_DESC_LEN(iommu->piommu_iqa), i, wait_index;
 	int shift = IQ_DESC_SHIFT(iommu->piommu_iqa);
 	struct q_inval *qi = &iommu->qi;
 	struct qi_desc *to, *from;
 	int required_cnt = count + 2;
 	void *desc = qi->desc;
 	int *desc_status, rc;

 	pkvm_spin_lock(&iommu->qi_lock);
 	/*
 	 * Detect if the free descriptor count is enough or not
 	 */
 	while (qi->free_cnt < required_cnt) {
 		u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift;
 		int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len;
 		int free_cnt = len - busy_cnt;

 		if (free_cnt >= required_cnt) {
 			qi->free_cnt = free_cnt;
 			break;
 		}
 		pkvm_spin_unlock(&iommu->qi_lock);
 		cpu_relax();
 		pkvm_spin_lock(&iommu->qi_lock);
 	}

 	for (i = 0; i < count; i++) {
 		from = base + i;
 		to = qi->desc + (((qi->free_head + i) % len) << shift);
 		to->qw0 = from->qw0;
 		to->qw1 = from->qw1;
 	}

 	wait_index = (qi->free_head + count) % len;
 	/* setup wait descriptor */
 	to = desc + (wait_index << shift);
 	to->qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
 		  QI_IWD_STATUS_WRITE | QI_IWD_TYPE;

 	desc_status = &qi->desc_status[wait_index];
 	WRITE_ONCE(*desc_status, QI_IN_USE);
 	to->qw1 = pkvm_virt_to_phys(desc_status);

 	/* submit to hardware with wait descriptor */
 	qi->free_cnt -= count + 1;
 	qi->free_head = (qi->free_head + count + 1) % len;
 	writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG);

 	while (READ_ONCE(*desc_status) != QI_DONE) {
 		rc = qi_check_fault(iommu, wait_index);
 		if (rc)
 			break;
 		pkvm_spin_unlock(&iommu->qi_lock);
 		cpu_relax();
 		pkvm_spin_lock(&iommu->qi_lock);
 	}

 	if (*desc_status != QI_DONE)
 		pkvm_err("pkvm: %s: failed with status %d\n",
 			 __func__, *desc_status);

 	/* release the free_cnt */
 	qi->free_cnt += count + 1;

 	pkvm_spin_unlock(&iommu->qi_lock);
 }

 static void flush_context_cache(struct pkvm_iommu *iommu, u16 did,
 				u16 sid, u8 fm, u64 type)
 {
 	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};

 	desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) |
 		   QI_CC_GRAN(type) | QI_CC_TYPE;

 	submit_qi(iommu, &desc, 1);
 }

 static void flush_pasid_cache(struct pkvm_iommu *iommu, u16 did,
 			      u64 granu, u32 pasid)
 {
 	struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0};

 	desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) |
 		   QI_PC_GRAN(granu) | QI_PC_TYPE;

 	submit_qi(iommu, &desc, 1);
 }

 static void flush_iotlb(struct pkvm_iommu *iommu, u16 did, u64 addr,
 			unsigned int size_order, u64 type)
 {
 	u8 dw = 0, dr = 0;
 	struct qi_desc desc = {.qw2 = 0, .qw3 = 0};
 	int ih = 0;

 	if (cap_write_drain(iommu->iommu.cap))
 		dw = 1;

 	if (cap_read_drain(iommu->iommu.cap))
 		dr = 1;

 	desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) |
 		   QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE;
 	desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) | QI_IOTLB_AM(size_order);

 	submit_qi(iommu, &desc, 1);
 }

 static void set_root_table(struct pkvm_iommu *iommu)
 {
 	u64 val = iommu->pgt.root_pa;
 	void __iomem *reg = iommu->iommu.reg;
 	u32 sts;

 	/* Set scalable mode */
 	if (ecap_smts(iommu->iommu.ecap))
 		val |= DMA_RTADDR_SMT;

 	writeq(val, reg + DMAR_RTADDR_REG);

 	/*
 	 * The shadow root table provides identical remapping results comparing
 	 * with the previous guest root table, so it is allowed to switch if
 	 * Translation Enable Status is still 1 according to IOMMU spec 6.6:
 	 *
 	 *  "
 	 *  If software sets the root-table pointer while remapping hardware is
 	 *  active (TES=1 in Global Status register), software must ensure the
 	 *  structures referenced by the new root-table pointer provide identical
 	 *  remapping results as the structures referenced by the previous root-table
 	 *  pointer so that inflight requests are properly translated.
 	 *  "
 	 *
 	 *  So don't need to turn off TE first before switching.
 	 */
 	writel(iommu->iommu.gcmd | DMA_GCMD_SRTP, reg + DMAR_GCMD_REG);

 	PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts);

 	flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
 	if (ecap_smts(iommu->iommu.ecap))
 		flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
 	flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }

 static void enable_translation(struct pkvm_iommu *iommu)
 {
 	void __iomem *reg = iommu->iommu.reg;
 	u32 sts;

 	if (iommu->iommu.gcmd & DMA_GCMD_TE)
 		return;

 	iommu->iommu.gcmd |= DMA_GCMD_TE;

 	writel(iommu->iommu.gcmd, reg + DMAR_GCMD_REG);

 	PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_TES), sts);
 }

 static void initialize_viommu_reg(struct pkvm_iommu *iommu)
 {
 	struct viommu_reg *vreg = &iommu->viommu.vreg;
 	void __iomem *reg_base = iommu->iommu.reg;

 	vreg->cap = readq(reg_base + DMAR_CAP_REG);
 	vreg->ecap = readq(reg_base + DMAR_ECAP_REG);
 	pkvm_update_iommu_virtual_caps(&vreg->cap, &vreg->ecap);

 	vreg->gsts = readl(reg_base + DMAR_GSTS_REG);
 	vreg->rta = readq(reg_base + DMAR_RTADDR_REG);

 	pkvm_dbg("%s: iommu phys reg 0x%llx cap 0x%llx ecap 0x%llx gsts 0x%x rta 0x%llx\n",
 		 __func__, iommu->iommu.reg_phys, vreg->cap, vreg->ecap, vreg->gsts, vreg->rta);

 	/* Invalidate Queue regs are updated when create descriptor */
 }

 static int activate_iommu(struct pkvm_iommu *iommu)
 {
 	unsigned long vaddr = 0, vaddr_end = IOMMU_MAX_VADDR;
 	int ret;

 	pkvm_dbg("%s: iommu%d\n", __func__, iommu->iommu.seq_id);

 	pkvm_spin_lock(&iommu->lock);

 	ret = initialize_iommu_pgt(iommu);
 	if (ret)
 		goto out;

 	initialize_viommu_reg(iommu);

 	ret = sync_shadow_id(iommu, vaddr, vaddr_end, 0);
 	if (ret)
 		goto out;

 	ret = create_qi_desc(iommu);
 	if (ret)
 		goto free_shadow;

 	set_root_table(iommu);

 	/*
 	 * It is possible that some of the IOMMU devices doesn't have memory
 	 * remapping translation enabled by the host IOMMU driver during boot
 	 * time, so pkvm IOMMU driver needs to make sure enabling this to
 	 * guarantee the IO isolation from the devices behind this IOMMU.
 	 *
 	 */
 	enable_translation(iommu);

 	iommu->activated = true;
 	root_tbl_walk(iommu);

 	pkvm_spin_unlock(&iommu->lock);
 	return 0;

 free_shadow:
 	free_shadow_id(iommu, vaddr, vaddr_end);
 out:
 	pkvm_spin_unlock(&iommu->lock);
 	return ret;
 }

 static int context_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc)
 {
 	u16 sid = QI_DESC_CC_SID(desc->qw0);
 	u16 did = ecap_smts(iommu->iommu.ecap) ? 0 : QI_DESC_CC_DID(desc->qw0);
 	u64 granu = QI_DESC_CC_GRANU(desc->qw0) << DMA_CCMD_INVL_GRANU_OFFSET;
 	unsigned long start, end;
 	int ret;

 	switch (granu) {
 	case DMA_CCMD_GLOBAL_INVL:
 		start = 0;
 		end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
 		pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id);
 		ret = sync_shadow_id(iommu, start, end, 0);
 		break;
 	case DMA_CCMD_DOMAIN_INVL:
 		/*
 		 * Domain selective invalidation which is processed by
 		 * hardware as global invalidations for scalable mode
 		 * according to spec 6.5.2.1
 		 */
 		start = 0;
 		end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
 		pkvm_dbg("pkvm: %s: iommu%d: domain selective\n",
 			 __func__, iommu->iommu.seq_id);
 		ret = sync_shadow_id(iommu, start, end, did);
 		break;
 	case DMA_CCMD_DEVICE_INVL:
 		if (ecap_smts(iommu->iommu.ecap)) {
 			start = (unsigned long)sid << DEVFN_SHIFT;
 			end = ((unsigned long)sid + 1) << DEVFN_SHIFT;
 		} else {
 			start = (unsigned long)sid << LM_DEVFN_SHIFT;
 			end = ((unsigned long)sid + 1) << LM_DEVFN_SHIFT;
 		}
 		pkvm_dbg("pkvm: %s: iommu%d: device selective sid 0x%x\n",
 			 __func__, iommu->iommu.seq_id, sid);
 		ret = sync_shadow_id(iommu, start, end, did);
 		break;
 	default:
 		pkvm_err("pkvm: %s: iommu%d: invalidate granu %lld\n",
 			__func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET);
 		ret = -EINVAL;
 		break;
 	}

 	if (ret)
 		pkvm_err("pkvm: %s: iommu%d: granularity %lld failed with ret %d\n",
 			__func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET, ret);
 	return ret;
 }

 static int pasid_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc)
 {
 	int pasid = QI_DESC_PC_PASID(desc->qw0);
 	u16 did = QI_DESC_PC_DID(desc->qw0);
 	int granu = QI_DESC_PC_GRANU(desc->qw0);
 	unsigned long start, end;
 	int ret;

 	switch (granu) {
 	case QI_PC_ALL_PASIDS:
 		/*
 		 * This is more like a global invalidation but to check
 		 * if matching with a specific DID.
 		 */
 		pkvm_dbg("pkvm: %s: iommu%d: ALL_PASID did %d\n",
 			 __func__, iommu->iommu.seq_id, did);
 		start = 0;
 		end = IOMMU_MAX_VADDR;
 		ret = sync_shadow_id(iommu, start, end, did);
 		break;
 	case QI_PC_PASID_SEL: {
 		/*
 		 * Sync specific PASID entry for all contexts
 		 */
 		u64 bdf, end_bdf = 0x10000;

 		pkvm_dbg("pkvm: %s: iommu%d: PASID_SEL did %d pasid 0x%x\n",
 			 __func__, iommu->iommu.seq_id, did, pasid);
 		for (bdf = 0; bdf < end_bdf; bdf++) {
 			start = (bdf << DEVFN_SHIFT) + pasid;
 			end = start + 1;
 			ret = sync_shadow_id(iommu, start, end, did);
 			if (ret)
 				break;
 		}
 		break;
 	}
 	case QI_PC_GLOBAL:
 		start = 0;
 		end = IOMMU_MAX_VADDR;
 		pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id);
 		ret = sync_shadow_id(iommu, start, end, 0);
 		break;
 	default:
 		pkvm_err("pkvm: %s: iommu%d: invalid granularity %d 0x%llx\n",
 			 __func__, iommu->iommu.seq_id, granu, desc->qw0);
 		ret = -EINVAL;
 		break;
 	}

 	if (ret)
 		pkvm_err("pkvm: %s: iommu%d: granularity %d failed with ret %d\n",
 			 __func__, iommu->iommu.seq_id, granu, ret);

 	return ret;
 }

 static int handle_descriptor(struct pkvm_iommu *iommu, struct qi_desc *desc)
 {
 	int type = QI_DESC_TYPE(desc->qw0);
 	int ret = 0;

 	switch (type) {
 	/*
 	 * TODO: is it necessary to intercept the
 	 * PGRP_RESP & PSTRM_RESP?
 	 */
 	case QI_PGRP_RESP_TYPE:
 	case QI_PSTRM_RESP_TYPE:
 	case QI_IOTLB_TYPE:
 	case QI_DIOTLB_TYPE:
 	case QI_DEIOTLB_TYPE:
 	case QI_IEC_TYPE:
 	case QI_IWD_TYPE:
 	case QI_EIOTLB_TYPE:
 		break;
 	case QI_CC_TYPE:
 		ret = context_cache_invalidate(iommu, desc);
 		break;
 	case QI_PC_TYPE:
 		ret = pasid_cache_invalidate(iommu, desc);
 		break;
 	default:
 		pkvm_err("pkvm: %s: iommu%d: invalid type %d desc addr 0x%llx val 0x%llx\n",
 			 __func__, iommu->iommu.seq_id, type, (u64)desc, desc->qw0);
 		ret = -EINVAL;
 		break;
 	}

 	return ret;
 }

 static void handle_qi_submit(struct pkvm_iommu *iommu, void *vdesc, int vhead, int count)
 {
 	struct pkvm_viommu *viommu = &iommu->viommu;
 	int vlen = IQ_DESC_LEN(viommu->iqa);
 	int vshift = IQ_DESC_SHIFT(viommu->iqa);
 	int len = IQ_DESC_LEN(iommu->piommu_iqa);
 	int shift = IQ_DESC_SHIFT(iommu->piommu_iqa);
 	struct q_inval *qi = &iommu->qi;
 	struct qi_desc *to, *from;
 	int required_cnt = count + 1, i;

 	pkvm_spin_lock(&iommu->qi_lock);
 	/*
 	 * Detect if the free descriptor count is enough or not
 	 */
 	while (qi->free_cnt < required_cnt) {
 		u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift;
 		int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len;
 		int free_cnt = len - busy_cnt;

 		if (free_cnt >= required_cnt) {
 			qi->free_cnt = free_cnt;
 			break;
 		}
 		pkvm_spin_unlock(&iommu->qi_lock);
 		cpu_relax();
 		pkvm_spin_lock(&iommu->qi_lock);
 	}

 	for (i = 0; i < count; i++) {
 		from = vdesc + (((vhead + i) % vlen) << vshift);
 		to = qi->desc + (((qi->free_head + i) % len) << shift);

 		to->qw0 = from->qw0;
 		to->qw1 = from->qw1;
 	}

 	/*
 	 * Reuse the desc_status from host so that host can poll
 	 * the desc_status itself instead of waiting in pkvm.
 	 */
 	qi->free_cnt -= count;
 	qi->free_head = (qi->free_head + count) % len;
 	writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG);

 	pkvm_spin_unlock(&iommu->qi_lock);
 }

 static int handle_qi_invalidation(struct pkvm_iommu *iommu, unsigned long val)
 {
 	struct pkvm_viommu *viommu = &iommu->viommu;
 	u64 viommu_iqa = viommu->iqa;
 	struct qi_desc *wait_desc;
 	int len = IQ_DESC_LEN(viommu_iqa);
 	int shift = IQ_DESC_SHIFT(viommu_iqa);
 	int head = viommu->vreg.iq_head >> shift;
 	int count, i, ret = 0;
 	int *desc_status;
 	void *desc;

 	viommu->vreg.iq_tail = val;
 	desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(viommu_iqa));
 	count = ((val >> shift) + len - head) % len;

 	for (i = 0; i < count; i++) {
 		viommu->vreg.iq_head = ((head + i) % len) << shift;
 		ret = handle_descriptor(iommu, desc + viommu->vreg.iq_head);
 		if (ret)
 			break;
 	}
 	/* update iq_head */
 	viommu->vreg.iq_head = val;

 	if (likely(!ret)) {
 		/*
 		 * Submit the descriptor to hardware. The desc_status
 		 * will be taken cared by hardware.
 		 */
 		handle_qi_submit(iommu, desc, head, count);
 	} else {
 		pkvm_err("pkvm: %s: failed with ret %d\n", __func__, ret);
 		/*
 		 * The descriptor seems invalid. Mark the desc_status as
 		 * QI_ABORT to make sure host driver won't be blocked.
 		 */
 		wait_desc = desc + (((head + count - 1) % len) << shift);
 		if (QI_DESC_TYPE(wait_desc->qw0) == QI_IWD_TYPE) {
 			desc_status = pkvm_phys_to_virt(wait_desc->qw1);
 			WRITE_ONCE(*desc_status, QI_ABORT);
 		}
 	}

 	return ret;
 }

 static void handle_gcmd_te(struct pkvm_iommu *iommu, bool en)
 {
 	unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu);
 	struct pkvm_viommu *viommu = &iommu->viommu;

 	if (en) {
 		viommu->vreg.gsts |= DMA_GSTS_TES;
 		/*
 		 * Sync shadow page table to emulate Translation enable.
 		 */
 		if (sync_shadow_id(iommu, vaddr, vaddr_end, 0))
 			return;
 		pkvm_dbg("pkvm: %s: enable TE\n", __func__);
 		goto out;
 	}

 	/*
 	 * Free shadow to emulate Translation disable.
 	 *
 	 * Not really disable translation as still
 	 * need to protect agains the device.
 	 */
 	free_shadow_id(iommu, vaddr, vaddr_end);
 	viommu->vreg.gsts &= ~DMA_GSTS_TES;
 	pkvm_dbg("pkvm: %s: disable TE\n", __func__);
 out:
 	flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
 	if (ecap_smts(iommu->iommu.ecap))
 		flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
 	flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);

 	root_tbl_walk(iommu);
 }

 static void handle_gcmd_srtp(struct pkvm_iommu *iommu)
 {
 	struct viommu_reg *vreg = &iommu->viommu.vreg;
 	struct pkvm_pgtable *vpgt = &iommu->viommu.pgt;

 	vreg->gsts &= ~DMA_GSTS_RTPS;

 	/* Set the root table phys address from vreg */
 	vpgt->root_pa = vreg->rta & VTD_PAGE_MASK;

 	pkvm_dbg("pkvm: %s: set SRTP val 0x%llx\n", __func__, vreg->rta);

 	if (vreg->gsts & DMA_GSTS_TES) {
 		unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu);

 		/* TE is already enabled, sync shadow */
 		if (sync_shadow_id(iommu, vaddr, vaddr_end, 0))
 			return;

 		flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
 		if (ecap_smts(iommu->iommu.ecap))
 			flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
 		flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 	}

 	vreg->gsts |= DMA_GSTS_RTPS;

 	root_tbl_walk(iommu);
 }

 static void handle_gcmd_qie(struct pkvm_iommu *iommu, bool en)
 {
 	struct viommu_reg *vreg = &iommu->viommu.vreg;

 	if (en) {
 		if (vreg->iq_tail != 0) {
 			pkvm_err("pkvm: Queue invalidation descriptor tail is not zero\n");
 			return;
 		}

 		/* Update the iqa from vreg */
 		iommu->viommu.iqa = vreg->iqa;
 		vreg->iq_head = 0;
 		vreg->gsts |= DMA_GSTS_QIES;
 		pkvm_dbg("pkvm: %s: enabled QI\n", __func__);
 		return;
 	}

 	if (vreg->iq_head != vreg->iq_tail) {
 		pkvm_err("pkvm: Queue invalidation descriptor is not empty yet\n");
 		return;
 	}

 	vreg->iq_head = 0;
 	vreg->gsts &= ~DMA_GSTS_QIES;
 	pkvm_dbg("pkvm: %s: disabled QI\n", __func__);
 }

 static void handle_gcmd_direct(struct pkvm_iommu *iommu, u32 val)
 {
 	struct viommu_reg *vreg = &iommu->viommu.vreg;
 	unsigned long changed = ((vreg->gsts ^ val) & DMAR_GCMD_DIRECT) &
 				DMAR_GSTS_EN_BITS;
 	unsigned long set = (val & DMAR_GCMD_DIRECT) & ~DMAR_GSTS_EN_BITS;
 	u32 cmd, gcmd, sts;
 	int bit;

 	if ((changed | set) & DMAR_GCMD_PROTECTED) {
 		pkvm_dbg("pkvm:%s touching protected bits changed 0x%lx set 0x%lx\n",
 			 __func__, changed, set);
 		return;
 	}

 	if (changed) {
 		pkvm_dbg("pkvm: %s: changed 0x%lx\n", __func__, changed);
 		gcmd = READ_ONCE(iommu->iommu.gcmd);
 		for_each_set_bit(bit, &changed, BITS_PER_BYTE * sizeof(vreg->gsts)) {
 			cmd = 1 << bit;
 			if (val & cmd) {
 				/* enable */
 				gcmd |= cmd;
 				writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
 				PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
 						   readl, (sts & cmd), sts);
 				vreg->gsts |= cmd;
 				pkvm_dbg("pkvm: %s: enable cmd bit %d\n", __func__, bit);
 			} else {
 				/* disable */
 				gcmd &= ~cmd;
 				writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG);
 				PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
 						   readl, !(sts & cmd), sts);
 				vreg->gsts &= ~cmd;
 				pkvm_dbg("pkvm: %s: disable cmd bit %d\n", __func__, bit);
 			}
 		}
 		WRITE_ONCE(iommu->iommu.gcmd, gcmd);
 	}

 	if (set) {
 		pkvm_dbg("pkvm: %s: set 0x%lx\n", __func__, set);
 		gcmd = READ_ONCE(iommu->iommu.gcmd);
 		for_each_set_bit(bit, &set, BITS_PER_BYTE * sizeof(vreg->gsts)) {
 			cmd = 1 << bit;
 			vreg->gsts &= ~cmd;
 			writel(gcmd | cmd, iommu->iommu.reg + DMAR_GCMD_REG);
 			PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG,
 					   readl, (sts & cmd), sts);
 			vreg->gsts |= cmd;
 			pkvm_dbg("pkvm: %s: set cmd bit %d\n", __func__, bit);
 		}
 	}
 }

 static void handle_global_cmd(struct pkvm_iommu *iommu, u32 val)
 {
 	u32 changed = iommu->viommu.vreg.gsts ^ val;

 	pkvm_dbg("pkvm: iommu%d: handle gcmd val 0x%x gsts 0x%x changed 0x%x\n",
 		  iommu->iommu.seq_id, val, iommu->viommu.vreg.gsts, changed);

 	if (changed & DMA_GCMD_TE)
 		handle_gcmd_te(iommu, !!(val & DMA_GCMD_TE));

 	if (val & DMA_GCMD_SRTP)
 		handle_gcmd_srtp(iommu);

 	if (changed & DMA_GCMD_QIE)
 		handle_gcmd_qie(iommu, !!(val & DMA_GCMD_QIE));

 	handle_gcmd_direct(iommu, val);
 }

 static struct pkvm_iommu *find_iommu_by_reg_phys(unsigned long phys)
 {
 	struct pkvm_iommu *iommu;

 	for_each_valid_iommu(iommu) {
 		if ((phys >= iommu->iommu.reg_phys) &&
 			(phys < (iommu->iommu.reg_phys + iommu->iommu.reg_size)))
 			return iommu;
 	}

 	return NULL;
 }

 static unsigned long direct_access_iommu_mmio(struct pkvm_iommu *iommu,
 					      bool is_read, int len,
 					      unsigned long phys,
 					      unsigned long val)
 {
 	unsigned long offset = phys - iommu->iommu.reg_phys;
 	void *reg = iommu->iommu.reg + offset;
 	unsigned long ret = 0;

 	switch (len) {
 	case 4:
 		if (is_read)
 			ret = (unsigned long)readl(reg);
 		else
 			writel((u32)val, reg);
 		break;
 	case 8:
 		if (is_read)
 			ret = (unsigned long)readq(reg);
 		else
 			writeq((u64)val, reg);
 		break;
 	default:
 		pkvm_err("%s: %s: unsupported len %d\n", __func__,
 			 is_read ? "read" : "write", len);
 		break;
 	}

 	return ret;
 }

 static unsigned long access_iommu_mmio(struct pkvm_iommu *iommu, bool is_read,
 				       int len, unsigned long phys,
 				       unsigned long val)
 {
 	struct pkvm_viommu *viommu = &iommu->viommu;
 	unsigned long offset = phys - iommu->iommu.reg_phys;
 	unsigned long ret = 0;

 	/* pkvm IOMMU driver is not activated yet, so directly access MMIO */
 	if (unlikely(!iommu->activated))
 		return direct_access_iommu_mmio(iommu, is_read, len, phys, val);

 	/* Only need to emulate part of the MMIO */
 	switch (offset) {
 	case DMAR_CAP_REG:
 		if (is_read)
 			ret = viommu->vreg.cap;
 		break;
 	case DMAR_ECAP_REG:
 		if (is_read)
 			ret = viommu->vreg.ecap;
 		break;
 	case DMAR_GCMD_REG:
 		if (is_read)
 			ret = 0;
 		else
 			handle_global_cmd(iommu, val);
 		break;
 	case DMAR_GSTS_REG:
 		if (is_read)
 			ret = viommu->vreg.gsts;
 		break;
 	case DMAR_RTADDR_REG:
 		if (is_read)
 			ret = viommu->vreg.rta;
 		else
 			viommu->vreg.rta = val;
 		break;
 	case DMAR_IQA_REG:
 		if (is_read)
 			ret = viommu->vreg.iqa;
 		else
 			viommu->vreg.iqa = val;
 		break;
 	case DMAR_IQH_REG:
 		if (is_read)
 			ret = viommu->vreg.iq_head;
 		break;
 	case DMAR_IQT_REG:
 		if (is_read)
 			ret = viommu->vreg.iq_tail;
 		else {
 			if (viommu->vreg.gsts & DMA_GSTS_QIES)
 				ret = handle_qi_invalidation(iommu, val);
 			else
 				viommu->vreg.iq_tail = val;
 		}
 		break;
 	default:
 		/* Not emulated MMIO can directly goes to hardware */
 		ret = direct_access_iommu_mmio(iommu, is_read, len, phys, val);
 		break;
 	}

 	return ret;
 }

 unsigned long pkvm_access_iommu(bool is_read, int len, unsigned long phys, unsigned long val)
 {
 	struct pkvm_iommu *pkvm_iommu = find_iommu_by_reg_phys(phys);
 	unsigned long ret;

 	if (!pkvm_iommu) {
 		pkvm_err("%s: cannot find pkvm iommu for reg 0x%lx\n",
 			__func__, phys);
 		return 0;
 	}

 	pkvm_spin_lock(&pkvm_iommu->lock);
 	ret = access_iommu_mmio(pkvm_iommu, is_read, len, phys, val);
 	pkvm_spin_unlock(&pkvm_iommu->lock);

 	return ret;
 }

 int pkvm_activate_iommu(void)
 {
 	struct pkvm_iommu *iommu;
 	int ret = 0;

 	for_each_valid_iommu(iommu) {
 		ret = activate_iommu(iommu);
 		if (ret)
 			return ret;
 	}

 	return 0;
 }

 bool is_mem_range_overlap_iommu(unsigned long start, unsigned long end)
 {
 	struct pkvm_iommu *iommu;

 	for_each_valid_iommu(iommu) {
 		if (end < iommu->iommu.reg_phys ||
 			start > (iommu->iommu.reg_phys + iommu->iommu.reg_size - 1))
 			continue;

 		return true;
 	}

 	return false;
 }

 static struct pkvm_iommu *bdf_pasid_to_iommu(u16 bdf, u32 pasid)
 {
 	struct pkvm_iommu *iommu, *find = NULL;
 	struct pkvm_ptdev *p;

 	for_each_valid_iommu(iommu) {
 		pkvm_spin_lock(&iommu->lock);
 		list_for_each_entry(p, &iommu->ptdev_head, iommu_node) {
 			if (match_ptdev(p, bdf, pasid)) {
 				find = iommu;
 				break;
 			}
 		}
 		pkvm_spin_unlock(&iommu->lock);
 		if (find)
 			break;
 	}

 	return find;
 }

 /*
  * pkvm_iommu_sync() - Sync IOMMU context/pasid entry according to a ptdev
  *
  * @bdf/pasid:		The corresponding IOMMU page table entry needs to sync.
  */
 int pkvm_iommu_sync(u16 bdf, u32 pasid)
 {
 	struct pkvm_iommu *iommu = bdf_pasid_to_iommu(bdf, pasid);
 	unsigned long id_addr, id_addr_end;
 	struct pkvm_ptdev *ptdev;
 	u16 old_did;
 	int ret;

 	/*
 	 * TODO:
 	 * Currently assume that the bdf/pasid has ever been synced
 	 * before so that the IOMMU can be found. If has not, then
 	 * the iommu pointer will be NULL. To handle this case, pKVM
 	 * IOMMU driver needs to check the DMAR to know which IOMMU
 	 * should be used for this bdf/pasid.
 	 */
 	if (!iommu)
 		return -ENODEV;

 	ptdev = pkvm_get_ptdev(bdf, pasid);
 	if (!ptdev)
 		return -ENODEV;

 	old_did = ptdev->did;

 	if (ecap_smts(iommu->iommu.ecap)) {
 		id_addr = ((unsigned long)bdf << DEVFN_SHIFT) |
 			  ((unsigned long)pasid & ((1UL << MAX_NR_PASID_BITS) - 1));
 		id_addr_end = id_addr + 1;
 	} else {
 		pkvm_err("%s: No support for legacy IOMMU.\n", __func__);
 		ret = -EOPNOTSUPP;
 		goto out;
 	}

 	pkvm_spin_lock(&iommu->lock);
 	ret = sync_shadow_id(iommu, id_addr, id_addr_end, 0);
 	if (!ret) {
 		if (old_did != ptdev->did) {
 			/* Flush pasid cache and IOTLB for the valid old_did */
 			if (ecap_smts(iommu->iommu.ecap))
 				flush_pasid_cache(iommu, old_did, QI_PC_PASID_SEL, pasid);
 			flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH);
 		}

 		/* Flush pasid cache and IOTLB to make sure no stale TLB for the new did */
 		if (ecap_smts(iommu->iommu.ecap))
 			flush_pasid_cache(iommu, ptdev->did, QI_PC_PASID_SEL, pasid);
 		flush_iotlb(iommu, ptdev->did, 0, 0, DMA_TLB_DSI_FLUSH);
 	}
 	pkvm_spin_unlock(&iommu->lock);
 out:
 	pkvm_put_ptdev(ptdev);
 	return ret;
 }