| /* |
| * SPDX-License-Identifier: GPL-2.0 |
| * Copyright (C) 2022 Intel Corporation |
| */ |
| #include <linux/intel-iommu.h> |
| #include <pkvm_spinlock.h> |
| #include <pkvm.h> |
| #include <gfp.h> |
| #include "pkvm_hyp.h" |
| #include "memory.h" |
| #include "mmu.h" |
| #include "ept.h" |
| #include "pgtable.h" |
| #include "iommu_internal.h" |
| #include "debug.h" |
| #include "ptdev.h" |
| |
| #define for_each_valid_iommu(p) \ |
| for ((p) = iommus; (p) < iommus + PKVM_MAX_IOMMU_NUM; (p)++) \ |
| if (!(p) || !(p)->iommu.reg_phys) { \ |
| continue; \ |
| } else |
| |
| static struct pkvm_iommu iommus[PKVM_MAX_IOMMU_NUM]; |
| |
| static struct pkvm_pool iommu_pool; |
| |
| /* |
| * Guest page table walking parameter. |
| * pkvm IOMMU driver walks the guest page table when syncing |
| * with the shadow page table. |
| */ |
| struct pgt_sync_walk_data { |
| struct pkvm_iommu *iommu; |
| /* |
| * Used to hold shadow page table physical address |
| * which is used for sync shadow entries at each |
| * page table level. |
| */ |
| u64 shadow_pa[IOMMU_SM_LEVEL_NUM]; |
| /* |
| * Used when just syncing a part of shadow |
| * page table entries which match with this did if |
| * it is set as a non-zero did value. |
| */ |
| u16 did; |
| }; |
| |
| #define DEFINE_PGT_SYNC_WALK_DATA(name, _iommu, domain_id) \ |
| struct pgt_sync_walk_data (name) = { \ |
| .iommu = (_iommu), \ |
| .shadow_pa = {0}, \ |
| .did = (domain_id), \ |
| } |
| |
| /* |
| * Used to config a shadow page table entry in root/context/pasid |
| * level. |
| */ |
| struct pgt_sync_data { |
| union { |
| u64 root_entry; |
| struct context_entry ct_entry; |
| struct pasid_dir_entry pd_entry; |
| struct pasid_entry p_entry; |
| }; |
| void *guest_ptep; |
| void *shadow_ptep; |
| int level; |
| u64 iommu_ecap; |
| u64 shadow_pa; |
| struct pkvm_pgtable *spgt; |
| unsigned long vaddr; |
| }; |
| |
| static inline void *iommu_zalloc_pages(size_t size) |
| { |
| return pkvm_alloc_pages(&iommu_pool, get_order(size)); |
| } |
| |
| static void *iommu_zalloc_page(void) |
| { |
| return pkvm_alloc_pages(&iommu_pool, 0); |
| } |
| |
| static void iommu_get_page(void *vaddr) |
| { |
| pkvm_get_page(&iommu_pool, vaddr); |
| } |
| |
| static void iommu_put_page(void *vaddr) |
| { |
| pkvm_put_page(&iommu_pool, vaddr); |
| } |
| |
| static void iommu_flush_cache(void *ptep, unsigned int size) |
| { |
| pkvm_clflush_cache_range(ptep, size); |
| } |
| |
| static struct pkvm_mm_ops viommu_mm_ops = { |
| .phys_to_virt = host_gpa2hva, |
| }; |
| |
| static struct pkvm_mm_ops iommu_pw_coherency_mm_ops = { |
| .phys_to_virt = pkvm_phys_to_virt, |
| .virt_to_phys = pkvm_virt_to_phys, |
| .zalloc_page = iommu_zalloc_page, |
| .get_page = iommu_get_page, |
| .put_page = iommu_put_page, |
| .page_count = pkvm_page_count, |
| }; |
| |
| static struct pkvm_mm_ops iommu_pw_noncoherency_mm_ops = { |
| .phys_to_virt = pkvm_phys_to_virt, |
| .virt_to_phys = pkvm_virt_to_phys, |
| .zalloc_page = iommu_zalloc_page, |
| .get_page = iommu_get_page, |
| .put_page = iommu_put_page, |
| .page_count = pkvm_page_count, |
| .flush_cache = iommu_flush_cache, |
| }; |
| |
| static bool iommu_id_entry_present(void *ptep) |
| { |
| u64 val; |
| |
| val = *(u64 *)ptep; |
| return !!(val & 1); |
| } |
| |
| static unsigned long iommu_id_entry_to_phys(void *ptep) |
| { |
| u64 val = *(u64 *)ptep; |
| |
| return val & VTD_PAGE_MASK; |
| } |
| |
| static int iommu_sm_id_entry_to_index(unsigned long vaddr, int level) |
| { |
| switch (level) { |
| case IOMMU_PASID_TABLE: |
| return vaddr & (BIT(PASIDDIR_BITS) - 1); |
| case IOMMU_PASID_DIR: |
| return (vaddr >> PASIDDIR_SHIFT) & (BIT(PASIDDIR_BITS) - 1); |
| case IOMMU_SM_CONTEXT: |
| return (vaddr >> DEVFN_SHIFT) & (BIT(SM_DEVFN_BITS) - 1); |
| case IOMMU_SM_ROOT: |
| return (vaddr >> SM_BUS_SHIFT) & (BIT(SM_BUS_BITS) - 1); |
| default: |
| break; |
| } |
| |
| return -EINVAL; |
| } |
| |
| static bool iommu_id_entry_is_leaf(void *ptep, int level) |
| { |
| if (LAST_LEVEL(level) || |
| !iommu_id_entry_present(ptep)) |
| return true; |
| |
| return false; |
| } |
| |
| static int iommu_sm_id_level_entry_size(int level) |
| { |
| switch (level) { |
| case IOMMU_PASID_TABLE: |
| return sizeof(struct pasid_entry); |
| case IOMMU_PASID_DIR: |
| return sizeof(struct pasid_dir_entry); |
| case IOMMU_SM_CONTEXT: |
| /* scalable mode requires 32bytes for context */ |
| return sizeof(struct context_entry) * 2; |
| case IOMMU_SM_ROOT: |
| return sizeof(u64); |
| default: |
| break; |
| } |
| |
| return -EINVAL; |
| } |
| |
| static int iommu_sm_id_level_to_entries(int level) |
| { |
| switch (level) { |
| case IOMMU_PASID_TABLE: |
| return 1 << PASIDTAB_BITS; |
| case IOMMU_PASID_DIR: |
| return 1 << PASIDDIR_BITS; |
| case IOMMU_SM_CONTEXT: |
| return 1 << SM_DEVFN_BITS; |
| case IOMMU_SM_ROOT: |
| return 1 << SM_BUS_BITS; |
| default: |
| break; |
| } |
| |
| return -EINVAL; |
| } |
| |
| static unsigned long iommu_sm_id_level_to_size(int level) |
| { |
| switch (level) { |
| case IOMMU_PASID_TABLE: |
| return 1; |
| case IOMMU_PASID_DIR: |
| return 1 << PASIDDIR_SHIFT; |
| case IOMMU_SM_CONTEXT: |
| return 1 << DEVFN_SHIFT; |
| case IOMMU_SM_ROOT: |
| return 1 << SM_BUS_SHIFT; |
| default: |
| break; |
| } |
| |
| return 0; |
| } |
| |
| struct pkvm_pgtable_ops iommu_sm_id_ops = { |
| .pgt_entry_present = iommu_id_entry_present, |
| .pgt_entry_to_phys = iommu_id_entry_to_phys, |
| .pgt_entry_to_index = iommu_sm_id_entry_to_index, |
| .pgt_entry_is_leaf = iommu_id_entry_is_leaf, |
| .pgt_level_entry_size = iommu_sm_id_level_entry_size, |
| .pgt_level_to_entries = iommu_sm_id_level_to_entries, |
| .pgt_level_to_size = iommu_sm_id_level_to_size, |
| }; |
| |
| static int iommu_lm_id_entry_to_index(unsigned long vaddr, int level) |
| { |
| switch (level) { |
| case IOMMU_LM_CONTEXT: |
| return (vaddr >> LM_DEVFN_SHIFT) & (BIT(LM_DEVFN_BITS) - 1); |
| case IOMMU_LM_ROOT: |
| return (vaddr >> LM_BUS_SHIFT) & (BIT(LM_BUS_BITS) - 1); |
| default: |
| break; |
| } |
| |
| return -EINVAL; |
| } |
| |
| static int iommu_lm_id_level_entry_size(int level) |
| { |
| switch (level) { |
| case IOMMU_LM_CONTEXT: |
| return sizeof(struct context_entry); |
| case IOMMU_LM_ROOT: |
| return sizeof(struct root_entry); |
| default: |
| break; |
| } |
| |
| return -EINVAL; |
| } |
| |
| static int iommu_lm_id_level_to_entries(int level) |
| { |
| switch (level) { |
| case IOMMU_LM_CONTEXT: |
| return 1 << LM_DEVFN_BITS; |
| case IOMMU_LM_ROOT: |
| return 1 << LM_BUS_BITS; |
| default: |
| break; |
| } |
| |
| return -EINVAL; |
| } |
| |
| static unsigned long iommu_lm_id_level_to_size(int level) |
| { |
| switch (level) { |
| case IOMMU_LM_CONTEXT: |
| return 1 << LM_DEVFN_SHIFT; |
| case IOMMU_LM_ROOT: |
| return 1 << LM_BUS_SHIFT; |
| default: |
| break; |
| } |
| |
| return 0; |
| } |
| |
| struct pkvm_pgtable_ops iommu_lm_id_ops = { |
| .pgt_entry_present = iommu_id_entry_present, |
| .pgt_entry_to_phys = iommu_id_entry_to_phys, |
| .pgt_entry_to_index = iommu_lm_id_entry_to_index, |
| .pgt_entry_is_leaf = iommu_id_entry_is_leaf, |
| .pgt_level_entry_size = iommu_lm_id_level_entry_size, |
| .pgt_level_to_entries = iommu_lm_id_level_to_entries, |
| .pgt_level_to_size = iommu_lm_id_level_to_size, |
| }; |
| |
| static int iommu_pgtable_walk(struct pkvm_pgtable *pgt, unsigned long vaddr, |
| unsigned long vaddr_end, struct pkvm_pgtable_walker *walker) |
| { |
| if (!pgt->root_pa) |
| return 0; |
| |
| return pgtable_walk(pgt, vaddr, vaddr_end - vaddr, false, walker); |
| } |
| |
| /* present root entry when shadow_pa valid, otherwise un-present it */ |
| static bool sync_root_entry(struct pgt_sync_data *sdata) |
| { |
| u64 *sre = sdata->shadow_ptep; |
| u64 sre_val = sdata->shadow_pa ? (sdata->shadow_pa | 1) : 0; |
| |
| if (READ_ONCE(*sre) != sre_val) { |
| WRITE_ONCE(*sre, sre_val); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* sync context entry when guest_ptep & shadow_pa valid, otherwise un-present it */ |
| static bool sync_shadow_context_entry(struct pgt_sync_data *sdata) |
| { |
| struct context_entry *shadow_ce = sdata->shadow_ptep, tmp = {0}; |
| struct context_entry *guest_ce = sdata->guest_ptep; |
| bool updated = false; |
| u8 aw; |
| |
| if (sdata->guest_ptep && sdata->shadow_pa) { |
| tmp.hi = guest_ce->hi; |
| tmp.lo = sdata->shadow_pa | (guest_ce->lo & 0xfff); |
| |
| if (ecap_smts(sdata->iommu_ecap)) |
| /* Clear DTE to make sure device TLB is disabled for security */ |
| context_clear_dte(&tmp); |
| else { |
| /* |
| * Set translation type to CONTEXT_TT_MULTI_LEVEL to ensure using |
| * 2nd-level translation and to disable device TLB for security. |
| */ |
| context_lm_set_tt(&tmp, CONTEXT_TT_MULTI_LEVEL); |
| |
| /* |
| * For now, set the address width only when host IOMMU driver |
| * is using pass-through mode. |
| * FIXME: Once shadow 2nd-stage page tables are supported by pKVM, |
| * set the address width in all cases. |
| */ |
| if (sdata->shadow_pa == pkvm_hyp->host_vm.ept->root_pa) { |
| aw = (pkvm_hyp->ept_iommu_pgt_level == 4) ? 2 : 3; |
| context_lm_set_aw(&tmp, aw); |
| } |
| } |
| } |
| |
| if (READ_ONCE(shadow_ce->hi) != tmp.hi) { |
| WRITE_ONCE(shadow_ce->hi, tmp.hi); |
| updated = true; |
| } |
| |
| if (READ_ONCE(shadow_ce->lo) != tmp.lo) { |
| WRITE_ONCE(shadow_ce->lo, tmp.lo); |
| updated = true; |
| } |
| |
| return updated; |
| } |
| |
| /* sync pasid dir entry when guest_ptep & shadow_pa valid, otherwise un-present it */ |
| static bool sync_shadow_pasid_dir_entry(struct pgt_sync_data *sdata) |
| { |
| struct pasid_dir_entry *shadow_pde = sdata->shadow_ptep; |
| u64 val = 0; |
| |
| if (sdata->guest_ptep && sdata->shadow_pa) { |
| struct pasid_dir_entry *guest_pde = sdata->guest_ptep; |
| |
| val = guest_pde->val & (PASID_PTE_FPD | PASID_PTE_PRESENT); |
| val |= sdata->shadow_pa; |
| } |
| |
| if (READ_ONCE(shadow_pde->val) != val) { |
| WRITE_ONCE(shadow_pde->val, val); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static int iommu_audit_did(struct pkvm_iommu *iommu, u16 did, int shadow_vm_handle) |
| { |
| struct pkvm_ptdev *tmp; |
| int ret = 0; |
| |
| list_for_each_entry(tmp, &iommu->ptdev_head, iommu_node) { |
| if (tmp->shadow_vm_handle != shadow_vm_handle) { |
| if (tmp->did == did) { |
| /* |
| * The devices belong to different VMs but behind |
| * the same IOMMU, cannot use the same did. |
| */ |
| ret = -EPERM; |
| break; |
| } |
| } |
| } |
| |
| return ret; |
| } |
| |
| static struct pkvm_ptdev *iommu_find_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid) |
| { |
| struct pkvm_ptdev *p; |
| |
| list_for_each_entry(p, &iommu->ptdev_head, iommu_node) { |
| if (match_ptdev(p, bdf, pasid)) |
| return p; |
| } |
| |
| return NULL; |
| } |
| |
| static struct pkvm_ptdev *iommu_add_ptdev(struct pkvm_iommu *iommu, u16 bdf, u32 pasid) |
| { |
| struct pkvm_ptdev *ptdev = pkvm_get_ptdev(bdf, pasid); |
| |
| if (!ptdev) |
| return NULL; |
| |
| list_add_tail(&ptdev->iommu_node, &iommu->ptdev_head); |
| return ptdev; |
| } |
| |
| static void iommu_del_ptdev(struct pkvm_iommu *iommu, struct pkvm_ptdev *ptdev) |
| { |
| list_del_init(&ptdev->iommu_node); |
| pkvm_put_ptdev(ptdev); |
| } |
| |
| /* sync pasid table entry when guest_ptep valid, otherwise un-present it */ |
| static bool sync_shadow_pasid_table_entry(struct pgt_sync_data *sdata) |
| { |
| u16 bdf = sdata->vaddr >> DEVFN_SHIFT; |
| u32 pasid = sdata->vaddr & ((1UL << MAX_NR_PASID_BITS) - 1); |
| struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(sdata->spgt); |
| struct pkvm_ptdev *ptdev = iommu_find_ptdev(iommu, bdf, pasid); |
| struct pasid_entry *shadow_pte = sdata->shadow_ptep, tmp_pte = {0}; |
| struct pasid_entry *guest_pte; |
| bool synced = false; |
| u64 type, aw; |
| |
| if (!ptdev) { |
| ptdev = iommu_add_ptdev(iommu, bdf, pasid); |
| if (!ptdev) |
| return false; |
| } |
| |
| if (!sdata->guest_ptep) { |
| if (pasid_pte_is_present(shadow_pte)) { |
| /* |
| * Making a pasid entry not present needs to remove |
| * the corresponding ptdev from IOMMU. It also means |
| * a ptdev's vpgt/did should be reset as well as |
| * deleting ptdev from this iommu. |
| */ |
| pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL); |
| pkvm_setup_ptdev_did(ptdev, 0); |
| iommu_del_ptdev(iommu, ptdev); |
| |
| synced = pasid_copy_entry(shadow_pte, &tmp_pte); |
| } |
| return synced; |
| } |
| |
| guest_pte = sdata->guest_ptep; |
| type = pasid_pte_get_pgtt(guest_pte); |
| if (type == PASID_ENTRY_PGTT_FL_ONLY) { |
| struct pkvm_pgtable_cap cap; |
| |
| if (ptdev_attached_to_vm(ptdev)) |
| /* |
| * For the attached ptdev, use SL Only mode with |
| * using ptdev->pgt so that the translation is |
| * totally controlled by pkvm. |
| */ |
| type = PASID_ENTRY_PGTT_SL_ONLY; |
| else |
| /* |
| * For the other ptdev, pkvm IOMMU will use nested |
| * translation to add one more layer translation to |
| * guarantee the protection. This one more layer is the |
| * primary VM's EPT. |
| */ |
| type = PASID_ENTRY_PGTT_NESTED; |
| |
| /* ptdev vpgt can be initialized with flptr */ |
| cap.level = pasid_get_flpm(guest_pte) == 0 ? 4 : 5; |
| cap.allowed_pgsz = pkvm_hyp->mmu_cap.allowed_pgsz; |
| pkvm_setup_ptdev_vpgt(ptdev, pasid_get_flptr(guest_pte), |
| &viommu_mm_ops, &mmu_ops, &cap); |
| } else if (type == PASID_ENTRY_PGTT_PT) { |
| /* |
| * When host IOMMU driver is using pass-through mode, pkvm |
| * IOMMU will actually use the second-level only translation |
| * to guarantee the protection. This second-level is als |
| * the EPT. |
| */ |
| type = PASID_ENTRY_PGTT_SL_ONLY; |
| } else { |
| /* |
| * As the host IOMMU driver in the pkvm enabled kernel has |
| * already been configured to use first-level only or |
| * pass-through mode, it will not use any other mode. But |
| * in case this has happened, reset the ptdev vpgt/did while |
| * keep ptdev linked to this IOMMU, and clear the shadow entry |
| * so that not to support it. |
| */ |
| pkvm_setup_ptdev_vpgt(ptdev, 0, NULL, NULL, NULL); |
| pkvm_setup_ptdev_did(ptdev, 0); |
| |
| pkvm_err("pkvm: unsupported pasid type %lld\n", type); |
| |
| return pasid_copy_entry(shadow_pte, &tmp_pte); |
| } |
| |
| pkvm_setup_ptdev_did(ptdev, pasid_get_domain_id(guest_pte)); |
| |
| if (iommu_audit_did(iommu, ptdev->did, ptdev->shadow_vm_handle)) |
| /* |
| * It is possible that this ptdev will be attached to a protected |
| * VM so primary VM allocates the same did used by this protected |
| * VM and did a TLB flush. But at this moment, this ptdev is not |
| * attached yet so audit is failed. For this case, can skip the sync |
| * of this pasid table entry and it will be synced again when this |
| * ptdev is attached. |
| * |
| * It is also possible that this ptdev is just detached from a |
| * protected VM but still using the previous did due to primary VM |
| * has not configured this ptdev yet. In this case, the did of this |
| * ptdev is still the same as the did used by other ptdevs not |
| * detached yet. For this case, can skip the sync of this pasid |
| * table entry and it will be synced again when primary VM configures |
| * this ptdev. |
| * |
| * If not the above cases but primary VM does this by purpose, also |
| * not sync the pasid table entry to guarantee the isolation. |
| */ |
| return false; |
| |
| /* |
| * ptdev->pgt will be used as second-level translation table |
| * which should be EPT format. |
| */ |
| if (!is_pgt_ops_ept(ptdev->pgt)) |
| return false; |
| |
| /* |
| * Copy all the bits from guest_pte. As the translation type will |
| * be re-configured in below, even some bits inherit from guest_pte |
| * but hardware will ignore those bits according to the translation |
| * type. |
| */ |
| memcpy(&tmp_pte, guest_pte, sizeof(struct pasid_entry)); |
| |
| pasid_set_page_snoop(&tmp_pte, !!ecap_smpwc(sdata->iommu_ecap)); |
| if (ecap_sc_support(sdata->iommu_ecap)) |
| pasid_set_pgsnp(&tmp_pte); |
| |
| /* |
| * Modify the second-level related bits: |
| * Set PGTT/SLPTR/AW. |
| * Clear SLADE/SLEE |
| * Reuse FPD/P |
| */ |
| pasid_set_translation_type(&tmp_pte, type); |
| pasid_set_slptr(&tmp_pte, ptdev->pgt->root_pa); |
| aw = (ptdev->pgt->level == 4) ? 2 : 3; |
| pasid_set_address_width(&tmp_pte, aw); |
| pasid_set_ssade(&tmp_pte, 0); |
| pasid_set_ssee(&tmp_pte, 0); |
| |
| return pasid_copy_entry(shadow_pte, &tmp_pte); |
| } |
| |
| static bool iommu_id_sync_entry(struct pgt_sync_data *sdata) |
| { |
| bool ret = false; |
| struct pkvm_pgtable *spgt = sdata->spgt; |
| |
| if (ecap_smts(sdata->iommu_ecap)) { |
| switch (sdata->level) { |
| case IOMMU_PASID_TABLE: |
| ret = sync_shadow_pasid_table_entry(sdata); |
| break; |
| case IOMMU_PASID_DIR: |
| ret = sync_shadow_pasid_dir_entry(sdata); |
| break; |
| case IOMMU_SM_CONTEXT: |
| ret = sync_shadow_context_entry(sdata); |
| break; |
| case IOMMU_SM_ROOT: |
| ret = sync_root_entry(sdata); |
| break; |
| default: |
| break; |
| } |
| } else { |
| switch (sdata->level) { |
| case IOMMU_LM_CONTEXT: |
| ret = sync_shadow_context_entry(sdata); |
| break; |
| case IOMMU_LM_ROOT: |
| ret = sync_root_entry(sdata); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| if (ret) { |
| int entry_size = spgt->pgt_ops->pgt_level_entry_size(sdata->level); |
| |
| if (entry_size && spgt->mm_ops->flush_cache) |
| spgt->mm_ops->flush_cache(sdata->shadow_ptep, entry_size); |
| } |
| |
| return ret; |
| } |
| |
| static int initialize_iommu_pgt(struct pkvm_iommu *iommu) |
| { |
| struct pkvm_pgtable *pgt = &iommu->pgt; |
| struct pkvm_pgtable *vpgt = &iommu->viommu.pgt; |
| static struct pkvm_mm_ops *iommu_mm_ops; |
| struct pkvm_pgtable_ops *iommu_ops; |
| struct pkvm_pgtable_cap cap; |
| u64 grt_pa = readq(iommu->iommu.reg + DMAR_RTADDR_REG) & VTD_PAGE_MASK; |
| int ret; |
| |
| if (ecap_smts(iommu->iommu.ecap)) { |
| cap.level = IOMMU_SM_ROOT; |
| iommu_ops = &iommu_sm_id_ops; |
| } else { |
| cap.level = IOMMU_LM_ROOT; |
| iommu_ops = &iommu_lm_id_ops; |
| } |
| |
| vpgt->root_pa = grt_pa; |
| ret = pkvm_pgtable_init(vpgt, &viommu_mm_ops, iommu_ops, &cap, false); |
| if (ret) |
| return ret; |
| |
| /* |
| * For the IOMMU without Page-Walk Coherency, should use |
| * iommu_pw_noncoherency_mm_ops to flush CPU cache when |
| * modifying any remapping structure entry. |
| * |
| * For the IOMMU with Page-Walk Coherency, can use |
| * iommu_pw_coherency_mm_ops to skip the CPU cache flushing. |
| */ |
| if (!ecap_coherent(iommu->iommu.ecap)) |
| iommu_mm_ops = &iommu_pw_noncoherency_mm_ops; |
| else |
| iommu_mm_ops = &iommu_pw_coherency_mm_ops; |
| |
| ret = pkvm_pgtable_init(pgt, iommu_mm_ops, iommu_ops, &cap, true); |
| if (!ret) { |
| /* |
| * Hold additional reference count to make |
| * sure root page won't be freed |
| */ |
| void *root = pgt->mm_ops->phys_to_virt(pgt->root_pa); |
| |
| pgt->mm_ops->get_page(root); |
| } |
| return ret; |
| } |
| |
| int pkvm_init_iommu(unsigned long mem_base, unsigned long nr_pages) |
| { |
| struct pkvm_iommu_info *info = &pkvm_hyp->iommu_infos[0]; |
| struct pkvm_iommu *piommu = &iommus[0]; |
| int i, ret = pkvm_pool_init(&iommu_pool, mem_base >> PAGE_SHIFT, nr_pages, 0); |
| |
| if (ret) |
| return ret; |
| |
| for (i = 0; i < PKVM_MAX_IOMMU_NUM; piommu++, info++, i++) { |
| if (!info->reg_phys) |
| break; |
| |
| INIT_LIST_HEAD(&piommu->ptdev_head); |
| |
| pkvm_spinlock_init(&piommu->lock); |
| piommu->iommu.reg_phys = info->reg_phys; |
| piommu->iommu.reg_size = info->reg_size; |
| piommu->iommu.reg = pkvm_iophys_to_virt(info->reg_phys); |
| if ((unsigned long)piommu->iommu.reg == INVALID_ADDR) |
| return -ENOMEM; |
| piommu->iommu.seq_id = i; |
| |
| ret = pkvm_mmu_map((unsigned long)piommu->iommu.reg, |
| (unsigned long)info->reg_phys, |
| info->reg_size, 1 << PG_LEVEL_4K, |
| PKVM_PAGE_IO_NOCACHE); |
| if (ret) |
| return ret; |
| |
| piommu->iommu.cap = readq(piommu->iommu.reg + DMAR_CAP_REG); |
| piommu->iommu.ecap = readq(piommu->iommu.reg + DMAR_ECAP_REG); |
| /* cache the enabled features from Global Status register */ |
| piommu->iommu.gcmd = readl(piommu->iommu.reg + DMAR_GSTS_REG) & |
| DMAR_GSTS_EN_BITS; |
| |
| ret = pkvm_host_ept_unmap((unsigned long)info->reg_phys, |
| (unsigned long)info->reg_phys, |
| info->reg_size); |
| if (ret) |
| return ret; |
| } |
| |
| return 0; |
| } |
| |
| static int free_shadow_id_cb(struct pkvm_pgtable *pgt, unsigned long vaddr, |
| unsigned long vaddr_end, int level, void *ptep, |
| unsigned long flags, struct pgt_flush_data *flush_data, |
| void *const arg) |
| { |
| struct pkvm_pgtable_ops *pgt_ops = pgt->pgt_ops; |
| struct pkvm_mm_ops *mm_ops = pgt->mm_ops; |
| struct pgt_sync_data sync_data = {0}; |
| struct pkvm_iommu *iommu = pgt_to_pkvm_iommu(pgt); |
| void *child_ptep; |
| |
| /* Doesn't need to do anything if the shadow entry is not present */ |
| if (!pgt_ops->pgt_entry_present(ptep)) |
| return 0; |
| |
| sync_data.shadow_ptep = ptep; |
| sync_data.level = level; |
| sync_data.spgt = pgt; |
| sync_data.iommu_ecap = iommu->iommu.ecap; |
| sync_data.vaddr = vaddr; |
| |
| /* Un-present a present PASID Table entry */ |
| if (LAST_LEVEL(level)) { |
| if (iommu_id_sync_entry(&sync_data)) |
| mm_ops->put_page(ptep); |
| return 0; |
| } |
| |
| /* |
| * it's a present entry for PASID DIR, context or root. |
| * its child ptep shall already be freed (the refcnt == 1), if so, we |
| * can un-present itself as well now. |
| */ |
| child_ptep = mm_ops->phys_to_virt(pgt_ops->pgt_entry_to_phys(ptep)); |
| if (mm_ops->page_count(child_ptep) == 1) { |
| if (iommu_id_sync_entry(&sync_data)) { |
| mm_ops->put_page(ptep); |
| mm_ops->put_page(child_ptep); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* sync_data != NULL, data != NULL */ |
| static int init_sync_id_data(struct pgt_sync_data *sync_data, |
| struct pgt_sync_walk_data *data, |
| struct pkvm_iommu *iommu, void *guest_ptep, |
| unsigned long vaddr, int level) |
| { |
| struct pkvm_pgtable *spgt = &iommu->pgt; |
| int idx = spgt->pgt_ops->pgt_entry_to_index(vaddr, level); |
| int entry_size = spgt->pgt_ops->pgt_level_entry_size(level); |
| |
| if (ecap_smts(iommu->iommu.ecap)) { |
| switch (level) { |
| case IOMMU_PASID_TABLE: |
| sync_data->p_entry = *((struct pasid_entry *)guest_ptep); |
| sync_data->guest_ptep = &sync_data->p_entry; |
| break; |
| case IOMMU_PASID_DIR: |
| sync_data->pd_entry = *((struct pasid_dir_entry *)guest_ptep); |
| sync_data->guest_ptep = &sync_data->pd_entry; |
| break; |
| case IOMMU_SM_CONTEXT: |
| sync_data->ct_entry = *((struct context_entry *)guest_ptep); |
| sync_data->guest_ptep = &sync_data->ct_entry; |
| break; |
| case IOMMU_SM_ROOT: |
| sync_data->root_entry = *((u64 *)guest_ptep); |
| sync_data->guest_ptep = &sync_data->root_entry; |
| break; |
| default: |
| return -EINVAL; |
| } |
| } else { |
| switch (level) { |
| case IOMMU_LM_CONTEXT: |
| sync_data->ct_entry = *((struct context_entry *)guest_ptep); |
| sync_data->guest_ptep = &sync_data->ct_entry; |
| break; |
| case IOMMU_LM_ROOT: |
| sync_data->root_entry = *((u64 *)guest_ptep); |
| sync_data->guest_ptep = &sync_data->root_entry; |
| break; |
| default: |
| return -EINVAL; |
| } |
| } |
| |
| /* shadow_pa of current level must be there */ |
| if (!data->shadow_pa[level]) |
| return -EINVAL; |
| |
| /* get current shadow_ptep */ |
| sync_data->shadow_ptep = spgt->mm_ops->phys_to_virt(data->shadow_pa[level]); |
| sync_data->shadow_ptep += idx * entry_size; |
| |
| sync_data->level = level; |
| sync_data->spgt = spgt; |
| sync_data->iommu_ecap = iommu->iommu.ecap; |
| sync_data->shadow_pa = 0; |
| sync_data->vaddr = vaddr; |
| |
| return 0; |
| } |
| |
| static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr, |
| unsigned long vaddr_end); |
| static int sync_shadow_id_cb(struct pkvm_pgtable *vpgt, unsigned long vaddr, |
| unsigned long vaddr_end, int level, void *ptep, |
| unsigned long flags, struct pgt_flush_data *flush_data, |
| void *const arg) |
| { |
| struct pkvm_pgtable_ops *vpgt_ops = vpgt->pgt_ops; |
| struct pgt_sync_walk_data *data = arg; |
| struct pkvm_iommu *iommu = data->iommu; |
| struct pkvm_pgtable *spgt = &iommu->pgt; |
| struct pgt_sync_data sync_data; |
| void *shadow_ptep, *guest_ptep; |
| bool shadow_p, guest_p; |
| int ret = init_sync_id_data(&sync_data, data, iommu, ptep, vaddr, level); |
| |
| if (ret < 0) |
| return ret; |
| |
| guest_ptep = sync_data.guest_ptep; |
| shadow_ptep = sync_data.shadow_ptep; |
| |
| /* |
| * WALK_TABLE_PRE is for non leaf, WALK_LEAF is for leaf |
| * if not match, it means guest changed it, return -EAGAIN |
| * to re-walk the page table. |
| */ |
| if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE && |
| vpgt_ops->pgt_entry_is_leaf(guest_ptep, level)) || |
| (flags == PKVM_PGTABLE_WALK_LEAF && |
| !vpgt_ops->pgt_entry_is_leaf(guest_ptep, level))) |
| return -EAGAIN; |
| |
| shadow_p = spgt->pgt_ops->pgt_entry_present(shadow_ptep); |
| guest_p = vpgt_ops->pgt_entry_present(guest_ptep); |
| if (!guest_p) { |
| if (shadow_p) { |
| /* |
| * For the case that guest not present but shadow present, just |
| * simply free the shadow to make them consistent. |
| */ |
| unsigned long new_vaddr_end = spgt->pgt_ops->pgt_level_to_size(level) + |
| vaddr; |
| /* |
| * Get a reference count before free to make sure the current page |
| * of this level and the pages of its parent levels won't be freed. |
| * As here we only want to free its specific sub-level. |
| */ |
| spgt->mm_ops->get_page(shadow_ptep); |
| free_shadow_id(iommu, vaddr, new_vaddr_end); |
| spgt->mm_ops->put_page(shadow_ptep); |
| } |
| /* |
| * As now both guest and shadow are not |
| * present, don't need to do anything more. |
| */ |
| return ret; |
| } |
| |
| if (LAST_LEVEL(level)) { |
| if (ecap_smts(iommu->iommu.ecap)) { |
| /* |
| * For PASID_TABLE, cache invalidation may want to |
| * sync specific PASID with did matched. So do the |
| * check before sync the entry. |
| * |
| * According to vt-d spec 6.2.2.1, software must not |
| * use domain-id value of 0 on when programming |
| * context-entries on implementations reporting CM=1 |
| * in the Capability register. |
| * |
| * So non-zero DID means a real DID from host software. |
| */ |
| if (data->did && (pasid_get_domain_id(guest_ptep) != data->did)) |
| return ret; |
| |
| /* |
| * The shadow_pa to configur the PASID table entry is |
| * depending on the pgt used by the corresponding ptdev. |
| * So no need to set sync_data.shadow_pa. |
| */ |
| } else { |
| switch (context_lm_get_tt(guest_ptep)) { |
| case CONTEXT_TT_MULTI_LEVEL: |
| case CONTEXT_TT_DEV_IOTLB: |
| /* |
| * Right now reference to a virtual 2nd-stage paging table. |
| * FIXME: Reference to a shadow paging table when shadowing |
| * 2nd-stage paging table is supported by pKVM, to ensure |
| * the memory protection. |
| */ |
| sync_data.shadow_pa = vpgt_ops->pgt_entry_to_phys(guest_ptep); |
| break; |
| case CONTEXT_TT_PASS_THROUGH: |
| /* |
| * When host IOMMU driver is using pass-through mode, pkvm |
| * IOMMU will actually use the 2nd-level translation using the |
| * host EPT to guarantee the memory protection. |
| */ |
| sync_data.shadow_pa = pkvm_hyp->host_vm.ept->root_pa; |
| break; |
| default: |
| /* |
| * Context entry with an unsupported (reserved) value of |
| * Translation Type shall be non-present to the physical IOMMU. |
| */ |
| break; |
| } |
| } |
| } else if (!shadow_p) { |
| /* |
| * For a non-present non-leaf (which may be root/context/pasid |
| * dir) entry, needs to allocate a new page to make this entry |
| * present. Root and context page are always one page with 4K |
| * size. As we fixed the pasid only support 15bits, which makes |
| * the pasid dir is also one page with 4K size. |
| */ |
| void *shadow = spgt->mm_ops->zalloc_page(); |
| |
| if (!shadow) |
| return -ENOMEM; |
| /* Get the shadow page physical address of the child level */ |
| sync_data.shadow_pa = spgt->mm_ops->virt_to_phys(shadow); |
| } else |
| /* |
| * For a present non-leaf (which is probably root/context/pasid dir) |
| * entry, get the shadow page physical address of its child level. |
| */ |
| sync_data.shadow_pa = spgt->pgt_ops->pgt_entry_to_phys(shadow_ptep); |
| |
| if (iommu_id_sync_entry(&sync_data)) { |
| if (!shadow_p) |
| /* |
| * A non-present to present changing require to get |
| * a new reference count for the shadow page. |
| */ |
| spgt->mm_ops->get_page(shadow_ptep); |
| } |
| |
| if ((flags == PKVM_PGTABLE_WALK_TABLE_PRE) && (!LAST_LEVEL(level))) { |
| /* |
| * As guest page table walking will go to the child level, pass |
| * the shadow page physical address of the child level to sync. |
| */ |
| data->shadow_pa[level - 1] = sync_data.shadow_pa; |
| } |
| |
| return ret; |
| } |
| |
| static int free_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr, |
| unsigned long vaddr_end) |
| { |
| struct pkvm_pgtable_walker walker = { |
| .cb = free_shadow_id_cb, |
| .flags = PKVM_PGTABLE_WALK_LEAF | |
| PKVM_PGTABLE_WALK_TABLE_POST, |
| }; |
| |
| /* |
| * To free the shadow IOMMU page table, walks the shadow IOMMU |
| * page table. |
| */ |
| if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES)) |
| return 0; |
| |
| return iommu_pgtable_walk(&iommu->pgt, vaddr, vaddr_end, &walker); |
| } |
| |
| static int sync_shadow_id(struct pkvm_iommu *iommu, unsigned long vaddr, |
| unsigned long vaddr_end, u16 did) |
| { |
| DEFINE_PGT_SYNC_WALK_DATA(arg, iommu, did); |
| struct pkvm_pgtable_walker walker = { |
| .cb = sync_shadow_id_cb, |
| .flags = PKVM_PGTABLE_WALK_TABLE_PRE | |
| PKVM_PGTABLE_WALK_LEAF, |
| .arg = &arg, |
| }; |
| int ret, retry_cnt = 0; |
| |
| if (!(iommu->viommu.vreg.gsts & DMA_GSTS_TES)) |
| return 0; |
| |
| retry: |
| if (ecap_smts(iommu->iommu.ecap)) |
| arg.shadow_pa[IOMMU_SM_ROOT] = iommu->pgt.root_pa; |
| else |
| arg.shadow_pa[IOMMU_LM_ROOT] = iommu->pgt.root_pa; |
| /* |
| * To sync the shadow IOMMU page table, walks the guest IOMMU |
| * page table |
| */ |
| ret = iommu_pgtable_walk(&iommu->viommu.pgt, vaddr, vaddr_end, &walker); |
| if ((ret == -EAGAIN) && (retry_cnt++ < 5)) |
| goto retry; |
| |
| return ret; |
| } |
| |
| static void enable_qi(struct pkvm_iommu *iommu) |
| { |
| void *desc = iommu->qi.desc; |
| int dw, qs; |
| u32 sts; |
| |
| dw = !!ecap_smts(iommu->iommu.ecap); |
| qs = fls(iommu->qi.free_cnt >> (7 + !dw)) - 1; |
| |
| /* Disable QI */ |
| sts = readl(iommu->iommu.reg + DMAR_GSTS_REG); |
| if (sts & DMA_GSTS_QIES) { |
| iommu->iommu.gcmd &= ~DMA_GCMD_QIE; |
| writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG); |
| PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, |
| readl, !(sts & DMA_GSTS_QIES), sts); |
| } |
| |
| /* Set tail to 0 */ |
| writel(0, iommu->iommu.reg + DMAR_IQT_REG); |
| |
| /* Set IQA */ |
| iommu->piommu_iqa = pkvm_virt_to_phys(desc) | (dw << 11) | qs; |
| writeq(iommu->piommu_iqa, iommu->iommu.reg + DMAR_IQA_REG); |
| |
| /* Enable QI */ |
| iommu->iommu.gcmd |= DMA_GCMD_QIE; |
| writel(iommu->iommu.gcmd, iommu->iommu.reg + DMAR_GCMD_REG); |
| PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, |
| readl, (sts & DMA_GSTS_QIES), sts); |
| } |
| |
| static int create_qi_desc(struct pkvm_iommu *iommu) |
| { |
| struct pkvm_viommu *viommu = &iommu->viommu; |
| struct q_inval *qi = &iommu->qi; |
| void __iomem *reg = iommu->iommu.reg; |
| |
| pkvm_spinlock_init(&iommu->qi_lock); |
| /* |
| * Before switching the descriptor, need to wait any pending |
| * invalidation descriptor completed. According to spec 6.5.2, |
| * The invalidation queue is considered quiesced when the queue |
| * is empty (head and tail registers equal) and the last |
| * descriptor completed is an Invalidation Wait Descriptor |
| * (which indicates no invalidation requests are pending in hardware). |
| */ |
| while (readq(reg + DMAR_IQH_REG) != |
| readq(reg + DMAR_IQT_REG)) |
| cpu_relax(); |
| |
| viommu->vreg.iqa = viommu->iqa = readq(reg + DMAR_IQA_REG); |
| viommu->vreg.iq_head = readq(reg + DMAR_IQH_REG); |
| viommu->vreg.iq_tail = readq(reg + DMAR_IQT_REG); |
| |
| if (viommu->vreg.gsts & DMA_GSTS_QIES) { |
| struct qi_desc *wait_desc; |
| u64 iqa = viommu->iqa; |
| int shift = IQ_DESC_SHIFT(iqa); |
| int offset = ((viommu->vreg.iq_head >> shift) + |
| IQ_DESC_LEN(iqa) - 1) % IQ_DESC_LEN(iqa); |
| int *desc_status; |
| |
| /* Find out the last descriptor */ |
| wait_desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(iqa)) + (offset << shift); |
| |
| pkvm_dbg("pkvm: viommu iqa 0x%llx head 0x%llx tail 0x%llx qw0 0x%llx qw1 0x%llx", |
| viommu->vreg.iqa, viommu->vreg.iq_head, viommu->vreg.iq_tail, |
| wait_desc->qw0, wait_desc->qw1); |
| |
| if (QI_DESC_TYPE(wait_desc->qw0) != QI_IWD_TYPE) { |
| pkvm_err("pkvm: %s: expect wait desc but 0x%llx\n", |
| __func__, wait_desc->qw0); |
| return -EINVAL; |
| } |
| |
| desc_status = pkvm_phys_to_virt(wait_desc->qw1); |
| /* |
| * Wait until the wait descriptor is completed. |
| * |
| * The desc_status is from host. Checking this in pkvm |
| * is relying on host IOMMU driver won't release the |
| * desc_status after it is completed, and this is guarantee |
| * by the current Linux IOMMU driver. |
| */ |
| while (READ_ONCE(*desc_status) == QI_IN_USE) |
| cpu_relax(); |
| } |
| |
| qi->free_cnt = PKVM_QI_DESC_ALIGNED_SIZE / sizeof(struct qi_desc); |
| qi->desc = iommu_zalloc_pages(PKVM_QI_DESC_ALIGNED_SIZE); |
| if (!qi->desc) |
| return -ENOMEM; |
| |
| qi->desc_status = iommu_zalloc_pages(PKVM_QI_DESC_STATUS_ALIGNED_SIZE); |
| if (!qi->desc_status) { |
| iommu_put_page(qi->desc); |
| return -ENOMEM; |
| } |
| |
| enable_qi(iommu); |
| return 0; |
| } |
| |
| static int qi_check_fault(struct pkvm_iommu *iommu, int wait_index) |
| { |
| u32 fault; |
| struct q_inval *qi = &iommu->qi; |
| |
| if (qi->desc_status[wait_index] == QI_ABORT) |
| return -EAGAIN; |
| |
| fault = readl(iommu->iommu.reg + DMAR_FSTS_REG); |
| |
| /* |
| * If IQE happens, the head points to the descriptor associated |
| * with the error. No new descriptors are fetched until the IQE |
| * is cleared. |
| */ |
| if (fault & DMA_FSTS_IQE) { |
| writel(DMA_FSTS_IQE, iommu->iommu.reg + DMAR_FSTS_REG); |
| pkvm_dbg("pkvm: Invalidation Queue Error (IQE) cleared\n"); |
| } |
| |
| /* |
| * If ITE happens, all pending wait_desc commands are aborted. |
| * No new descriptors are fetched until the ITE is cleared. |
| */ |
| if (fault & DMA_FSTS_ITE) { |
| writel(DMA_FSTS_ITE, iommu->iommu.reg + DMAR_FSTS_REG); |
| pkvm_dbg("pkvm: Invalidation Time-out Error (ITE) cleared\n"); |
| } |
| |
| if (fault & DMA_FSTS_ICE) { |
| writel(DMA_FSTS_ICE, iommu->iommu.reg + DMAR_FSTS_REG); |
| pkvm_dbg("pkvm: Invalidation Completion Error (ICE) cleared\n"); |
| } |
| |
| return 0; |
| } |
| |
| static void submit_qi(struct pkvm_iommu *iommu, struct qi_desc *base, int count) |
| { |
| int len = IQ_DESC_LEN(iommu->piommu_iqa), i, wait_index; |
| int shift = IQ_DESC_SHIFT(iommu->piommu_iqa); |
| struct q_inval *qi = &iommu->qi; |
| struct qi_desc *to, *from; |
| int required_cnt = count + 2; |
| void *desc = qi->desc; |
| int *desc_status, rc; |
| |
| pkvm_spin_lock(&iommu->qi_lock); |
| /* |
| * Detect if the free descriptor count is enough or not |
| */ |
| while (qi->free_cnt < required_cnt) { |
| u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift; |
| int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len; |
| int free_cnt = len - busy_cnt; |
| |
| if (free_cnt >= required_cnt) { |
| qi->free_cnt = free_cnt; |
| break; |
| } |
| pkvm_spin_unlock(&iommu->qi_lock); |
| cpu_relax(); |
| pkvm_spin_lock(&iommu->qi_lock); |
| } |
| |
| for (i = 0; i < count; i++) { |
| from = base + i; |
| to = qi->desc + (((qi->free_head + i) % len) << shift); |
| to->qw0 = from->qw0; |
| to->qw1 = from->qw1; |
| } |
| |
| wait_index = (qi->free_head + count) % len; |
| /* setup wait descriptor */ |
| to = desc + (wait_index << shift); |
| to->qw0 = QI_IWD_STATUS_DATA(QI_DONE) | |
| QI_IWD_STATUS_WRITE | QI_IWD_TYPE; |
| |
| desc_status = &qi->desc_status[wait_index]; |
| WRITE_ONCE(*desc_status, QI_IN_USE); |
| to->qw1 = pkvm_virt_to_phys(desc_status); |
| |
| /* submit to hardware with wait descriptor */ |
| qi->free_cnt -= count + 1; |
| qi->free_head = (qi->free_head + count + 1) % len; |
| writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG); |
| |
| while (READ_ONCE(*desc_status) != QI_DONE) { |
| rc = qi_check_fault(iommu, wait_index); |
| if (rc) |
| break; |
| pkvm_spin_unlock(&iommu->qi_lock); |
| cpu_relax(); |
| pkvm_spin_lock(&iommu->qi_lock); |
| } |
| |
| if (*desc_status != QI_DONE) |
| pkvm_err("pkvm: %s: failed with status %d\n", |
| __func__, *desc_status); |
| |
| /* release the free_cnt */ |
| qi->free_cnt += count + 1; |
| |
| pkvm_spin_unlock(&iommu->qi_lock); |
| } |
| |
| static void flush_context_cache(struct pkvm_iommu *iommu, u16 did, |
| u16 sid, u8 fm, u64 type) |
| { |
| struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; |
| |
| desc.qw0 = QI_CC_FM(fm) | QI_CC_SID(sid) | QI_CC_DID(did) | |
| QI_CC_GRAN(type) | QI_CC_TYPE; |
| |
| submit_qi(iommu, &desc, 1); |
| } |
| |
| static void flush_pasid_cache(struct pkvm_iommu *iommu, u16 did, |
| u64 granu, u32 pasid) |
| { |
| struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; |
| |
| desc.qw0 = QI_PC_PASID(pasid) | QI_PC_DID(did) | |
| QI_PC_GRAN(granu) | QI_PC_TYPE; |
| |
| submit_qi(iommu, &desc, 1); |
| } |
| |
| static void flush_iotlb(struct pkvm_iommu *iommu, u16 did, u64 addr, |
| unsigned int size_order, u64 type) |
| { |
| u8 dw = 0, dr = 0; |
| struct qi_desc desc = {.qw2 = 0, .qw3 = 0}; |
| int ih = 0; |
| |
| if (cap_write_drain(iommu->iommu.cap)) |
| dw = 1; |
| |
| if (cap_read_drain(iommu->iommu.cap)) |
| dr = 1; |
| |
| desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) | |
| QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; |
| desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) | QI_IOTLB_AM(size_order); |
| |
| submit_qi(iommu, &desc, 1); |
| } |
| |
| static void set_root_table(struct pkvm_iommu *iommu) |
| { |
| u64 val = iommu->pgt.root_pa; |
| void __iomem *reg = iommu->iommu.reg; |
| u32 sts; |
| |
| /* Set scalable mode */ |
| if (ecap_smts(iommu->iommu.ecap)) |
| val |= DMA_RTADDR_SMT; |
| |
| writeq(val, reg + DMAR_RTADDR_REG); |
| |
| /* |
| * The shadow root table provides identical remapping results comparing |
| * with the previous guest root table, so it is allowed to switch if |
| * Translation Enable Status is still 1 according to IOMMU spec 6.6: |
| * |
| * " |
| * If software sets the root-table pointer while remapping hardware is |
| * active (TES=1 in Global Status register), software must ensure the |
| * structures referenced by the new root-table pointer provide identical |
| * remapping results as the structures referenced by the previous root-table |
| * pointer so that inflight requests are properly translated. |
| * " |
| * |
| * So don't need to turn off TE first before switching. |
| */ |
| writel(iommu->iommu.gcmd | DMA_GCMD_SRTP, reg + DMAR_GCMD_REG); |
| |
| PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts); |
| |
| flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); |
| if (ecap_smts(iommu->iommu.ecap)) |
| flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); |
| flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); |
| } |
| |
| static void enable_translation(struct pkvm_iommu *iommu) |
| { |
| void __iomem *reg = iommu->iommu.reg; |
| u32 sts; |
| |
| if (iommu->iommu.gcmd & DMA_GCMD_TE) |
| return; |
| |
| iommu->iommu.gcmd |= DMA_GCMD_TE; |
| |
| writel(iommu->iommu.gcmd, reg + DMAR_GCMD_REG); |
| |
| PKVM_IOMMU_WAIT_OP(reg + DMAR_GSTS_REG, readl, (sts & DMA_GSTS_TES), sts); |
| } |
| |
| static void initialize_viommu_reg(struct pkvm_iommu *iommu) |
| { |
| struct viommu_reg *vreg = &iommu->viommu.vreg; |
| void __iomem *reg_base = iommu->iommu.reg; |
| |
| vreg->cap = readq(reg_base + DMAR_CAP_REG); |
| vreg->ecap = readq(reg_base + DMAR_ECAP_REG); |
| pkvm_update_iommu_virtual_caps(&vreg->cap, &vreg->ecap); |
| |
| vreg->gsts = readl(reg_base + DMAR_GSTS_REG); |
| vreg->rta = readq(reg_base + DMAR_RTADDR_REG); |
| |
| pkvm_dbg("%s: iommu phys reg 0x%llx cap 0x%llx ecap 0x%llx gsts 0x%x rta 0x%llx\n", |
| __func__, iommu->iommu.reg_phys, vreg->cap, vreg->ecap, vreg->gsts, vreg->rta); |
| |
| /* Invalidate Queue regs are updated when create descriptor */ |
| } |
| |
| static int activate_iommu(struct pkvm_iommu *iommu) |
| { |
| unsigned long vaddr = 0, vaddr_end = IOMMU_MAX_VADDR; |
| int ret; |
| |
| pkvm_dbg("%s: iommu%d\n", __func__, iommu->iommu.seq_id); |
| |
| pkvm_spin_lock(&iommu->lock); |
| |
| ret = initialize_iommu_pgt(iommu); |
| if (ret) |
| goto out; |
| |
| initialize_viommu_reg(iommu); |
| |
| ret = sync_shadow_id(iommu, vaddr, vaddr_end, 0); |
| if (ret) |
| goto out; |
| |
| ret = create_qi_desc(iommu); |
| if (ret) |
| goto free_shadow; |
| |
| set_root_table(iommu); |
| |
| /* |
| * It is possible that some of the IOMMU devices doesn't have memory |
| * remapping translation enabled by the host IOMMU driver during boot |
| * time, so pkvm IOMMU driver needs to make sure enabling this to |
| * guarantee the IO isolation from the devices behind this IOMMU. |
| * |
| */ |
| enable_translation(iommu); |
| |
| iommu->activated = true; |
| root_tbl_walk(iommu); |
| |
| pkvm_spin_unlock(&iommu->lock); |
| return 0; |
| |
| free_shadow: |
| free_shadow_id(iommu, vaddr, vaddr_end); |
| out: |
| pkvm_spin_unlock(&iommu->lock); |
| return ret; |
| } |
| |
| static int context_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc) |
| { |
| u16 sid = QI_DESC_CC_SID(desc->qw0); |
| u16 did = ecap_smts(iommu->iommu.ecap) ? 0 : QI_DESC_CC_DID(desc->qw0); |
| u64 granu = QI_DESC_CC_GRANU(desc->qw0) << DMA_CCMD_INVL_GRANU_OFFSET; |
| unsigned long start, end; |
| int ret; |
| |
| switch (granu) { |
| case DMA_CCMD_GLOBAL_INVL: |
| start = 0; |
| end = MAX_NUM_OF_ADDRESS_SPACE(iommu); |
| pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id); |
| ret = sync_shadow_id(iommu, start, end, 0); |
| break; |
| case DMA_CCMD_DOMAIN_INVL: |
| /* |
| * Domain selective invalidation which is processed by |
| * hardware as global invalidations for scalable mode |
| * according to spec 6.5.2.1 |
| */ |
| start = 0; |
| end = MAX_NUM_OF_ADDRESS_SPACE(iommu); |
| pkvm_dbg("pkvm: %s: iommu%d: domain selective\n", |
| __func__, iommu->iommu.seq_id); |
| ret = sync_shadow_id(iommu, start, end, did); |
| break; |
| case DMA_CCMD_DEVICE_INVL: |
| if (ecap_smts(iommu->iommu.ecap)) { |
| start = (unsigned long)sid << DEVFN_SHIFT; |
| end = ((unsigned long)sid + 1) << DEVFN_SHIFT; |
| } else { |
| start = (unsigned long)sid << LM_DEVFN_SHIFT; |
| end = ((unsigned long)sid + 1) << LM_DEVFN_SHIFT; |
| } |
| pkvm_dbg("pkvm: %s: iommu%d: device selective sid 0x%x\n", |
| __func__, iommu->iommu.seq_id, sid); |
| ret = sync_shadow_id(iommu, start, end, did); |
| break; |
| default: |
| pkvm_err("pkvm: %s: iommu%d: invalidate granu %lld\n", |
| __func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET); |
| ret = -EINVAL; |
| break; |
| } |
| |
| if (ret) |
| pkvm_err("pkvm: %s: iommu%d: granularity %lld failed with ret %d\n", |
| __func__, iommu->iommu.seq_id, granu >> DMA_CCMD_INVL_GRANU_OFFSET, ret); |
| return ret; |
| } |
| |
| static int pasid_cache_invalidate(struct pkvm_iommu *iommu, struct qi_desc *desc) |
| { |
| int pasid = QI_DESC_PC_PASID(desc->qw0); |
| u16 did = QI_DESC_PC_DID(desc->qw0); |
| int granu = QI_DESC_PC_GRANU(desc->qw0); |
| unsigned long start, end; |
| int ret; |
| |
| switch (granu) { |
| case QI_PC_ALL_PASIDS: |
| /* |
| * This is more like a global invalidation but to check |
| * if matching with a specific DID. |
| */ |
| pkvm_dbg("pkvm: %s: iommu%d: ALL_PASID did %d\n", |
| __func__, iommu->iommu.seq_id, did); |
| start = 0; |
| end = IOMMU_MAX_VADDR; |
| ret = sync_shadow_id(iommu, start, end, did); |
| break; |
| case QI_PC_PASID_SEL: { |
| /* |
| * Sync specific PASID entry for all contexts |
| */ |
| u64 bdf, end_bdf = 0x10000; |
| |
| pkvm_dbg("pkvm: %s: iommu%d: PASID_SEL did %d pasid 0x%x\n", |
| __func__, iommu->iommu.seq_id, did, pasid); |
| for (bdf = 0; bdf < end_bdf; bdf++) { |
| start = (bdf << DEVFN_SHIFT) + pasid; |
| end = start + 1; |
| ret = sync_shadow_id(iommu, start, end, did); |
| if (ret) |
| break; |
| } |
| break; |
| } |
| case QI_PC_GLOBAL: |
| start = 0; |
| end = IOMMU_MAX_VADDR; |
| pkvm_dbg("pkvm: %s: iommu%d: global\n", __func__, iommu->iommu.seq_id); |
| ret = sync_shadow_id(iommu, start, end, 0); |
| break; |
| default: |
| pkvm_err("pkvm: %s: iommu%d: invalid granularity %d 0x%llx\n", |
| __func__, iommu->iommu.seq_id, granu, desc->qw0); |
| ret = -EINVAL; |
| break; |
| } |
| |
| if (ret) |
| pkvm_err("pkvm: %s: iommu%d: granularity %d failed with ret %d\n", |
| __func__, iommu->iommu.seq_id, granu, ret); |
| |
| return ret; |
| } |
| |
| static int handle_descriptor(struct pkvm_iommu *iommu, struct qi_desc *desc) |
| { |
| int type = QI_DESC_TYPE(desc->qw0); |
| int ret = 0; |
| |
| switch (type) { |
| /* |
| * TODO: is it necessary to intercept the |
| * PGRP_RESP & PSTRM_RESP? |
| */ |
| case QI_PGRP_RESP_TYPE: |
| case QI_PSTRM_RESP_TYPE: |
| case QI_IOTLB_TYPE: |
| case QI_DIOTLB_TYPE: |
| case QI_DEIOTLB_TYPE: |
| case QI_IEC_TYPE: |
| case QI_IWD_TYPE: |
| case QI_EIOTLB_TYPE: |
| break; |
| case QI_CC_TYPE: |
| ret = context_cache_invalidate(iommu, desc); |
| break; |
| case QI_PC_TYPE: |
| ret = pasid_cache_invalidate(iommu, desc); |
| break; |
| default: |
| pkvm_err("pkvm: %s: iommu%d: invalid type %d desc addr 0x%llx val 0x%llx\n", |
| __func__, iommu->iommu.seq_id, type, (u64)desc, desc->qw0); |
| ret = -EINVAL; |
| break; |
| } |
| |
| return ret; |
| } |
| |
| static void handle_qi_submit(struct pkvm_iommu *iommu, void *vdesc, int vhead, int count) |
| { |
| struct pkvm_viommu *viommu = &iommu->viommu; |
| int vlen = IQ_DESC_LEN(viommu->iqa); |
| int vshift = IQ_DESC_SHIFT(viommu->iqa); |
| int len = IQ_DESC_LEN(iommu->piommu_iqa); |
| int shift = IQ_DESC_SHIFT(iommu->piommu_iqa); |
| struct q_inval *qi = &iommu->qi; |
| struct qi_desc *to, *from; |
| int required_cnt = count + 1, i; |
| |
| pkvm_spin_lock(&iommu->qi_lock); |
| /* |
| * Detect if the free descriptor count is enough or not |
| */ |
| while (qi->free_cnt < required_cnt) { |
| u64 head = readq(iommu->iommu.reg + DMAR_IQH_REG) >> shift; |
| int busy_cnt = (READ_ONCE(qi->free_head) + len - head) % len; |
| int free_cnt = len - busy_cnt; |
| |
| if (free_cnt >= required_cnt) { |
| qi->free_cnt = free_cnt; |
| break; |
| } |
| pkvm_spin_unlock(&iommu->qi_lock); |
| cpu_relax(); |
| pkvm_spin_lock(&iommu->qi_lock); |
| } |
| |
| for (i = 0; i < count; i++) { |
| from = vdesc + (((vhead + i) % vlen) << vshift); |
| to = qi->desc + (((qi->free_head + i) % len) << shift); |
| |
| to->qw0 = from->qw0; |
| to->qw1 = from->qw1; |
| } |
| |
| /* |
| * Reuse the desc_status from host so that host can poll |
| * the desc_status itself instead of waiting in pkvm. |
| */ |
| qi->free_cnt -= count; |
| qi->free_head = (qi->free_head + count) % len; |
| writel(qi->free_head << shift, iommu->iommu.reg + DMAR_IQT_REG); |
| |
| pkvm_spin_unlock(&iommu->qi_lock); |
| } |
| |
| static int handle_qi_invalidation(struct pkvm_iommu *iommu, unsigned long val) |
| { |
| struct pkvm_viommu *viommu = &iommu->viommu; |
| u64 viommu_iqa = viommu->iqa; |
| struct qi_desc *wait_desc; |
| int len = IQ_DESC_LEN(viommu_iqa); |
| int shift = IQ_DESC_SHIFT(viommu_iqa); |
| int head = viommu->vreg.iq_head >> shift; |
| int count, i, ret = 0; |
| int *desc_status; |
| void *desc; |
| |
| viommu->vreg.iq_tail = val; |
| desc = pkvm_phys_to_virt(IQ_DESC_BASE_PHYS(viommu_iqa)); |
| count = ((val >> shift) + len - head) % len; |
| |
| for (i = 0; i < count; i++) { |
| viommu->vreg.iq_head = ((head + i) % len) << shift; |
| ret = handle_descriptor(iommu, desc + viommu->vreg.iq_head); |
| if (ret) |
| break; |
| } |
| /* update iq_head */ |
| viommu->vreg.iq_head = val; |
| |
| if (likely(!ret)) { |
| /* |
| * Submit the descriptor to hardware. The desc_status |
| * will be taken cared by hardware. |
| */ |
| handle_qi_submit(iommu, desc, head, count); |
| } else { |
| pkvm_err("pkvm: %s: failed with ret %d\n", __func__, ret); |
| /* |
| * The descriptor seems invalid. Mark the desc_status as |
| * QI_ABORT to make sure host driver won't be blocked. |
| */ |
| wait_desc = desc + (((head + count - 1) % len) << shift); |
| if (QI_DESC_TYPE(wait_desc->qw0) == QI_IWD_TYPE) { |
| desc_status = pkvm_phys_to_virt(wait_desc->qw1); |
| WRITE_ONCE(*desc_status, QI_ABORT); |
| } |
| } |
| |
| return ret; |
| } |
| |
| static void handle_gcmd_te(struct pkvm_iommu *iommu, bool en) |
| { |
| unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu); |
| struct pkvm_viommu *viommu = &iommu->viommu; |
| |
| if (en) { |
| viommu->vreg.gsts |= DMA_GSTS_TES; |
| /* |
| * Sync shadow page table to emulate Translation enable. |
| */ |
| if (sync_shadow_id(iommu, vaddr, vaddr_end, 0)) |
| return; |
| pkvm_dbg("pkvm: %s: enable TE\n", __func__); |
| goto out; |
| } |
| |
| /* |
| * Free shadow to emulate Translation disable. |
| * |
| * Not really disable translation as still |
| * need to protect agains the device. |
| */ |
| free_shadow_id(iommu, vaddr, vaddr_end); |
| viommu->vreg.gsts &= ~DMA_GSTS_TES; |
| pkvm_dbg("pkvm: %s: disable TE\n", __func__); |
| out: |
| flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); |
| if (ecap_smts(iommu->iommu.ecap)) |
| flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); |
| flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); |
| |
| root_tbl_walk(iommu); |
| } |
| |
| static void handle_gcmd_srtp(struct pkvm_iommu *iommu) |
| { |
| struct viommu_reg *vreg = &iommu->viommu.vreg; |
| struct pkvm_pgtable *vpgt = &iommu->viommu.pgt; |
| |
| vreg->gsts &= ~DMA_GSTS_RTPS; |
| |
| /* Set the root table phys address from vreg */ |
| vpgt->root_pa = vreg->rta & VTD_PAGE_MASK; |
| |
| pkvm_dbg("pkvm: %s: set SRTP val 0x%llx\n", __func__, vreg->rta); |
| |
| if (vreg->gsts & DMA_GSTS_TES) { |
| unsigned long vaddr = 0, vaddr_end = MAX_NUM_OF_ADDRESS_SPACE(iommu); |
| |
| /* TE is already enabled, sync shadow */ |
| if (sync_shadow_id(iommu, vaddr, vaddr_end, 0)) |
| return; |
| |
| flush_context_cache(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); |
| if (ecap_smts(iommu->iommu.ecap)) |
| flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); |
| flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); |
| } |
| |
| vreg->gsts |= DMA_GSTS_RTPS; |
| |
| root_tbl_walk(iommu); |
| } |
| |
| static void handle_gcmd_qie(struct pkvm_iommu *iommu, bool en) |
| { |
| struct viommu_reg *vreg = &iommu->viommu.vreg; |
| |
| if (en) { |
| if (vreg->iq_tail != 0) { |
| pkvm_err("pkvm: Queue invalidation descriptor tail is not zero\n"); |
| return; |
| } |
| |
| /* Update the iqa from vreg */ |
| iommu->viommu.iqa = vreg->iqa; |
| vreg->iq_head = 0; |
| vreg->gsts |= DMA_GSTS_QIES; |
| pkvm_dbg("pkvm: %s: enabled QI\n", __func__); |
| return; |
| } |
| |
| if (vreg->iq_head != vreg->iq_tail) { |
| pkvm_err("pkvm: Queue invalidation descriptor is not empty yet\n"); |
| return; |
| } |
| |
| vreg->iq_head = 0; |
| vreg->gsts &= ~DMA_GSTS_QIES; |
| pkvm_dbg("pkvm: %s: disabled QI\n", __func__); |
| } |
| |
| static void handle_gcmd_direct(struct pkvm_iommu *iommu, u32 val) |
| { |
| struct viommu_reg *vreg = &iommu->viommu.vreg; |
| unsigned long changed = ((vreg->gsts ^ val) & DMAR_GCMD_DIRECT) & |
| DMAR_GSTS_EN_BITS; |
| unsigned long set = (val & DMAR_GCMD_DIRECT) & ~DMAR_GSTS_EN_BITS; |
| u32 cmd, gcmd, sts; |
| int bit; |
| |
| if ((changed | set) & DMAR_GCMD_PROTECTED) { |
| pkvm_dbg("pkvm:%s touching protected bits changed 0x%lx set 0x%lx\n", |
| __func__, changed, set); |
| return; |
| } |
| |
| if (changed) { |
| pkvm_dbg("pkvm: %s: changed 0x%lx\n", __func__, changed); |
| gcmd = READ_ONCE(iommu->iommu.gcmd); |
| for_each_set_bit(bit, &changed, BITS_PER_BYTE * sizeof(vreg->gsts)) { |
| cmd = 1 << bit; |
| if (val & cmd) { |
| /* enable */ |
| gcmd |= cmd; |
| writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG); |
| PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, |
| readl, (sts & cmd), sts); |
| vreg->gsts |= cmd; |
| pkvm_dbg("pkvm: %s: enable cmd bit %d\n", __func__, bit); |
| } else { |
| /* disable */ |
| gcmd &= ~cmd; |
| writel(gcmd, iommu->iommu.reg + DMAR_GCMD_REG); |
| PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, |
| readl, !(sts & cmd), sts); |
| vreg->gsts &= ~cmd; |
| pkvm_dbg("pkvm: %s: disable cmd bit %d\n", __func__, bit); |
| } |
| } |
| WRITE_ONCE(iommu->iommu.gcmd, gcmd); |
| } |
| |
| if (set) { |
| pkvm_dbg("pkvm: %s: set 0x%lx\n", __func__, set); |
| gcmd = READ_ONCE(iommu->iommu.gcmd); |
| for_each_set_bit(bit, &set, BITS_PER_BYTE * sizeof(vreg->gsts)) { |
| cmd = 1 << bit; |
| vreg->gsts &= ~cmd; |
| writel(gcmd | cmd, iommu->iommu.reg + DMAR_GCMD_REG); |
| PKVM_IOMMU_WAIT_OP(iommu->iommu.reg + DMAR_GSTS_REG, |
| readl, (sts & cmd), sts); |
| vreg->gsts |= cmd; |
| pkvm_dbg("pkvm: %s: set cmd bit %d\n", __func__, bit); |
| } |
| } |
| } |
| |
| static void handle_global_cmd(struct pkvm_iommu *iommu, u32 val) |
| { |
| u32 changed = iommu->viommu.vreg.gsts ^ val; |
| |
| pkvm_dbg("pkvm: iommu%d: handle gcmd val 0x%x gsts 0x%x changed 0x%x\n", |
| iommu->iommu.seq_id, val, iommu->viommu.vreg.gsts, changed); |
| |
| if (changed & DMA_GCMD_TE) |
| handle_gcmd_te(iommu, !!(val & DMA_GCMD_TE)); |
| |
| if (val & DMA_GCMD_SRTP) |
| handle_gcmd_srtp(iommu); |
| |
| if (changed & DMA_GCMD_QIE) |
| handle_gcmd_qie(iommu, !!(val & DMA_GCMD_QIE)); |
| |
| handle_gcmd_direct(iommu, val); |
| } |
| |
| static struct pkvm_iommu *find_iommu_by_reg_phys(unsigned long phys) |
| { |
| struct pkvm_iommu *iommu; |
| |
| for_each_valid_iommu(iommu) { |
| if ((phys >= iommu->iommu.reg_phys) && |
| (phys < (iommu->iommu.reg_phys + iommu->iommu.reg_size))) |
| return iommu; |
| } |
| |
| return NULL; |
| } |
| |
| static unsigned long direct_access_iommu_mmio(struct pkvm_iommu *iommu, |
| bool is_read, int len, |
| unsigned long phys, |
| unsigned long val) |
| { |
| unsigned long offset = phys - iommu->iommu.reg_phys; |
| void *reg = iommu->iommu.reg + offset; |
| unsigned long ret = 0; |
| |
| switch (len) { |
| case 4: |
| if (is_read) |
| ret = (unsigned long)readl(reg); |
| else |
| writel((u32)val, reg); |
| break; |
| case 8: |
| if (is_read) |
| ret = (unsigned long)readq(reg); |
| else |
| writeq((u64)val, reg); |
| break; |
| default: |
| pkvm_err("%s: %s: unsupported len %d\n", __func__, |
| is_read ? "read" : "write", len); |
| break; |
| } |
| |
| return ret; |
| } |
| |
| static unsigned long access_iommu_mmio(struct pkvm_iommu *iommu, bool is_read, |
| int len, unsigned long phys, |
| unsigned long val) |
| { |
| struct pkvm_viommu *viommu = &iommu->viommu; |
| unsigned long offset = phys - iommu->iommu.reg_phys; |
| unsigned long ret = 0; |
| |
| /* pkvm IOMMU driver is not activated yet, so directly access MMIO */ |
| if (unlikely(!iommu->activated)) |
| return direct_access_iommu_mmio(iommu, is_read, len, phys, val); |
| |
| /* Only need to emulate part of the MMIO */ |
| switch (offset) { |
| case DMAR_CAP_REG: |
| if (is_read) |
| ret = viommu->vreg.cap; |
| break; |
| case DMAR_ECAP_REG: |
| if (is_read) |
| ret = viommu->vreg.ecap; |
| break; |
| case DMAR_GCMD_REG: |
| if (is_read) |
| ret = 0; |
| else |
| handle_global_cmd(iommu, val); |
| break; |
| case DMAR_GSTS_REG: |
| if (is_read) |
| ret = viommu->vreg.gsts; |
| break; |
| case DMAR_RTADDR_REG: |
| if (is_read) |
| ret = viommu->vreg.rta; |
| else |
| viommu->vreg.rta = val; |
| break; |
| case DMAR_IQA_REG: |
| if (is_read) |
| ret = viommu->vreg.iqa; |
| else |
| viommu->vreg.iqa = val; |
| break; |
| case DMAR_IQH_REG: |
| if (is_read) |
| ret = viommu->vreg.iq_head; |
| break; |
| case DMAR_IQT_REG: |
| if (is_read) |
| ret = viommu->vreg.iq_tail; |
| else { |
| if (viommu->vreg.gsts & DMA_GSTS_QIES) |
| ret = handle_qi_invalidation(iommu, val); |
| else |
| viommu->vreg.iq_tail = val; |
| } |
| break; |
| default: |
| /* Not emulated MMIO can directly goes to hardware */ |
| ret = direct_access_iommu_mmio(iommu, is_read, len, phys, val); |
| break; |
| } |
| |
| return ret; |
| } |
| |
| unsigned long pkvm_access_iommu(bool is_read, int len, unsigned long phys, unsigned long val) |
| { |
| struct pkvm_iommu *pkvm_iommu = find_iommu_by_reg_phys(phys); |
| unsigned long ret; |
| |
| if (!pkvm_iommu) { |
| pkvm_err("%s: cannot find pkvm iommu for reg 0x%lx\n", |
| __func__, phys); |
| return 0; |
| } |
| |
| pkvm_spin_lock(&pkvm_iommu->lock); |
| ret = access_iommu_mmio(pkvm_iommu, is_read, len, phys, val); |
| pkvm_spin_unlock(&pkvm_iommu->lock); |
| |
| return ret; |
| } |
| |
| int pkvm_activate_iommu(void) |
| { |
| struct pkvm_iommu *iommu; |
| int ret = 0; |
| |
| for_each_valid_iommu(iommu) { |
| ret = activate_iommu(iommu); |
| if (ret) |
| return ret; |
| } |
| |
| return 0; |
| } |
| |
| bool is_mem_range_overlap_iommu(unsigned long start, unsigned long end) |
| { |
| struct pkvm_iommu *iommu; |
| |
| for_each_valid_iommu(iommu) { |
| if (end < iommu->iommu.reg_phys || |
| start > (iommu->iommu.reg_phys + iommu->iommu.reg_size - 1)) |
| continue; |
| |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static struct pkvm_iommu *bdf_pasid_to_iommu(u16 bdf, u32 pasid) |
| { |
| struct pkvm_iommu *iommu, *find = NULL; |
| struct pkvm_ptdev *p; |
| |
| for_each_valid_iommu(iommu) { |
| pkvm_spin_lock(&iommu->lock); |
| list_for_each_entry(p, &iommu->ptdev_head, iommu_node) { |
| if (match_ptdev(p, bdf, pasid)) { |
| find = iommu; |
| break; |
| } |
| } |
| pkvm_spin_unlock(&iommu->lock); |
| if (find) |
| break; |
| } |
| |
| return find; |
| } |
| |
| /* |
| * pkvm_iommu_sync() - Sync IOMMU context/pasid entry according to a ptdev |
| * |
| * @bdf/pasid: The corresponding IOMMU page table entry needs to sync. |
| */ |
| int pkvm_iommu_sync(u16 bdf, u32 pasid) |
| { |
| struct pkvm_iommu *iommu = bdf_pasid_to_iommu(bdf, pasid); |
| unsigned long id_addr, id_addr_end; |
| struct pkvm_ptdev *ptdev; |
| u16 old_did; |
| int ret; |
| |
| /* |
| * TODO: |
| * Currently assume that the bdf/pasid has ever been synced |
| * before so that the IOMMU can be found. If has not, then |
| * the iommu pointer will be NULL. To handle this case, pKVM |
| * IOMMU driver needs to check the DMAR to know which IOMMU |
| * should be used for this bdf/pasid. |
| */ |
| if (!iommu) |
| return -ENODEV; |
| |
| ptdev = pkvm_get_ptdev(bdf, pasid); |
| if (!ptdev) |
| return -ENODEV; |
| |
| old_did = ptdev->did; |
| |
| if (ecap_smts(iommu->iommu.ecap)) { |
| id_addr = ((unsigned long)bdf << DEVFN_SHIFT) | |
| ((unsigned long)pasid & ((1UL << MAX_NR_PASID_BITS) - 1)); |
| id_addr_end = id_addr + 1; |
| } else { |
| pkvm_err("%s: No support for legacy IOMMU.\n", __func__); |
| ret = -EOPNOTSUPP; |
| goto out; |
| } |
| |
| pkvm_spin_lock(&iommu->lock); |
| ret = sync_shadow_id(iommu, id_addr, id_addr_end, 0); |
| if (!ret) { |
| if (old_did != ptdev->did) { |
| /* Flush pasid cache and IOTLB for the valid old_did */ |
| if (ecap_smts(iommu->iommu.ecap)) |
| flush_pasid_cache(iommu, old_did, QI_PC_PASID_SEL, pasid); |
| flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH); |
| } |
| |
| /* Flush pasid cache and IOTLB to make sure no stale TLB for the new did */ |
| if (ecap_smts(iommu->iommu.ecap)) |
| flush_pasid_cache(iommu, ptdev->did, QI_PC_PASID_SEL, pasid); |
| flush_iotlb(iommu, ptdev->did, 0, 0, DMA_TLB_DSI_FLUSH); |
| } |
| pkvm_spin_unlock(&iommu->lock); |
| out: |
| pkvm_put_ptdev(ptdev); |
| return ret; |
| } |