diff options
Diffstat (limited to 'drivers/iommu/amd')
-rw-r--r-- | drivers/iommu/amd/Kconfig | 1 | ||||
-rw-r--r-- | drivers/iommu/amd/Makefile | 2 | ||||
-rw-r--r-- | drivers/iommu/amd/amd_iommu.h | 29 | ||||
-rw-r--r-- | drivers/iommu/amd/amd_iommu_types.h | 47 | ||||
-rw-r--r-- | drivers/iommu/amd/init.c | 110 | ||||
-rw-r--r-- | drivers/iommu/amd/io_pgtable.c | 558 | ||||
-rw-r--r-- | drivers/iommu/amd/iommu.c | 672 | ||||
-rw-r--r-- | drivers/iommu/amd/iommu_v2.c | 4 |
8 files changed, 794 insertions, 629 deletions
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 626b97d0dd21..a3cbafb603f5 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -10,6 +10,7 @@ config AMD_IOMMU select IOMMU_API select IOMMU_IOVA select IOMMU_DMA + select IOMMU_IO_PGTABLE depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/Makefile b/drivers/iommu/amd/Makefile index dc5a2fa4fd37..a935f8f4b974 100644 --- a/drivers/iommu/amd/Makefile +++ b/drivers/iommu/amd/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o +obj-$(CONFIG_AMD_IOMMU) += iommu.o init.o quirks.o io_pgtable.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o obj-$(CONFIG_AMD_IOMMU_V2) += iommu_v2.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 6b8cbdf71714..026ce7f8d993 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -36,6 +36,7 @@ extern void amd_iommu_disable(void); extern int amd_iommu_reenable(int); extern int amd_iommu_enable_faulting(void); extern int amd_iommu_guest_ir; +extern enum io_pgtable_fmt amd_iommu_pgtable; /* IOMMUv2 specific functions */ struct iommu_domain; @@ -56,6 +57,10 @@ extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); extern int amd_iommu_flush_page(struct iommu_domain *dom, u32 pasid, u64 address); +extern void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); +extern void amd_iommu_domain_update(struct protection_domain *domain); +extern void amd_iommu_domain_flush_complete(struct protection_domain *domain); +extern void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain); extern int amd_iommu_flush_tlb(struct iommu_domain *dom, u32 pasid); extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, u32 pasid, unsigned long cr3); @@ -84,12 +89,9 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev) (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); } -static inline bool iommu_feature(struct amd_iommu *iommu, u64 f) +static inline bool iommu_feature(struct amd_iommu *iommu, u64 mask) { - if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) - return false; - - return !!(iommu->features & f); + return !!(iommu->features & mask); } static inline u64 iommu_virt_to_phys(void *vaddr) @@ -102,6 +104,21 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) return phys_to_virt(__sme_clr(paddr)); } +static inline +void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) +{ + atomic64_set(&domain->iop.pt_root, root); + domain->iop.root = (u64 *)(root & PAGE_MASK); + domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */ +} + +static inline +void amd_iommu_domain_clr_pt_root(struct protection_domain *domain) +{ + amd_iommu_domain_set_pt_root(domain, 0); +} + + extern bool translation_pre_enabled(struct amd_iommu *iommu); extern bool amd_iommu_is_attach_deferred(struct iommu_domain *domain, struct device *dev); @@ -114,4 +131,6 @@ void amd_iommu_apply_ivrs_quirks(void); static inline void amd_iommu_apply_ivrs_quirks(void) { } #endif +extern void amd_iommu_domain_set_pgtable(struct protection_domain *domain, + u64 *root, int mode); #endif diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 553587827771..6937e3674a16 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -15,6 +15,7 @@ #include <linux/spinlock.h> #include <linux/pci.h> #include <linux/irqreturn.h> +#include <linux/io-pgtable.h> /* * Maximum number of IOMMUs supported @@ -252,6 +253,19 @@ #define GA_GUEST_NR 0x1 +#define IOMMU_IN_ADDR_BIT_SIZE 52 +#define IOMMU_OUT_ADDR_BIT_SIZE 52 + +/* + * This bitmap is used to advertise the page sizes our hardware support + * to the IOMMU core, which will then use this information to split + * physically contiguous memory regions it is mapping into page sizes + * that we support. + * + * 512GB Pages are not supported due to a hardware bug + */ +#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) + /* Bit value definition for dte irq remapping fields*/ #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) #define DTE_IRQ_REMAP_INTCTL_MASK (0x3ULL << 60) @@ -387,6 +401,10 @@ #define IOMMU_CAP_NPCACHE 26 #define IOMMU_CAP_EFR 27 +/* IOMMU IVINFO */ +#define IOMMU_IVINFO_OFFSET 36 +#define IOMMU_IVINFO_EFRSUP BIT(0) + /* IOMMU Feature Reporting Field (for IVHD type 10h */ #define IOMMU_FEAT_GASUP_SHIFT 6 @@ -466,6 +484,27 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) +#define io_pgtable_to_data(x) \ + container_of((x), struct amd_io_pgtable, iop) + +#define io_pgtable_ops_to_data(x) \ + io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) + +#define io_pgtable_ops_to_domain(x) \ + container_of(io_pgtable_ops_to_data(x), \ + struct protection_domain, iop) + +#define io_pgtable_cfg_to_data(x) \ + container_of((x), struct amd_io_pgtable, pgtbl_cfg) + +struct amd_io_pgtable { + struct io_pgtable_cfg pgtbl_cfg; + struct io_pgtable iop; + int mode; + u64 *root; + atomic64_t pt_root; /* pgtable root and pgtable mode */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -474,9 +513,9 @@ struct protection_domain { struct list_head dev_list; /* List of all devices in this domain */ struct iommu_domain domain; /* generic domain handle used by iommu core code */ + struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ - atomic64_t pt_root; /* pgtable root and pgtable mode */ int glx; /* Number of levels for GCR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ @@ -484,12 +523,6 @@ struct protection_domain { unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; -/* For decocded pt_root */ -struct domain_pgtable { - int mode; - u64 *root; -}; - /* * Structure where we save information about one hardware AMD IOMMU in the * system. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 6a1f7048dacc..9126efcbaf2c 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -12,6 +12,7 @@ #include <linux/acpi.h> #include <linux/list.h> #include <linux/bitmap.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/syscore_ops.h> #include <linux/interrupt.h> @@ -147,6 +148,8 @@ struct ivmd_header { bool amd_iommu_dump; bool amd_iommu_irq_remap __read_mostly; +enum io_pgtable_fmt amd_iommu_pgtable = AMD_IOMMU_V1; + int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC; static int amd_iommu_xt_mode = IRQ_REMAP_XAPIC_MODE; @@ -254,9 +257,13 @@ static enum iommu_init_state init_state = IOMMU_START_STATE; static int amd_iommu_enable_interrupts(void); static int __init iommu_go_to_state(enum iommu_init_state state); static void init_device_table_dma(void); +static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, + u8 fxn, u64 *value, bool is_write); static bool amd_iommu_pre_enabled = true; +static u32 amd_iommu_ivinfo __initdata; + bool translation_pre_enabled(struct amd_iommu *iommu) { return (iommu->flags & AMD_IOMMU_FLAG_TRANS_PRE_ENABLED); @@ -296,6 +303,18 @@ int amd_iommu_get_num_iommus(void) return amd_iommus_present; } +/* + * For IVHD type 0x11/0x40, EFR is also available via IVHD. + * Default to IVHD EFR since it is available sooner + * (i.e. before PCI init). + */ +static void __init early_iommu_features_init(struct amd_iommu *iommu, + struct ivhd_header *h) +{ + if (amd_iommu_ivinfo & IOMMU_IVINFO_EFRSUP) + iommu->features = h->efr_reg; +} + /* Access to l1 and l2 indexed register spaces */ static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) @@ -1577,6 +1596,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE; + + early_iommu_features_init(iommu, h); + break; default: return -EINVAL; @@ -1695,13 +1717,11 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } -static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value, bool is_write); - -static void init_iommu_perf_ctr(struct amd_iommu *iommu) +static void __init init_iommu_perf_ctr(struct amd_iommu *iommu) { + int retry; struct pci_dev *pdev = iommu->dev; - u64 val = 0xabcd, val2 = 0, save_reg = 0; + u64 val = 0xabcd, val2 = 0, save_reg, save_src; if (!iommu_feature(iommu, FEATURE_PC)) return; @@ -1709,17 +1729,39 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu) amd_iommu_pc_present = true; /* save the value to restore, if writable */ - if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, false)) + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, false) || + iommu_pc_get_set_reg(iommu, 0, 0, 8, &save_src, false)) goto pc_false; - /* Check if the performance counters can be written to */ - if ((iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true)) || - (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false)) || - (val != val2)) + /* + * Disable power gating by programing the performance counter + * source to 20 (i.e. counts the reads and writes from/to IOMMU + * Reserved Register [MMIO Offset 1FF8h] that are ignored.), + * which never get incremented during this init phase. + * (Note: The event is also deprecated.) + */ + val = 20; + if (iommu_pc_get_set_reg(iommu, 0, 0, 8, &val, true)) goto pc_false; + /* Check if the performance counters can be written to */ + val = 0xabcd; + for (retry = 5; retry; retry--) { + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true) || + iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false) || + val2) + break; + + /* Wait about 20 msec for power gating to disable and retry. */ + msleep(20); + } + /* restore */ - if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, true)) + if (iommu_pc_get_set_reg(iommu, 0, 0, 0, &save_reg, true) || + iommu_pc_get_set_reg(iommu, 0, 0, 8, &save_src, true)) + goto pc_false; + + if (val != val2) goto pc_false; pci_info(pdev, "IOMMU performance counters supported\n"); @@ -1770,6 +1812,35 @@ static const struct attribute_group *amd_iommu_groups[] = { NULL, }; +/* + * Note: IVHD 0x11 and 0x40 also contains exact copy + * of the IOMMU Extended Feature Register [MMIO Offset 0030h]. + * Default to EFR in IVHD since it is available sooner (i.e. before PCI init). + */ +static void __init late_iommu_features_init(struct amd_iommu *iommu) +{ + u64 features; + + if (!(iommu->cap & (1 << IOMMU_CAP_EFR))) + return; + + /* read extended feature bits */ + features = readq(iommu->mmio_base + MMIO_EXT_FEATURES); + + if (!iommu->features) { + iommu->features = features; + return; + } + + /* + * Sanity check and warn if EFR values from + * IVHD and MMIO conflict. + */ + if (features != iommu->features) + pr_warn(FW_WARN "EFR mismatch. Use IVHD EFR (%#llx : %#llx\n).", + features, iommu->features); +} + static int __init iommu_init_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; @@ -1789,8 +1860,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) amd_iommu_iotlb_sup = false; - /* read extended feature bits */ - iommu->features = readq(iommu->mmio_base + MMIO_EXT_FEATURES); + late_iommu_features_init(iommu); if (iommu_feature(iommu, FEATURE_GT)) { int glxval; @@ -1883,7 +1953,7 @@ static void print_iommu_info(void) struct pci_dev *pdev = iommu->dev; int i; - pci_info(pdev, "Found IOMMU cap 0x%hx\n", iommu->cap_ptr); + pci_info(pdev, "Found IOMMU cap 0x%x\n", iommu->cap_ptr); if (iommu->cap & (1 << IOMMU_CAP_EFR)) { pci_info(pdev, "Extended features (%#llx):", @@ -1911,7 +1981,7 @@ static void print_iommu_info(void) static int __init amd_iommu_init_pci(void) { struct amd_iommu *iommu; - int ret = 0; + int ret; for_each_iommu(iommu) { ret = iommu_init_pci(iommu); @@ -2607,6 +2677,11 @@ static void __init free_dma_resources(void) free_unity_maps(); } +static void __init ivinfo_init(void *ivrs) +{ + amd_iommu_ivinfo = *((u32 *)(ivrs + IOMMU_IVINFO_OFFSET)); +} + /* * This is the hardware init function for AMD IOMMU in the system. * This function is called either from amd_iommu_init or from the interrupt @@ -2637,8 +2712,8 @@ static void __init free_dma_resources(void) static int __init early_amd_iommu_init(void) { struct acpi_table_header *ivrs_base; + int i, remap_cache_sz, ret; acpi_status status; - int i, remap_cache_sz, ret = 0; u32 pci_id; if (!amd_iommu_detected) @@ -2661,6 +2736,8 @@ static int __init early_amd_iommu_init(void) if (ret) goto out; + ivinfo_init(ivrs_base); + amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base); DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type); @@ -2780,7 +2857,6 @@ static int __init early_amd_iommu_init(void) out: /* Don't leak any ACPI memory */ acpi_put_table(ivrs_base); - ivrs_base = NULL; return ret; } diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c new file mode 100644 index 000000000000..1c4961e05c12 --- /dev/null +++ b/drivers/iommu/amd/io_pgtable.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CPU-agnostic AMD IO page table allocator. + * + * Copyright (C) 2020 Advanced Micro Devices, Inc. + * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> + */ + +#define pr_fmt(fmt) "AMD-Vi: " fmt +#define dev_fmt(fmt) pr_fmt(fmt) + +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/io-pgtable.h> +#include <linux/kernel.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/dma-mapping.h> + +#include <asm/barrier.h> + +#include "amd_iommu_types.h" +#include "amd_iommu.h" + +static void v1_tlb_flush_all(void *cookie) +{ +} + +static void v1_tlb_flush_walk(unsigned long iova, size_t size, + size_t granule, void *cookie) +{ +} + +static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, size_t granule, + void *cookie) +{ +} + +static const struct iommu_flush_ops v1_flush_ops = { + .tlb_flush_all = v1_tlb_flush_all, + .tlb_flush_walk = v1_tlb_flush_walk, + .tlb_add_page = v1_tlb_add_page, +}; + +/* + * Helper function to get the first pte of a large mapping + */ +static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, + unsigned long *count) +{ + unsigned long pte_mask, pg_size, cnt; + u64 *fpte; + + pg_size = PTE_PAGE_SIZE(*pte); + cnt = PAGE_SIZE_PTE_COUNT(pg_size); + pte_mask = ~((cnt << 3) - 1); + fpte = (u64 *)(((unsigned long)pte) & pte_mask); + + if (page_size) + *page_size = pg_size; + + if (count) + *count = cnt; + + return fpte; +} + +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +static void free_page_list(struct page *freelist) +{ + while (freelist != NULL) { + unsigned long p = (unsigned long)page_address(freelist); + + freelist = freelist->freelist; + free_page(p); + } +} + +static struct page *free_pt_page(unsigned long pt, struct page *freelist) +{ + struct page *p = virt_to_page((void *)pt); + + p->freelist = freelist; + + return p; +} + +#define DEFINE_FREE_PT_FN(LVL, FN) \ +static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist) \ +{ \ + unsigned long p; \ + u64 *pt; \ + int i; \ + \ + pt = (u64 *)__pt; \ + \ + for (i = 0; i < 512; ++i) { \ + /* PTE present? */ \ + if (!IOMMU_PTE_PRESENT(pt[i])) \ + continue; \ + \ + /* Large PTE? */ \ + if (PM_PTE_LEVEL(pt[i]) == 0 || \ + PM_PTE_LEVEL(pt[i]) == 7) \ + continue; \ + \ + p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ + freelist = FN(p, freelist); \ + } \ + \ + return free_pt_page((unsigned long)pt, freelist); \ +} + +DEFINE_FREE_PT_FN(l2, free_pt_page) +DEFINE_FREE_PT_FN(l3, free_pt_l2) +DEFINE_FREE_PT_FN(l4, free_pt_l3) +DEFINE_FREE_PT_FN(l5, free_pt_l4) +DEFINE_FREE_PT_FN(l6, free_pt_l5) + +static struct page *free_sub_pt(unsigned long root, int mode, + struct page *freelist) +{ + switch (mode) { + case PAGE_MODE_NONE: + case PAGE_MODE_7_LEVEL: + break; + case PAGE_MODE_1_LEVEL: + freelist = free_pt_page(root, freelist); + break; + case PAGE_MODE_2_LEVEL: + freelist = free_pt_l2(root, freelist); + break; + case PAGE_MODE_3_LEVEL: + freelist = free_pt_l3(root, freelist); + break; + case PAGE_MODE_4_LEVEL: + freelist = free_pt_l4(root, freelist); + break; + case PAGE_MODE_5_LEVEL: + freelist = free_pt_l5(root, freelist); + break; + case PAGE_MODE_6_LEVEL: + freelist = free_pt_l6(root, freelist); + break; + default: + BUG(); + } + + return freelist; +} + +void amd_iommu_domain_set_pgtable(struct protection_domain *domain, + u64 *root, int mode) +{ + u64 pt_root; + + /* lowest 3 bits encode pgtable mode */ + pt_root = mode & 7; + pt_root |= (u64)root; + + amd_iommu_domain_set_pt_root(domain, pt_root); +} + +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + unsigned long address, + gfp_t gfp) +{ + unsigned long flags; + bool ret = true; + u64 *pte; + + spin_lock_irqsave(&domain->lock, flags); + + if (address <= PM_LEVEL_SIZE(domain->iop.mode)) + goto out; + + ret = false; + if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) + goto out; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + goto out; + + *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); + + domain->iop.root = pte; + domain->iop.mode += 1; + amd_iommu_update_and_flush_device_table(domain); + amd_iommu_domain_flush_complete(domain); + + /* + * Device Table needs to be updated and flushed before the new root can + * be published. + */ + amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); + + ret = true; + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + unsigned long page_size, + u64 **pte_page, + gfp_t gfp, + bool *updated) +{ + int level, end_lvl; + u64 *pte, *page; + + BUG_ON(!is_power_of_2(page_size)); + + while (address > PM_LEVEL_SIZE(domain->iop.mode)) { + /* + * Return an error if there is no memory to update the + * page-table. + */ + if (!increase_address_space(domain, address, gfp)) + return NULL; + } + + + level = domain->iop.mode - 1; + pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; + address = PAGE_SIZE_ALIGN(address, page_size); + end_lvl = PAGE_SIZE_LEVEL(page_size); + + while (level > end_lvl) { + u64 __pte, __npte; + int pte_level; + + __pte = *pte; + pte_level = PM_PTE_LEVEL(__pte); + + /* + * If we replace a series of large PTEs, we need + * to tear down all of them. + */ + if (IOMMU_PTE_PRESENT(__pte) && + pte_level == PAGE_MODE_7_LEVEL) { + unsigned long count, i; + u64 *lpte; + + lpte = first_pte_l7(pte, NULL, &count); + + /* + * Unmap the replicated PTEs that still match the + * original large mapping + */ + for (i = 0; i < count; ++i) + cmpxchg64(&lpte[i], __pte, 0ULL); + + *updated = true; + continue; + } + + if (!IOMMU_PTE_PRESENT(__pte) || + pte_level == PAGE_MODE_NONE) { + page = (u64 *)get_zeroed_page(gfp); + + if (!page) + return NULL; + + __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); + + /* pte could have been changed somewhere. */ + if (cmpxchg64(pte, __pte, __npte) != __pte) + free_page((unsigned long)page); + else if (IOMMU_PTE_PRESENT(__pte)) + *updated = true; + + continue; + } + + /* No level skipping support yet */ + if (pte_level != level) + return NULL; + + level -= 1; + + pte = IOMMU_PTE_PAGE(__pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } + + return pte; +} + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct amd_io_pgtable *pgtable, + unsigned long address, + unsigned long *page_size) +{ + int level; + u64 *pte; + + *page_size = 0; + + if (address > PM_LEVEL_SIZE(pgtable->mode)) + return NULL; + + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + *page_size = PTE_LEVEL_PAGE_SIZE(level); + + while (level > 0) { + + /* Not Present */ + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + /* Large PTE */ + if (PM_PTE_LEVEL(*pte) == 7 || + PM_PTE_LEVEL(*pte) == 0) + break; + + /* No level skipping support yet */ + if (PM_PTE_LEVEL(*pte) != level) + return NULL; + + level -= 1; + + /* Walk to the next level */ + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; + *page_size = PTE_LEVEL_PAGE_SIZE(level); + } + + /* + * If we have a series of large PTEs, make + * sure to return a pointer to the first one. + */ + if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) + pte = first_pte_l7(pte, page_size, NULL); + + return pte; +} + +static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist) +{ + unsigned long pt; + int mode; + + while (cmpxchg64(pte, pteval, 0) != pteval) { + pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); + pteval = *pte; + } + + if (!IOMMU_PTE_PRESENT(pteval)) + return freelist; + + pt = (unsigned long)IOMMU_PTE_PAGE(pteval); + mode = IOMMU_PTE_MODE(pteval); + + return free_sub_pt(pt, mode, freelist); +} + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ +static int iommu_v1_map_page(struct io_pgtable_ops *ops, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + struct protection_domain *dom = io_pgtable_ops_to_domain(ops); + struct page *freelist = NULL; + bool updated = false; + u64 __pte, *pte; + int ret, i, count; + + BUG_ON(!IS_ALIGNED(iova, size)); + BUG_ON(!IS_ALIGNED(paddr, size)); + + ret = -EINVAL; + if (!(prot & IOMMU_PROT_MASK)) + goto out; + + count = PAGE_SIZE_PTE_COUNT(size); + pte = alloc_pte(dom, iova, size, NULL, gfp, &updated); + + ret = -ENOMEM; + if (!pte) + goto out; + + for (i = 0; i < count; ++i) + freelist = free_clear_pte(&pte[i], pte[i], freelist); + + if (freelist != NULL) + updated = true; + + if (count > 1) { + __pte = PAGE_SIZE_PTE(__sme_set(paddr), size); + __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; + } else + __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; + + if (prot & IOMMU_PROT_IR) + __pte |= IOMMU_PTE_IR; + if (prot & IOMMU_PROT_IW) + __pte |= IOMMU_PTE_IW; + + for (i = 0; i < count; ++i) + pte[i] = __pte; + + ret = 0; + +out: + if (updated) { + unsigned long flags; + + spin_lock_irqsave(&dom->lock, flags); + /* + * Flush domain TLB(s) and wait for completion. Any Device-Table + * Updates and flushing already happened in + * increase_address_space(). + */ + amd_iommu_domain_flush_tlb_pde(dom); + amd_iommu_domain_flush_complete(dom); + spin_unlock_irqrestore(&dom->lock, flags); + } + + /* Everything flushed out, free pages now */ + free_page_list(freelist); + + return ret; +} + +static unsigned long iommu_v1_unmap_page(struct io_pgtable_ops *ops, + unsigned long iova, + size_t size, + struct iommu_iotlb_gather *gather) +{ + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + unsigned long long unmapped; + unsigned long unmap_size; + u64 *pte; + + BUG_ON(!is_power_of_2(size)); + + unmapped = 0; + + while (unmapped < size) { + pte = fetch_pte(pgtable, iova, &unmap_size); + if (pte) { + int i, count; + + count = PAGE_SIZE_PTE_COUNT(unmap_size); + for (i = 0; i < count; i++) + pte[i] = 0ULL; + } + + iova = (iova & ~(unmap_size - 1)) + unmap_size; + unmapped += unmap_size; + } + + BUG_ON(unmapped && !is_power_of_2(unmapped)); + + return unmapped; +} + +static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) +{ + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + unsigned long offset_mask, pte_pgsize; + u64 *pte, __pte; + + if (pgtable->mode == PAGE_MODE_NONE) + return iova; + + pte = fetch_pte(pgtable, iova, &pte_pgsize); + + if (!pte || !IOMMU_PTE_PRESENT(*pte)) + return 0; + + offset_mask = pte_pgsize - 1; + __pte = __sme_clr(*pte & PM_ADDR_MASK); + + return (__pte & ~offset_mask) | (iova & offset_mask); +} + +/* + * ---------------------------------------------------- + */ +static void v1_free_pgtable(struct io_pgtable *iop) +{ + struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); + struct protection_domain *dom; + struct page *freelist = NULL; + unsigned long root; + + if (pgtable->mode == PAGE_MODE_NONE) + return; + + dom = container_of(pgtable, struct protection_domain, iop); + + /* Update data structure */ + amd_iommu_domain_clr_pt_root(dom); + + /* Make changes visible to IOMMUs */ + amd_iommu_domain_update(dom); + + /* Page-table is not visible to IOMMU anymore, so free it */ + BUG_ON(pgtable->mode < PAGE_MODE_NONE || + pgtable->mode > PAGE_MODE_6_LEVEL); + + root = (unsigned long)pgtable->root; + freelist = free_sub_pt(root, pgtable->mode, freelist); + + free_page_list(freelist); +} + +static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) +{ + struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); + + cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, + cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, + cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, + cfg->tlb = &v1_flush_ops; + + pgtable->iop.ops.map = iommu_v1_map_page; + pgtable->iop.ops.unmap = iommu_v1_unmap_page; + pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; + + return &pgtable->iop; +} + +struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { + .alloc = v1_alloc_pgtable, + .free = v1_free_pgtable, +}; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index f0adbc48fd17..a69a8b573e40 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -31,6 +31,7 @@ #include <linux/irqdomain.h> #include <linux/percpu.h> #include <linux/iova.h> +#include <linux/io-pgtable.h> #include <asm/irq_remapping.h> #include <asm/io_apic.h> #include <asm/apic.h> @@ -57,16 +58,6 @@ #define HT_RANGE_START (0xfd00000000ULL) #define HT_RANGE_END (0xffffffffffULL) -/* - * This bitmap is used to advertise the page sizes our hardware support - * to the IOMMU core, which will then use this information to split - * physically contiguous memory regions it is mapping into page sizes - * that we support. - * - * 512GB Pages are not supported due to a hardware bug - */ -#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) - #define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL static DEFINE_SPINLOCK(pd_bitmap_lock); @@ -96,10 +87,7 @@ struct iommu_cmd { struct kmem_cache *amd_iommu_irq_cache; -static void update_domain(struct protection_domain *domain); static void detach_device(struct device *dev); -static void update_and_flush_device_table(struct protection_domain *domain, - struct domain_pgtable *pgtable); /**************************************************************************** * @@ -151,37 +139,6 @@ static struct protection_domain *to_pdomain(struct iommu_domain *dom) return container_of(dom, struct protection_domain, domain); } -static void amd_iommu_domain_get_pgtable(struct protection_domain *domain, - struct domain_pgtable *pgtable) -{ - u64 pt_root = atomic64_read(&domain->pt_root); - - pgtable->root = (u64 *)(pt_root & PAGE_MASK); - pgtable->mode = pt_root & 7; /* lowest 3 bits encode pgtable mode */ -} - -static void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) -{ - atomic64_set(&domain->pt_root, root); -} - -static void amd_iommu_domain_clr_pt_root(struct protection_domain *domain) -{ - amd_iommu_domain_set_pt_root(domain, 0); -} - -static void amd_iommu_domain_set_pgtable(struct protection_domain *domain, - u64 *root, int mode) -{ - u64 pt_root; - - /* lowest 3 bits encode pgtable mode */ - pt_root = mode & 7; - pt_root |= (u64)root; - - amd_iommu_domain_set_pt_root(domain, pt_root); -} - static struct iommu_dev_data *alloc_dev_data(u16 devid) { struct iommu_dev_data *dev_data; @@ -437,29 +394,6 @@ static void amd_iommu_uninit_device(struct device *dev) */ } -/* - * Helper function to get the first pte of a large mapping - */ -static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, - unsigned long *count) -{ - unsigned long pte_mask, pg_size, cnt; - u64 *fpte; - - pg_size = PTE_PAGE_SIZE(*pte); - cnt = PAGE_SIZE_PTE_COUNT(pg_size); - pte_mask = ~((cnt << 3) - 1); - fpte = (u64 *)(((unsigned long)pte) & pte_mask); - - if (page_size) - *page_size = pg_size; - - if (count) - *count = cnt; - - return fpte; -} - /**************************************************************************** * * Interrupt handling functions @@ -1335,12 +1269,12 @@ static void domain_flush_pages(struct protection_domain *domain, } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void domain_flush_tlb_pde(struct protection_domain *domain) +void amd_iommu_domain_flush_tlb_pde(struct protection_domain *domain) { __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } -static void domain_flush_complete(struct protection_domain *domain) +void amd_iommu_domain_flush_complete(struct protection_domain *domain) { int i; @@ -1365,7 +1299,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, spin_lock_irqsave(&domain->lock, flags); domain_flush_pages(domain, iova, size); - domain_flush_complete(domain); + amd_iommu_domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } } @@ -1384,443 +1318,6 @@ static void domain_flush_devices(struct protection_domain *domain) /**************************************************************************** * - * The functions below are used the create the page table mappings for - * unity mapped regions. - * - ****************************************************************************/ - -static void free_page_list(struct page *freelist) -{ - while (freelist != NULL) { - unsigned long p = (unsigned long)page_address(freelist); - freelist = freelist->freelist; - free_page(p); - } -} - -static struct page *free_pt_page(unsigned long pt, struct page *freelist) -{ - struct page *p = virt_to_page((void *)pt); - - p->freelist = freelist; - - return p; -} - -#define DEFINE_FREE_PT_FN(LVL, FN) \ -static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist) \ -{ \ - unsigned long p; \ - u64 *pt; \ - int i; \ - \ - pt = (u64 *)__pt; \ - \ - for (i = 0; i < 512; ++i) { \ - /* PTE present? */ \ - if (!IOMMU_PTE_PRESENT(pt[i])) \ - continue; \ - \ - /* Large PTE? */ \ - if (PM_PTE_LEVEL(pt[i]) == 0 || \ - PM_PTE_LEVEL(pt[i]) == 7) \ - continue; \ - \ - p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ - freelist = FN(p, freelist); \ - } \ - \ - return free_pt_page((unsigned long)pt, freelist); \ -} - -DEFINE_FREE_PT_FN(l2, free_pt_page) -DEFINE_FREE_PT_FN(l3, free_pt_l2) -DEFINE_FREE_PT_FN(l4, free_pt_l3) -DEFINE_FREE_PT_FN(l5, free_pt_l4) -DEFINE_FREE_PT_FN(l6, free_pt_l5) - -static struct page *free_sub_pt(unsigned long root, int mode, - struct page *freelist) -{ - switch (mode) { - case PAGE_MODE_NONE: - case PAGE_MODE_7_LEVEL: - break; - case PAGE_MODE_1_LEVEL: - freelist = free_pt_page(root, freelist); - break; - case PAGE_MODE_2_LEVEL: - freelist = free_pt_l2(root, freelist); - break; - case PAGE_MODE_3_LEVEL: - freelist = free_pt_l3(root, freelist); - break; - case PAGE_MODE_4_LEVEL: - freelist = free_pt_l4(root, freelist); - break; - case PAGE_MODE_5_LEVEL: - freelist = free_pt_l5(root, freelist); - break; - case PAGE_MODE_6_LEVEL: - freelist = free_pt_l6(root, freelist); - break; - default: - BUG(); - } - - return freelist; -} - -static void free_pagetable(struct domain_pgtable *pgtable) -{ - struct page *freelist = NULL; - unsigned long root; - - if (pgtable->mode == PAGE_MODE_NONE) - return; - - BUG_ON(pgtable->mode < PAGE_MODE_NONE || - pgtable->mode > PAGE_MODE_6_LEVEL); - - root = (unsigned long)pgtable->root; - freelist = free_sub_pt(root, pgtable->mode, freelist); - - free_page_list(freelist); -} - -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - unsigned long address, - gfp_t gfp) -{ - struct domain_pgtable pgtable; - unsigned long flags; - bool ret = true; - u64 *pte; - - spin_lock_irqsave(&domain->lock, flags); - - amd_iommu_domain_get_pgtable(domain, &pgtable); - - if (address <= PM_LEVEL_SIZE(pgtable.mode)) - goto out; - - ret = false; - if (WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL)) - goto out; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - goto out; - - *pte = PM_LEVEL_PDE(pgtable.mode, iommu_virt_to_phys(pgtable.root)); - - pgtable.root = pte; - pgtable.mode += 1; - update_and_flush_device_table(domain, &pgtable); - domain_flush_complete(domain); - - /* - * Device Table needs to be updated and flushed before the new root can - * be published. - */ - amd_iommu_domain_set_pgtable(domain, pte, pgtable.mode); - - ret = true; - -out: - spin_unlock_irqrestore(&domain->lock, flags); - - return ret; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - unsigned long page_size, - u64 **pte_page, - gfp_t gfp, - bool *updated) -{ - struct domain_pgtable pgtable; - int level, end_lvl; - u64 *pte, *page; - - BUG_ON(!is_power_of_2(page_size)); - - amd_iommu_domain_get_pgtable(domain, &pgtable); - - while (address > PM_LEVEL_SIZE(pgtable.mode)) { - /* - * Return an error if there is no memory to update the - * page-table. - */ - if (!increase_address_space(domain, address, gfp)) - return NULL; - - /* Read new values to check if update was successful */ - amd_iommu_domain_get_pgtable(domain, &pgtable); - } - - - level = pgtable.mode - 1; - pte = &pgtable.root[PM_LEVEL_INDEX(level, address)]; - address = PAGE_SIZE_ALIGN(address, page_size); - end_lvl = PAGE_SIZE_LEVEL(page_size); - - while (level > end_lvl) { - u64 __pte, __npte; - int pte_level; - - __pte = *pte; - pte_level = PM_PTE_LEVEL(__pte); - - /* - * If we replace a series of large PTEs, we need - * to tear down all of them. - */ - if (IOMMU_PTE_PRESENT(__pte) && - pte_level == PAGE_MODE_7_LEVEL) { - unsigned long count, i; - u64 *lpte; - - lpte = first_pte_l7(pte, NULL, &count); - - /* - * Unmap the replicated PTEs that still match the - * original large mapping - */ - for (i = 0; i < count; ++i) - cmpxchg64(&lpte[i], __pte, 0ULL); - - *updated = true; - continue; - } - - if (!IOMMU_PTE_PRESENT(__pte) || - pte_level == PAGE_MODE_NONE) { - page = (u64 *)get_zeroed_page(gfp); - - if (!page) - return NULL; - - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); - - /* pte could have been changed somewhere. */ - if (cmpxchg64(pte, __pte, __npte) != __pte) - free_page((unsigned long)page); - else if (IOMMU_PTE_PRESENT(__pte)) - *updated = true; - - continue; - } - - /* No level skipping support yet */ - if (pte_level != level) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(__pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, - unsigned long *page_size) -{ - struct domain_pgtable pgtable; - int level; - u64 *pte; - - *page_size = 0; - - amd_iommu_domain_get_pgtable(domain, &pgtable); - - if (address > PM_LEVEL_SIZE(pgtable.mode)) - return NULL; - - level = pgtable.mode - 1; - pte = &pgtable.root[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - - while (level > 0) { - - /* Not Present */ - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - /* Large PTE */ - if (PM_PTE_LEVEL(*pte) == 7 || - PM_PTE_LEVEL(*pte) == 0) - break; - - /* No level skipping support yet */ - if (PM_PTE_LEVEL(*pte) != level) - return NULL; - - level -= 1; - - /* Walk to the next level */ - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - *page_size = PTE_LEVEL_PAGE_SIZE(level); - } - - /* - * If we have a series of large PTEs, make - * sure to return a pointer to the first one. - */ - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) - pte = first_pte_l7(pte, page_size, NULL); - - return pte; -} - -static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist) -{ - unsigned long pt; - int mode; - - while (cmpxchg64(pte, pteval, 0) != pteval) { - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); - pteval = *pte; - } - - if (!IOMMU_PTE_PRESENT(pteval)) - return freelist; - - pt = (unsigned long)IOMMU_PTE_PAGE(pteval); - mode = IOMMU_PTE_MODE(pteval); - - return free_sub_pt(pt, mode, freelist); -} - -/* - * Generic mapping functions. It maps a physical address into a DMA - * address space. It allocates the page table pages if necessary. - * In the future it can be extended to a generic mapping function - * supporting all features of AMD IOMMU page tables like level skipping - * and full 64 bit address spaces. - */ -static int iommu_map_page(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long phys_addr, - unsigned long page_size, - int prot, - gfp_t gfp) -{ - struct page *freelist = NULL; - bool updated = false; - u64 __pte, *pte; - int ret, i, count; - - BUG_ON(!IS_ALIGNED(bus_addr, page_size)); - BUG_ON(!IS_ALIGNED(phys_addr, page_size)); - - ret = -EINVAL; - if (!(prot & IOMMU_PROT_MASK)) - goto out; - - count = PAGE_SIZE_PTE_COUNT(page_size); - pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated); - - ret = -ENOMEM; - if (!pte) - goto out; - - for (i = 0; i < count; ++i) - freelist = free_clear_pte(&pte[i], pte[i], freelist); - - if (freelist != NULL) - updated = true; - - if (count > 1) { - __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; - } else - __pte = __sme_set(phys_addr) | IOMMU_PTE_PR | IOMMU_PTE_FC; - - if (prot & IOMMU_PROT_IR) - __pte |= IOMMU_PTE_IR; - if (prot & IOMMU_PROT_IW) - __pte |= IOMMU_PTE_IW; - - for (i = 0; i < count; ++i) - pte[i] = __pte; - - ret = 0; - -out: - if (updated) { - unsigned long flags; - - spin_lock_irqsave(&dom->lock, flags); - /* - * Flush domain TLB(s) and wait for completion. Any Device-Table - * Updates and flushing already happened in - * increase_address_space(). - */ - domain_flush_tlb_pde(dom); - domain_flush_complete(dom); - spin_unlock_irqrestore(&dom->lock, flags); - } - - /* Everything flushed out, free pages now */ - free_page_list(freelist); - - return ret; -} - -static unsigned long iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long page_size) -{ - unsigned long long unmapped; - unsigned long unmap_size; - u64 *pte; - - BUG_ON(!is_power_of_2(page_size)); - - unmapped = 0; - - while (unmapped < page_size) { - - pte = fetch_pte(dom, bus_addr, &unmap_size); - - if (pte) { - int i, count; - - count = PAGE_SIZE_PTE_COUNT(unmap_size); - for (i = 0; i < count; i++) - pte[i] = 0ULL; - } - - bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size; - unmapped += unmap_size; - } - - BUG_ON(unmapped && !is_power_of_2(unmapped)); - - return unmapped; -} - -/**************************************************************************** - * * The next functions belong to the domain allocation. A domain is * allocated for every IOMMU as the default domain. If device isolation * is enabled, every device get its own domain. The most important thing @@ -1896,17 +1393,16 @@ static void free_gcr3_table(struct protection_domain *domain) } static void set_dte_entry(u16 devid, struct protection_domain *domain, - struct domain_pgtable *pgtable, bool ats, bool ppr) { u64 pte_root = 0; u64 flags = 0; u32 old_domid; - if (pgtable->mode != PAGE_MODE_NONE) - pte_root = iommu_virt_to_phys(pgtable->root); + if (domain->iop.mode != PAGE_MODE_NONE) + pte_root = iommu_virt_to_phys(domain->iop.root); - pte_root |= (pgtable->mode & DEV_ENTRY_MODE_MASK) + pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV; @@ -1979,7 +1475,6 @@ static void clear_dte_entry(u16 devid) static void do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { - struct domain_pgtable pgtable; struct amd_iommu *iommu; bool ats; @@ -1995,8 +1490,7 @@ static void do_attach(struct iommu_dev_data *dev_data, domain->dev_cnt += 1; /* Update device table */ - amd_iommu_domain_get_pgtable(domain, &pgtable); - set_dte_entry(dev_data->devid, domain, &pgtable, + set_dte_entry(dev_data->devid, domain, ats, dev_data->iommu_v2); clone_aliases(dev_data->pdev); @@ -2020,10 +1514,10 @@ static void do_detach(struct iommu_dev_data *dev_data) device_flush_dte(dev_data); /* Flush IOTLB */ - domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_tlb_pde(domain); /* Wait for the flushes to finish */ - domain_flush_complete(domain); + amd_iommu_domain_flush_complete(domain); /* decrease reference counters - needs to happen after the flushes */ domain->dev_iommu[iommu->index] -= 1; @@ -2156,9 +1650,9 @@ skip_ats_check: * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_tlb_pde(domain); - domain_flush_complete(domain); + amd_iommu_domain_flush_complete(domain); out: spin_unlock(&dev_data->lock); @@ -2303,36 +1797,31 @@ static int amd_iommu_domain_get_attr(struct iommu_domain *domain, * *****************************************************************************/ -static void update_device_table(struct protection_domain *domain, - struct domain_pgtable *pgtable) +static void update_device_table(struct protection_domain *domain) { struct iommu_dev_data *dev_data; list_for_each_entry(dev_data, &domain->dev_list, list) { - set_dte_entry(dev_data->devid, domain, pgtable, + set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled, dev_data->iommu_v2); clone_aliases(dev_data->pdev); } } -static void update_and_flush_device_table(struct protection_domain *domain, - struct domain_pgtable *pgtable) +void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) { - update_device_table(domain, pgtable); + update_device_table(domain); domain_flush_devices(domain); } -static void update_domain(struct protection_domain *domain) +void amd_iommu_domain_update(struct protection_domain *domain) { - struct domain_pgtable pgtable; - /* Update device table */ - amd_iommu_domain_get_pgtable(domain, &pgtable); - update_and_flush_device_table(domain, &pgtable); + amd_iommu_update_and_flush_device_table(domain); /* Flush domain TLB(s) and wait for completion */ - domain_flush_tlb_pde(domain); - domain_flush_complete(domain); + amd_iommu_domain_flush_tlb_pde(domain); + amd_iommu_domain_flush_complete(domain); } int __init amd_iommu_init_api(void) @@ -2400,22 +1889,19 @@ static void cleanup_domain(struct protection_domain *domain) static void protection_domain_free(struct protection_domain *domain) { - struct domain_pgtable pgtable; - if (!domain) return; if (domain->id) domain_id_free(domain->id); - amd_iommu_domain_get_pgtable(domain, &pgtable); - amd_iommu_domain_clr_pt_root(domain); - free_pagetable(&pgtable); + if (domain->iop.pgtbl_cfg.tlb) + free_io_pgtable_ops(&domain->iop.iop.ops); kfree(domain); } -static int protection_domain_init(struct protection_domain *domain, int mode) +static int protection_domain_init_v1(struct protection_domain *domain, int mode) { u64 *pt_root = NULL; @@ -2438,34 +1924,55 @@ static int protection_domain_init(struct protection_domain *domain, int mode) return 0; } -static struct protection_domain *protection_domain_alloc(int mode) +static struct protection_domain *protection_domain_alloc(unsigned int type) { + struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; + int pgtable = amd_iommu_pgtable; + int mode = DEFAULT_PGTABLE_LEVEL; + int ret; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; - if (protection_domain_init(domain, mode)) + /* + * Force IOMMU v1 page table when iommu=pt and + * when allocating domain for pass-through devices. + */ + if (type == IOMMU_DOMAIN_IDENTITY) { + pgtable = AMD_IOMMU_V1; + mode = PAGE_MODE_NONE; + } else if (type == IOMMU_DOMAIN_UNMANAGED) { + pgtable = AMD_IOMMU_V1; + } + + switch (pgtable) { + case AMD_IOMMU_V1: + ret = protection_domain_init_v1(domain, mode); + break; + default: + ret = -EINVAL; + } + + if (ret) goto out_err; - return domain; + pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain); + if (!pgtbl_ops) + goto out_err; + return domain; out_err: kfree(domain); - return NULL; } static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) { struct protection_domain *domain; - int mode = DEFAULT_PGTABLE_LEVEL; - - if (type == IOMMU_DOMAIN_IDENTITY) - mode = PAGE_MODE_NONE; - domain = protection_domain_alloc(mode); + domain = protection_domain_alloc(type); if (!domain) return NULL; @@ -2580,12 +2087,12 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, gfp_t gfp) { struct protection_domain *domain = to_pdomain(dom); - struct domain_pgtable pgtable; + struct io_pgtable_ops *ops = &domain->iop.iop.ops; int prot = 0; - int ret; + int ret = -EINVAL; - amd_iommu_domain_get_pgtable(domain, &pgtable); - if (pgtable.mode == PAGE_MODE_NONE) + if ((amd_iommu_pgtable == AMD_IOMMU_V1) && + (domain->iop.mode == PAGE_MODE_NONE)) return -EINVAL; if (iommu_prot & IOMMU_READ) @@ -2593,9 +2100,10 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, if (iommu_prot & IOMMU_WRITE) prot |= IOMMU_PROT_IW; - ret = iommu_map_page(domain, iova, paddr, page_size, prot, gfp); - - domain_flush_np_cache(domain, iova, page_size); + if (ops->map) { + ret = ops->map(ops, iova, paddr, page_size, prot, gfp); + domain_flush_np_cache(domain, iova, page_size); + } return ret; } @@ -2605,36 +2113,22 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, struct iommu_iotlb_gather *gather) { struct protection_domain *domain = to_pdomain(dom); - struct domain_pgtable pgtable; + struct io_pgtable_ops *ops = &domain->iop.iop.ops; - amd_iommu_domain_get_pgtable(domain, &pgtable); - if (pgtable.mode == PAGE_MODE_NONE) + if ((amd_iommu_pgtable == AMD_IOMMU_V1) && + (domain->iop.mode == PAGE_MODE_NONE)) return 0; - return iommu_unmap_page(domain, iova, page_size); + return (ops->unmap) ? ops->unmap(ops, iova, page_size, gather) : 0; } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, dma_addr_t iova) { struct protection_domain *domain = to_pdomain(dom); - unsigned long offset_mask, pte_pgsize; - struct domain_pgtable pgtable; - u64 *pte, __pte; - - amd_iommu_domain_get_pgtable(domain, &pgtable); - if (pgtable.mode == PAGE_MODE_NONE) - return iova; + struct io_pgtable_ops *ops = &domain->iop.iop.ops; - pte = fetch_pte(domain, iova, &pte_pgsize); - - if (!pte || !IOMMU_PTE_PRESENT(*pte)) - return 0; - - offset_mask = pte_pgsize - 1; - __pte = __sme_clr(*pte & PM_ADDR_MASK); - - return (__pte & ~offset_mask) | (iova & offset_mask); + return ops->iova_to_phys(ops, iova); } static bool amd_iommu_capable(enum iommu_cap cap) @@ -2720,8 +2214,8 @@ static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) unsigned long flags; spin_lock_irqsave(&dom->lock, flags); - domain_flush_tlb_pde(dom); - domain_flush_complete(dom); + amd_iommu_domain_flush_tlb_pde(dom); + amd_iommu_domain_flush_complete(dom); spin_unlock_irqrestore(&dom->lock, flags); } @@ -2799,22 +2293,12 @@ EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); void amd_iommu_domain_direct_map(struct iommu_domain *dom) { struct protection_domain *domain = to_pdomain(dom); - struct domain_pgtable pgtable; unsigned long flags; spin_lock_irqsave(&domain->lock, flags); - /* First save pgtable configuration*/ - amd_iommu_domain_get_pgtable(domain, &pgtable); - - /* Remove page-table from domain */ - amd_iommu_domain_clr_pt_root(domain); - - /* Make changes visible to IOMMUs */ - update_domain(domain); - - /* Page-table is not visible to IOMMU anymore, so free it */ - free_pagetable(&pgtable); + if (domain->iop.pgtbl_cfg.tlb) + free_io_pgtable_ops(&domain->iop.iop.ops); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2855,7 +2339,7 @@ int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) domain->glx = levels; domain->flags |= PD_IOMMUV2_MASK; - update_domain(domain); + amd_iommu_domain_update(domain); ret = 0; @@ -2892,7 +2376,7 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, } /* Wait until IOMMU TLB flushes are complete */ - domain_flush_complete(domain); + amd_iommu_domain_flush_complete(domain); /* Now flush device TLBs */ list_for_each_entry(dev_data, &domain->dev_list, list) { @@ -2918,7 +2402,7 @@ static int __flush_pasid(struct protection_domain *domain, u32 pasid, } /* Wait until all device TLBs are flushed */ - domain_flush_complete(domain); + amd_iommu_domain_flush_complete(domain); ret = 0; @@ -3003,11 +2487,9 @@ static u64 *__get_gcr3_pte(u64 *root, int level, u32 pasid, bool alloc) static int __set_gcr3(struct protection_domain *domain, u32 pasid, unsigned long cr3) { - struct domain_pgtable pgtable; u64 *pte; - amd_iommu_domain_get_pgtable(domain, &pgtable); - if (pgtable.mode != PAGE_MODE_NONE) + if (domain->iop.mode != PAGE_MODE_NONE) return -EINVAL; pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); @@ -3021,11 +2503,9 @@ static int __set_gcr3(struct protection_domain *domain, u32 pasid, static int __clear_gcr3(struct protection_domain *domain, u32 pasid) { - struct domain_pgtable pgtable; u64 *pte; - amd_iommu_domain_get_pgtable(domain, &pgtable); - if (pgtable.mode != PAGE_MODE_NONE) + if (domain->iop.mode != PAGE_MODE_NONE) return -EINVAL; pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 5ecc0bc608ec..f8d4ad421e07 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -77,7 +77,7 @@ struct fault { }; static LIST_HEAD(state_list); -static spinlock_t state_lock; +static DEFINE_SPINLOCK(state_lock); static struct workqueue_struct *iommu_wq; @@ -938,8 +938,6 @@ static int __init amd_iommu_v2_init(void) return 0; } - spin_lock_init(&state_lock); - ret = -ENOMEM; iommu_wq = alloc_workqueue("amd_iommu_v2", WQ_MEM_RECLAIM, 0); if (iommu_wq == NULL) |