From 31c11db6bd93b0c051d2c835da4fa9bba636cfdb Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 26 May 2021 11:12:11 +0800 Subject: virtio_ring: Fix kernel-doc Fix function name in virtio_ring.c kernel-doc comment to remove a warning found by clang_w1. drivers/virtio/virtio_ring.c:1903: warning: expecting prototype for virtqueue_get_buf(). Prototype was for virtqueue_get_buf_ctx() instead Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/1621998731-17445-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 71e16b53e9c1..095a9a3afcba 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1875,7 +1875,7 @@ bool virtqueue_kick(struct virtqueue *vq) EXPORT_SYMBOL_GPL(virtqueue_kick); /** - * virtqueue_get_buf - get the next used buffer + * virtqueue_get_buf_ctx - get the next used buffer * @_vq: the struct virtqueue we're talking about. * @len: the length written into the buffer * @ctx: extra context for the token -- cgit v1.2.3 From 8d622d21d24803408b256d96463eac4574dcf067 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 13 Apr 2021 01:19:16 -0400 Subject: virtio: fix up virtio_disable_cb virtio_disable_cb is currently a nop for split ring with event index. This is because it used to be always called from a callback when we know device won't trigger more events until we update the index. However, now that we run with interrupts enabled a lot we also poll without a callback so that is different: disabling callbacks will help reduce the number of spurious interrupts. Further, if using event index with a packed ring, and if being called from a callback, we actually do disable interrupts which is unnecessary. Fix both issues by tracking whenever we get a callback. If that is the case disabling interrupts with event index can be a nop. If not the case disable interrupts. Note: with a split ring there's no explicit "no interrupts" value. For now we write a fixed value so our chance of triggering an interupt is 1/ring size. It's probably better to write something related to the last used index there to reduce the chance even further. For now I'm keeping it simple. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 095a9a3afcba..992cb1cbec93 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -113,6 +113,9 @@ struct vring_virtqueue { /* Last used index we've seen. */ u16 last_used_idx; + /* Hint for event idx: already triggered no need to disable. */ + bool event_triggered; + union { /* Available for split ring */ struct { @@ -739,7 +742,10 @@ static void virtqueue_disable_cb_split(struct virtqueue *_vq) if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; - if (!vq->event) + if (vq->event) + /* TODO: this is a hack. Figure out a cleaner value to write. */ + vring_used_event(&vq->split.vring) = 0x0; + else vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); @@ -1605,6 +1611,7 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; + vq->event_triggered = false; vq->num_added = 0; vq->packed_ring = true; vq->use_dma_api = vring_use_dma_api(vdev); @@ -1919,6 +1926,12 @@ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + /* If device triggered an event already it won't trigger one again: + * no need to disable. + */ + if (vq->event_triggered) + return; + if (vq->packed_ring) virtqueue_disable_cb_packed(_vq); else @@ -1942,6 +1955,9 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + if (vq->event_triggered) + vq->event_triggered = false; + return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : virtqueue_enable_cb_prepare_split(_vq); } @@ -2005,6 +2021,9 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + if (vq->event_triggered) + vq->event_triggered = false; + return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : virtqueue_enable_cb_delayed_split(_vq); } @@ -2044,6 +2063,10 @@ irqreturn_t vring_interrupt(int irq, void *_vq) if (unlikely(vq->broken)) return IRQ_HANDLED; + /* Just a hint for performance: so it's ok that this can be racy! */ + if (vq->event) + vq->event_triggered = true; + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); @@ -2083,6 +2106,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->weak_barriers = weak_barriers; vq->broken = false; vq->last_used_idx = 0; + vq->event_triggered = false; vq->num_added = 0; vq->use_dma_api = vring_use_dma_api(vdev); #ifdef DEBUG -- cgit v1.2.3 From aeef9b4733c5c2356c75ba4f5c99e1a09ff1721d Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:44 +0800 Subject: virtio-ring: maintain next in extra state for packed virtqueue This patch moves next from vring_desc_state_packed to vring_desc_desc_extra_packed. This makes it simpler to let extra state to be reused by split virtqueue. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 992cb1cbec93..51d898667854 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -74,7 +74,6 @@ struct vring_desc_state_packed { void *data; /* Data for callback. */ struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ u16 num; /* Descriptor list length. */ - u16 next; /* The next desc state in a list. */ u16 last; /* The last desc state in a list. */ }; @@ -82,6 +81,7 @@ struct vring_desc_extra_packed { dma_addr_t addr; /* Buffer DMA addr. */ u32 len; /* Buffer length. */ u16 flags; /* Descriptor flags. */ + u16 next; /* The next desc state in a list. */ }; struct vring_virtqueue { @@ -1067,7 +1067,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, 1 << VRING_PACKED_DESC_F_USED; } vq->packed.next_avail_idx = n; - vq->free_head = vq->packed.desc_state[id].next; + vq->free_head = vq->packed.desc_extra[id].next; /* Store token and indirect buffer state. */ vq->packed.desc_state[id].num = 1; @@ -1175,7 +1175,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, le16_to_cpu(flags); } prev = curr; - curr = vq->packed.desc_state[curr].next; + curr = vq->packed.desc_extra[curr].next; if ((unlikely(++i >= vq->packed.vring.num))) { i = 0; @@ -1296,7 +1296,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, /* Clear data ptr. */ state->data = NULL; - vq->packed.desc_state[state->last].next = vq->free_head; + vq->packed.desc_extra[state->last].next = vq->free_head; vq->free_head = id; vq->vq.num_free += state->num; @@ -1305,7 +1305,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, for (i = 0; i < state->num; i++) { vring_unmap_state_packed(vq, &vq->packed.desc_extra[curr]); - curr = vq->packed.desc_state[curr].next; + curr = vq->packed.desc_extra[curr].next; } } @@ -1656,8 +1656,6 @@ static struct virtqueue *vring_create_virtqueue_packed( /* Put everything in free lists. */ vq->free_head = 0; - for (i = 0; i < num-1; i++) - vq->packed.desc_state[i].next = i + 1; vq->packed.desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra_packed), @@ -1668,6 +1666,9 @@ static struct virtqueue *vring_create_virtqueue_packed( memset(vq->packed.desc_extra, 0, num * sizeof(struct vring_desc_extra_packed)); + for (i = 0; i < num - 1; i++) + vq->packed.desc_extra[i].next = i + 1; + /* No callback? Tell other side not to bother us. */ if (!callback) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; -- cgit v1.2.3 From 1f28750f2e113132791161563c6e7b99eaa4c46b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:45 +0800 Subject: virtio_ring: rename vring_desc_extra_packed Rename vring_desc_extra_packed to vring_desc_extra since the structure are pretty generic which could be reused by split virtqueue as well. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-3-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 51d898667854..03caa19fca67 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -77,7 +77,7 @@ struct vring_desc_state_packed { u16 last; /* The last desc state in a list. */ }; -struct vring_desc_extra_packed { +struct vring_desc_extra { dma_addr_t addr; /* Buffer DMA addr. */ u32 len; /* Buffer length. */ u16 flags; /* Descriptor flags. */ @@ -169,7 +169,7 @@ struct vring_virtqueue { /* Per-descriptor state. */ struct vring_desc_state_packed *desc_state; - struct vring_desc_extra_packed *desc_extra; + struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t ring_dma_addr; @@ -918,7 +918,7 @@ static struct virtqueue *vring_create_virtqueue_split( */ static void vring_unmap_state_packed(const struct vring_virtqueue *vq, - struct vring_desc_extra_packed *state) + struct vring_desc_extra *state) { u16 flags; @@ -1658,13 +1658,13 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->free_head = 0; vq->packed.desc_extra = kmalloc_array(num, - sizeof(struct vring_desc_extra_packed), + sizeof(struct vring_desc_extra), GFP_KERNEL); if (!vq->packed.desc_extra) goto err_desc_extra; memset(vq->packed.desc_extra, 0, - num * sizeof(struct vring_desc_extra_packed)); + num * sizeof(struct vring_desc_extra)); for (i = 0; i < num - 1; i++) vq->packed.desc_extra[i].next = i + 1; -- cgit v1.2.3 From 5a22242160201b819be2fe67e15cc9338f3ee582 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:46 +0800 Subject: virtio-ring: factor out desc_extra allocation A helper is introduced for the logic of allocating the descriptor extra data. This will be reused by split virtqueue. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 03caa19fca67..f2f4a3b635f3 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1556,6 +1556,25 @@ static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) return NULL; } +static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *vq, + unsigned int num) +{ + struct vring_desc_extra *desc_extra; + unsigned int i; + + desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra), + GFP_KERNEL); + if (!desc_extra) + return NULL; + + memset(desc_extra, 0, num * sizeof(struct vring_desc_extra)); + + for (i = 0; i < num - 1; i++) + desc_extra[i].next = i + 1; + + return desc_extra; +} + static struct virtqueue *vring_create_virtqueue_packed( unsigned int index, unsigned int num, @@ -1573,7 +1592,6 @@ static struct virtqueue *vring_create_virtqueue_packed( struct vring_packed_desc_event *driver, *device; dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; - unsigned int i; ring_size_in_bytes = num * sizeof(struct vring_packed_desc); @@ -1657,18 +1675,10 @@ static struct virtqueue *vring_create_virtqueue_packed( /* Put everything in free lists. */ vq->free_head = 0; - vq->packed.desc_extra = kmalloc_array(num, - sizeof(struct vring_desc_extra), - GFP_KERNEL); + vq->packed.desc_extra = vring_alloc_desc_extra(vq, num); if (!vq->packed.desc_extra) goto err_desc_extra; - memset(vq->packed.desc_extra, 0, - num * sizeof(struct vring_desc_extra)); - - for (i = 0; i < num - 1; i++) - vq->packed.desc_extra[i].next = i + 1; - /* No callback? Tell other side not to bother us. */ if (!callback) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; -- cgit v1.2.3 From 44593865b7c5f55bf587f297c72d682c671eea2b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:47 +0800 Subject: virtio_ring: secure handling of mapping errors We should not depend on the DMA address, length and flag of descriptor table since they could be wrote with arbitrary value by the device. So this patch switches to use the stored one in desc_extra. Note that the indirect descriptors are fine since they are read-only streaming mappings. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-5-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f2f4a3b635f3..00e54115e29b 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1219,13 +1219,16 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, unmap_release: err_idx = i; i = head; + curr = vq->free_head; vq->packed.avail_used_flags = avail_used_flags; for (n = 0; n < total_sg; n++) { if (i == err_idx) break; - vring_unmap_desc_packed(vq, &desc[i]); + vring_unmap_state_packed(vq, + &vq->packed.desc_extra[curr]); + curr = vq->packed.desc_extra[curr].next; i++; if (i >= vq->packed.vring.num) i = 0; -- cgit v1.2.3 From fe4c3862df630ec711133e686e023b4467da2ec1 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:48 +0800 Subject: virtio_ring: introduce virtqueue_desc_add_split() This patch introduces a helper for storing descriptor in the descriptor table for split virtqueue. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-6-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 00e54115e29b..c7d9a6fcaee7 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -415,6 +415,20 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, return desc; } +static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, + struct vring_desc *desc, + unsigned int i, + dma_addr_t addr, + unsigned int len, + u16 flags) +{ + desc[i].flags = cpu_to_virtio16(vq->vdev, flags); + desc[i].addr = cpu_to_virtio64(vq->vdev, addr); + desc[i].len = cpu_to_virtio32(vq->vdev, len); + + return virtio16_to_cpu(vq->vdev, desc[i].next); +} + static inline int virtqueue_add_split(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, @@ -487,11 +501,9 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); - desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); - desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; - i = virtio16_to_cpu(_vq->vdev, desc[i].next); + i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, + VRING_DESC_F_NEXT); } } for (; n < (out_sgs + in_sgs); n++) { @@ -500,11 +512,11 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); - desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); - desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; - i = virtio16_to_cpu(_vq->vdev, desc[i].next); + i = virtqueue_add_desc_split(_vq, desc, i, addr, + sg->length, + VRING_DESC_F_NEXT | + VRING_DESC_F_WRITE); } } /* Last one doesn't continue. */ @@ -518,13 +530,10 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, - VRING_DESC_F_INDIRECT); - vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, - addr); - - vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev, - total_sg * sizeof(struct vring_desc)); + virtqueue_add_desc_split(_vq, vq->split.vring.desc, + head, addr, + total_sg * sizeof(struct vring_desc), + VRING_DESC_F_INDIRECT); } /* We're using some buffers from the free list. */ -- cgit v1.2.3 From 5bc72234f7c65830e60806dbb73ae76bacd8a061 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:49 +0800 Subject: virtio: use err label in __vring_new_virtqueue() Using error label for unwind in __vring_new_virtqueue. This is useful for future refacotring. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-7-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c7d9a6fcaee7..5faa876df6c6 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2161,10 +2161,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->split.desc_state = kmalloc_array(vring.num, sizeof(struct vring_desc_state_split), GFP_KERNEL); - if (!vq->split.desc_state) { - kfree(vq); - return NULL; - } + if (!vq->split.desc_state) + goto err_state; /* Put everything in free lists. */ vq->free_head = 0; @@ -2175,6 +2173,10 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; + +err_state: + kfree(vq); + return NULL; } EXPORT_SYMBOL_GPL(__vring_new_virtqueue); -- cgit v1.2.3 From 72b5e8958738aaa453db5149e6ca3bcf416023b9 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 4 Jun 2021 13:53:50 +0800 Subject: virtio-ring: store DMA metadata in desc_extra for split virtqueue For split virtqueue, we used to depend on the address, length and flags stored in the descriptor ring for DMA unmapping. This is unsafe for the case since the device can manipulate the behavior of virtio driver, IOMMU drivers and swiotlb. For safety, maintain the DMA address, DMA length, descriptor flags and next filed of the non indirect descriptors in vring_desc_state_extra when DMA API is used for virtio as we did for packed virtqueue and use those metadata for performing DMA operations. Indirect descriptors should be safe since they are using streaming mappings. With this the descriptor ring is write only form the view of the driver. This slight increase the footprint of the drive but it's not noticed through pktgen (64B) test and netperf test in the case of virtio-net. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210604055350.58753-8-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 112 +++++++++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 25 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 5faa876df6c6..89bfe46a8a7f 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -133,6 +133,7 @@ struct vring_virtqueue { /* Per-descriptor state. */ struct vring_desc_state_split *desc_state; + struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t queue_dma_addr; @@ -367,8 +368,8 @@ static int vring_mapping_error(const struct vring_virtqueue *vq, * Split ring specific functions - *_split(). */ -static void vring_unmap_one_split(const struct vring_virtqueue *vq, - struct vring_desc *desc) +static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, + struct vring_desc *desc) { u16 flags; @@ -392,6 +393,35 @@ static void vring_unmap_one_split(const struct vring_virtqueue *vq, } } +static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, + unsigned int i) +{ + struct vring_desc_extra *extra = vq->split.desc_extra; + u16 flags; + + if (!vq->use_dma_api) + goto out; + + flags = extra[i].flags; + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + dma_unmap_page(vring_dma_dev(vq), + extra[i].addr, + extra[i].len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + +out: + return extra[i].next; +} + static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, unsigned int total_sg, gfp_t gfp) @@ -420,13 +450,28 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, unsigned int i, dma_addr_t addr, unsigned int len, - u16 flags) + u16 flags, + bool indirect) { + struct vring_virtqueue *vring = to_vvq(vq); + struct vring_desc_extra *extra = vring->split.desc_extra; + u16 next; + desc[i].flags = cpu_to_virtio16(vq->vdev, flags); desc[i].addr = cpu_to_virtio64(vq->vdev, addr); desc[i].len = cpu_to_virtio32(vq->vdev, len); - return virtio16_to_cpu(vq->vdev, desc[i].next); + if (!indirect) { + next = extra[i].next; + desc[i].next = cpu_to_virtio16(vq->vdev, next); + + extra[i].addr = addr; + extra[i].len = len; + extra[i].flags = flags; + } else + next = virtio16_to_cpu(vq->vdev, desc[i].next); + + return next; } static inline int virtqueue_add_split(struct virtqueue *_vq, @@ -502,8 +547,12 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, goto unmap_release; prev = i; + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, - VRING_DESC_F_NEXT); + VRING_DESC_F_NEXT, + indirect); } } for (; n < (out_sgs + in_sgs); n++) { @@ -513,14 +562,21 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, goto unmap_release; prev = i; + /* Note that we trust indirect descriptor + * table since it use stream DMA mapping. + */ i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, VRING_DESC_F_NEXT | - VRING_DESC_F_WRITE); + VRING_DESC_F_WRITE, + indirect); } } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + if (!indirect && vq->use_dma_api) + vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags = + ~VRING_DESC_F_NEXT; if (indirect) { /* Now that the indirect table is filled in, map it. */ @@ -533,7 +589,8 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, virtqueue_add_desc_split(_vq, vq->split.vring.desc, head, addr, total_sg * sizeof(struct vring_desc), - VRING_DESC_F_INDIRECT); + VRING_DESC_F_INDIRECT, + false); } /* We're using some buffers from the free list. */ @@ -541,8 +598,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Update free pointer */ if (indirect) - vq->free_head = virtio16_to_cpu(_vq->vdev, - vq->split.vring.desc[head].next); + vq->free_head = vq->split.desc_extra[head].next; else vq->free_head = i; @@ -587,8 +643,11 @@ unmap_release: for (n = 0; n < total_sg; n++) { if (i == err_idx) break; - vring_unmap_one_split(vq, &desc[i]); - i = virtio16_to_cpu(_vq->vdev, desc[i].next); + if (indirect) { + vring_unmap_one_split_indirect(vq, &desc[i]); + i = virtio16_to_cpu(_vq->vdev, desc[i].next); + } else + i = vring_unmap_one_split(vq, i); } if (indirect) @@ -642,14 +701,13 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, i = head; while (vq->split.vring.desc[i].flags & nextflag) { - vring_unmap_one_split(vq, &vq->split.vring.desc[i]); - i = virtio16_to_cpu(vq->vq.vdev, vq->split.vring.desc[i].next); + vring_unmap_one_split(vq, i); + i = vq->split.desc_extra[i].next; vq->vq.num_free++; } - vring_unmap_one_split(vq, &vq->split.vring.desc[i]); - vq->split.vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, - vq->free_head); + vring_unmap_one_split(vq, i); + vq->split.desc_extra[i].next = vq->free_head; vq->free_head = head; /* Plus final descriptor */ @@ -664,15 +722,14 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, if (!indir_desc) return; - len = virtio32_to_cpu(vq->vq.vdev, - vq->split.vring.desc[head].len); + len = vq->split.desc_extra[head].len; - BUG_ON(!(vq->split.vring.desc[head].flags & - cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); + BUG_ON(!(vq->split.desc_extra[head].flags & + VRING_DESC_F_INDIRECT)); BUG_ON(len == 0 || len % sizeof(struct vring_desc)); for (j = 0; j < len / sizeof(struct vring_desc); j++) - vring_unmap_one_split(vq, &indir_desc[j]); + vring_unmap_one_split_indirect(vq, &indir_desc[j]); kfree(indir_desc); vq->split.desc_state[head].indir_desc = NULL; @@ -2108,7 +2165,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, void (*callback)(struct virtqueue *), const char *name) { - unsigned int i; struct vring_virtqueue *vq; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) @@ -2164,16 +2220,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, if (!vq->split.desc_state) goto err_state; + vq->split.desc_extra = vring_alloc_desc_extra(vq, vring.num); + if (!vq->split.desc_extra) + goto err_extra; + /* Put everything in free lists. */ vq->free_head = 0; - for (i = 0; i < vring.num-1; i++) - vq->split.vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); list_add_tail(&vq->vq.list, &vdev->vqs); return &vq->vq; +err_extra: + kfree(vq->split.desc_state); err_state: kfree(vq); return NULL; @@ -2257,8 +2317,10 @@ void vring_del_virtqueue(struct virtqueue *_vq) vq->split.queue_dma_addr); } } - if (!vq->packed_ring) + if (!vq->packed_ring) { kfree(vq->split.desc_state); + kfree(vq->split.desc_extra); + } list_del(&_vq->list); kfree(vq); } -- cgit v1.2.3 From 0140b3d07617e71a8d9509776434ced107572fc8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 2 Jun 2021 10:15:34 +0800 Subject: virtio-pci library: introduce vp_modern_get_driver_features() This patch introduce a helper to get driver/guest features from the device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210602021536.39525-3-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/virtio/virtio_pci_modern_dev.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 54f297028586..e11ed748e661 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -383,6 +383,27 @@ u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) } EXPORT_SYMBOL_GPL(vp_modern_get_features); +/* + * vp_modern_get_driver_features - get driver features from device + * @mdev: the modern virtio-pci device + * + * Returns the driver features read from the device + */ +u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + u64 features; + + vp_iowrite32(0, &cfg->guest_feature_select); + features = vp_ioread32(&cfg->guest_feature); + vp_iowrite32(1, &cfg->guest_feature_select); + features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32); + + return features; +} +EXPORT_SYMBOL_GPL(vp_modern_get_driver_features); + /* * vp_modern_set_features - set features to device * @mdev: the modern virtio-pci device -- cgit v1.2.3 From efa08cb468cdd67855f63f341eac5f5f9ac93370 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 2 Jun 2021 10:15:36 +0800 Subject: virtio/vdpa: clear the virtqueue state during probe Clear the available index as part of the initialization process to clear and values that might be left from previous usage of the device. For example, if the device was previously used by vhost_vdpa and now probed by vhost_vdpa, you want to start with indices. Fixes: c043b4a8cf3b ("virtio: introduce a vDPA based transport") Signed-off-by: Eli Cohen Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210602021536.39525-5-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/virtio/virtio_vdpa.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index e28acf482e0c..e1a141135992 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -142,6 +142,8 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, struct vdpa_callback cb; struct virtqueue *vq; u64 desc_addr, driver_addr, device_addr; + /* Assume split virtqueue, switch to packed if necessary */ + struct vdpa_vq_state state = {0}; unsigned long flags; u32 align, num; int err; @@ -191,6 +193,19 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, goto err_vq; } + /* reset virtqueue state index */ + if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { + struct vdpa_vq_state_packed *s = &state.packed; + + s->last_avail_counter = 1; + s->last_avail_idx = 0; + s->last_used_counter = 1; + s->last_used_idx = 0; + } + err = ops->set_vq_state(vdpa, index, &state); + if (err) + goto err_vq; + ops->set_vq_ready(vdpa, index, 1); vq->priv = info; -- cgit v1.2.3 From 500817bf5e110ad9b7138bc582971bb7ee77d6f7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:14 +0200 Subject: virtio-mem: don't read big block size in Sub Block Mode We are reading a Big Block Mode value while in Sub Block Mode when initializing. Fortunately, vm->bbm.bb_size maps to some counter in the vm->sbm.mb_count array, which is 0 at that point in time. No harm done; still, this was unintended and is not future-proof. Fixes: 4ba50cd3355d ("virtio-mem: Big Block Mode (BBM) memory hotplug") Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-2-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 10ec60d81e84..3bf08b5bb359 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2420,6 +2420,10 @@ static int virtio_mem_init(struct virtio_mem *vm) dev_warn(&vm->vdev->dev, "Some device memory is not addressable/pluggable. This can make some memory unusable.\n"); + /* Prepare the offline threshold - make sure we can add two blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), + VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); + /* * We want subblocks to span at least MAX_ORDER_NR_PAGES and * pageblock_nr_pages pages. This: @@ -2466,14 +2470,11 @@ static int virtio_mem_init(struct virtio_mem *vm) vm->bbm.bb_size - 1; vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr); vm->bbm.next_bb_id = vm->bbm.first_bb_id; - } - /* Prepare the offline threshold - make sure we can add two blocks. */ - vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(), - VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); - /* In BBM, we also want at least two big blocks. */ - vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, - vm->offline_threshold); + /* Make sure we can add two big blocks. */ + vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size, + vm->offline_threshold); + } dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); -- cgit v1.2.3 From 49d42872d520365df619e5092ff7fb225e3079b3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:15 +0200 Subject: virtio-mem: use page_zonenum() in virtio_mem_fake_offline() Let's use page_zonenum() instead of zone_idx(page_zone()). Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-3-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 3bf08b5bb359..1d4b1e25ac8b 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1135,7 +1135,7 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) */ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) { - const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) == + const bool is_movable = page_zonenum(pfn_to_page(pfn)) == ZONE_MOVABLE; int rc, retry_count; -- cgit v1.2.3 From f4cf803dff4c87656cf25d9c5ec3cf828839efec Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:16 +0200 Subject: virtio-mem: simplify high-level plug handling in Sub Block Mode Let's simplify high-level memory block selection when plugging in Sub Block Mode. No need for two separate loops when selecting memory blocks for plugging memory. Avoid passing the "online" state by simply obtaining the state in virtio_mem_sbm_plug_any_sb(). Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-4-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 45 +++++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 28 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 1d4b1e25ac8b..c0e6ea6244e4 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1583,9 +1583,9 @@ static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm, * Note: Can fail after some subblocks were successfully plugged. */ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, - unsigned long mb_id, uint64_t *nb_sb, - bool online) + unsigned long mb_id, uint64_t *nb_sb) { + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); unsigned long pfn, nr_pages; int sb_id, count; int rc; @@ -1607,7 +1607,7 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, if (rc) return rc; *nb_sb -= count; - if (!online) + if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) continue; /* fake-online the pages if the memory block is online */ @@ -1617,23 +1617,21 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, virtio_mem_fake_online(pfn, nr_pages); } - if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) { - if (online) - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE); - else - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE); - } + if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) + virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1); return 0; } static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) { + const int mb_states[] = { + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + }; uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; - int rc; + int rc, i; if (!nb_sb) return 0; @@ -1641,22 +1639,13 @@ static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) /* Don't race with onlining/offlining */ mutex_lock(&vm->hotplug_mutex); - /* Try to plug subblocks of partially plugged online blocks. */ - virtio_mem_sbm_for_each_mb(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { - rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); - } - - /* Try to plug subblocks of partially plugged offline blocks. */ - virtio_mem_sbm_for_each_mb(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { + virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) { + rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + goto out_unlock; + cond_resched(); + } } /* -- cgit v1.2.3 From 5304ca3dd70c586012fb93f4a6d74e3ab750902d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:17 +0200 Subject: virtio-mem: simplify high-level unplug handling in Sub Block Mode Let's simplify by introducing a new virtio_mem_sbm_unplug_any_sb(), similar to virtio_mem_sbm_plug_any_sb(), to simplify high-level memory block selection when unplugging in Sub Block Mode. Rename existing virtio_mem_sbm_unplug_any_sb() to virtio_mem_sbm_unplug_any_sb_raw(). The only change is that we now temporarily unlock the hotplug mutex around cond_resched() when processing offline memory blocks, which doesn't make a real difference as we already have to temporarily unlock in virtio_mem_sbm_unplug_any_sb_offline() when removing a memory block. Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-5-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 103 ++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 46 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index c0e6ea6244e4..d54bb34a7ed8 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -1453,8 +1453,8 @@ static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id) * * Note: can fail after some subblocks were unplugged. */ -static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, - unsigned long mb_id, uint64_t *nb_sb) +static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm, + unsigned long mb_id, uint64_t *nb_sb) { int sb_id, count; int rc; @@ -1496,7 +1496,7 @@ static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id) { uint64_t nb_sb = vm->sbm.sbs_per_mb; - return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); + return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb); } /* @@ -1806,7 +1806,7 @@ static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm, { int rc; - rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb); + rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb); /* some subblocks might have been unplugged even on failure */ if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) @@ -1929,11 +1929,46 @@ unplugged: return 0; } +/* + * Unplug the desired number of plugged subblocks of a memory block that is + * already added to Linux. Will skip subblock of online memory blocks that are + * busy (by the OS). Will fail if any subblock that's not busy cannot get + * unplugged. + * + * Will modify the state of the memory block. Might temporarily drop the + * hotplug_mutex. + * + * Note: Can fail after some subblocks were successfully unplugged. Can + * return 0 even if subblocks were busy and could not get unplugged. + */ +static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, + unsigned long mb_id, + uint64_t *nb_sb) +{ + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); + + switch (old_state) { + case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_ONLINE: + return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); + case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_OFFLINE: + return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb); + } + return -EINVAL; +} + static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { + const int mb_states[] = { + VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_OFFLINE, + VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_ONLINE, + }; uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; - int rc; + int rc, i; if (!nb_sb) return 0; @@ -1945,47 +1980,23 @@ static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) */ mutex_lock(&vm->hotplug_mutex); - /* Try to unplug subblocks of partially plugged offline blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, - VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) { - rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); - } - - /* Try to unplug subblocks of plugged offline blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) { - rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - cond_resched(); - } - - if (!unplug_online) { - mutex_unlock(&vm->hotplug_mutex); - return 0; - } - - /* Try to unplug subblocks of partially plugged online blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) { - rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - mutex_unlock(&vm->hotplug_mutex); - cond_resched(); - mutex_lock(&vm->hotplug_mutex); - } - - /* Try to unplug subblocks of plugged online blocks. */ - virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) { - rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb); - if (rc || !nb_sb) - goto out_unlock; - mutex_unlock(&vm->hotplug_mutex); - cond_resched(); - mutex_lock(&vm->hotplug_mutex); + /* + * We try unplug from partially plugged blocks first, to try removing + * whole memory blocks along with metadata. + */ + for (i = 0; i < ARRAY_SIZE(mb_states); i++) { + virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { + rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb); + if (rc || !nb_sb) + goto out_unlock; + mutex_unlock(&vm->hotplug_mutex); + cond_resched(); + mutex_lock(&vm->hotplug_mutex); + } + if (!unplug_online && i == 1) { + mutex_unlock(&vm->hotplug_mutex); + return 0; + } } mutex_unlock(&vm->hotplug_mutex); -- cgit v1.2.3 From c740bb97cc84b88f160f32e0b5c80159e1c6fd9c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:18 +0200 Subject: virtio-mem: prioritize unplug from ZONE_MOVABLE in Sub Block Mode Until now, memory provided by a single virtio-mem device was usually either onlined completely to ZONE_MOVABLE (online_movable) or to ZONE_NORMAL (online_kernel); however, that will change in the future. There are two reasons why we want to track to which zone a memory blocks belongs to and prioritize ZONE_MOVABLE blocks: 1) Memory managed by ZONE_MOVABLE can more likely get unplugged, therefore, resulting in a faster memory hotunplug process. Further, we can more reliably unplug and remove complete memory blocks, removing metadata allocated for the whole memory block. 2) We want to avoid corner cases where unplugging with the current scheme (highest to lowest address) could result in accidential zone imbalances, whereby we remove too much ZONE_NORMAL memory for ZONE_MOVABLE memory of the same device. Let's track the zone via memory block states and try unplug from ZONE_MOVABLE first. Rename VIRTIO_MEM_SBM_MB_ONLINE* to VIRTIO_MEM_SBM_MB_KERNEL* to avoid even longer state names. In commit 27f852795a06 ("virtio-mem: don't special-case ZONE_MOVABLE"), we removed slightly similar tracking for fully plugged memory blocks to support unplugging from ZONE_MOVABLE at all -- as we didn't allow partially plugged memory blocks in ZONE_MOVABLE before that. That commit already mentioned "In the future, we might want to remember the zone again and use the information when (un)plugging memory." Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-6-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 72 ++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 20 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index d54bb34a7ed8..156a79ceb9fc 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -75,10 +75,14 @@ enum virtio_mem_sbm_mb_state { VIRTIO_MEM_SBM_MB_OFFLINE, /* Partially plugged, fully added to Linux, offline. */ VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, - /* Fully plugged, fully added to Linux, online. */ - VIRTIO_MEM_SBM_MB_ONLINE, - /* Partially plugged, fully added to Linux, online. */ - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + /* Fully plugged, fully added to Linux, onlined to a kernel zone. */ + VIRTIO_MEM_SBM_MB_KERNEL, + /* Partially plugged, fully added to Linux, online to a kernel zone */ + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + /* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ + VIRTIO_MEM_SBM_MB_MOVABLE, + /* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */ + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, VIRTIO_MEM_SBM_MB_COUNT }; @@ -832,11 +836,13 @@ static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, unsigned long mb_id) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { - case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL); break; - case VIRTIO_MEM_SBM_MB_ONLINE: + case VIRTIO_MEM_SBM_MB_KERNEL: + case VIRTIO_MEM_SBM_MB_MOVABLE: virtio_mem_sbm_set_mb_state(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE); break; @@ -847,21 +853,29 @@ static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm, } static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, - unsigned long mb_id) + unsigned long mb_id, + unsigned long start_pfn) { + const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == + ZONE_MOVABLE; + int new_state; + switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); + new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL; + if (is_movable) + new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL; break; case VIRTIO_MEM_SBM_MB_OFFLINE: - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE); + new_state = VIRTIO_MEM_SBM_MB_KERNEL; + if (is_movable) + new_state = VIRTIO_MEM_SBM_MB_MOVABLE; break; default: BUG(); break; } + virtio_mem_sbm_set_mb_state(vm, mb_id, new_state); } static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm, @@ -1015,7 +1029,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, break; case MEM_ONLINE: if (vm->in_sbm) - virtio_mem_sbm_notify_online(vm, id); + virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn); atomic64_sub(size, &vm->offline_size); /* @@ -1626,7 +1640,8 @@ static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm, static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff) { const int mb_states[] = { - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, }; uint64_t nb_sb = diff / vm->sbm.sb_size; @@ -1843,6 +1858,7 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, int count) { const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count; + const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); unsigned long start_pfn; int rc; @@ -1861,8 +1877,17 @@ static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm, return rc; } - virtio_mem_sbm_set_mb_state(vm, mb_id, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL); + switch (old_state) { + case VIRTIO_MEM_SBM_MB_KERNEL: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL); + break; + case VIRTIO_MEM_SBM_MB_MOVABLE: + virtio_mem_sbm_set_mb_state(vm, mb_id, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL); + break; + } + return 0; } @@ -1948,8 +1973,10 @@ static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm, const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id); switch (old_state) { - case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL: - case VIRTIO_MEM_SBM_MB_ONLINE: + case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL: + case VIRTIO_MEM_SBM_MB_KERNEL: + case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL: + case VIRTIO_MEM_SBM_MB_MOVABLE: return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb); case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL: case VIRTIO_MEM_SBM_MB_OFFLINE: @@ -1963,8 +1990,10 @@ static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) const int mb_states[] = { VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL, VIRTIO_MEM_SBM_MB_OFFLINE, - VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL, - VIRTIO_MEM_SBM_MB_ONLINE, + VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL, + VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL, + VIRTIO_MEM_SBM_MB_MOVABLE, + VIRTIO_MEM_SBM_MB_KERNEL, }; uint64_t nb_sb = diff / vm->sbm.sb_size; unsigned long mb_id; @@ -1982,7 +2011,10 @@ static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff) /* * We try unplug from partially plugged blocks first, to try removing - * whole memory blocks along with metadata. + * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE + * as it's more reliable to unplug memory and remove whole memory + * blocks, and we don't want to trigger a zone imbalances by + * accidentially removing too much kernel memory. */ for (i = 0; i < ARRAY_SIZE(mb_states); i++) { virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) { -- cgit v1.2.3 From c6bc1422fa55033c1bd04c788203af8be2d5ce4c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:19 +0200 Subject: virtio-mem: simplify high-level unplug handling in Big Block Mode Let's simplify high-level big block selection when unplugging in Big Block Mode. Combine handling of offline and online blocks. We can get rid of virtio_mem_bbm_bb_is_offline() and simply use virtio_mem_bbm_offline_remove_and_unplug_bb(), as that already tolerates offline parts. We can race with concurrent onlining/offlining either way, so we don;t have to be super correct by failing if an offline big block we'd like to unplug just got (partially) onlined. Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-7-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 96 ++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 72 deletions(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 156a79ceb9fc..43199389c414 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -702,18 +702,6 @@ static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id) return virtio_mem_remove_memory(vm, addr, size); } -/* - * See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered - * by the big block. - */ -static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id) -{ - const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id); - const uint64_t size = vm->bbm.bb_size; - - return virtio_mem_remove_memory(vm, addr, size); -} - /* * Try offlining and removing memory from Linux. * @@ -2114,35 +2102,6 @@ rollback_safe_unplug: return rc; } -/* - * Try to remove a big block from Linux and unplug it. Will fail with - * -EBUSY if some memory is online. - * - * Will modify the state of the memory block. - */ -static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm, - unsigned long bb_id) -{ - int rc; - - if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) != - VIRTIO_MEM_BBM_BB_ADDED)) - return -EINVAL; - - rc = virtio_mem_bbm_remove_bb(vm, bb_id); - if (rc) - return -EBUSY; - - rc = virtio_mem_bbm_unplug_bb(vm, bb_id); - if (rc) - virtio_mem_bbm_set_bb_state(vm, bb_id, - VIRTIO_MEM_BBM_BB_PLUGGED); - else - virtio_mem_bbm_set_bb_state(vm, bb_id, - VIRTIO_MEM_BBM_BB_UNUSED); - return rc; -} - /* * Test if a big block is completely offline. */ @@ -2166,42 +2125,35 @@ static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { uint64_t nb_bb = diff / vm->bbm.bb_size; uint64_t bb_id; - int rc; + int rc, i; if (!nb_bb) return 0; - /* Try to unplug completely offline big blocks first. */ - virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { - cond_resched(); - /* - * As we're holding no locks, this check is racy as memory - * can get onlined in the meantime - but we'll fail gracefully. - */ - if (!virtio_mem_bbm_bb_is_offline(vm, bb_id)) - continue; - rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id); - if (rc == -EBUSY) - continue; - if (!rc) - nb_bb--; - if (rc || !nb_bb) - return rc; - } - - if (!unplug_online) - return 0; + /* + * Try to unplug big blocks. Similar to SBM, start with offline + * big blocks. + */ + for (i = 0; i < 2; i++) { + virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { + cond_resched(); - /* Try to unplug any big blocks. */ - virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { - cond_resched(); - rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); - if (rc == -EBUSY) - continue; - if (!rc) - nb_bb--; - if (rc || !nb_bb) - return rc; + /* + * As we're holding no locks, these checks are racy, + * but we don't care. + */ + if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) + continue; + rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); + if (rc == -EBUSY) + continue; + if (!rc) + nb_bb--; + if (rc || !nb_bb) + return rc; + } + if (i == 0 && !unplug_online) + return 0; } return nb_bb ? -EBUSY : 0; -- cgit v1.2.3 From db7b337709a15d33cc5e901d2ee35d3bb3e42b2f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jun 2021 20:57:20 +0200 Subject: virtio-mem: prioritize unplug from ZONE_MOVABLE in Big Block Mode Let's handle unplug in Big Block Mode similar to Sub Block Mode -- prioritize memory blocks onlined to ZONE_MOVABLE. We won't care further about big blocks with mixed zones, as it's rather a corner case that won't matter in practice. Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20210602185720.31821-8-david@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_mem.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'drivers/virtio') diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 43199389c414..d3e874b6b50b 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2121,6 +2121,29 @@ static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm, return true; } +/* + * Test if a big block is completely onlined to ZONE_MOVABLE (or offline). + */ +static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm, + unsigned long bb_id) +{ + const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id)); + const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size); + struct page *page; + unsigned long pfn; + + for (pfn = start_pfn; pfn < start_pfn + nr_pages; + pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + if (page_zonenum(page) != ZONE_MOVABLE) + return false; + } + + return true; +} + static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) { uint64_t nb_bb = diff / vm->bbm.bb_size; @@ -2134,7 +2157,7 @@ static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) * Try to unplug big blocks. Similar to SBM, start with offline * big blocks. */ - for (i = 0; i < 2; i++) { + for (i = 0; i < 3; i++) { virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) { cond_resched(); @@ -2144,6 +2167,8 @@ static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff) */ if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id)) continue; + if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id)) + continue; rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id); if (rc == -EBUSY) continue; -- cgit v1.2.3