diff options
Diffstat (limited to 'drivers/target/target_core_user.c')
-rw-r--r-- | drivers/target/target_core_user.c | 645 |
1 files changed, 511 insertions, 134 deletions
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index f615c3bbb73e..9045837f748b 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -2,6 +2,7 @@ * Copyright (C) 2013 Shaohua Li <shli@kernel.org> * Copyright (C) 2014 Red Hat, Inc. * Copyright (C) 2015 Arrikto, Inc. + * Copyright (C) 2017 Chinamobile, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -25,10 +26,13 @@ #include <linux/parser.h> #include <linux/vmalloc.h> #include <linux/uio_driver.h> +#include <linux/radix-tree.h> #include <linux/stringify.h> #include <linux/bitops.h> #include <linux/highmem.h> #include <linux/configfs.h> +#include <linux/mutex.h> +#include <linux/kthread.h> #include <net/genetlink.h> #include <scsi/scsi_common.h> #include <scsi/scsi_proto.h> @@ -63,17 +67,26 @@ * this may have a 'UAM' comment. */ - #define TCMU_TIME_OUT (30 * MSEC_PER_SEC) -#define DATA_BLOCK_BITS 256 -#define DATA_BLOCK_SIZE 4096 +/* For cmd area, the size is fixed 8MB */ +#define CMDR_SIZE (8 * 1024 * 1024) -#define CMDR_SIZE (16 * 4096) +/* + * For data area, the block size is PAGE_SIZE and + * the total size is 256K * PAGE_SIZE. + */ +#define DATA_BLOCK_SIZE PAGE_SIZE +#define DATA_BLOCK_BITS (256 * 1024) #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE) +#define DATA_BLOCK_INIT_BITS 128 +/* The total size of the ring is 8M + 256K * PAGE_SIZE */ #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE) +/* Default maximum of the global data blocks(512K * PAGE_SIZE) */ +#define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024) + static struct device *tcmu_root_device; struct tcmu_hba { @@ -83,6 +96,8 @@ struct tcmu_hba { #define TCMU_CONFIG_LEN 256 struct tcmu_dev { + struct list_head node; + struct se_device se_dev; char *name; @@ -94,6 +109,8 @@ struct tcmu_dev { struct uio_info uio_info; + struct inode *inode; + struct tcmu_mailbox *mb_addr; size_t dev_size; u32 cmdr_size; @@ -103,11 +120,14 @@ struct tcmu_dev { size_t data_off; size_t data_size; - DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS); - wait_queue_head_t wait_cmdr; - /* TODO should this be a mutex? */ - spinlock_t cmdr_lock; + struct mutex cmdr_lock; + + bool waiting_global; + uint32_t dbi_max; + uint32_t dbi_thresh; + DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS); + struct radix_tree_root data_blocks; struct idr commands; spinlock_t commands_lock; @@ -130,7 +150,9 @@ struct tcmu_cmd { /* Can't use se_cmd when cleaning up expired cmds, because if cmd has been completed then accessing se_cmd is off limits */ - DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS); + uint32_t dbi_cnt; + uint32_t dbi_cur; + uint32_t *dbi; unsigned long deadline; @@ -138,6 +160,13 @@ struct tcmu_cmd { unsigned long flags; }; +static struct task_struct *unmap_thread; +static wait_queue_head_t unmap_wait; +static DEFINE_MUTEX(root_udev_mutex); +static LIST_HEAD(root_udev); + +static atomic_t global_db_count = ATOMIC_INIT(0); + static struct kmem_cache *tcmu_cmd_cache; /* multicast group */ @@ -161,6 +190,114 @@ static struct genl_family tcmu_genl_family __ro_after_init = { .netnsok = true, }; +#define tcmu_cmd_set_dbi_cur(cmd, index) ((cmd)->dbi_cur = (index)) +#define tcmu_cmd_reset_dbi_cur(cmd) tcmu_cmd_set_dbi_cur(cmd, 0) +#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index)) +#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++]) + +static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + uint32_t i; + + for (i = 0; i < len; i++) + clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap); +} + +static inline bool tcmu_get_empty_block(struct tcmu_dev *udev, + struct tcmu_cmd *tcmu_cmd) +{ + struct page *page; + int ret, dbi; + + dbi = find_first_zero_bit(udev->data_bitmap, udev->dbi_thresh); + if (dbi == udev->dbi_thresh) + return false; + + page = radix_tree_lookup(&udev->data_blocks, dbi); + if (!page) { + + if (atomic_add_return(1, &global_db_count) > + TCMU_GLOBAL_MAX_BLOCKS) { + atomic_dec(&global_db_count); + return false; + } + + /* try to get new page from the mm */ + page = alloc_page(GFP_KERNEL); + if (!page) + return false; + + ret = radix_tree_insert(&udev->data_blocks, dbi, page); + if (ret) { + __free_page(page); + return false; + } + + } + + if (dbi > udev->dbi_max) + udev->dbi_max = dbi; + + set_bit(dbi, udev->data_bitmap); + tcmu_cmd_set_dbi(tcmu_cmd, dbi); + + return true; +} + +static bool tcmu_get_empty_blocks(struct tcmu_dev *udev, + struct tcmu_cmd *tcmu_cmd) +{ + int i; + + udev->waiting_global = false; + + for (i = tcmu_cmd->dbi_cur; i < tcmu_cmd->dbi_cnt; i++) { + if (!tcmu_get_empty_block(udev, tcmu_cmd)) + goto err; + } + return true; + +err: + udev->waiting_global = true; + /* Try to wake up the unmap thread */ + wake_up(&unmap_wait); + return false; +} + +static inline struct page * +tcmu_get_block_page(struct tcmu_dev *udev, uint32_t dbi) +{ + return radix_tree_lookup(&udev->data_blocks, dbi); +} + +static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd) +{ + kfree(tcmu_cmd->dbi); + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); +} + +static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd) +{ + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE); + + if (se_cmd->se_cmd_flags & SCF_BIDI) { + BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents)); + data_length += round_up(se_cmd->t_bidi_data_sg->length, + DATA_BLOCK_SIZE); + } + + return data_length; +} + +static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd) +{ + size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd); + + return data_length / DATA_BLOCK_SIZE; +} + static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) { struct se_device *se_dev = se_cmd->se_dev; @@ -178,6 +315,15 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) tcmu_cmd->deadline = jiffies + msecs_to_jiffies(udev->cmd_time_out); + tcmu_cmd_reset_dbi_cur(tcmu_cmd); + tcmu_cmd->dbi_cnt = tcmu_cmd_get_block_cnt(tcmu_cmd); + tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), + GFP_KERNEL); + if (!tcmu_cmd->dbi) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + idr_preload(GFP_KERNEL); spin_lock_irq(&udev->commands_lock); cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0, @@ -186,7 +332,7 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) idr_preload_end(); if (cmd_id < 0) { - kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + tcmu_free_cmd(tcmu_cmd); return NULL; } tcmu_cmd->cmd_id = cmd_id; @@ -248,10 +394,10 @@ static inline void new_iov(struct iovec **iov, int *iov_cnt, #define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size) /* offset is relative to mb_addr */ -static inline size_t get_block_offset(struct tcmu_dev *dev, - int block, int remaining) +static inline size_t get_block_offset_user(struct tcmu_dev *dev, + int dbi, int remaining) { - return dev->data_off + block * DATA_BLOCK_SIZE + + return dev->data_off + dbi * DATA_BLOCK_SIZE + DATA_BLOCK_SIZE - remaining; } @@ -260,37 +406,45 @@ static inline size_t iov_tail(struct tcmu_dev *udev, struct iovec *iov) return (size_t)iov->iov_base + iov->iov_len; } -static void alloc_and_scatter_data_area(struct tcmu_dev *udev, - struct scatterlist *data_sg, unsigned int data_nents, - struct iovec **iov, int *iov_cnt, bool copy_data) +static int scatter_data_area(struct tcmu_dev *udev, + struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg, + unsigned int data_nents, struct iovec **iov, + int *iov_cnt, bool copy_data) { - int i, block; + int i, dbi; int block_remaining = 0; - void *from, *to; - size_t copy_bytes, to_offset; + void *from, *to = NULL; + size_t copy_bytes, to_offset, offset; struct scatterlist *sg; + struct page *page; for_each_sg(data_sg, sg, data_nents, i) { int sg_remaining = sg->length; from = kmap_atomic(sg_page(sg)) + sg->offset; while (sg_remaining > 0) { if (block_remaining == 0) { - block = find_first_zero_bit(udev->data_bitmap, - DATA_BLOCK_BITS); + if (to) + kunmap_atomic(to); + block_remaining = DATA_BLOCK_SIZE; - set_bit(block, udev->data_bitmap); + dbi = tcmu_cmd_get_dbi(tcmu_cmd); + page = tcmu_get_block_page(udev, dbi); + to = kmap_atomic(page); } + copy_bytes = min_t(size_t, sg_remaining, block_remaining); - to_offset = get_block_offset(udev, block, + to_offset = get_block_offset_user(udev, dbi, block_remaining); - to = (void *)udev->mb_addr + to_offset; + offset = DATA_BLOCK_SIZE - block_remaining; + to = (void *)(unsigned long)to + offset; + if (*iov_cnt != 0 && to_offset == iov_tail(udev, *iov)) { (*iov)->iov_len += copy_bytes; } else { new_iov(iov, iov_cnt, udev); - (*iov)->iov_base = (void __user *) to_offset; + (*iov)->iov_base = (void __user *)to_offset; (*iov)->iov_len = copy_bytes; } if (copy_data) { @@ -303,33 +457,29 @@ static void alloc_and_scatter_data_area(struct tcmu_dev *udev, } kunmap_atomic(from - sg->offset); } -} + if (to) + kunmap_atomic(to); -static void free_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd) -{ - bitmap_xor(udev->data_bitmap, udev->data_bitmap, cmd->data_bitmap, - DATA_BLOCK_BITS); + return 0; } static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, bool bidi) { struct se_cmd *se_cmd = cmd->se_cmd; - int i, block; + int i, dbi; int block_remaining = 0; - void *from, *to; - size_t copy_bytes, from_offset; + void *from = NULL, *to; + size_t copy_bytes, offset; struct scatterlist *sg, *data_sg; + struct page *page; unsigned int data_nents; - DECLARE_BITMAP(bitmap, DATA_BLOCK_BITS); - - bitmap_copy(bitmap, cmd->data_bitmap, DATA_BLOCK_BITS); + uint32_t count = 0; if (!bidi) { data_sg = se_cmd->t_data_sg; data_nents = se_cmd->t_data_nents; } else { - uint32_t count; /* * For bidi case, the first count blocks are for Data-Out @@ -337,30 +487,30 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, * the Data-Out buffer blocks should be discarded. */ count = DIV_ROUND_UP(se_cmd->data_length, DATA_BLOCK_SIZE); - while (count--) { - block = find_first_bit(bitmap, DATA_BLOCK_BITS); - clear_bit(block, bitmap); - } data_sg = se_cmd->t_bidi_data_sg; data_nents = se_cmd->t_bidi_data_nents; } + tcmu_cmd_set_dbi_cur(cmd, count); + for_each_sg(data_sg, sg, data_nents, i) { int sg_remaining = sg->length; to = kmap_atomic(sg_page(sg)) + sg->offset; while (sg_remaining > 0) { if (block_remaining == 0) { - block = find_first_bit(bitmap, - DATA_BLOCK_BITS); + if (from) + kunmap_atomic(from); + block_remaining = DATA_BLOCK_SIZE; - clear_bit(block, bitmap); + dbi = tcmu_cmd_get_dbi(cmd); + page = tcmu_get_block_page(udev, dbi); + from = kmap_atomic(page); } copy_bytes = min_t(size_t, sg_remaining, block_remaining); - from_offset = get_block_offset(udev, block, - block_remaining); - from = (void *) udev->mb_addr + from_offset; + offset = DATA_BLOCK_SIZE - block_remaining; + from = (void *)(unsigned long)from + offset; tcmu_flush_dcache_range(from, copy_bytes); memcpy(to + sg->length - sg_remaining, from, copy_bytes); @@ -370,12 +520,13 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd, } kunmap_atomic(to - sg->offset); } + if (from) + kunmap_atomic(from); } -static inline size_t spc_bitmap_free(unsigned long *bitmap) +static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh) { - return DATA_BLOCK_SIZE * (DATA_BLOCK_BITS - - bitmap_weight(bitmap, DATA_BLOCK_BITS)); + return DATA_BLOCK_SIZE * (thresh - bitmap_weight(bitmap, thresh)); } /* @@ -384,9 +535,12 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap) * * Called with ring lock held. */ -static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed) +static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd, + size_t cmd_size, size_t data_needed) { struct tcmu_mailbox *mb = udev->mb_addr; + uint32_t blocks_needed = (data_needed + DATA_BLOCK_SIZE - 1) + / DATA_BLOCK_SIZE; size_t space, cmd_needed; u32 cmd_head; @@ -410,35 +564,63 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d return false; } - space = spc_bitmap_free(udev->data_bitmap); + /* try to check and get the data blocks as needed */ + space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh); if (space < data_needed) { - pr_debug("no data space: only %zu available, but ask for %zu\n", - space, data_needed); - return false; + unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh; + unsigned long grow; + + if (blocks_left < blocks_needed) { + pr_debug("no data space: only %lu available, but ask for %zu\n", + blocks_left * DATA_BLOCK_SIZE, + data_needed); + return false; + } + + /* Try to expand the thresh */ + if (!udev->dbi_thresh) { + /* From idle state */ + uint32_t init_thresh = DATA_BLOCK_INIT_BITS; + + udev->dbi_thresh = max(blocks_needed, init_thresh); + } else { + /* + * Grow the data area by max(blocks needed, + * dbi_thresh / 2), but limited to the max + * DATA_BLOCK_BITS size. + */ + grow = max(blocks_needed, udev->dbi_thresh / 2); + udev->dbi_thresh += grow; + if (udev->dbi_thresh > DATA_BLOCK_BITS) + udev->dbi_thresh = DATA_BLOCK_BITS; + } } + if (!tcmu_get_empty_blocks(udev, cmd)) + return false; + return true; } -static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd) +static inline size_t tcmu_cmd_get_base_cmd_size(size_t iov_cnt) { - struct se_cmd *se_cmd = tcmu_cmd->se_cmd; - size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE); - - if (se_cmd->se_cmd_flags & SCF_BIDI) { - BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents)); - data_length += round_up(se_cmd->t_bidi_data_sg->length, - DATA_BLOCK_SIZE); - } - - return data_length; + return max(offsetof(struct tcmu_cmd_entry, req.iov[iov_cnt]), + sizeof(struct tcmu_cmd_entry)); } -static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd) +static inline size_t tcmu_cmd_get_cmd_size(struct tcmu_cmd *tcmu_cmd, + size_t base_command_size) { - size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd); + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t command_size; - return data_length / DATA_BLOCK_SIZE; + command_size = base_command_size + + round_up(scsi_command_size(se_cmd->t_task_cdb), + TCMU_OP_ALIGN_SIZE); + + WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); + + return command_size; } static sense_reason_t @@ -450,12 +632,11 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) struct tcmu_mailbox *mb; struct tcmu_cmd_entry *entry; struct iovec *iov; - int iov_cnt; + int iov_cnt, ret; uint32_t cmd_head; uint64_t cdb_off; bool copy_to_data_area; size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd); - DECLARE_BITMAP(old_bitmap, DATA_BLOCK_BITS); if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -464,18 +645,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) * Must be a certain minimum size for response sense info, but * also may be larger if the iov array is large. * - * We prepare way too many iovs for potential uses here, because it's - * expensive to tell how many regions are freed in the bitmap - */ - base_command_size = max(offsetof(struct tcmu_cmd_entry, - req.iov[tcmu_cmd_get_block_cnt(tcmu_cmd)]), - sizeof(struct tcmu_cmd_entry)); - command_size = base_command_size - + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE); - - WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); + * We prepare as many iovs as possbile for potential uses here, + * because it's expensive to tell how many regions are freed in + * the bitmap & global data pool, as the size calculated here + * will only be used to do the checks. + * + * The size will be recalculated later as actually needed to save + * cmd area memories. + */ + base_command_size = tcmu_cmd_get_base_cmd_size(tcmu_cmd->dbi_cnt); + command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size); - spin_lock_irq(&udev->cmdr_lock); + mutex_lock(&udev->cmdr_lock); mb = udev->mb_addr; cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ @@ -484,18 +665,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu " "cmd ring/data area\n", command_size, data_length, udev->cmdr_size, udev->data_size); - spin_unlock_irq(&udev->cmdr_lock); + mutex_unlock(&udev->cmdr_lock); return TCM_INVALID_CDB_FIELD; } - while (!is_ring_space_avail(udev, command_size, data_length)) { + while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) { int ret; DEFINE_WAIT(__wait); prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE); pr_debug("sleeping for ring space\n"); - spin_unlock_irq(&udev->cmdr_lock); + mutex_unlock(&udev->cmdr_lock); if (udev->cmd_time_out) ret = schedule_timeout( msecs_to_jiffies(udev->cmd_time_out)); @@ -507,7 +688,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } - spin_lock_irq(&udev->cmdr_lock); + mutex_lock(&udev->cmdr_lock); /* We dropped cmdr_lock, cmd_head is stale */ cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ @@ -534,20 +715,26 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) entry = (void *) mb + CMDR_OFF + cmd_head; tcmu_flush_dcache_range(entry, sizeof(*entry)); tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_CMD); - tcmu_hdr_set_len(&entry->hdr.len_op, command_size); entry->hdr.cmd_id = tcmu_cmd->cmd_id; entry->hdr.kflags = 0; entry->hdr.uflags = 0; - bitmap_copy(old_bitmap, udev->data_bitmap, DATA_BLOCK_BITS); - /* Handle allocating space from the data area */ + tcmu_cmd_reset_dbi_cur(tcmu_cmd); iov = &entry->req.iov[0]; iov_cnt = 0; copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE || se_cmd->se_cmd_flags & SCF_BIDI); - alloc_and_scatter_data_area(udev, se_cmd->t_data_sg, - se_cmd->t_data_nents, &iov, &iov_cnt, copy_to_data_area); + ret = scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, &iov, &iov_cnt, + copy_to_data_area); + if (ret) { + tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt); + mutex_unlock(&udev->cmdr_lock); + + pr_err("tcmu: alloc and scatter data failed\n"); + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + } entry->req.iov_cnt = iov_cnt; entry->req.iov_dif_cnt = 0; @@ -555,14 +742,29 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) if (se_cmd->se_cmd_flags & SCF_BIDI) { iov_cnt = 0; iov++; - alloc_and_scatter_data_area(udev, se_cmd->t_bidi_data_sg, - se_cmd->t_bidi_data_nents, &iov, &iov_cnt, - false); + ret = scatter_data_area(udev, tcmu_cmd, + se_cmd->t_bidi_data_sg, + se_cmd->t_bidi_data_nents, + &iov, &iov_cnt, false); + if (ret) { + tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt); + mutex_unlock(&udev->cmdr_lock); + + pr_err("tcmu: alloc and scatter bidi data failed\n"); + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + } entry->req.iov_bidi_cnt = iov_cnt; } - /* cmd's data_bitmap is what changed in process */ - bitmap_xor(tcmu_cmd->data_bitmap, old_bitmap, udev->data_bitmap, - DATA_BLOCK_BITS); + + /* + * Recalaulate the command's base size and size according + * to the actual needs + */ + base_command_size = tcmu_cmd_get_base_cmd_size(entry->req.iov_cnt + + entry->req.iov_bidi_cnt); + command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size); + + tcmu_hdr_set_len(&entry->hdr.len_op, command_size); /* All offsets relative to mb_addr, not start of entry! */ cdb_off = CMDR_OFF + cmd_head + base_command_size; @@ -572,8 +774,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size); tcmu_flush_dcache_range(mb, sizeof(*mb)); - - spin_unlock_irq(&udev->cmdr_lock); + mutex_unlock(&udev->cmdr_lock); /* TODO: only if FLUSH and FUA? */ uio_event_notify(&udev->uio_info); @@ -604,7 +805,7 @@ tcmu_queue_cmd(struct se_cmd *se_cmd) idr_remove(&udev->commands, tcmu_cmd->cmd_id); spin_unlock_irq(&udev->commands_lock); - kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + tcmu_free_cmd(tcmu_cmd); } return ret; @@ -615,50 +816,45 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * struct se_cmd *se_cmd = cmd->se_cmd; struct tcmu_dev *udev = cmd->tcmu_dev; - if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { - /* - * cmd has been completed already from timeout, just reclaim - * data area space and free cmd - */ - free_data_area(udev, cmd); + /* + * cmd has been completed already from timeout, just reclaim + * data area space and free cmd + */ + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + goto out; - kmem_cache_free(tcmu_cmd_cache, cmd); - return; - } + tcmu_cmd_reset_dbi_cur(cmd); if (entry->hdr.uflags & TCMU_UFLAG_UNKNOWN_OP) { - free_data_area(udev, cmd); pr_warn("TCMU: Userspace set UNKNOWN_OP flag on se_cmd %p\n", cmd->se_cmd); entry->rsp.scsi_status = SAM_STAT_CHECK_CONDITION; } else if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer, se_cmd->scsi_sense_length); - free_data_area(udev, cmd); } else if (se_cmd->se_cmd_flags & SCF_BIDI) { /* Get Data-In buffer before clean up */ gather_data_area(udev, cmd, true); - free_data_area(udev, cmd); } else if (se_cmd->data_direction == DMA_FROM_DEVICE) { gather_data_area(udev, cmd, false); - free_data_area(udev, cmd); } else if (se_cmd->data_direction == DMA_TO_DEVICE) { - free_data_area(udev, cmd); + /* TODO: */ } else if (se_cmd->data_direction != DMA_NONE) { pr_warn("TCMU: data direction was %d!\n", se_cmd->data_direction); } target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); - cmd->se_cmd = NULL; - kmem_cache_free(tcmu_cmd_cache, cmd); +out: + cmd->se_cmd = NULL; + tcmu_cmd_free_data(cmd, cmd->dbi_cnt); + tcmu_free_cmd(cmd); } static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) { struct tcmu_mailbox *mb; - unsigned long flags; int handled = 0; if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { @@ -666,8 +862,6 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) return 0; } - spin_lock_irqsave(&udev->cmdr_lock, flags); - mb = udev->mb_addr; tcmu_flush_dcache_range(mb, sizeof(*mb)); @@ -708,8 +902,6 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) if (mb->cmd_tail == mb->cmd_head) del_timer(&udev->timeout); /* no more pending cmds */ - spin_unlock_irqrestore(&udev->cmdr_lock, flags); - wake_up(&udev->wait_cmdr); return handled; @@ -736,16 +928,14 @@ static void tcmu_device_timedout(unsigned long data) { struct tcmu_dev *udev = (struct tcmu_dev *)data; unsigned long flags; - int handled; - - handled = tcmu_handle_completions(udev); - - pr_warn("%d completions handled from timeout\n", handled); spin_lock_irqsave(&udev->commands_lock, flags); idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL); spin_unlock_irqrestore(&udev->commands_lock, flags); + /* Try to wake up the ummap thread */ + wake_up(&unmap_wait); + /* * We don't need to wakeup threads on wait_cmdr since they have their * own timeout. @@ -790,7 +980,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) udev->cmd_time_out = TCMU_TIME_OUT; init_waitqueue_head(&udev->wait_cmdr); - spin_lock_init(&udev->cmdr_lock); + mutex_init(&udev->cmdr_lock); idr_init(&udev->commands); spin_lock_init(&udev->commands_lock); @@ -805,7 +995,9 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) { struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info); + mutex_lock(&tcmu_dev->cmdr_lock); tcmu_handle_completions(tcmu_dev); + mutex_unlock(&tcmu_dev->cmdr_lock); return 0; } @@ -827,6 +1019,60 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma) return -1; } +static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi) +{ + struct page *page; + int ret; + + mutex_lock(&udev->cmdr_lock); + page = tcmu_get_block_page(udev, dbi); + if (likely(page)) { + mutex_unlock(&udev->cmdr_lock); + return page; + } + + /* + * Normally it shouldn't be here: + * Only when the userspace has touched the blocks which + * are out of the tcmu_cmd's data iov[], and will return + * one zeroed page. + */ + pr_warn("Block(%u) out of cmd's iov[] has been touched!\n", dbi); + pr_warn("Mostly it will be a bug of userspace, please have a check!\n"); + + if (dbi >= udev->dbi_thresh) { + /* Extern the udev->dbi_thresh to dbi + 1 */ + udev->dbi_thresh = dbi + 1; + udev->dbi_max = dbi; + } + + page = radix_tree_lookup(&udev->data_blocks, dbi); + if (!page) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + mutex_unlock(&udev->cmdr_lock); + return NULL; + } + + ret = radix_tree_insert(&udev->data_blocks, dbi, page); + if (ret) { + mutex_unlock(&udev->cmdr_lock); + __free_page(page); + return NULL; + } + + /* + * Since this case is rare in page fault routine, here we + * will allow the global_db_count >= TCMU_GLOBAL_MAX_BLOCKS + * to reduce possible page fault call trace. + */ + atomic_inc(&global_db_count); + } + mutex_unlock(&udev->cmdr_lock); + + return page; +} + static int tcmu_vma_fault(struct vm_fault *vmf) { struct tcmu_dev *udev = vmf->vma->vm_private_data; @@ -845,11 +1091,20 @@ static int tcmu_vma_fault(struct vm_fault *vmf) */ offset = (vmf->pgoff - mi) << PAGE_SHIFT; - addr = (void *)(unsigned long)info->mem[mi].addr + offset; - if (info->mem[mi].memtype == UIO_MEM_LOGICAL) - page = virt_to_page(addr); - else + if (offset < udev->data_off) { + /* For the vmalloc()ed cmd area pages */ + addr = (void *)(unsigned long)info->mem[mi].addr + offset; page = vmalloc_to_page(addr); + } else { + uint32_t dbi; + + /* For the dynamically growing data area pages */ + dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE; + page = tcmu_try_get_block_page(udev, dbi); + if (!page) + return VM_FAULT_NOPAGE; + } + get_page(page); vmf->page = page; return 0; @@ -883,6 +1138,8 @@ static int tcmu_open(struct uio_info *info, struct inode *inode) if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags)) return -EBUSY; + udev->inode = inode; + pr_debug("open\n"); return 0; @@ -963,7 +1220,7 @@ static int tcmu_configure_device(struct se_device *dev) info->name = str; - udev->mb_addr = vzalloc(TCMU_RING_SIZE); + udev->mb_addr = vzalloc(CMDR_SIZE); if (!udev->mb_addr) { ret = -ENOMEM; goto err_vzalloc; @@ -972,8 +1229,11 @@ static int tcmu_configure_device(struct se_device *dev) /* mailbox fits in first part of CMDR space */ udev->cmdr_size = CMDR_SIZE - CMDR_OFF; udev->data_off = CMDR_SIZE; - udev->data_size = TCMU_RING_SIZE - CMDR_SIZE; + udev->data_size = DATA_SIZE; + udev->dbi_thresh = 0; /* Default in Idle state */ + udev->waiting_global = false; + /* Initialise the mailbox of the ring buffer */ mb = udev->mb_addr; mb->version = TCMU_MAILBOX_VERSION; mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC; @@ -984,12 +1244,14 @@ static int tcmu_configure_device(struct se_device *dev) WARN_ON(udev->data_size % PAGE_SIZE); WARN_ON(udev->data_size % DATA_BLOCK_SIZE); + INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL); + info->version = __stringify(TCMU_MAILBOX_VERSION); info->mem[0].name = "tcm-user command & data buffer"; info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr; info->mem[0].size = TCMU_RING_SIZE; - info->mem[0].memtype = UIO_MEM_VIRTUAL; + info->mem[0].memtype = UIO_MEM_NONE; info->irqcontrol = tcmu_irqcontrol; info->irq = UIO_IRQ_CUSTOM; @@ -1015,6 +1277,10 @@ static int tcmu_configure_device(struct se_device *dev) if (ret) goto err_netlink; + mutex_lock(&root_udev_mutex); + list_add(&udev->node, &root_udev); + mutex_unlock(&root_udev_mutex); + return 0; err_netlink: @@ -1049,6 +1315,23 @@ static bool tcmu_dev_configured(struct tcmu_dev *udev) return udev->uio_info.uio_dev ? true : false; } +static void tcmu_blocks_release(struct tcmu_dev *udev) +{ + int i; + struct page *page; + + /* Try to release all block pages */ + mutex_lock(&udev->cmdr_lock); + for (i = 0; i <= udev->dbi_max; i++) { + page = radix_tree_delete(&udev->data_blocks, i); + if (page) { + __free_page(page); + atomic_dec(&global_db_count); + } + } + mutex_unlock(&udev->cmdr_lock); +} + static void tcmu_free_device(struct se_device *dev) { struct tcmu_dev *udev = TCMU_DEV(dev); @@ -1058,6 +1341,10 @@ static void tcmu_free_device(struct se_device *dev) del_timer_sync(&udev->timeout); + mutex_lock(&root_udev_mutex); + list_del(&udev->node); + mutex_unlock(&root_udev_mutex); + vfree(udev->mb_addr); /* Upper layer should drain all requests before calling this */ @@ -1070,6 +1357,8 @@ static void tcmu_free_device(struct se_device *dev) spin_unlock_irq(&udev->commands_lock); WARN_ON(!all_expired); + tcmu_blocks_release(udev); + if (tcmu_dev_configured(udev)) { tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, udev->uio_info.uio_dev->minor); @@ -1256,6 +1545,84 @@ static struct target_backend_ops tcmu_ops = { .tb_dev_attrib_attrs = NULL, }; +static int unmap_thread_fn(void *data) +{ + struct tcmu_dev *udev; + loff_t off; + uint32_t start, end, block; + struct page *page; + int i; + + while (1) { + DEFINE_WAIT(__wait); + + prepare_to_wait(&unmap_wait, &__wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&unmap_wait, &__wait); + + if (kthread_should_stop()) + break; + + mutex_lock(&root_udev_mutex); + list_for_each_entry(udev, &root_udev, node) { + mutex_lock(&udev->cmdr_lock); + + /* Try to complete the finished commands first */ + tcmu_handle_completions(udev); + + /* Skip the udevs waiting the global pool or in idle */ + if (udev->waiting_global || !udev->dbi_thresh) { + mutex_unlock(&udev->cmdr_lock); + continue; + } + + end = udev->dbi_max + 1; + block = find_last_bit(udev->data_bitmap, end); + if (block == udev->dbi_max) { + /* + * The last bit is dbi_max, so there is + * no need to shrink any blocks. + */ + mutex_unlock(&udev->cmdr_lock); + continue; + } else if (block == end) { + /* The current udev will goto idle state */ + udev->dbi_thresh = start = 0; + udev->dbi_max = 0; + } else { + udev->dbi_thresh = start = block + 1; + udev->dbi_max = block; + } + + /* Here will truncate the data area from off */ + off = udev->data_off + start * DATA_BLOCK_SIZE; + unmap_mapping_range(udev->inode->i_mapping, off, 0, 1); + + /* Release the block pages */ + for (i = start; i < end; i++) { + page = radix_tree_delete(&udev->data_blocks, i); + if (page) { + __free_page(page); + atomic_dec(&global_db_count); + } + } + mutex_unlock(&udev->cmdr_lock); + } + + /* + * Try to wake up the udevs who are waiting + * for the global data pool. + */ + list_for_each_entry(udev, &root_udev, node) { + if (udev->waiting_global) + wake_up(&udev->wait_cmdr); + } + mutex_unlock(&root_udev_mutex); + } + + return 0; +} + static int __init tcmu_module_init(void) { int ret, i, len = 0; @@ -1301,8 +1668,17 @@ static int __init tcmu_module_init(void) if (ret) goto out_attrs; + init_waitqueue_head(&unmap_wait); + unmap_thread = kthread_run(unmap_thread_fn, NULL, "tcmu_unmap"); + if (IS_ERR(unmap_thread)) { + ret = PTR_ERR(unmap_thread); + goto out_unreg_transport; + } + return 0; +out_unreg_transport: + target_backend_unregister(&tcmu_ops); out_attrs: kfree(tcmu_attrs); out_unreg_genl: @@ -1317,6 +1693,7 @@ out_free_cache: static void __exit tcmu_module_exit(void) { + kthread_stop(unmap_thread); target_backend_unregister(&tcmu_ops); kfree(tcmu_attrs); genl_unregister_family(&tcmu_genl_family); |