diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-01 15:33:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-01 15:33:12 -0700 |
commit | ffc1c20c46f74e24c3f03147688b4af6e429654a (patch) | |
tree | 9db2b99d6d788761398e3a459b7d0924385b72e1 /drivers/md | |
parent | f365ab31efacb70bed1e821f7435626e0b2528a6 (diff) | |
parent | 81d5553d1288c2ec0390f02f84d71ca0f0f9f137 (diff) |
Merge tag 'for-5.7/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Add DM writecache "cleaner" policy feature that allows cache to be
flushed while userspace monitors for completion to then discommision
use of caching.
- Optimize DM writecache superblock writing and also yield CPU while
initializing writecache on large PMEM devices to avoid CPU stalls.
- Various fixes to DM integrity target while preparing for the ability
to resize a DM integrity device. In addition to resize support, add
optional discard support with the "allow_discards" feature.
- Fix DM clone target's discard handling and overflow bugs which could
cause data corruption.
- Fix memory leak in destructor for DM verity FEC support.
- Fix DM zoned target's redundant increment of nr_rnd_zones.
- Small cleanup in DM crypt to use crypt_integrity_aead() helper.
* tag 'for-5.7/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
dm clone metadata: Fix return type of dm_clone_nr_of_hydrated_regions()
dm clone: Add missing casts to prevent overflows and data corruption
dm clone: Add overflow check for number of regions
dm clone: Fix handling of partial region discards
dm writecache: add cond_resched to avoid CPU hangs
dm integrity: improve discard in journal mode
dm integrity: add optional discard support
dm integrity: allow resize of the integrity device
dm integrity: factor out get_provided_data_sectors()
dm integrity: don't replay journal data past the end of the device
dm integrity: remove sector type casts
dm integrity: fix a crash with unusually large tag size
dm zoned: remove duplicate nr_rnd_zones increase in dmz_init_zone()
dm verity fec: fix memory leak in verity_fec_dtr
dm writecache: optimize superblock write
dm writecache: implement gradual cleanup
dm writecache: implement the "cleaner" policy
dm writecache: do direct write if the cache is full
dm integrity: print device name in integrity_metadata() error message
dm crypt: use crypt_integrity_aead() helper
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-clone-metadata.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-clone-metadata.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-clone-target.c | 66 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 304 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 138 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 1 |
8 files changed, 431 insertions, 102 deletions
diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index c05b12110456..17712456fa63 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -656,7 +656,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, return (bit >= (start + nr_regions)); } -unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd) { return bitmap_weight(cmd->region_map, cmd->nr_regions); } @@ -850,6 +850,12 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re struct dirty_map *dmap; unsigned long word, flags; + if (unlikely(region_nr >= cmd->nr_regions)) { + DMERR("Region %lu out of range (total number of regions %lu)", + region_nr, cmd->nr_regions); + return -ERANGE; + } + word = region_nr / BITS_PER_LONG; spin_lock_irqsave(&cmd->bitmap_lock, flags); @@ -879,6 +885,13 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, struct dirty_map *dmap; unsigned long word, region_nr; + if (unlikely(start >= cmd->nr_regions || (start + nr_regions) < start || + (start + nr_regions) > cmd->nr_regions)) { + DMERR("Invalid region range: start %lu, nr_regions %lu (total number of regions %lu)", + start, nr_regions, cmd->nr_regions); + return -ERANGE; + } + spin_lock_irq(&cmd->bitmap_lock); if (cmd->read_only) { diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h index 14af1ebd853f..d848b8799c07 100644 --- a/drivers/md/dm-clone-metadata.h +++ b/drivers/md/dm-clone-metadata.h @@ -156,7 +156,7 @@ bool dm_clone_is_range_hydrated(struct dm_clone_metadata *cmd, /* * Returns the number of hydrated regions. */ -unsigned long dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); +unsigned int dm_clone_nr_of_hydrated_regions(struct dm_clone_metadata *cmd); /* * Returns the first unhydrated region with region_nr >= @start diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index d1e1b5b56b1b..5ce96ddf1ce1 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -282,7 +282,7 @@ static bool bio_triggers_commit(struct clone *clone, struct bio *bio) /* Get the address of the region in sectors */ static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr) { - return (region_nr << clone->region_shift); + return ((sector_t)region_nr << clone->region_shift); } /* Get the region number of the bio */ @@ -293,10 +293,17 @@ static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio) /* Get the region range covered by the bio */ static void bio_region_range(struct clone *clone, struct bio *bio, - unsigned long *rs, unsigned long *re) + unsigned long *rs, unsigned long *nr_regions) { + unsigned long end; + *rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size); - *re = bio_end_sector(bio) >> clone->region_shift; + end = bio_end_sector(bio) >> clone->region_shift; + + if (*rs >= end) + *nr_regions = 0; + else + *nr_regions = end - *rs; } /* Check whether a bio overwrites a region */ @@ -454,7 +461,7 @@ static void trim_bio(struct bio *bio, sector_t sector, unsigned int len) static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success) { - unsigned long rs, re; + unsigned long rs, nr_regions; /* * If the destination device supports discards, remap and trim the @@ -463,9 +470,9 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ */ if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) { remap_to_dest(clone, bio); - bio_region_range(clone, bio, &rs, &re); - trim_bio(bio, rs << clone->region_shift, - (re - rs) << clone->region_shift); + bio_region_range(clone, bio, &rs, &nr_regions); + trim_bio(bio, region_to_sector(clone, rs), + nr_regions << clone->region_shift); generic_make_request(bio); } else bio_endio(bio); @@ -473,12 +480,21 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ static void process_discard_bio(struct clone *clone, struct bio *bio) { - unsigned long rs, re; + unsigned long rs, nr_regions; - bio_region_range(clone, bio, &rs, &re); - BUG_ON(re > clone->nr_regions); + bio_region_range(clone, bio, &rs, &nr_regions); + if (!nr_regions) { + bio_endio(bio); + return; + } - if (unlikely(rs == re)) { + if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs || + (rs + nr_regions) > clone->nr_regions)) { + DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)", + clone_device_name(clone), rs, nr_regions, + clone->nr_regions, + (unsigned long long)bio->bi_iter.bi_sector, + bio_sectors(bio)); bio_endio(bio); return; } @@ -487,7 +503,7 @@ static void process_discard_bio(struct clone *clone, struct bio *bio) * The covered regions are already hydrated so we just need to pass * down the discard. */ - if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) { + if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) { complete_discard_bio(clone, bio, true); return; } @@ -788,11 +804,14 @@ static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr struct dm_io_region from, to; struct clone *clone = hd->clone; + if (WARN_ON(!nr_regions)) + return; + region_size = clone->region_size; region_start = hd->region_nr; region_end = region_start + nr_regions - 1; - total_size = (nr_regions - 1) << clone->region_shift; + total_size = region_to_sector(clone, nr_regions - 1); if (region_end == clone->nr_regions - 1) { /* @@ -1169,7 +1188,7 @@ static void process_deferred_discards(struct clone *clone) int r = -EPERM; struct bio *bio; struct blk_plug plug; - unsigned long rs, re; + unsigned long rs, nr_regions; struct bio_list discards = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); @@ -1185,14 +1204,13 @@ static void process_deferred_discards(struct clone *clone) /* Update the metadata */ bio_list_for_each(bio, &discards) { - bio_region_range(clone, bio, &rs, &re); + bio_region_range(clone, bio, &rs, &nr_regions); /* * A discard request might cover regions that have been already * hydrated. There is no need to update the metadata for these * regions. */ - r = dm_clone_cond_set_range(clone->cmd, rs, re - rs); - + r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions); if (unlikely(r)) break; } @@ -1455,7 +1473,7 @@ static void clone_status(struct dm_target *ti, status_type_t type, goto error; } - DMEMIT("%u %llu/%llu %llu %lu/%lu %u ", + DMEMIT("%u %llu/%llu %llu %u/%lu %u ", DM_CLONE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks), (unsigned long long)nr_metadata_blocks, @@ -1775,6 +1793,7 @@ error: static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; + sector_t nr_regions; struct clone *clone; struct dm_arg_set as; @@ -1816,7 +1835,16 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto out_with_source_dev; clone->region_shift = __ffs(clone->region_size); - clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size); + nr_regions = dm_sector_div_up(ti->len, clone->region_size); + + /* Check for overflow */ + if (nr_regions != (unsigned long)nr_regions) { + ti->error = "Too many regions. Consider increasing the region size"; + r = -EOVERFLOW; + goto out_with_source_dev; + } + + clone->nr_regions = nr_regions; r = validate_nr_regions(clone->nr_regions, &ti->error); if (r) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index c6a529873d0f..3df90daba89e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -230,6 +230,8 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io); static struct scatterlist *crypt_get_sg_data(struct crypt_config *cc, struct scatterlist *sg); +static bool crypt_integrity_aead(struct crypt_config *cc); + /* * Use this to access cipher attributes that are independent of the key. */ @@ -346,7 +348,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, unsigned bs; int log; - if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) + if (crypt_integrity_aead(cc)) bs = crypto_aead_blocksize(any_tfm_aead(cc)); else bs = crypto_skcipher_blocksize(any_tfm(cc)); @@ -712,7 +714,7 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) { + if (crypt_integrity_aead(cc)) { ti->error = "AEAD transforms not supported for EBOIV"; return -EINVAL; } diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2f03fecd312d..b989d109d55d 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -39,6 +39,7 @@ #define RECALC_WRITE_SUPER 16 #define BITMAP_BLOCK_SIZE 4096 /* don't change it */ #define BITMAP_FLUSH_INTERVAL (10 * HZ) +#define DISCARD_FILLER 0xf6 /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -257,6 +258,7 @@ struct dm_integrity_c { bool just_formatted; bool recalculate_flag; bool fix_padding; + bool discard; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -284,7 +286,7 @@ struct dm_integrity_io { struct work_struct work; struct dm_integrity_c *ic; - bool write; + enum req_opf op; bool fua; struct dm_integrity_range range; @@ -510,8 +512,8 @@ static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", - (unsigned long long)sector, - (unsigned long long)n_sectors, + sector, + n_sectors, ic->sb->log2_sectors_per_block, ic->log2_blocks_per_bitmap_bit, mode); @@ -1299,6 +1301,11 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned *metadata_offset, unsigned total_size, int op) { +#define MAY_BE_FILLER 1 +#define MAY_BE_HASH 2 + unsigned hash_offset = 0; + unsigned may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + do { unsigned char *data, *dp; struct dm_buffer *b; @@ -1320,18 +1327,35 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se } else if (op == TAG_WRITE) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); - } else { + } else { /* e.g.: op == TAG_CMP */ - if (unlikely(memcmp(dp, tag, to_copy))) { - unsigned i; - for (i = 0; i < to_copy; i++) { - if (dp[i] != tag[i]) - break; - total_size--; + if (likely(is_power_of_2(ic->tag_size))) { + if (unlikely(memcmp(dp, tag, to_copy))) + if (unlikely(!ic->discard) || + unlikely(!memchr_inv(dp, DISCARD_FILLER, to_copy))) { + goto thorough_test; + } + } else { + unsigned i, ts; +thorough_test: + ts = total_size; + + for (i = 0; i < to_copy; i++, ts--) { + if (unlikely(dp[i] != tag[i])) + may_be &= ~MAY_BE_HASH; + if (likely(dp[i] != DISCARD_FILLER)) + may_be &= ~MAY_BE_FILLER; + hash_offset++; + if (unlikely(hash_offset == ic->tag_size)) { + if (unlikely(!may_be)) { + dm_bufio_release(b); + return ts; + } + hash_offset = 0; + may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + } } - dm_bufio_release(b); - return total_size; } } dm_bufio_release(b); @@ -1342,10 +1366,17 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se (*metadata_block)++; *metadata_offset = 0; } + + if (unlikely(!is_power_of_2(ic->tag_size))) { + hash_offset = (hash_offset + to_copy) % ic->tag_size; + } + total_size -= to_copy; } while (unlikely(total_size)); return 0; +#undef MAY_BE_FILLER +#undef MAY_BE_HASH } static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) @@ -1428,7 +1459,7 @@ static void dec_in_flight(struct dm_integrity_io *dio) remove_range(ic, &dio->range); - if (unlikely(dio->write)) + if (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD)) schedule_autocommit(ic); bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); @@ -1519,15 +1550,20 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[HASH_MAX_DIGESTSIZE]; - unsigned sectors_to_process = dio->range.n_sectors; - sector_t sector = dio->range.logical_sector; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + sector_t sector; + unsigned sectors_to_process; + sector_t save_metadata_block; + unsigned save_metadata_offset; if (unlikely(ic->mode == 'R')) goto skip_io; - checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, - GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + if (likely(dio->op != REQ_OP_DISCARD)) + checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); + else + checksums = kmalloc(PAGE_SIZE, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); if (!checksums) { checksums = checksums_onstack; if (WARN_ON(extra_space && @@ -1537,6 +1573,43 @@ static void integrity_metadata(struct work_struct *w) } } + if (unlikely(dio->op == REQ_OP_DISCARD)) { + sector_t bi_sector = dio->bio_details.bi_iter.bi_sector; + unsigned bi_size = dio->bio_details.bi_iter.bi_size; + unsigned max_size = likely(checksums != checksums_onstack) ? PAGE_SIZE : HASH_MAX_DIGESTSIZE; + unsigned max_blocks = max_size / ic->tag_size; + memset(checksums, DISCARD_FILLER, max_size); + + while (bi_size) { + unsigned this_step_blocks = bi_size >> (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + this_step_blocks = min(this_step_blocks, max_blocks); + r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, + this_step_blocks * ic->tag_size, TAG_WRITE); + if (unlikely(r)) { + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto error; + } + + /*if (bi_size < this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block)) { + printk("BUGG: bi_sector: %llx, bi_size: %u\n", bi_sector, bi_size); + printk("BUGG: this_step_blocks: %u\n", this_step_blocks); + BUG(); + }*/ + bi_size -= this_step_blocks << (SECTOR_SHIFT + ic->sb->log2_sectors_per_block); + bi_sector += this_step_blocks << ic->sb->log2_sectors_per_block; + } + + if (likely(checksums != checksums_onstack)) + kfree(checksums); + goto skip_io; + } + + save_metadata_block = dio->metadata_block; + save_metadata_offset = dio->metadata_offset; + sector = dio->range.logical_sector; + sectors_to_process = dio->range.n_sectors; + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos; char *mem, *checksums_ptr; @@ -1555,11 +1628,12 @@ again: kunmap_atomic(mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, - checksums_ptr - checksums, !dio->write ? TAG_CMP : TAG_WRITE); + checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { if (r > 0) { - DMERR_LIMIT("Checksum failed at sector 0x%llx", - (unsigned long long)(sector - ((r + ic->tag_size - 1) / ic->tag_size))); + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: Checksum failed at sector 0x%llx", bio_devname(bio, b), + (sector - ((r + ic->tag_size - 1) / ic->tag_size))); r = -EILSEQ; atomic64_inc(&ic->number_of_mismatches); } @@ -1598,7 +1672,7 @@ again: tag = lowmem_page_address(biv.bv_page) + biv.bv_offset; this_len = min(biv.bv_len, data_to_process); r = dm_integrity_rw_tag(ic, tag, &dio->metadata_block, &dio->metadata_offset, - this_len, !dio->write ? TAG_READ : TAG_WRITE); + this_len, dio->op == REQ_OP_READ ? TAG_READ : TAG_WRITE); if (unlikely(r)) goto error; data_to_process -= this_len; @@ -1625,6 +1699,20 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) dio->ic = ic; dio->bi_status = 0; + dio->op = bio_op(bio); + + if (unlikely(dio->op == REQ_OP_DISCARD)) { + if (ti->max_io_len) { + sector_t sec = dm_target_offset(ti, bio->bi_iter.bi_sector); + unsigned log2_max_io_len = __fls(ti->max_io_len); + sector_t start_boundary = sec >> log2_max_io_len; + sector_t end_boundary = (sec + bio_sectors(bio) - 1) >> log2_max_io_len; + if (start_boundary < end_boundary) { + sector_t len = ti->max_io_len - (sec & (ti->max_io_len - 1)); + dm_accept_partial_bio(bio, len); + } + } + } if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { submit_flush_bio(ic, dio); @@ -1632,8 +1720,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } dio->range.logical_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); - dio->write = bio_op(bio) == REQ_OP_WRITE; - dio->fua = dio->write && bio->bi_opf & REQ_FUA; + dio->fua = dio->op == REQ_OP_WRITE && bio->bi_opf & REQ_FUA; if (unlikely(dio->fua)) { /* * Don't pass down the FUA flag because we have to flush @@ -1643,18 +1730,18 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } if (unlikely(dio->range.logical_sector + bio_sectors(bio) > ic->provided_data_sectors)) { DMERR("Too big sector number: 0x%llx + 0x%x > 0x%llx", - (unsigned long long)dio->range.logical_sector, bio_sectors(bio), - (unsigned long long)ic->provided_data_sectors); + dio->range.logical_sector, bio_sectors(bio), + ic->provided_data_sectors); return DM_MAPIO_KILL; } if (unlikely((dio->range.logical_sector | bio_sectors(bio)) & (unsigned)(ic->sectors_per_block - 1))) { DMERR("Bio not aligned on %u sectors: 0x%llx, 0x%x", ic->sectors_per_block, - (unsigned long long)dio->range.logical_sector, bio_sectors(bio)); + dio->range.logical_sector, bio_sectors(bio)); return DM_MAPIO_KILL; } - if (ic->sectors_per_block > 1) { + if (ic->sectors_per_block > 1 && likely(dio->op != REQ_OP_DISCARD)) { struct bvec_iter iter; struct bio_vec bv; bio_for_each_segment(bv, bio, iter) { @@ -1687,7 +1774,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) } } - if (unlikely(ic->mode == 'R') && unlikely(dio->write)) + if (unlikely(ic->mode == 'R') && unlikely(dio->op != REQ_OP_READ)) return DM_MAPIO_KILL; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); @@ -1717,13 +1804,13 @@ static bool __journal_read_write(struct dm_integrity_io *dio, struct bio *bio, bio_advance_iter(bio, &bio->bi_iter, bv.bv_len); retry_kmap: mem = kmap_atomic(bv.bv_page); - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) flush_dcache_page(bv.bv_page); do { struct journal_entry *je = access_journal_entry(ic, journal_section, journal_entry); - if (unlikely(!dio->write)) { + if (unlikely(dio->op == REQ_OP_READ)) { struct journal_sector *js; char *mem_ptr; unsigned s; @@ -1748,12 +1835,12 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", - (unsigned long long)logical_sector); + logical_sector); } } #endif @@ -1770,7 +1857,7 @@ retry_kmap: char *tag_addr; BUG_ON(PageHighMem(biv.bv_page)); tag_addr = lowmem_page_address(biv.bv_page) + biv.bv_offset; - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) memcpy(tag_ptr, tag_addr, tag_now); else memcpy(tag_addr, tag_ptr, tag_now); @@ -1778,12 +1865,12 @@ retry_kmap: tag_ptr += tag_now; tag_todo -= tag_now; } while (unlikely(tag_todo)); else { - if (likely(dio->write)) + if (likely(dio->op == REQ_OP_WRITE)) memset(tag_ptr, 0, tag_todo); } } - if (likely(dio->write)) { + if (likely(dio->op == REQ_OP_WRITE)) { struct journal_sector *js; unsigned s; @@ -1819,12 +1906,12 @@ retry_kmap: bv.bv_offset += ic->sectors_per_block << SECTOR_SHIFT; } while (bv.bv_len -= ic->sectors_per_block << SECTOR_SHIFT); - if (unlikely(!dio->write)) + if (unlikely(dio->op == REQ_OP_READ)) flush_dcache_page(bv.bv_page); kunmap_atomic(mem); } while (n_sectors); - if (likely(dio->write)) { + if (likely(dio->op == REQ_OP_WRITE)) { smp_mb(); if (unlikely(waitqueue_active(&ic->copy_to_journal_wait))) wake_up(&ic->copy_to_journal_wait); @@ -1856,7 +1943,10 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map unsigned journal_section, journal_entry; unsigned journal_read_pos; struct completion read_comp; - bool need_sync_io = ic->internal_hash && !dio->write; + bool discard_retried = false; + bool need_sync_io = ic->internal_hash && dio->op == REQ_OP_READ; + if (unlikely(dio->op == REQ_OP_DISCARD) && ic->mode != 'D') + need_sync_io = true; if (need_sync_io && from_map) { INIT_WORK(&dio->work, integrity_bio_wait); @@ -1874,8 +1964,8 @@ retry: } dio->range.n_sectors = bio_sectors(bio); journal_read_pos = NOT_FOUND; - if (likely(ic->mode == 'J')) { - if (dio->write) { + if (ic->mode == 'J' && likely(dio->op != REQ_OP_DISCARD)) { + if (dio->op == REQ_OP_WRITE) { unsigned next_entry, i, pos; unsigned ws, we, range_sectors; @@ -1970,6 +2060,21 @@ offload_to_thread: } } } + if (ic->mode == 'J' && likely(dio->op == REQ_OP_DISCARD) && !discard_retried) { + sector_t next_sector; + unsigned new_pos = find_journal_node(ic, dio->range.logical_sector, &next_sector); + if (unlikely(new_pos != NOT_FOUND) || + unlikely(next_sector < dio->range.logical_sector - dio->range.n_sectors)) { + remove_range_unlocked(ic, &dio->range); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); + flush_workqueue(ic->commit_wq); + queue_work(ic->writer_wq, &ic->writer_work); + flush_workqueue(ic->writer_wq); + discard_retried = true; + goto lock_retry; + } + } spin_unlock_irq(&ic->endio_wait.lock); if (unlikely(journal_read_pos != NOT_FOUND)) { @@ -1978,7 +2083,7 @@ offload_to_thread: goto journal_read_write; } - if (ic->mode == 'B' && dio->write) { + if (ic->mode == 'B' && (dio->op == REQ_OP_WRITE || unlikely(dio->op == REQ_OP_DISCARD))) { if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { struct bitmap_block_status *bbs; @@ -2007,6 +2112,18 @@ offload_to_thread: bio->bi_end_io = integrity_end_io; bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) { + integrity_metadata(&dio->work); + dm_integrity_flush_buffers(ic); + + dio->in_flight = (atomic_t)ATOMIC_INIT(1); + dio->completion = NULL; + + generic_make_request(bio); + + return; + } + generic_make_request(bio); if (need_sync_io) { @@ -2193,6 +2310,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, sec &= ~(sector_t)(ic->sectors_per_block - 1); } } + if (unlikely(sec >= ic->provided_data_sectors)) + continue; get_area_and_offset(ic, sec, &area, &offset); restore_last_bytes(ic, access_journal_data(ic, i, j), je); for (k = j + 1; k < ic->journal_section_entries; k++) { @@ -2202,6 +2321,8 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, break; BUG_ON(unlikely(journal_entry_is_inprogress(je2)) && !from_replay); sec2 = journal_entry_get_sector(je2); + if (unlikely(sec2 >= ic->provided_data_sectors)) + break; get_area_and_offset(ic, sec2, &area2, &offset2); if (area2 != area || offset2 != offset + ((k - j) << ic->sb->log2_sectors_per_block)) break; @@ -2404,7 +2525,7 @@ next_chunk: get_area_and_offset(ic, logical_sector, &area, &offset); } - DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors); + DEBUG_print("recalculating: %llx, %llx\n", logical_sector, n_sectors); if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { recalc_write_super(ic); @@ -2828,9 +2949,29 @@ static void dm_integrity_postsuspend(struct dm_target *ti) static void dm_integrity_resume(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + __u64 old_provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); int r; + DEBUG_print("resume\n"); + if (ic->provided_data_sectors != old_provided_data_sectors) { + if (ic->provided_data_sectors > old_provided_data_sectors && + ic->mode == 'B' && + ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + block_bitmap_op(ic, ic->journal, old_provided_data_sectors, + ic->provided_data_sectors - old_provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + + ic->sb->provided_data_sectors = cpu_to_le64(ic->provided_data_sectors); + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { DEBUG_print("resume dirty_bitmap\n"); rw_journal_sectors(ic, REQ_OP_READ, 0, 0, @@ -2898,7 +3039,7 @@ static void dm_integrity_resume(struct dm_target *ti) DEBUG_print("testing recalc: %x\n", ic->sb->flags); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); - DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors); + DEBUG_print("recalc pos: %llx / %llx\n", recalc_pos, ic->provided_data_sectors); if (recalc_pos < ic->provided_data_sectors) { queue_work(ic->recalc_wq, &ic->recalc_work); } else if (recalc_pos > ic->provided_data_sectors) { @@ -2928,10 +3069,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: DMEMIT("%llu %llu", - (unsigned long long)atomic64_read(&ic->number_of_mismatches), - (unsigned long long)ic->provided_data_sectors); + atomic64_read(&ic->number_of_mismatches), + ic->provided_data_sectors); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) - DMEMIT(" %llu", (unsigned long long)le64_to_cpu(ic->sb->recalc_sector)); + DMEMIT(" %llu", le64_to_cpu(ic->sb->recalc_sector)); else DMEMIT(" -"); break; @@ -2944,6 +3085,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->discard; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'J'; arg_count += ic->mode == 'B'; @@ -2952,7 +3094,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; - DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, + DMEMIT("%s %llu %u %c %u", ic->dev->name, ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) DMEMIT(" meta_device:%s", ic->meta_dev->name); @@ -2960,6 +3102,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) DMEMIT(" recalculate"); + if (ic->discard) + DMEMIT(" allow_discards"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); @@ -2968,7 +3112,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" commit_time:%u", ic->autocommit_msec); } if (ic->mode == 'B') { - DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" sectors_per_bit:%llu", (sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) @@ -3073,6 +3217,24 @@ static int calculate_device_limits(struct dm_integrity_c *ic) return 0; } +static void get_provided_data_sectors(struct dm_integrity_c *ic) +{ + if (!ic->meta_dev) { + int test_bit; + ic->provided_data_sectors = 0; + for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { + __u64 prev_data_sectors = ic->provided_data_sectors; + + ic->provided_data_sectors |= (sector_t)1 << test_bit; + if (calculate_device_limits(ic)) + ic->provided_data_sectors = prev_data_sectors; + } + } else { + ic->provided_data_sectors = ic->data_device_sectors; + ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + } +} + static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sectors, unsigned interleave_sectors) { unsigned journal_sections; @@ -3100,20 +3262,15 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec ic->sb->log2_interleave_sectors = max((__u8)MIN_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); ic->sb->log2_interleave_sectors = min((__u8)MAX_LOG2_INTERLEAVE_SECTORS, ic->sb->log2_interleave_sectors); - ic->provided_data_sectors = 0; - for (test_bit = fls64(ic->meta_device_sectors) - 1; test_bit >= 3; test_bit--) { - __u64 prev_data_sectors = ic->provided_data_sectors; - - ic->provided_data_sectors |= (sector_t)1 << test_bit; - if (calculate_device_limits(ic)) - ic->provided_data_sectors = prev_data_sectors; - } + get_provided_data_sectors(ic); if (!ic->provided_data_sectors) return -EINVAL; } else { ic->sb->log2_interleave_sectors = 0; - ic->provided_data_sectors = ic->data_device_sectors; - ic->provided_data_sectors &= ~(sector_t)(ic->sectors_per_block - 1); + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) + return -EINVAL; try_smaller_buffer: ic->sb->journal_sections = cpu_to_le32(0); @@ -3733,6 +3890,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "allow_discards")) { + ic->discard = true; } else if (!strcmp(opt_string, "fix_padding")) { ic->fix_padding = true; } else { @@ -3791,6 +3950,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + if (ic->discard && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Discard can be only used with internal hash"; + goto bad; + } + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; timer_setup(&ic->autocommit_timer, autocommit_fn, 0); @@ -3920,16 +4085,16 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } } - ic->provided_data_sectors = le64_to_cpu(ic->sb->provided_data_sectors); - if (ic->provided_data_sectors != le64_to_cpu(ic->sb->provided_data_sectors)) { - /* test for overflow */ + if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { r = -EINVAL; - ti->error = "The superblock has 64-bit device size, but the kernel was compiled with 32-bit sectors"; + ti->error = "Journal mac mismatch"; goto bad; } - if (!!(ic->sb->flags & cpu_to_le32(SB_FLAG_HAVE_JOURNAL_MAC)) != !!ic->journal_mac_alg.alg_string) { + + get_provided_data_sectors(ic); + if (!ic->provided_data_sectors) { r = -EINVAL; - ti->error = "Journal mac mismatch"; + ti->error = "The device is too small"; goto bad; } @@ -3994,10 +4159,9 @@ try_smaller_buffer: DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); - DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, - (unsigned long long)ic->provided_data_sectors); + DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", ic->provided_data_sectors, ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); - DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal); + DEBUG_print(" bits_in_journal %llu\n", bits_in_journal); if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); @@ -4121,6 +4285,8 @@ try_smaller_buffer: ti->num_flush_bios = 1; ti->flush_supported = true; + if (ic->discard) + ti->num_discard_bios = 1; return 0; @@ -4202,7 +4368,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 3ceeb6b404ed..49147e634046 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -551,6 +551,7 @@ void verity_fec_dtr(struct dm_verity *v) mempool_exit(&f->rs_pool); mempool_exit(&f->prealloc_pool); mempool_exit(&f->extra_pool); + mempool_exit(&f->output_pool); kmem_cache_destroy(f->cache); if (f->data_bufio) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index a09bdc000e64..114927da9cc9 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -26,6 +26,8 @@ #define AUTOCOMMIT_BLOCKS_SSD 65536 #define AUTOCOMMIT_BLOCKS_PMEM 64 #define AUTOCOMMIT_MSEC 1000 +#define MAX_AGE_DIV 16 +#define MAX_AGE_UNSPECIFIED -1UL #define BITMAP_GRANULARITY 65536 #if BITMAP_GRANULARITY < PAGE_SIZE @@ -88,6 +90,7 @@ struct wc_entry { :47 #endif ; + unsigned long age; #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS uint64_t original_sector; uint64_t seq_count; @@ -119,6 +122,7 @@ struct dm_writecache { size_t writeback_size; size_t freelist_high_watermark; size_t freelist_low_watermark; + unsigned long max_age; unsigned uncommitted_blocks; unsigned autocommit_blocks; @@ -130,6 +134,8 @@ struct dm_writecache { struct timer_list autocommit_timer; struct wait_queue_head freelist_wait; + struct timer_list max_age_timer; + atomic_t bio_in_progress[2]; struct wait_queue_head bio_in_progress_wait[2]; @@ -160,6 +166,7 @@ struct dm_writecache { bool autocommit_time_set:1; bool writeback_fua_set:1; bool flush_on_suspend:1; + bool cleaner:1; unsigned writeback_all; struct workqueue_struct *writeback_wq; @@ -502,6 +509,34 @@ static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); } +static void ssd_commit_superblock(struct dm_writecache *wc) +{ + int r; + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = wc->ssd_dev->bdev; + region.sector = 0; + region.count = PAGE_SIZE; + + if (unlikely(region.sector + region.count > wc->metadata_sectors)) + region.count = wc->metadata_sectors - region.sector; + + region.sector += wc->start_sector; + + req.bi_op = REQ_OP_WRITE; + req.bi_op_flags = REQ_SYNC | REQ_FUA; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map; + req.client = wc->dm_io; + req.notify.fn = NULL; + req.notify.context = NULL; + + r = dm_io(&req, 1, ®ion, NULL); + if (unlikely(r)) + writecache_error(wc, r, "error writing superblock"); +} + static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) @@ -596,6 +631,7 @@ static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *i rb_link_node(&ins->rb_node, parent, node); rb_insert_color(&ins->rb_node, &wc->tree); list_add(&ins->lru, &wc->lru); + ins->age = jiffies; } static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) @@ -631,6 +667,16 @@ static inline void writecache_verify_watermark(struct dm_writecache *wc) queue_work(wc->writeback_wq, &wc->writeback_work); } +static void writecache_max_age_timer(struct timer_list *t) +{ + struct dm_writecache *wc = from_timer(wc, t, max_age_timer); + + if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) { + queue_work(wc->writeback_wq, &wc->writeback_work); + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + } +} + static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) { struct wc_entry *e; @@ -741,8 +787,10 @@ static void writecache_flush(struct dm_writecache *wc) wc->seq_count++; pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); - writecache_commit_flushed(wc, false); + if (WC_MODE_PMEM(wc)) + writecache_commit_flushed(wc, false); + else + ssd_commit_superblock(wc); wc->overwrote_committed = false; @@ -837,6 +885,7 @@ static void writecache_suspend(struct dm_target *ti) bool flush_on_suspend; del_timer_sync(&wc->autocommit_timer); + del_timer_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); @@ -876,6 +925,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc) struct wc_entry *e = &wc->entries[b]; e->index = b; e->write_in_progress = false; + cond_resched(); } return 0; @@ -930,6 +980,7 @@ static void writecache_resume(struct dm_target *ti) e->original_sector = le64_to_cpu(wme.original_sector); e->seq_count = le64_to_cpu(wme.seq_count); } + cond_resched(); } #endif for (b = 0; b < wc->n_blocks; b++) { @@ -973,6 +1024,9 @@ erase_this: writecache_verify_watermark(wc); + if (wc->max_age != MAX_AGE_UNSPECIFIED) + mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV); + wc_unlock(wc); } @@ -1021,6 +1075,28 @@ static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_w return 0; } +static void activate_cleaner(struct dm_writecache *wc) +{ + wc->flush_on_suspend = true; + wc->cleaner = true; + wc->freelist_high_watermark = wc->n_blocks; + wc->freelist_low_watermark = wc->n_blocks; +} + +static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc) +{ + if (argc != 1) + return -EINVAL; + + wc_lock(wc); + activate_cleaner(wc); + if (!dm_suspended(wc->ti)) + writecache_verify_watermark(wc); + wc_unlock(wc); + + return 0; +} + static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, char *result, unsigned maxlen) { @@ -1031,6 +1107,8 @@ static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, r = process_flush_mesg(argc, argv, wc); else if (!strcasecmp(argv[0], "flush_on_suspend")) r = process_flush_on_suspend_mesg(argc, argv, wc); + else if (!strcasecmp(argv[0], "cleaner")) + r = process_cleaner_mesg(argc, argv, wc); else DMERR("unrecognised message received: %s", argv[0]); @@ -1194,6 +1272,7 @@ read_next_block: } } else { do { + bool found_entry = false; if (writecache_has_error(wc)) goto unlock_error; e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); @@ -1204,9 +1283,25 @@ read_next_block: wc->overwrote_committed = true; goto bio_copy; } + found_entry = true; + } else { + if (unlikely(wc->cleaner)) + goto direct_write; } e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { + if (!found_entry) { +direct_write: + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); + if (e) { + sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector; + BUG_ON(!next_boundary); + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { + dm_accept_partial_bio(bio, next_boundary); + } + } + goto unlock_remap_origin; + } writecache_wait_on_freelist(wc); continue; } @@ -1619,7 +1714,9 @@ restart: wbl.size = 0; while (!list_empty(&wc->lru) && (wc->writeback_all || - wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) { + wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark || + (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >= + wc->max_age - wc->max_age / MAX_AGE_DIV))) { n_walked++; if (unlikely(n_walked > WRITEBACK_LATENCY) && @@ -1791,8 +1888,10 @@ static int init_memory(struct dm_writecache *wc) pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); - for (b = 0; b < wc->n_blocks; b++) + for (b = 0; b < wc->n_blocks; b++) { write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); + cond_resched(); + } writecache_flush_all_metadata(wc); writecache_commit_flushed(wc, false); @@ -1882,9 +1981,11 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) wc->ti = ti; mutex_init(&wc->lock); + wc->max_age = MAX_AGE_UNSPECIFIED; writecache_poison_lists(wc); init_waitqueue_head(&wc->freelist_wait); timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); + timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0); for (i = 0; i < 2; i++) { atomic_set(&wc->bio_in_progress[i], 0); @@ -2058,6 +2159,16 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); wc->autocommit_time_set = true; + } else if (!strcasecmp(string, "max_age") && opt_params >= 1) { + unsigned max_age_msecs; + string = dm_shift_arg(&as), opt_params--; + if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1) + goto invalid_optional; + if (max_age_msecs > 86400000) + goto invalid_optional; + wc->max_age = msecs_to_jiffies(max_age_msecs); + } else if (!strcasecmp(string, "cleaner")) { + wc->cleaner = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { wc->writeback_fua = true; @@ -2235,6 +2346,9 @@ overflow: do_div(x, 100); wc->freelist_low_watermark = x; + if (wc->cleaner) + activate_cleaner(wc); + r = writecache_alloc_entries(wc); if (r) { ti->error = "Cannot allocate memory"; @@ -2278,9 +2392,9 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args = 0; if (wc->start_sector) extra_args += 2; - if (wc->high_wm_percent_set) + if (wc->high_wm_percent_set && !wc->cleaner) extra_args += 2; - if (wc->low_wm_percent_set) + if (wc->low_wm_percent_set && !wc->cleaner) extra_args += 2; if (wc->max_writeback_jobs_set) extra_args += 2; @@ -2288,19 +2402,21 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args += 2; if (wc->autocommit_time_set) extra_args += 2; + if (wc->cleaner) + extra_args++; if (wc->writeback_fua_set) extra_args++; DMEMIT("%u", extra_args); if (wc->start_sector) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set) { + if (wc->high_wm_percent_set && !wc->cleaner) { x = (uint64_t)wc->freelist_high_watermark * 100; x += wc->n_blocks / 2; do_div(x, (size_t)wc->n_blocks); DMEMIT(" high_watermark %u", 100 - (unsigned)x); } - if (wc->low_wm_percent_set) { + if (wc->low_wm_percent_set && !wc->cleaner) { x = (uint64_t)wc->freelist_low_watermark * 100; x += wc->n_blocks / 2; do_div(x, (size_t)wc->n_blocks); @@ -2312,6 +2428,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); + if (wc->max_age != MAX_AGE_UNSPECIFIED) + DMEMIT(" max_age %u", jiffies_to_msecs(wc->max_age)); + if (wc->cleaner) + DMEMIT(" cleaner"); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); break; @@ -2320,7 +2440,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, static struct target_type writecache_target = { .name = "writecache", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = writecache_ctr, .dtr = writecache_dtr, diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 516c7b671d25..369de15c4e80 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1109,7 +1109,6 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) switch (blkz->type) { case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); - zmd->nr_rnd_zones++; break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: |