mirror of https://github.com/torvalds/linux.git
1627 lines
44 KiB
C
1627 lines
44 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/module.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/init.h>
|
|
#include <linux/timer.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/list.h>
|
|
#include <linux/file.h>
|
|
#include <linux/seq_file.h>
|
|
#include <trace/events/block.h>
|
|
|
|
#include "md.h"
|
|
#include "md-bitmap.h"
|
|
|
|
/*
|
|
* #### Background
|
|
*
|
|
* Redundant data is used to enhance data fault tolerance, and the storage
|
|
* methods for redundant data vary depending on the RAID levels. And it's
|
|
* important to maintain the consistency of redundant data.
|
|
*
|
|
* Bitmap is used to record which data blocks have been synchronized and which
|
|
* ones need to be resynchronized or recovered. Each bit in the bitmap
|
|
* represents a segment of data in the array. When a bit is set, it indicates
|
|
* that the multiple redundant copies of that data segment may not be
|
|
* consistent. Data synchronization can be performed based on the bitmap after
|
|
* power failure or readding a disk. If there is no bitmap, a full disk
|
|
* synchronization is required.
|
|
*
|
|
* #### Key Features
|
|
*
|
|
* - IO fastpath is lockless, if user issues lots of write IO to the same
|
|
* bitmap bit in a short time, only the first write has additional overhead
|
|
* to update bitmap bit, no additional overhead for the following writes;
|
|
* - support only resync or recover written data, means in the case creating
|
|
* new array or replacing with a new disk, there is no need to do a full disk
|
|
* resync/recovery;
|
|
*
|
|
* #### Key Concept
|
|
*
|
|
* ##### State Machine
|
|
*
|
|
* Each bit is one byte, contain 6 different states, see llbitmap_state. And
|
|
* there are total 8 different actions, see llbitmap_action, can change state:
|
|
*
|
|
* llbitmap state machine: transitions between states
|
|
*
|
|
* | | Startwrite | Startsync | Endsync | Abortsync|
|
|
* | --------- | ---------- | --------- | ------- | ------- |
|
|
* | Unwritten | Dirty | x | x | x |
|
|
* | Clean | Dirty | x | x | x |
|
|
* | Dirty | x | x | x | x |
|
|
* | NeedSync | x | Syncing | x | x |
|
|
* | Syncing | x | Syncing | Dirty | NeedSync |
|
|
*
|
|
* | | Reload | Daemon | Discard | Stale |
|
|
* | --------- | -------- | ------ | --------- | --------- |
|
|
* | Unwritten | x | x | x | x |
|
|
* | Clean | x | x | Unwritten | NeedSync |
|
|
* | Dirty | NeedSync | Clean | Unwritten | NeedSync |
|
|
* | NeedSync | x | x | Unwritten | x |
|
|
* | Syncing | NeedSync | x | Unwritten | NeedSync |
|
|
*
|
|
* Typical scenarios:
|
|
*
|
|
* 1) Create new array
|
|
* All bits will be set to Unwritten by default, if --assume-clean is set,
|
|
* all bits will be set to Clean instead.
|
|
*
|
|
* 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and
|
|
* rely on xor data
|
|
*
|
|
* 2.1) write new data to raid1/raid10:
|
|
* Unwritten --StartWrite--> Dirty
|
|
*
|
|
* 2.2) write new data to raid456:
|
|
* Unwritten --StartWrite--> NeedSync
|
|
*
|
|
* Because the initial recover for raid456 is skipped, the xor data is not built
|
|
* yet, the bit must be set to NeedSync first and after lazy initial recover is
|
|
* finished, the bit will finally set to Dirty(see 5.1 and 5.4);
|
|
*
|
|
* 2.3) cover write
|
|
* Clean --StartWrite--> Dirty
|
|
*
|
|
* 3) daemon, if the array is not degraded:
|
|
* Dirty --Daemon--> Clean
|
|
*
|
|
* 4) discard
|
|
* {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten
|
|
*
|
|
* 5) resync and recover
|
|
*
|
|
* 5.1) common process
|
|
* NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean
|
|
*
|
|
* 5.2) resync after power failure
|
|
* Dirty --Reload--> NeedSync
|
|
*
|
|
* 5.3) recover while replacing with a new disk
|
|
* By default, the old bitmap framework will recover all data, and llbitmap
|
|
* implements this by a new helper, see llbitmap_skip_sync_blocks:
|
|
*
|
|
* skip recover for bits other than dirty or clean;
|
|
*
|
|
* 5.4) lazy initial recover for raid5:
|
|
* By default, the old bitmap framework will only allow new recover when there
|
|
* are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added
|
|
* to perform raid456 lazy recover for set bits(from 2.2).
|
|
*
|
|
* 6. special handling for degraded array:
|
|
*
|
|
* - Dirty bits will never be cleared, daemon will just do nothing, so that if
|
|
* a disk is readded, Clean bits can be skipped with recovery;
|
|
* - Dirty bits will convert to Syncing from start write, to do data recovery
|
|
* for new added disks;
|
|
* - New write will convert bits to NeedSync directly;
|
|
*
|
|
* ##### Bitmap IO
|
|
*
|
|
* ##### Chunksize
|
|
*
|
|
* The default bitmap size is 128k, incluing 1k bitmap super block, and
|
|
* the default size of segment of data in the array each bit(chunksize) is 64k,
|
|
* and chunksize will adjust to twice the old size each time if the total number
|
|
* bits is not less than 127k.(see llbitmap_init)
|
|
*
|
|
* ##### READ
|
|
*
|
|
* While creating bitmap, all pages will be allocated and read for llbitmap,
|
|
* there won't be read afterwards
|
|
*
|
|
* ##### WRITE
|
|
*
|
|
* WRITE IO is divided into logical_block_size of the array, the dirty state
|
|
* of each block is tracked independently, for example:
|
|
*
|
|
* each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit;
|
|
*
|
|
* | page0 | page1 | ... | page 31 |
|
|
* | |
|
|
* | \-----------------------\
|
|
* | |
|
|
* | block0 | block1 | ... | block 8|
|
|
* | |
|
|
* | \-----------------\
|
|
* | |
|
|
* | bit0 | bit1 | ... | bit511 |
|
|
*
|
|
* From IO path, if one bit is changed to Dirty or NeedSync, the corresponding
|
|
* subpage will be marked dirty, such block must write first before the IO is
|
|
* issued. This behaviour will affect IO performance, to reduce the impact, if
|
|
* multiple bits are changed in the same block in a short time, all bits in this
|
|
* block will be changed to Dirty/NeedSync, so that there won't be any overhead
|
|
* until daemon clears dirty bits.
|
|
*
|
|
* ##### Dirty Bits synchronization
|
|
*
|
|
* IO fast path will set bits to dirty, and those dirty bits will be cleared
|
|
* by daemon after IO is done. llbitmap_page_ctl is used to synchronize between
|
|
* IO path and daemon;
|
|
*
|
|
* IO path:
|
|
* 1) try to grab a reference, if succeed, set expire time after 5s and return;
|
|
* 2) if failed to grab a reference, wait for daemon to finish clearing dirty
|
|
* bits;
|
|
*
|
|
* Daemon (Daemon will be woken up every daemon_sleep seconds):
|
|
* For each page:
|
|
* 1) check if page expired, if not skip this page; for expired page:
|
|
* 2) suspend the page and wait for inflight write IO to be done;
|
|
* 3) change dirty page to clean;
|
|
* 4) resume the page;
|
|
*/
|
|
|
|
#define BITMAP_DATA_OFFSET 1024
|
|
|
|
/* 64k is the max IO size of sync IO for raid1/raid10 */
|
|
#define MIN_CHUNK_SIZE (64 * 2)
|
|
|
|
/* By default, daemon will be woken up every 30s */
|
|
#define DEFAULT_DAEMON_SLEEP 30
|
|
|
|
/*
|
|
* Dirtied bits that have not been accessed for more than 5s will be cleared
|
|
* by daemon.
|
|
*/
|
|
#define DEFAULT_BARRIER_IDLE 5
|
|
|
|
enum llbitmap_state {
|
|
/* No valid data, init state after assemble the array */
|
|
BitUnwritten = 0,
|
|
/* data is consistent */
|
|
BitClean,
|
|
/* data will be consistent after IO is done, set directly for writes */
|
|
BitDirty,
|
|
/*
|
|
* data need to be resynchronized:
|
|
* 1) set directly for writes if array is degraded, prevent full disk
|
|
* synchronization after readding a disk;
|
|
* 2) reassemble the array after power failure, and dirty bits are
|
|
* found after reloading the bitmap;
|
|
* 3) set for first write for raid5, to build initial xor data lazily
|
|
*/
|
|
BitNeedSync,
|
|
/* data is synchronizing */
|
|
BitSyncing,
|
|
BitStateCount,
|
|
BitNone = 0xff,
|
|
};
|
|
|
|
enum llbitmap_action {
|
|
/* User write new data, this is the only action from IO fast path */
|
|
BitmapActionStartwrite = 0,
|
|
/* Start recovery */
|
|
BitmapActionStartsync,
|
|
/* Finish recovery */
|
|
BitmapActionEndsync,
|
|
/* Failed recovery */
|
|
BitmapActionAbortsync,
|
|
/* Reassemble the array */
|
|
BitmapActionReload,
|
|
/* Daemon thread is trying to clear dirty bits */
|
|
BitmapActionDaemon,
|
|
/* Data is deleted */
|
|
BitmapActionDiscard,
|
|
/*
|
|
* Bitmap is stale, mark all bits in addition to BitUnwritten to
|
|
* BitNeedSync.
|
|
*/
|
|
BitmapActionStale,
|
|
BitmapActionCount,
|
|
/* Init state is BitUnwritten */
|
|
BitmapActionInit,
|
|
};
|
|
|
|
enum llbitmap_page_state {
|
|
LLPageFlush = 0,
|
|
LLPageDirty,
|
|
};
|
|
|
|
struct llbitmap_page_ctl {
|
|
char *state;
|
|
struct page *page;
|
|
unsigned long expire;
|
|
unsigned long flags;
|
|
wait_queue_head_t wait;
|
|
struct percpu_ref active;
|
|
/* Per block size dirty state, maximum 64k page / 1 sector = 128 */
|
|
unsigned long dirty[];
|
|
};
|
|
|
|
struct llbitmap {
|
|
struct mddev *mddev;
|
|
struct llbitmap_page_ctl **pctl;
|
|
|
|
unsigned int nr_pages;
|
|
unsigned int io_size;
|
|
unsigned int blocks_per_page;
|
|
|
|
/* shift of one chunk */
|
|
unsigned long chunkshift;
|
|
/* size of one chunk in sector */
|
|
unsigned long chunksize;
|
|
/* total number of chunks */
|
|
unsigned long chunks;
|
|
unsigned long last_end_sync;
|
|
/*
|
|
* time in seconds that dirty bits will be cleared if the page is not
|
|
* accessed.
|
|
*/
|
|
unsigned long barrier_idle;
|
|
/* fires on first BitDirty state */
|
|
struct timer_list pending_timer;
|
|
struct work_struct daemon_work;
|
|
|
|
unsigned long flags;
|
|
__u64 events_cleared;
|
|
|
|
/* for slow disks */
|
|
atomic_t behind_writes;
|
|
wait_queue_head_t behind_wait;
|
|
};
|
|
|
|
struct llbitmap_unplug_work {
|
|
struct work_struct work;
|
|
struct llbitmap *llbitmap;
|
|
struct completion *done;
|
|
};
|
|
|
|
static struct workqueue_struct *md_llbitmap_io_wq;
|
|
static struct workqueue_struct *md_llbitmap_unplug_wq;
|
|
|
|
static char state_machine[BitStateCount][BitmapActionCount] = {
|
|
[BitUnwritten] = {
|
|
[BitmapActionStartwrite] = BitDirty,
|
|
[BitmapActionStartsync] = BitNone,
|
|
[BitmapActionEndsync] = BitNone,
|
|
[BitmapActionAbortsync] = BitNone,
|
|
[BitmapActionReload] = BitNone,
|
|
[BitmapActionDaemon] = BitNone,
|
|
[BitmapActionDiscard] = BitNone,
|
|
[BitmapActionStale] = BitNone,
|
|
},
|
|
[BitClean] = {
|
|
[BitmapActionStartwrite] = BitDirty,
|
|
[BitmapActionStartsync] = BitNone,
|
|
[BitmapActionEndsync] = BitNone,
|
|
[BitmapActionAbortsync] = BitNone,
|
|
[BitmapActionReload] = BitNone,
|
|
[BitmapActionDaemon] = BitNone,
|
|
[BitmapActionDiscard] = BitUnwritten,
|
|
[BitmapActionStale] = BitNeedSync,
|
|
},
|
|
[BitDirty] = {
|
|
[BitmapActionStartwrite] = BitNone,
|
|
[BitmapActionStartsync] = BitNone,
|
|
[BitmapActionEndsync] = BitNone,
|
|
[BitmapActionAbortsync] = BitNone,
|
|
[BitmapActionReload] = BitNeedSync,
|
|
[BitmapActionDaemon] = BitClean,
|
|
[BitmapActionDiscard] = BitUnwritten,
|
|
[BitmapActionStale] = BitNeedSync,
|
|
},
|
|
[BitNeedSync] = {
|
|
[BitmapActionStartwrite] = BitNone,
|
|
[BitmapActionStartsync] = BitSyncing,
|
|
[BitmapActionEndsync] = BitNone,
|
|
[BitmapActionAbortsync] = BitNone,
|
|
[BitmapActionReload] = BitNone,
|
|
[BitmapActionDaemon] = BitNone,
|
|
[BitmapActionDiscard] = BitUnwritten,
|
|
[BitmapActionStale] = BitNone,
|
|
},
|
|
[BitSyncing] = {
|
|
[BitmapActionStartwrite] = BitNone,
|
|
[BitmapActionStartsync] = BitSyncing,
|
|
[BitmapActionEndsync] = BitDirty,
|
|
[BitmapActionAbortsync] = BitNeedSync,
|
|
[BitmapActionReload] = BitNeedSync,
|
|
[BitmapActionDaemon] = BitNone,
|
|
[BitmapActionDiscard] = BitUnwritten,
|
|
[BitmapActionStale] = BitNeedSync,
|
|
},
|
|
};
|
|
|
|
static void __llbitmap_flush(struct mddev *mddev);
|
|
|
|
static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos)
|
|
{
|
|
unsigned int idx;
|
|
unsigned int offset;
|
|
|
|
pos += BITMAP_DATA_OFFSET;
|
|
idx = pos >> PAGE_SHIFT;
|
|
offset = offset_in_page(pos);
|
|
|
|
return llbitmap->pctl[idx]->state[offset];
|
|
}
|
|
|
|
/* set all the bits in the subpage as dirty */
|
|
static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
|
|
struct llbitmap_page_ctl *pctl,
|
|
unsigned int block)
|
|
{
|
|
bool level_456 = raid_is_456(llbitmap->mddev);
|
|
unsigned int io_size = llbitmap->io_size;
|
|
int pos;
|
|
|
|
for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
|
|
switch (pctl->state[pos]) {
|
|
case BitUnwritten:
|
|
pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
|
|
break;
|
|
case BitClean:
|
|
pctl->state[pos] = BitDirty;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
|
|
int offset)
|
|
{
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
|
|
unsigned int io_size = llbitmap->io_size;
|
|
int block = offset / io_size;
|
|
int pos;
|
|
|
|
if (!test_bit(LLPageDirty, &pctl->flags))
|
|
set_bit(LLPageDirty, &pctl->flags);
|
|
|
|
/*
|
|
* For degraded array, dirty bits will never be cleared, and we must
|
|
* resync all the dirty bits, hence skip infect new dirty bits to
|
|
* prevent resync unnecessary data.
|
|
*/
|
|
if (llbitmap->mddev->degraded) {
|
|
set_bit(block, pctl->dirty);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* The subpage usually contains a total of 512 bits. If any single bit
|
|
* within the subpage is marked as dirty, the entire sector will be
|
|
* written. To avoid impacting write performance, when multiple bits
|
|
* within the same sector are modified within llbitmap->barrier_idle,
|
|
* all bits in the sector will be collectively marked as dirty at once.
|
|
*/
|
|
if (test_and_set_bit(block, pctl->dirty)) {
|
|
llbitmap_infect_dirty_bits(llbitmap, pctl, block);
|
|
return;
|
|
}
|
|
|
|
for (pos = block * io_size; pos < (block + 1) * io_size; pos++) {
|
|
if (pos == offset)
|
|
continue;
|
|
if (pctl->state[pos] == BitDirty ||
|
|
pctl->state[pos] == BitNeedSync) {
|
|
llbitmap_infect_dirty_bits(llbitmap, pctl, block);
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
|
|
loff_t pos)
|
|
{
|
|
unsigned int idx;
|
|
unsigned int bit;
|
|
|
|
pos += BITMAP_DATA_OFFSET;
|
|
idx = pos >> PAGE_SHIFT;
|
|
bit = offset_in_page(pos);
|
|
|
|
llbitmap->pctl[idx]->state[bit] = state;
|
|
if (state == BitDirty || state == BitNeedSync)
|
|
llbitmap_set_page_dirty(llbitmap, idx, bit);
|
|
}
|
|
|
|
static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
|
|
{
|
|
struct mddev *mddev = llbitmap->mddev;
|
|
struct page *page = NULL;
|
|
struct md_rdev *rdev;
|
|
|
|
if (llbitmap->pctl && llbitmap->pctl[idx])
|
|
page = llbitmap->pctl[idx]->page;
|
|
if (page)
|
|
return page;
|
|
|
|
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
|
if (!page)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
rdev_for_each(rdev, mddev) {
|
|
sector_t sector;
|
|
|
|
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
|
|
continue;
|
|
|
|
sector = mddev->bitmap_info.offset +
|
|
(idx << PAGE_SECTORS_SHIFT);
|
|
|
|
if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ,
|
|
true))
|
|
return page;
|
|
|
|
md_error(mddev, rdev);
|
|
}
|
|
|
|
__free_page(page);
|
|
return ERR_PTR(-EIO);
|
|
}
|
|
|
|
static void llbitmap_write_page(struct llbitmap *llbitmap, int idx)
|
|
{
|
|
struct page *page = llbitmap->pctl[idx]->page;
|
|
struct mddev *mddev = llbitmap->mddev;
|
|
struct md_rdev *rdev;
|
|
int block;
|
|
|
|
for (block = 0; block < llbitmap->blocks_per_page; block++) {
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
|
|
|
|
if (!test_and_clear_bit(block, pctl->dirty))
|
|
continue;
|
|
|
|
rdev_for_each(rdev, mddev) {
|
|
sector_t sector;
|
|
sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT;
|
|
|
|
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
|
|
continue;
|
|
|
|
sector = mddev->bitmap_info.offset + rdev->sb_start +
|
|
(idx << PAGE_SECTORS_SHIFT) +
|
|
block * bit_sector;
|
|
md_write_metadata(mddev, rdev, sector,
|
|
llbitmap->io_size, page,
|
|
block * llbitmap->io_size);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void active_release(struct percpu_ref *ref)
|
|
{
|
|
struct llbitmap_page_ctl *pctl =
|
|
container_of(ref, struct llbitmap_page_ctl, active);
|
|
|
|
wake_up(&pctl->wait);
|
|
}
|
|
|
|
static void llbitmap_free_pages(struct llbitmap *llbitmap)
|
|
{
|
|
int i;
|
|
|
|
if (!llbitmap->pctl)
|
|
return;
|
|
|
|
for (i = 0; i < llbitmap->nr_pages; i++) {
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
|
|
|
|
if (!pctl || !pctl->page)
|
|
break;
|
|
|
|
__free_page(pctl->page);
|
|
percpu_ref_exit(&pctl->active);
|
|
}
|
|
|
|
kfree(llbitmap->pctl[0]);
|
|
kfree(llbitmap->pctl);
|
|
llbitmap->pctl = NULL;
|
|
}
|
|
|
|
static int llbitmap_cache_pages(struct llbitmap *llbitmap)
|
|
{
|
|
struct llbitmap_page_ctl *pctl;
|
|
unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks +
|
|
BITMAP_DATA_OFFSET, PAGE_SIZE);
|
|
unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS(
|
|
llbitmap->blocks_per_page));
|
|
int i;
|
|
|
|
llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *),
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
if (!llbitmap->pctl)
|
|
return -ENOMEM;
|
|
|
|
size = round_up(size, cache_line_size());
|
|
pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO);
|
|
if (!pctl) {
|
|
kfree(llbitmap->pctl);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
llbitmap->nr_pages = nr_pages;
|
|
|
|
for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) {
|
|
struct page *page = llbitmap_read_page(llbitmap, i);
|
|
|
|
llbitmap->pctl[i] = pctl;
|
|
|
|
if (IS_ERR(page)) {
|
|
llbitmap_free_pages(llbitmap);
|
|
return PTR_ERR(page);
|
|
}
|
|
|
|
if (percpu_ref_init(&pctl->active, active_release,
|
|
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
|
|
__free_page(page);
|
|
llbitmap_free_pages(llbitmap);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
pctl->page = page;
|
|
pctl->state = page_address(page);
|
|
init_waitqueue_head(&pctl->wait);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void llbitmap_init_state(struct llbitmap *llbitmap)
|
|
{
|
|
enum llbitmap_state state = BitUnwritten;
|
|
unsigned long i;
|
|
|
|
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
|
|
state = BitClean;
|
|
|
|
for (i = 0; i < llbitmap->chunks; i++)
|
|
llbitmap_write(llbitmap, state, i);
|
|
}
|
|
|
|
/* The return value is only used from resync, where @start == @end. */
|
|
static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
|
|
unsigned long start,
|
|
unsigned long end,
|
|
enum llbitmap_action action)
|
|
{
|
|
struct mddev *mddev = llbitmap->mddev;
|
|
enum llbitmap_state state = BitNone;
|
|
bool level_456 = raid_is_456(llbitmap->mddev);
|
|
bool need_resync = false;
|
|
bool need_recovery = false;
|
|
|
|
if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
|
|
return BitNone;
|
|
|
|
if (action == BitmapActionInit) {
|
|
llbitmap_init_state(llbitmap);
|
|
return BitNone;
|
|
}
|
|
|
|
while (start <= end) {
|
|
enum llbitmap_state c = llbitmap_read(llbitmap, start);
|
|
|
|
if (c < 0 || c >= BitStateCount) {
|
|
pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n",
|
|
__func__, start, c, action);
|
|
state = BitNeedSync;
|
|
goto write_bitmap;
|
|
}
|
|
|
|
if (c == BitNeedSync)
|
|
need_resync = !mddev->degraded;
|
|
|
|
state = state_machine[c][action];
|
|
|
|
write_bitmap:
|
|
if (unlikely(mddev->degraded)) {
|
|
/* For degraded array, mark new data as need sync. */
|
|
if (state == BitDirty &&
|
|
action == BitmapActionStartwrite)
|
|
state = BitNeedSync;
|
|
/*
|
|
* For degraded array, resync dirty data as well, noted
|
|
* if array is still degraded after resync is done, all
|
|
* new data will still be dirty until array is clean.
|
|
*/
|
|
else if (c == BitDirty &&
|
|
action == BitmapActionStartsync)
|
|
state = BitSyncing;
|
|
} else if (c == BitUnwritten && state == BitDirty &&
|
|
action == BitmapActionStartwrite && level_456) {
|
|
/* Delay raid456 initial recovery to first write. */
|
|
state = BitNeedSync;
|
|
}
|
|
|
|
if (state == BitNone) {
|
|
start++;
|
|
continue;
|
|
}
|
|
|
|
llbitmap_write(llbitmap, state, start);
|
|
|
|
if (state == BitNeedSync)
|
|
need_resync = !mddev->degraded;
|
|
else if (state == BitDirty &&
|
|
!timer_pending(&llbitmap->pending_timer))
|
|
mod_timer(&llbitmap->pending_timer,
|
|
jiffies + mddev->bitmap_info.daemon_sleep * HZ);
|
|
|
|
start++;
|
|
}
|
|
|
|
if (need_resync && level_456)
|
|
need_recovery = true;
|
|
|
|
if (need_recovery) {
|
|
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
|
set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery);
|
|
md_wakeup_thread(mddev->thread);
|
|
} else if (need_resync) {
|
|
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
|
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
|
|
md_wakeup_thread(mddev->thread);
|
|
}
|
|
|
|
return state;
|
|
}
|
|
|
|
static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx)
|
|
{
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
|
|
|
|
retry:
|
|
if (likely(percpu_ref_tryget_live(&pctl->active))) {
|
|
WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ);
|
|
return;
|
|
}
|
|
|
|
wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active));
|
|
goto retry;
|
|
}
|
|
|
|
static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx)
|
|
{
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
|
|
|
|
percpu_ref_put(&pctl->active);
|
|
}
|
|
|
|
static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
|
|
{
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
|
|
|
|
percpu_ref_kill(&pctl->active);
|
|
|
|
if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
|
|
llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
|
|
return -ETIMEDOUT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx)
|
|
{
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx];
|
|
|
|
pctl->expire = LONG_MAX;
|
|
percpu_ref_resurrect(&pctl->active);
|
|
wake_up(&pctl->wait);
|
|
}
|
|
|
|
static int llbitmap_check_support(struct mddev *mddev)
|
|
{
|
|
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
|
|
pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n",
|
|
mdname(mddev));
|
|
return -EBUSY;
|
|
}
|
|
|
|
if (mddev->bitmap_info.space == 0) {
|
|
if (mddev->bitmap_info.default_space == 0) {
|
|
pr_notice("md/llbitmap: %s: no space for bitmap\n",
|
|
mdname(mddev));
|
|
return -ENOSPC;
|
|
}
|
|
}
|
|
|
|
if (!mddev->persistent) {
|
|
pr_notice("md/llbitmap: %s: array must be persistent\n",
|
|
mdname(mddev));
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
if (mddev->bitmap_info.file) {
|
|
pr_notice("md/llbitmap: %s: doesn't support bitmap file\n",
|
|
mdname(mddev));
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
if (mddev->bitmap_info.external) {
|
|
pr_notice("md/llbitmap: %s: doesn't support external metadata\n",
|
|
mdname(mddev));
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
if (mddev_is_dm(mddev)) {
|
|
pr_notice("md/llbitmap: %s: doesn't support dm-raid\n",
|
|
mdname(mddev));
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int llbitmap_init(struct llbitmap *llbitmap)
|
|
{
|
|
struct mddev *mddev = llbitmap->mddev;
|
|
sector_t blocks = mddev->resync_max_sectors;
|
|
unsigned long chunksize = MIN_CHUNK_SIZE;
|
|
unsigned long chunks = DIV_ROUND_UP(blocks, chunksize);
|
|
unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT;
|
|
int ret;
|
|
|
|
while (chunks > space) {
|
|
chunksize = chunksize << 1;
|
|
chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
|
|
}
|
|
|
|
llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
|
|
llbitmap->chunkshift = ffz(~chunksize);
|
|
llbitmap->chunksize = chunksize;
|
|
llbitmap->chunks = chunks;
|
|
mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP;
|
|
|
|
ret = llbitmap_cache_pages(llbitmap);
|
|
if (ret)
|
|
return ret;
|
|
|
|
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
|
|
BitmapActionInit);
|
|
/* flush initial llbitmap to disk */
|
|
__llbitmap_flush(mddev);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int llbitmap_read_sb(struct llbitmap *llbitmap)
|
|
{
|
|
struct mddev *mddev = llbitmap->mddev;
|
|
unsigned long daemon_sleep;
|
|
unsigned long chunksize;
|
|
unsigned long events;
|
|
struct page *sb_page;
|
|
bitmap_super_t *sb;
|
|
int ret = -EINVAL;
|
|
|
|
if (!mddev->bitmap_info.offset) {
|
|
pr_err("md/llbitmap: %s: no super block found", mdname(mddev));
|
|
return -EINVAL;
|
|
}
|
|
|
|
sb_page = llbitmap_read_page(llbitmap, 0);
|
|
if (IS_ERR(sb_page)) {
|
|
pr_err("md/llbitmap: %s: read super block failed",
|
|
mdname(mddev));
|
|
return -EIO;
|
|
}
|
|
|
|
sb = kmap_local_page(sb_page);
|
|
if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
|
|
pr_err("md/llbitmap: %s: invalid super block magic number",
|
|
mdname(mddev));
|
|
goto out_put_page;
|
|
}
|
|
|
|
if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) {
|
|
pr_err("md/llbitmap: %s: invalid super block version",
|
|
mdname(mddev));
|
|
goto out_put_page;
|
|
}
|
|
|
|
if (memcmp(sb->uuid, mddev->uuid, 16)) {
|
|
pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n",
|
|
mdname(mddev));
|
|
goto out_put_page;
|
|
}
|
|
|
|
if (mddev->bitmap_info.space == 0) {
|
|
int room = le32_to_cpu(sb->sectors_reserved);
|
|
|
|
if (room)
|
|
mddev->bitmap_info.space = room;
|
|
else
|
|
mddev->bitmap_info.space = mddev->bitmap_info.default_space;
|
|
}
|
|
llbitmap->flags = le32_to_cpu(sb->state);
|
|
if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) {
|
|
ret = llbitmap_init(llbitmap);
|
|
goto out_put_page;
|
|
}
|
|
|
|
chunksize = le32_to_cpu(sb->chunksize);
|
|
if (!is_power_of_2(chunksize)) {
|
|
pr_err("md/llbitmap: %s: chunksize not a power of 2",
|
|
mdname(mddev));
|
|
goto out_put_page;
|
|
}
|
|
|
|
if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors,
|
|
mddev->bitmap_info.space << SECTOR_SHIFT)) {
|
|
pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu",
|
|
mdname(mddev), chunksize, mddev->resync_max_sectors,
|
|
mddev->bitmap_info.space);
|
|
goto out_put_page;
|
|
}
|
|
|
|
daemon_sleep = le32_to_cpu(sb->daemon_sleep);
|
|
if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) {
|
|
pr_err("md/llbitmap: %s: daemon sleep %lu period out of range",
|
|
mdname(mddev), daemon_sleep);
|
|
goto out_put_page;
|
|
}
|
|
|
|
events = le64_to_cpu(sb->events);
|
|
if (events < mddev->events) {
|
|
pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery",
|
|
mdname(mddev), events, mddev->events);
|
|
set_bit(BITMAP_STALE, &llbitmap->flags);
|
|
}
|
|
|
|
sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
|
|
mddev->bitmap_info.chunksize = chunksize;
|
|
mddev->bitmap_info.daemon_sleep = daemon_sleep;
|
|
|
|
llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE;
|
|
llbitmap->chunksize = chunksize;
|
|
llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize);
|
|
llbitmap->chunkshift = ffz(~chunksize);
|
|
ret = llbitmap_cache_pages(llbitmap);
|
|
|
|
out_put_page:
|
|
__free_page(sb_page);
|
|
kunmap_local(sb);
|
|
return ret;
|
|
}
|
|
|
|
static void llbitmap_pending_timer_fn(struct timer_list *pending_timer)
|
|
{
|
|
struct llbitmap *llbitmap =
|
|
container_of(pending_timer, struct llbitmap, pending_timer);
|
|
|
|
if (work_busy(&llbitmap->daemon_work)) {
|
|
pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n",
|
|
mdname(llbitmap->mddev),
|
|
llbitmap->mddev->bitmap_info.daemon_sleep);
|
|
set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags);
|
|
return;
|
|
}
|
|
|
|
queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
|
|
}
|
|
|
|
static void md_llbitmap_daemon_fn(struct work_struct *work)
|
|
{
|
|
struct llbitmap *llbitmap =
|
|
container_of(work, struct llbitmap, daemon_work);
|
|
unsigned long start;
|
|
unsigned long end;
|
|
bool restart;
|
|
int idx;
|
|
|
|
if (llbitmap->mddev->degraded)
|
|
return;
|
|
retry:
|
|
start = 0;
|
|
end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1;
|
|
restart = false;
|
|
|
|
for (idx = 0; idx < llbitmap->nr_pages; idx++) {
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
|
|
|
|
if (idx > 0) {
|
|
start = end + 1;
|
|
end = min(end + PAGE_SIZE, llbitmap->chunks - 1);
|
|
}
|
|
|
|
if (!test_bit(LLPageFlush, &pctl->flags) &&
|
|
time_before(jiffies, pctl->expire)) {
|
|
restart = true;
|
|
continue;
|
|
}
|
|
|
|
if (llbitmap_suspend_timeout(llbitmap, idx) < 0) {
|
|
pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n",
|
|
mdname(llbitmap->mddev), __func__, idx);
|
|
continue;
|
|
}
|
|
|
|
llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon);
|
|
llbitmap_resume(llbitmap, idx);
|
|
}
|
|
|
|
/*
|
|
* If the daemon took a long time to finish, retry to prevent missing
|
|
* clearing dirty bits.
|
|
*/
|
|
if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags))
|
|
goto retry;
|
|
|
|
/* If some page is dirty but not expired, setup timer again */
|
|
if (restart)
|
|
mod_timer(&llbitmap->pending_timer,
|
|
jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ);
|
|
}
|
|
|
|
static int llbitmap_create(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap;
|
|
int ret;
|
|
|
|
ret = llbitmap_check_support(mddev);
|
|
if (ret)
|
|
return ret;
|
|
|
|
llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL);
|
|
if (!llbitmap)
|
|
return -ENOMEM;
|
|
|
|
llbitmap->mddev = mddev;
|
|
llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0);
|
|
llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size;
|
|
|
|
timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0);
|
|
INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn);
|
|
atomic_set(&llbitmap->behind_writes, 0);
|
|
init_waitqueue_head(&llbitmap->behind_wait);
|
|
|
|
mutex_lock(&mddev->bitmap_info.mutex);
|
|
mddev->bitmap = llbitmap;
|
|
ret = llbitmap_read_sb(llbitmap);
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
if (ret) {
|
|
kfree(llbitmap);
|
|
mddev->bitmap = NULL;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long chunks;
|
|
|
|
if (chunksize == 0)
|
|
chunksize = llbitmap->chunksize;
|
|
|
|
/* If there is enough space, leave the chunksize unchanged. */
|
|
chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
|
|
while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) {
|
|
chunksize = chunksize << 1;
|
|
chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize);
|
|
}
|
|
|
|
llbitmap->chunkshift = ffz(~chunksize);
|
|
llbitmap->chunksize = chunksize;
|
|
llbitmap->chunks = chunks;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int llbitmap_load(struct mddev *mddev)
|
|
{
|
|
enum llbitmap_action action = BitmapActionReload;
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags))
|
|
action = BitmapActionStale;
|
|
|
|
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action);
|
|
return 0;
|
|
}
|
|
|
|
static void llbitmap_destroy(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
if (!llbitmap)
|
|
return;
|
|
|
|
mutex_lock(&mddev->bitmap_info.mutex);
|
|
|
|
timer_delete_sync(&llbitmap->pending_timer);
|
|
flush_workqueue(md_llbitmap_io_wq);
|
|
flush_workqueue(md_llbitmap_unplug_wq);
|
|
|
|
mddev->bitmap = NULL;
|
|
llbitmap_free_pages(llbitmap);
|
|
kfree(llbitmap);
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
}
|
|
|
|
static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
|
|
unsigned long sectors)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long start = offset >> llbitmap->chunkshift;
|
|
unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
|
|
int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
|
|
llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
|
|
|
|
while (page_start <= page_end) {
|
|
llbitmap_raise_barrier(llbitmap, page_start);
|
|
page_start++;
|
|
}
|
|
}
|
|
|
|
static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
|
|
unsigned long sectors)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long start = offset >> llbitmap->chunkshift;
|
|
unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
|
|
int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
|
|
while (page_start <= page_end) {
|
|
llbitmap_release_barrier(llbitmap, page_start);
|
|
page_start++;
|
|
}
|
|
}
|
|
|
|
static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
|
|
unsigned long sectors)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
|
|
unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
|
|
int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
|
|
llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
|
|
|
|
while (page_start <= page_end) {
|
|
llbitmap_raise_barrier(llbitmap, page_start);
|
|
page_start++;
|
|
}
|
|
}
|
|
|
|
static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
|
|
unsigned long sectors)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize);
|
|
unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift;
|
|
int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
|
|
|
|
while (page_start <= page_end) {
|
|
llbitmap_release_barrier(llbitmap, page_start);
|
|
page_start++;
|
|
}
|
|
}
|
|
|
|
static void llbitmap_unplug_fn(struct work_struct *work)
|
|
{
|
|
struct llbitmap_unplug_work *unplug_work =
|
|
container_of(work, struct llbitmap_unplug_work, work);
|
|
struct llbitmap *llbitmap = unplug_work->llbitmap;
|
|
struct blk_plug plug;
|
|
int i;
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
for (i = 0; i < llbitmap->nr_pages; i++) {
|
|
if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) ||
|
|
!test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
|
|
continue;
|
|
|
|
llbitmap_write_page(llbitmap, i);
|
|
}
|
|
|
|
blk_finish_plug(&plug);
|
|
md_super_wait(llbitmap->mddev);
|
|
complete(unplug_work->done);
|
|
}
|
|
|
|
static bool llbitmap_dirty(struct llbitmap *llbitmap)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < llbitmap->nr_pages; i++)
|
|
if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static void llbitmap_unplug(struct mddev *mddev, bool sync)
|
|
{
|
|
DECLARE_COMPLETION_ONSTACK(done);
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
struct llbitmap_unplug_work unplug_work = {
|
|
.llbitmap = llbitmap,
|
|
.done = &done,
|
|
};
|
|
|
|
if (!llbitmap_dirty(llbitmap))
|
|
return;
|
|
|
|
/*
|
|
* Issue new bitmap IO under submit_bio() context will deadlock:
|
|
* - the bio will wait for bitmap bio to be done, before it can be
|
|
* issued;
|
|
* - bitmap bio will be added to current->bio_list and wait for this
|
|
* bio to be issued;
|
|
*/
|
|
INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn);
|
|
queue_work(md_llbitmap_unplug_wq, &unplug_work.work);
|
|
wait_for_completion(&done);
|
|
destroy_work_on_stack(&unplug_work.work);
|
|
}
|
|
|
|
/*
|
|
* Force to write all bitmap pages to disk, called when stopping the array, or
|
|
* every daemon_sleep seconds when sync_thread is running.
|
|
*/
|
|
static void __llbitmap_flush(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
struct blk_plug plug;
|
|
int i;
|
|
|
|
blk_start_plug(&plug);
|
|
for (i = 0; i < llbitmap->nr_pages; i++) {
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
|
|
|
|
/* mark all blocks as dirty */
|
|
set_bit(LLPageDirty, &pctl->flags);
|
|
bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
|
|
llbitmap_write_page(llbitmap, i);
|
|
}
|
|
blk_finish_plug(&plug);
|
|
md_super_wait(llbitmap->mddev);
|
|
}
|
|
|
|
static void llbitmap_flush(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
int i;
|
|
|
|
for (i = 0; i < llbitmap->nr_pages; i++)
|
|
set_bit(LLPageFlush, &llbitmap->pctl[i]->flags);
|
|
|
|
timer_delete_sync(&llbitmap->pending_timer);
|
|
queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work);
|
|
flush_work(&llbitmap->daemon_work);
|
|
|
|
__llbitmap_flush(mddev);
|
|
}
|
|
|
|
/* This is used for raid5 lazy initial recovery */
|
|
static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long p = offset >> llbitmap->chunkshift;
|
|
enum llbitmap_state c = llbitmap_read(llbitmap, p);
|
|
|
|
return c == BitClean || c == BitDirty;
|
|
}
|
|
|
|
static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long p = offset >> llbitmap->chunkshift;
|
|
int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
|
|
enum llbitmap_state c = llbitmap_read(llbitmap, p);
|
|
|
|
/* always skip unwritten blocks */
|
|
if (c == BitUnwritten)
|
|
return blocks;
|
|
|
|
/* For degraded array, don't skip */
|
|
if (mddev->degraded)
|
|
return 0;
|
|
|
|
/* For resync also skip clean/dirty blocks */
|
|
if ((c == BitClean || c == BitDirty) &&
|
|
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
|
|
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
|
|
return blocks;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
|
|
sector_t *blocks, bool degraded)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long p = offset >> llbitmap->chunkshift;
|
|
|
|
/*
|
|
* Handle one bit at a time, this is much simpler. And it doesn't matter
|
|
* if md_do_sync() loop more times.
|
|
*/
|
|
*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
|
|
return llbitmap_state_machine(llbitmap, p, p,
|
|
BitmapActionStartsync) == BitSyncing;
|
|
}
|
|
|
|
/* Something is wrong, sync_thread stop at @offset */
|
|
static void llbitmap_end_sync(struct mddev *mddev, sector_t offset,
|
|
sector_t *blocks)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long p = offset >> llbitmap->chunkshift;
|
|
|
|
*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
|
|
llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1,
|
|
BitmapActionAbortsync);
|
|
}
|
|
|
|
/* A full sync_thread is finished */
|
|
static void llbitmap_close_sync(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
int i;
|
|
|
|
for (i = 0; i < llbitmap->nr_pages; i++) {
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
|
|
|
|
/* let daemon_fn clear dirty bits immediately */
|
|
WRITE_ONCE(pctl->expire, jiffies);
|
|
}
|
|
|
|
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
|
|
BitmapActionEndsync);
|
|
}
|
|
|
|
/*
|
|
* sync_thread have reached @sector, update metadata every daemon_sleep seconds,
|
|
* just in case sync_thread have to restart after power failure.
|
|
*/
|
|
static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
|
|
bool force)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
if (sector == 0) {
|
|
llbitmap->last_end_sync = jiffies;
|
|
return;
|
|
}
|
|
|
|
if (time_before(jiffies, llbitmap->last_end_sync +
|
|
HZ * mddev->bitmap_info.daemon_sleep))
|
|
return;
|
|
|
|
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
|
|
|
|
mddev->curr_resync_completed = sector;
|
|
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
|
|
llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift,
|
|
BitmapActionEndsync);
|
|
__llbitmap_flush(mddev);
|
|
|
|
llbitmap->last_end_sync = jiffies;
|
|
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
|
}
|
|
|
|
static bool llbitmap_enabled(void *data, bool flush)
|
|
{
|
|
struct llbitmap *llbitmap = data;
|
|
|
|
return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
|
|
}
|
|
|
|
static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s,
|
|
unsigned long e)
|
|
{
|
|
llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite);
|
|
}
|
|
|
|
static void llbitmap_write_sb(struct llbitmap *llbitmap)
|
|
{
|
|
int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size);
|
|
|
|
bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks);
|
|
llbitmap_write_page(llbitmap, 0);
|
|
md_super_wait(llbitmap->mddev);
|
|
}
|
|
|
|
static void llbitmap_update_sb(void *data)
|
|
{
|
|
struct llbitmap *llbitmap = data;
|
|
struct mddev *mddev = llbitmap->mddev;
|
|
struct page *sb_page;
|
|
bitmap_super_t *sb;
|
|
|
|
if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags))
|
|
return;
|
|
|
|
sb_page = llbitmap_read_page(llbitmap, 0);
|
|
if (IS_ERR(sb_page)) {
|
|
pr_err("%s: %s: read super block failed", __func__,
|
|
mdname(mddev));
|
|
set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags);
|
|
return;
|
|
}
|
|
|
|
if (mddev->events < llbitmap->events_cleared)
|
|
llbitmap->events_cleared = mddev->events;
|
|
|
|
sb = kmap_local_page(sb_page);
|
|
sb->events = cpu_to_le64(mddev->events);
|
|
sb->state = cpu_to_le32(llbitmap->flags);
|
|
sb->chunksize = cpu_to_le32(llbitmap->chunksize);
|
|
sb->sync_size = cpu_to_le64(mddev->resync_max_sectors);
|
|
sb->events_cleared = cpu_to_le64(llbitmap->events_cleared);
|
|
sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space);
|
|
sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep);
|
|
|
|
kunmap_local(sb);
|
|
llbitmap_write_sb(llbitmap);
|
|
}
|
|
|
|
static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats)
|
|
{
|
|
struct llbitmap *llbitmap = data;
|
|
|
|
memset(stats, 0, sizeof(*stats));
|
|
|
|
stats->missing_pages = 0;
|
|
stats->pages = llbitmap->nr_pages;
|
|
stats->file_pages = llbitmap->nr_pages;
|
|
|
|
stats->behind_writes = atomic_read(&llbitmap->behind_writes);
|
|
stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait);
|
|
stats->events_cleared = llbitmap->events_cleared;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* just flag all pages as needing to be written */
|
|
static void llbitmap_write_all(struct mddev *mddev)
|
|
{
|
|
int i;
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
for (i = 0; i < llbitmap->nr_pages; i++) {
|
|
struct llbitmap_page_ctl *pctl = llbitmap->pctl[i];
|
|
|
|
set_bit(LLPageDirty, &pctl->flags);
|
|
bitmap_fill(pctl->dirty, llbitmap->blocks_per_page);
|
|
}
|
|
}
|
|
|
|
static void llbitmap_start_behind_write(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
atomic_inc(&llbitmap->behind_writes);
|
|
}
|
|
|
|
static void llbitmap_end_behind_write(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
if (atomic_dec_and_test(&llbitmap->behind_writes))
|
|
wake_up(&llbitmap->behind_wait);
|
|
}
|
|
|
|
static void llbitmap_wait_behind_writes(struct mddev *mddev)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
if (!llbitmap)
|
|
return;
|
|
|
|
wait_event(llbitmap->behind_wait,
|
|
atomic_read(&llbitmap->behind_writes) == 0);
|
|
|
|
}
|
|
|
|
static ssize_t bits_show(struct mddev *mddev, char *page)
|
|
{
|
|
struct llbitmap *llbitmap;
|
|
int bits[BitStateCount] = {0};
|
|
loff_t start = 0;
|
|
|
|
mutex_lock(&mddev->bitmap_info.mutex);
|
|
llbitmap = mddev->bitmap;
|
|
if (!llbitmap || !llbitmap->pctl) {
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
return sprintf(page, "no bitmap\n");
|
|
}
|
|
|
|
if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) {
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
return sprintf(page, "bitmap io error\n");
|
|
}
|
|
|
|
while (start < llbitmap->chunks) {
|
|
enum llbitmap_state c = llbitmap_read(llbitmap, start);
|
|
|
|
if (c < 0 || c >= BitStateCount)
|
|
pr_err("%s: invalid bit %llu state %d\n",
|
|
__func__, start, c);
|
|
else
|
|
bits[c]++;
|
|
start++;
|
|
}
|
|
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
|
|
bits[BitUnwritten], bits[BitClean], bits[BitDirty],
|
|
bits[BitNeedSync], bits[BitSyncing]);
|
|
}
|
|
|
|
static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
|
|
|
|
static ssize_t metadata_show(struct mddev *mddev, char *page)
|
|
{
|
|
struct llbitmap *llbitmap;
|
|
ssize_t ret;
|
|
|
|
mutex_lock(&mddev->bitmap_info.mutex);
|
|
llbitmap = mddev->bitmap;
|
|
if (!llbitmap) {
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
return sprintf(page, "no bitmap\n");
|
|
}
|
|
|
|
ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n",
|
|
llbitmap->chunksize, llbitmap->chunkshift,
|
|
llbitmap->chunks, mddev->bitmap_info.offset,
|
|
llbitmap->mddev->bitmap_info.daemon_sleep);
|
|
mutex_unlock(&mddev->bitmap_info.mutex);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata);
|
|
|
|
static ssize_t
|
|
daemon_sleep_show(struct mddev *mddev, char *page)
|
|
{
|
|
return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep);
|
|
}
|
|
|
|
static ssize_t
|
|
daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len)
|
|
{
|
|
unsigned long timeout;
|
|
int rv = kstrtoul(buf, 10, &timeout);
|
|
|
|
if (rv)
|
|
return rv;
|
|
|
|
mddev->bitmap_info.daemon_sleep = timeout;
|
|
return len;
|
|
}
|
|
|
|
static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep);
|
|
|
|
static ssize_t
|
|
barrier_idle_show(struct mddev *mddev, char *page)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
|
|
return sprintf(page, "%lu\n", llbitmap->barrier_idle);
|
|
}
|
|
|
|
static ssize_t
|
|
barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
|
|
{
|
|
struct llbitmap *llbitmap = mddev->bitmap;
|
|
unsigned long timeout;
|
|
int rv = kstrtoul(buf, 10, &timeout);
|
|
|
|
if (rv)
|
|
return rv;
|
|
|
|
llbitmap->barrier_idle = timeout;
|
|
return len;
|
|
}
|
|
|
|
static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
|
|
|
|
static struct attribute *md_llbitmap_attrs[] = {
|
|
&llbitmap_bits.attr,
|
|
&llbitmap_metadata.attr,
|
|
&llbitmap_daemon_sleep.attr,
|
|
&llbitmap_barrier_idle.attr,
|
|
NULL
|
|
};
|
|
|
|
static struct attribute_group md_llbitmap_group = {
|
|
.name = "llbitmap",
|
|
.attrs = md_llbitmap_attrs,
|
|
};
|
|
|
|
static struct bitmap_operations llbitmap_ops = {
|
|
.head = {
|
|
.type = MD_BITMAP,
|
|
.id = ID_LLBITMAP,
|
|
.name = "llbitmap",
|
|
},
|
|
|
|
.enabled = llbitmap_enabled,
|
|
.create = llbitmap_create,
|
|
.resize = llbitmap_resize,
|
|
.load = llbitmap_load,
|
|
.destroy = llbitmap_destroy,
|
|
|
|
.start_write = llbitmap_start_write,
|
|
.end_write = llbitmap_end_write,
|
|
.start_discard = llbitmap_start_discard,
|
|
.end_discard = llbitmap_end_discard,
|
|
.unplug = llbitmap_unplug,
|
|
.flush = llbitmap_flush,
|
|
|
|
.start_behind_write = llbitmap_start_behind_write,
|
|
.end_behind_write = llbitmap_end_behind_write,
|
|
.wait_behind_writes = llbitmap_wait_behind_writes,
|
|
|
|
.blocks_synced = llbitmap_blocks_synced,
|
|
.skip_sync_blocks = llbitmap_skip_sync_blocks,
|
|
.start_sync = llbitmap_start_sync,
|
|
.end_sync = llbitmap_end_sync,
|
|
.close_sync = llbitmap_close_sync,
|
|
.cond_end_sync = llbitmap_cond_end_sync,
|
|
|
|
.update_sb = llbitmap_update_sb,
|
|
.get_stats = llbitmap_get_stats,
|
|
.dirty_bits = llbitmap_dirty_bits,
|
|
.write_all = llbitmap_write_all,
|
|
|
|
.group = &md_llbitmap_group,
|
|
};
|
|
|
|
int md_llbitmap_init(void)
|
|
{
|
|
md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io",
|
|
WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
|
|
if (!md_llbitmap_io_wq)
|
|
return -ENOMEM;
|
|
|
|
md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug",
|
|
WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
|
|
if (!md_llbitmap_unplug_wq) {
|
|
destroy_workqueue(md_llbitmap_io_wq);
|
|
md_llbitmap_io_wq = NULL;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
return register_md_submodule(&llbitmap_ops.head);
|
|
}
|
|
|
|
void md_llbitmap_exit(void)
|
|
{
|
|
destroy_workqueue(md_llbitmap_io_wq);
|
|
md_llbitmap_io_wq = NULL;
|
|
destroy_workqueue(md_llbitmap_unplug_wq);
|
|
md_llbitmap_unplug_wq = NULL;
|
|
unregister_md_submodule(&llbitmap_ops.head);
|
|
}
|