blob: 4efe1fc496b1d050ae786b98acd085e406fd7311 [file] [log] [blame]
/*[A[A
* Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
*
* Module Author: Heinz Mauelshagen <heinzm@redhat.com>
*
* This file is released under the GPL.
*
*
* Linux 2.6 Device Mapper RAID4 and RAID5 target.
*
* Supports:
* o RAID4 with dedicated and selectable parity device
* o RAID5 with rotating parity (left+right, symmetric+asymmetric)
* o recovery of out of sync device for initial
* RAID set creation or after dead drive replacement
* o run time optimization of xor algorithm used to calculate parity
*
*
* Thanks to MD for:
* o the raid address calculation algorithm
* o the base of the biovec <-> page list copier.
*
*
* Uses region hash to keep track of how many writes are in flight to
* regions in order to use dirty log to keep state of regions to recover:
*
* o clean regions (those which are synchronized
* and don't have write io in flight)
* o dirty regions (those with write io in flight)
*
*
* On startup, any dirty regions are migrated to the
* 'nosync' state and are subject to recovery by the daemon.
*
* See raid_ctr() for table definition.
*
* FIXME: recovery bandwidth
*/
static const char *version = "v0.2594b";
#include "dm.h"
#include "dm-memcache.h"
#include "dm-message.h"
#include "dm-raid45.h"
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/raid/xor.h>
#include <linux/bio.h>
#include <linux/dm-io.h>
#include <linux/dm-dirty-log.h>
#include "dm-region-hash.h"
/*
* Configurable parameters
*/
/* Minimum/maximum and default # of selectable stripes. */
#define STRIPES_MIN 8
#define STRIPES_MAX 16384
#define STRIPES_DEFAULT 80
/* Maximum and default chunk size in sectors if not set in constructor. */
#define CHUNK_SIZE_MIN 8
#define CHUNK_SIZE_MAX 16384
#define CHUNK_SIZE_DEFAULT 64
/* Default io size in sectors if not set in constructor. */
#define IO_SIZE_MIN CHUNK_SIZE_MIN
#define IO_SIZE_DEFAULT IO_SIZE_MIN
/* Recover io size default in sectors. */
#define RECOVER_IO_SIZE_MIN 64
#define RECOVER_IO_SIZE_DEFAULT 256
/* Default, minimum and maximum percentage of recover io bandwidth. */
#define BANDWIDTH_DEFAULT 10
#define BANDWIDTH_MIN 1
#define BANDWIDTH_MAX 100
/* # of parallel recovered regions */
#define RECOVERY_STRIPES_MIN 1
#define RECOVERY_STRIPES_MAX 64
#define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN
/*
* END Configurable parameters
*/
#define TARGET "dm-raid45"
#define DAEMON "kraid45d"
#define DM_MSG_PREFIX TARGET
#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
/* Amount/size for __xor(). */
#define XOR_SIZE PAGE_SIZE
/* Check value in range. */
#define range_ok(i, min, max) (i >= min && i <= max)
/* Check argument is power of 2. */
#define POWER_OF_2(a) (!(a & (a - 1)))
/* Structure access macros. */
/* Derive raid_set from stripe_cache pointer. */
#define RS(x) container_of(x, struct raid_set, sc)
/* Page reference. */
#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
/* Stripe chunk reference. */
#define CHUNK(stripe, p) ((stripe)->chunk + p)
/* Bio list reference. */
#define BL(stripe, p, rw) (stripe->chunk[p].bl + rw)
#define BL_CHUNK(chunk, rw) (chunk->bl + rw)
/* Page list reference. */
#define PL(stripe, p) (stripe->obj[p].pl)
/* END: structure access macros. */
/* Factor out to dm-bio-list.h */
static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
{
bio->bi_next = bl->head;
bl->head = bio;
if (!bl->tail)
bl->tail = bio;
}
/* Factor out to dm.h */
#define TI_ERR_RET(str, ret) \
do { ti->error = str; return ret; } while (0);
#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
/* Macro to define access IO flags access inline functions. */
#define BITOPS(name, what, var, flag) \
static inline int TestClear ## name ## what(struct var *v) \
{ return test_and_clear_bit(flag, &v->io.flags); } \
static inline int TestSet ## name ## what(struct var *v) \
{ return test_and_set_bit(flag, &v->io.flags); } \
static inline void Clear ## name ## what(struct var *v) \
{ clear_bit(flag, &v->io.flags); } \
static inline void Set ## name ## what(struct var *v) \
{ set_bit(flag, &v->io.flags); } \
static inline int name ## what(struct var *v) \
{ return test_bit(flag, &v->io.flags); }
/*-----------------------------------------------------------------
* Stripe cache
*
* Cache for all reads and writes to raid sets (operational or degraded)
*
* We need to run all data to and from a RAID set through this cache,
* because parity chunks need to get calculated from data chunks
* or, in the degraded/resynchronization case, missing chunks need
* to be reconstructed using the other chunks of the stripe.
*---------------------------------------------------------------*/
/* A chunk within a stripe (holds bios hanging off). */
/* IO status flags for chunks of a stripe. */
enum chunk_flags {
CHUNK_DIRTY, /* Pages of chunk dirty; need writing. */
CHUNK_ERROR, /* IO error on any chunk page. */
CHUNK_IO, /* Allow/prohibit IO on chunk pages. */
CHUNK_LOCKED, /* Chunk pages locked during IO. */
CHUNK_MUST_IO, /* Chunk must io. */
CHUNK_UNLOCK, /* Enforce chunk unlock. */
CHUNK_UPTODATE, /* Chunk pages are uptodate. */
};
#if READ != 0 || WRITE != 1
#error dm-raid45: READ/WRITE != 0/1 used as index!!!
#endif
enum bl_type {
WRITE_QUEUED = WRITE + 1,
WRITE_MERGED,
NR_BL_TYPES, /* Must be last one! */
};
struct stripe_chunk {
atomic_t cnt; /* Reference count. */
struct stripe *stripe; /* Backpointer to stripe for endio(). */
/* Bio lists for reads, writes, and writes merged. */
struct bio_list bl[NR_BL_TYPES];
struct {
unsigned long flags; /* IO status flags. */
} io;
};
/* Define chunk bit operations. */
BITOPS(Chunk, Dirty, stripe_chunk, CHUNK_DIRTY)
BITOPS(Chunk, Error, stripe_chunk, CHUNK_ERROR)
BITOPS(Chunk, Io, stripe_chunk, CHUNK_IO)
BITOPS(Chunk, Locked, stripe_chunk, CHUNK_LOCKED)
BITOPS(Chunk, MustIo, stripe_chunk, CHUNK_MUST_IO)
BITOPS(Chunk, Unlock, stripe_chunk, CHUNK_UNLOCK)
BITOPS(Chunk, Uptodate, stripe_chunk, CHUNK_UPTODATE)
/*
* Stripe linked list indexes. Keep order, because the stripe
* and the stripe cache rely on the first 3!
*/
enum list_types {
LIST_FLUSH, /* Stripes to flush for io. */
LIST_ENDIO, /* Stripes to endio. */
LIST_LRU, /* Least recently used stripes. */
SC_NR_LISTS, /* # of lists in stripe cache. */
LIST_HASH = SC_NR_LISTS, /* Hashed stripes. */
LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
STRIPE_NR_LISTS,/* To size array in struct stripe. */
};
/* Adressing region recovery. */
struct recover_addr {
struct dm_region *reg; /* Actual region to recover. */
sector_t pos; /* Position within region to recover. */
sector_t end; /* End of region to recover. */
};
/* A stripe: the io object to handle all reads and writes to a RAID set. */
struct stripe {
atomic_t cnt; /* Reference count. */
struct stripe_cache *sc; /* Backpointer to stripe cache. */
/*
* 4 linked lists:
* o io list to flush io
* o endio list
* o LRU list to put stripes w/o reference count on
* o stripe cache hash
*/
struct list_head lists[STRIPE_NR_LISTS];
sector_t key; /* Hash key. */
region_t region; /* Region stripe is mapped to. */
struct {
unsigned long flags; /* Stripe state flags (see below). */
/*
* Pending ios in flight:
*
* used to control move of stripe to endio list
*/
atomic_t pending;
/* Sectors to read and write for multi page stripe sets. */
unsigned size;
} io;
/* Address region recovery. */
struct recover_addr *recover;
/* Lock on stripe (Future: for clustering). */
void *lock;
struct {
unsigned short parity; /* Parity chunk index. */
short recover; /* Recovery chunk index. */
} idx;
/*
* This stripe's memory cache object (dm-mem-cache);
* i.e. the io chunk pages.
*/
struct dm_mem_cache_object *obj;
/* Array of stripe sets (dynamically allocated). */
struct stripe_chunk chunk[0];
};
/* States stripes can be in (flags field). */
enum stripe_states {
STRIPE_ERROR, /* io error on stripe. */
STRIPE_MERGED, /* Writes got merged to be written. */
STRIPE_RBW, /* Read-before-write stripe. */
STRIPE_RECONSTRUCT, /* Reconstruct of a missing chunk required. */
STRIPE_RECONSTRUCTED, /* Reconstructed of a missing chunk. */
STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
};
/* Define stripe bit operations. */
BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
/* A stripe hash. */
struct stripe_hash {
struct list_head *hash;
unsigned buckets;
unsigned mask;
unsigned prime;
unsigned shift;
};
enum sc_lock_types {
LOCK_ENDIO, /* Protect endio list. */
LOCK_LRU, /* Protect LRU list. */
NR_LOCKS, /* To size array in struct stripe_cache. */
};
/* A stripe cache. */
struct stripe_cache {
/* Stripe hash. */
struct stripe_hash hash;
spinlock_t locks[NR_LOCKS]; /* Locks to protect lists. */
/* Stripes with io to flush, stripes to endio and LRU lists. */
struct list_head lists[SC_NR_LISTS];
/* Slab cache to allocate stripes from. */
struct {
struct kmem_cache *cache; /* Cache itself. */
char name[32]; /* Unique name. */
} kc;
struct dm_io_client *dm_io_client; /* dm-io client resource context. */
/* dm-mem-cache client resource context. */
struct dm_mem_cache_client *mem_cache_client;
int stripes_parm; /* # stripes parameter from constructor. */
atomic_t stripes; /* actual # of stripes in cache. */
atomic_t stripes_to_set; /* # of stripes to resize cache to. */
atomic_t stripes_last; /* last # of stripes in cache. */
atomic_t active_stripes; /* actual # of active stripes in cache. */
/* REMOVEME: */
atomic_t active_stripes_max; /* actual # of active stripes in cache. */
};
/* Flag specs for raid_dev */ ;
enum raid_dev_flags {
DEV_FAILED, /* Device failed. */
DEV_IO_QUEUED, /* Io got queued to device. */
};
/* The raid device in a set. */
struct raid_dev {
struct dm_dev *dev;
sector_t start; /* Offset to map to. */
struct { /* Using struct to be able to BITOPS(). */
unsigned long flags; /* raid_dev_flags. */
} io;
};
BITOPS(Dev, Failed, raid_dev, DEV_FAILED)
BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
/* Flags spec for raid_set. */
enum raid_set_flags {
RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
RS_DEAD, /* RAID set inoperational. */
RS_DEGRADED, /* Io errors on RAID device. */
RS_DEVEL_STATS, /* REMOVEME: display status information. */
RS_RECOVER, /* Do recovery. */
RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
RS_SC_BUSY, /* Stripe cache busy -> send an event. */
RS_SUSPEND, /* Suspend RAID set. */
};
/* REMOVEME: devel stats counters. */
enum stats_types {
S_BIOS_READ,
S_BIOS_ADDED_READ,
S_BIOS_ENDIO_READ,
S_BIOS_WRITE,
S_BIOS_ADDED_WRITE,
S_BIOS_ENDIO_WRITE,
S_CAN_MERGE,
S_CANT_MERGE,
S_CONGESTED,
S_DM_IO_READ,
S_DM_IO_WRITE,
S_BANDWIDTH,
S_BARRIER,
S_BIO_COPY_PL_NEXT,
S_DEGRADED,
S_DELAYED_BIOS,
S_FLUSHS,
S_HITS_1ST,
S_IOS_POST,
S_INSCACHE,
S_MAX_LOOKUP,
S_CHUNK_LOCKED,
S_NO_BANDWIDTH,
S_NOT_CONGESTED,
S_NO_RW,
S_NOSYNC,
S_OVERWRITE,
S_PROHIBITCHUNKIO,
S_RECONSTRUCT_EI,
S_RECONSTRUCT_DEV,
S_RECONSTRUCT_SET,
S_RECONSTRUCTED,
S_REQUEUE,
S_STRIPE_ERROR,
S_SUM_DELAYED_BIOS,
S_XORS,
S_NR_STATS, /* # of stats counters. Must be last! */
};
/* Status type -> string mappings. */
struct stats_map {
const enum stats_types type;
const char *str;
};
static struct stats_map stats_map[] = {
{ S_BIOS_READ, "r=" },
{ S_BIOS_ADDED_READ, "/" },
{ S_BIOS_ENDIO_READ, "/" },
{ S_BIOS_WRITE, " w=" },
{ S_BIOS_ADDED_WRITE, "/" },
{ S_BIOS_ENDIO_WRITE, "/" },
{ S_DM_IO_READ, " rc=" },
{ S_DM_IO_WRITE, " wc=" },
{ S_BANDWIDTH, "\nbw=" },
{ S_NO_BANDWIDTH, " no_bw=" },
{ S_BARRIER, "\nbarrier=" },
{ S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
{ S_CAN_MERGE, "\nmerge=" },
{ S_CANT_MERGE, "/no_merge=" },
{ S_CHUNK_LOCKED, "\nchunk_locked=" },
{ S_CONGESTED, "\ncgst=" },
{ S_NOT_CONGESTED, "/not_cgst=" },
{ S_DEGRADED, "\ndegraded=" },
{ S_DELAYED_BIOS, "\ndel_bios=" },
{ S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
{ S_FLUSHS, "\nflushs=" },
{ S_HITS_1ST, "\nhits_1st=" },
{ S_IOS_POST, " ios_post=" },
{ S_INSCACHE, " inscache=" },
{ S_MAX_LOOKUP, " maxlookup=" },
{ S_NO_RW, "\nno_rw=" },
{ S_NOSYNC, " nosync=" },
{ S_OVERWRITE, " ovr=" },
{ S_PROHIBITCHUNKIO, " prhbt_io=" },
{ S_RECONSTRUCT_EI, "\nrec_ei=" },
{ S_RECONSTRUCT_DEV, " rec_dev=" },
{ S_RECONSTRUCT_SET, " rec_set=" },
{ S_RECONSTRUCTED, " rec=" },
{ S_REQUEUE, " requeue=" },
{ S_STRIPE_ERROR, " stripe_err=" },
{ S_XORS, " xors=" },
};
/*
* A RAID set.
*/
#define dm_rh_client dm_region_hash
enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
typedef void (*xor_function_t)(unsigned count, unsigned long **data);
struct raid_set {
struct dm_target *ti; /* Target pointer. */
struct {
unsigned long flags; /* State flags. */
struct mutex in_lock; /* Protects central input list below. */
struct bio_list in; /* Pending ios (central input list). */
struct bio_list work; /* ios work set. */
wait_queue_head_t suspendq; /* suspend synchronization. */
atomic_t in_process; /* counter of queued bios (suspendq). */
atomic_t in_process_max;/* counter of queued bios max. */
/* io work. */
struct workqueue_struct *wq;
struct delayed_work dws_do_raid; /* For main worker. */
struct work_struct ws_do_table_event; /* For event worker. */
} io;
/* Stripe locking abstraction. */
struct dm_raid45_locking_type *locking;
struct stripe_cache sc; /* Stripe cache for this set. */
/* Xor optimization. */
struct {
struct xor_func *f;
unsigned chunks;
unsigned speed;
} xor;
/* Recovery parameters. */
struct recover {
struct dm_dirty_log *dl; /* Dirty log. */
struct dm_rh_client *rh; /* Region hash. */
struct dm_io_client *dm_io_client; /* recovery dm-io client. */
/* dm-mem-cache client resource context for recovery stripes. */
struct dm_mem_cache_client *mem_cache_client;
struct list_head stripes; /* List of recovery stripes. */
region_t nr_regions;
region_t nr_regions_to_recover;
region_t nr_regions_recovered;
unsigned long start_jiffies;
unsigned long end_jiffies;
unsigned bandwidth; /* Recovery bandwidth [%]. */
unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
unsigned bandwidth_parm; /* " constructor parm. */
unsigned io_size; /* recovery io size <= region size. */
unsigned io_size_parm; /* recovery io size ctr parameter. */
unsigned recovery; /* Recovery allowed/prohibited. */
unsigned recovery_stripes; /* # of parallel recovery stripes. */
/* recovery io throttling. */
atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
unsigned long last_jiffies;
} recover;
/* RAID set parameters. */
struct {
struct raid_type *raid_type; /* RAID type (eg, RAID4). */
unsigned raid_parms; /* # variable raid parameters. */
unsigned chunk_size; /* Sectors per chunk. */
unsigned chunk_size_parm;
unsigned chunk_shift; /* rsector chunk size shift. */
unsigned io_size; /* Sectors per io. */
unsigned io_size_parm;
unsigned io_mask; /* Mask for bio_copy_page_list(). */
unsigned io_inv_mask; /* Mask for raid_address(). */
sector_t sectors_per_dev; /* Sectors per device. */
atomic_t failed_devs; /* Amount of devices failed. */
/* Index of device to initialize. */
int dev_to_init;
int dev_to_init_parm;
/* Raid devices dynamically allocated. */
unsigned raid_devs; /* # of RAID devices below. */
unsigned data_devs; /* # of RAID data devices. */
int ei; /* index of failed RAID device. */
/* Index of dedicated parity device (i.e. RAID4). */
int pi;
int pi_parm; /* constructor parm for status output. */
} set;
/* REMOVEME: devel stats counters. */
atomic_t stats[S_NR_STATS];
/* Dynamically allocated temporary pointers for xor(). */
unsigned long **data;
/* Dynamically allocated RAID devices. Alignment? */
struct raid_dev dev[0];
};
/* Define RAID set bit operations. */
BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
BITOPS(RS, Dead, raid_set, RS_DEAD)
BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
BITOPS(RS, Recover, raid_set, RS_RECOVER)
BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
#undef BITOPS
/*-----------------------------------------------------------------
* Raid-4/5 set structures.
*---------------------------------------------------------------*/
/* RAID level definitions. */
enum raid_level {
raid4,
raid5,
};
/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
enum raid_algorithm {
none,
left_asym,
right_asym,
left_sym,
right_sym,
};
struct raid_type {
const char *name; /* RAID algorithm. */
const char *descr; /* Descriptor text for logging. */
const unsigned parity_devs; /* # of parity devices. */
const unsigned minimal_devs; /* minimal # of devices in set. */
const enum raid_level level; /* RAID level. */
const enum raid_algorithm algorithm; /* RAID algorithm. */
};
/* Supported raid types and properties. */
static struct raid_type raid_types[] = {
{"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
{"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
{"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
{"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
{"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
};
/* Address as calculated by raid_address(). */
struct raid_address {
sector_t key; /* Hash key (address of stripe % chunk_size). */
unsigned di, pi; /* Data and parity disks index. */
};
/* REMOVEME: reset statistics counters. */
static void stats_reset(struct raid_set *rs)
{
unsigned s = S_NR_STATS;
while (s--)
atomic_set(rs->stats + s, 0);
}
/*----------------------------------------------------------------
* RAID set management routines.
*--------------------------------------------------------------*/
/*
* Begin small helper functions.
*/
/* No need to be called from region hash indirectly at dm_rh_dec(). */
static void wake_dummy(void *context) {}
/* Return # of io reference. */
static int io_ref(struct raid_set *rs)
{
return atomic_read(&rs->io.in_process);
}
/* Get an io reference. */
static void io_get(struct raid_set *rs)
{
int p = atomic_inc_return(&rs->io.in_process);
if (p > atomic_read(&rs->io.in_process_max))
atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
}
/* Put the io reference and conditionally wake io waiters. */
static void io_put(struct raid_set *rs)
{
/* Intel: rebuild data corrupter? */
if (atomic_dec_and_test(&rs->io.in_process))
wake_up(&rs->io.suspendq);
else
BUG_ON(io_ref(rs) < 0);
}
/* Wait until all io has been processed. */
static void wait_ios(struct raid_set *rs)
{
wait_event(rs->io.suspendq, !io_ref(rs));
}
/* Queue (optionally delayed) io work. */
static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
{
queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
}
/* Queue io work immediately (called from region hash too). */
static void wake_do_raid(void *context)
{
struct raid_set *rs = context;
queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
}
/* Calculate device sector offset. */
static sector_t _sector(struct raid_set *rs, struct bio *bio)
{
sector_t sector = bio->bi_sector;
sector_div(sector, rs->set.data_devs);
return sector;
}
/* Return # of active stripes in stripe cache. */
static int sc_active(struct stripe_cache *sc)
{
return atomic_read(&sc->active_stripes);
}
/* Stripe cache busy indicator. */
static int sc_busy(struct raid_set *rs)
{
return sc_active(&rs->sc) >
atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
}
/* Set chunks states. */
enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
{
switch (type) {
case CLEAN:
ClearChunkDirty(chunk);
break;
case DIRTY:
SetChunkDirty(chunk);
break;
case ERROR:
SetChunkError(chunk);
SetStripeError(chunk->stripe);
return;
default:
BUG();
}
SetChunkUptodate(chunk);
SetChunkIo(chunk);
ClearChunkError(chunk);
}
/* Return region state for a sector. */
static int region_state(struct raid_set *rs, sector_t sector,
enum dm_rh_region_states state)
{
struct dm_rh_client *rh = rs->recover.rh;
region_t region = dm_rh_sector_to_region(rh, sector);
return !!(dm_rh_get_state(rh, region, 1) & state);
}
/*
* Return true in case a chunk should be read/written
*
* Conditions to read/write:
* o chunk not uptodate
* o chunk dirty
*
* Conditios to avoid io:
* o io already ongoing on chunk
* o io explitely prohibited
*/
static int chunk_io(struct stripe_chunk *chunk)
{
/* 2nd run optimization (flag set below on first run). */
if (TestClearChunkMustIo(chunk))
return 1;
/* Avoid io if prohibited or a locked chunk. */
if (!ChunkIo(chunk) || ChunkLocked(chunk))
return 0;
if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
SetChunkMustIo(chunk); /* 2nd run optimization. */
return 1;
}
return 0;
}
/* Call a function on each chunk needing io unless device failed. */
static unsigned for_each_io_dev(struct stripe *stripe,
void (*f_io)(struct stripe *stripe, unsigned p))
{
struct raid_set *rs = RS(stripe->sc);
unsigned p, r = 0;
for (p = 0; p < rs->set.raid_devs; p++) {
if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
f_io(stripe, p);
r++;
}
}
return r;
}
/*
* Index of device to calculate parity on.
*
* Either the parity device index *or* the selected
* device to init after a spare replacement.
*/
static int dev_for_parity(struct stripe *stripe, int *sync)
{
struct raid_set *rs = RS(stripe->sc);
int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
*sync = !r;
/* Reconstruct a particular device ?. */
if (r && rs->set.dev_to_init > -1)
return rs->set.dev_to_init;
else if (rs->set.raid_type->level == raid4)
return rs->set.pi;
else if (!StripeRecover(stripe))
return stripe->idx.parity;
else
return -1;
}
/* RAID set congested function. */
static int rs_congested(void *congested_data, int bdi_bits)
{
int r;
unsigned p;
struct raid_set *rs = congested_data;
if (sc_busy(rs) || RSSuspend(rs))
r = 1;
else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
/* If any of our component devices are overloaded. */
struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
r |= bdi_congested(&q->backing_dev_info, bdi_bits);
}
/* REMOVEME: statistics. */
atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
return r;
}
/* RAID device degrade check. */
static void rs_check_degrade_dev(struct raid_set *rs,
struct stripe *stripe, unsigned p)
{
if (TestSetDevFailed(rs->dev + p))
return;
/* Through an event in case of member device errors. */
if (atomic_inc_return(&rs->set.failed_devs) >
rs->set.raid_type->parity_devs &&
!TestSetRSDead(rs)) {
/* Display RAID set dead message once. */
unsigned p;
char buf[BDEVNAME_SIZE];
DMERR("FATAL: too many devices failed -> RAID set broken");
for (p = 0; p < rs->set.raid_devs; p++) {
if (DevFailed(rs->dev + p))
DMERR("device /dev/%s failed",
bdevname(rs->dev[p].dev->bdev, buf));
}
}
/* Only log the first member error. */
if (!TestSetRSDegraded(rs)) {
char buf[BDEVNAME_SIZE];
/* Store index for recovery. */
rs->set.ei = p;
DMERR("CRITICAL: %sio error on device /dev/%s "
"in region=%llu; DEGRADING RAID set\n",
stripe ? "" : "FAKED ",
bdevname(rs->dev[p].dev->bdev, buf),
(unsigned long long) (stripe ? stripe->key : 0));
DMERR("further device error messages suppressed");
}
schedule_work(&rs->io.ws_do_table_event);
}
/* RAID set degrade check. */
static void rs_check_degrade(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
unsigned p = rs->set.raid_devs;
while (p--) {
if (ChunkError(CHUNK(stripe, p)))
rs_check_degrade_dev(rs, stripe, p);
}
}
/* Lookup a RAID device by name or by major:minor number. */
static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
{
unsigned p;
struct raid_dev *dev;
/*
* Must be an incremental loop, because the device array
* can have empty slots still on calls from raid_ctr()
*/
for (dev = rs->dev, p = 0;
dev->dev && p < rs->set.raid_devs;
dev++, p++) {
if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
return p;
}
return -ENODEV;
}
/*
* End small helper functions.
*/
/*
* Stripe hash functions
*/
/* Initialize/destroy stripe hash. */
static int hash_init(struct stripe_hash *hash, unsigned stripes)
{
unsigned buckets = 2, max_buckets = stripes >> 1;
static unsigned hash_primes[] = {
/* Table of primes for hash_fn/table size optimization. */
1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
1543, 3079, 6151, 12289, 24593, 49157, 98317,
};
/* Calculate number of buckets (2^^n <= stripes / 2). */
while (buckets < max_buckets)
buckets <<= 1;
/* Allocate stripe hash buckets. */
hash->hash = vmalloc(buckets * sizeof(*hash->hash));
if (!hash->hash)
return -ENOMEM;
hash->buckets = buckets;
hash->mask = buckets - 1;
hash->shift = ffs(buckets);
if (hash->shift > ARRAY_SIZE(hash_primes))
hash->shift = ARRAY_SIZE(hash_primes) - 1;
BUG_ON(hash->shift < 2);
hash->prime = hash_primes[hash->shift];
/* Initialize buckets. */
while (buckets--)
INIT_LIST_HEAD(hash->hash + buckets);
return 0;
}
static void hash_exit(struct stripe_hash *hash)
{
if (hash->hash) {
vfree(hash->hash);
hash->hash = NULL;
}
}
static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
{
return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
}
static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
{
return hash->hash + hash_fn(hash, key);
}
/* Insert an entry into a hash. */
static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
{
list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
}
/* Lookup an entry in the stripe hash. */
static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
{
unsigned look = 0;
struct stripe *stripe;
struct list_head *bucket = hash_bucket(&sc->hash, key);
list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
look++;
if (stripe->key == key) {
/* REMOVEME: statisics. */
if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
return stripe;
}
}
return NULL;
}
/* Resize the stripe cache hash on size changes. */
static int sc_hash_resize(struct stripe_cache *sc)
{
/* Resize indicated ? */
if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
int r;
struct stripe_hash hash;
r = hash_init(&hash, atomic_read(&sc->stripes));
if (r)
return r;
if (sc->hash.hash) {
unsigned b = sc->hash.buckets;
struct list_head *pos, *tmp;
/* Walk old buckets and insert into new. */
while (b--) {
list_for_each_safe(pos, tmp, sc->hash.hash + b)
stripe_insert(&hash,
list_entry(pos, struct stripe,
lists[LIST_HASH]));
}
}
hash_exit(&sc->hash);
memcpy(&sc->hash, &hash, sizeof(sc->hash));
atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
}
return 0;
}
/* End hash stripe hash function. */
/* List add, delete, push and pop functions. */
/* Add stripe to flush list. */
#define DEL_LIST(lh) \
if (!list_empty(lh)) \
list_del_init(lh);
/* Delete stripe from hash. */
static void stripe_hash_del(struct stripe *stripe)
{
DEL_LIST(stripe->lists + LIST_HASH);
}
/* Return stripe reference count. */
static inline int stripe_ref(struct stripe *stripe)
{
return atomic_read(&stripe->cnt);
}
static void stripe_flush_add(struct stripe *stripe)
{
struct stripe_cache *sc = stripe->sc;
struct list_head *lh = stripe->lists + LIST_FLUSH;
if (!StripeReconstruct(stripe) && list_empty(lh))
list_add_tail(lh, sc->lists + LIST_FLUSH);
}
/*
* Add stripe to LRU (inactive) list.
*
* Need lock, because of concurrent access from message interface.
*/
static void stripe_lru_add(struct stripe *stripe)
{
if (!StripeRecover(stripe)) {
unsigned long flags;
struct list_head *lh = stripe->lists + LIST_LRU;
spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
spin_lock_irqsave(lock, flags);
if (list_empty(lh))
list_add_tail(lh, stripe->sc->lists + LIST_LRU);
spin_unlock_irqrestore(lock, flags);
}
}
#define POP_LIST(list) \
do { \
if (list_empty(sc->lists + (list))) \
stripe = NULL; \
else { \
stripe = list_first_entry(sc->lists + (list), \
struct stripe, \
lists[(list)]); \
list_del_init(stripe->lists + (list)); \
} \
} while (0);
/* Pop an available stripe off the LRU list. */
static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
{
struct stripe *stripe;
spinlock_t *lock = sc->locks + LOCK_LRU;
spin_lock_irq(lock);
POP_LIST(LIST_LRU);
spin_unlock_irq(lock);
return stripe;
}
/* Pop an available stripe off the io list. */
static struct stripe *stripe_io_pop(struct stripe_cache *sc)
{
struct stripe *stripe;
POP_LIST(LIST_FLUSH);
return stripe;
}
/* Push a stripe safely onto the endio list to be handled by do_endios(). */
static void stripe_endio_push(struct stripe *stripe)
{
unsigned long flags;
struct stripe_cache *sc = stripe->sc;
struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
*sc_list = sc->lists + LIST_ENDIO;
spinlock_t *lock = sc->locks + LOCK_ENDIO;
/* This runs in parallel with do_endios(). */
spin_lock_irqsave(lock, flags);
if (list_empty(stripe_list))
list_add_tail(stripe_list, sc_list);
spin_unlock_irqrestore(lock, flags);
wake_do_raid(RS(sc)); /* Wake myself. */
}
/* Pop a stripe off safely off the endio list. */
static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
{
struct stripe *stripe;
spinlock_t *lock = sc->locks + LOCK_ENDIO;
/* This runs in parallel with endio(). */
spin_lock_irq(lock);
POP_LIST(LIST_ENDIO)
spin_unlock_irq(lock);
return stripe;
}
#undef POP_LIST
/*
* Stripe cache locking functions
*/
/* Dummy lock function for single host RAID4+5. */
static void *no_lock(sector_t key, enum dm_lock_type type)
{
return &no_lock;
}
/* Dummy unlock function for single host RAID4+5. */
static void no_unlock(void *lock_handle)
{
}
/* No locking (for single host RAID 4+5). */
static struct dm_raid45_locking_type locking_none = {
.lock = no_lock,
.unlock = no_unlock,
};
/* Lock a stripe (for clustering). */
static int
stripe_lock(struct stripe *stripe, int rw, sector_t key)
{
stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
return stripe->lock ? 0 : -EPERM;
}
/* Unlock a stripe (for clustering). */
static void stripe_unlock(struct stripe *stripe)
{
RS(stripe->sc)->locking->unlock(stripe->lock);
stripe->lock = NULL;
}
/* Test io pending on stripe. */
static int stripe_io_ref(struct stripe *stripe)
{
return atomic_read(&stripe->io.pending);
}
static void stripe_io_get(struct stripe *stripe)
{
if (atomic_inc_return(&stripe->io.pending) == 1)
/* REMOVEME: statistics */
atomic_inc(&stripe->sc->active_stripes);
else
BUG_ON(stripe_io_ref(stripe) < 0);
}
static void stripe_io_put(struct stripe *stripe)
{
if (atomic_dec_and_test(&stripe->io.pending)) {
if (unlikely(StripeRecover(stripe)))
/* Don't put recovery stripe on endio list. */
wake_do_raid(RS(stripe->sc));
else
/* Add regular stripe to endio list and wake daemon. */
stripe_endio_push(stripe);
/* REMOVEME: statistics */
atomic_dec(&stripe->sc->active_stripes);
} else
BUG_ON(stripe_io_ref(stripe) < 0);
}
/* Take stripe reference out. */
static int stripe_get(struct stripe *stripe)
{
int r;
struct list_head *lh = stripe->lists + LIST_LRU;
spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
/* Delete stripe from LRU (inactive) list if on. */
spin_lock_irq(lock);
DEL_LIST(lh);
spin_unlock_irq(lock);
BUG_ON(stripe_ref(stripe) < 0);
/* Lock stripe on first reference */
r = (atomic_inc_return(&stripe->cnt) == 1) ?
stripe_lock(stripe, WRITE, stripe->key) : 0;
return r;
}
#undef DEL_LIST
/* Return references on a chunk. */
static int chunk_ref(struct stripe_chunk *chunk)
{
return atomic_read(&chunk->cnt);
}
/* Take out reference on a chunk. */
static int chunk_get(struct stripe_chunk *chunk)
{
return atomic_inc_return(&chunk->cnt);
}
/* Drop reference on a chunk. */
static void chunk_put(struct stripe_chunk *chunk)
{
BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
}
/*
* Drop reference on a stripe.
*
* Move it to list of LRU stripes if zero.
*/
static void stripe_put(struct stripe *stripe)
{
if (atomic_dec_and_test(&stripe->cnt)) {
BUG_ON(stripe_io_ref(stripe));
stripe_unlock(stripe);
} else
BUG_ON(stripe_ref(stripe) < 0);
}
/* Helper needed by for_each_io_dev(). */
static void stripe_get_references(struct stripe *stripe, unsigned p)
{
/*
* Another one to reference the stripe in
* order to protect vs. LRU list moves.
*/
io_get(RS(stripe->sc)); /* Global io references. */
stripe_get(stripe);
stripe_io_get(stripe); /* One for each chunk io. */
}
/* Helper for endio() to put all take references. */
static void stripe_put_references(struct stripe *stripe)
{
stripe_io_put(stripe); /* One for each chunk io. */
stripe_put(stripe);
io_put(RS(stripe->sc));
}
/*
* Stripe cache functions.
*/
/*
* Invalidate all chunks (i.e. their pages) of a stripe.
*
* I only keep state for the whole chunk.
*/
static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
{
chunk->io.flags = 0;
}
static void
stripe_chunks_invalidate(struct stripe *stripe)
{
unsigned p = RS(stripe->sc)->set.raid_devs;
while (p--)
stripe_chunk_invalidate(CHUNK(stripe, p));
}
/* Prepare stripe for (re)use. */
static void stripe_invalidate(struct stripe *stripe)
{
stripe->io.flags = 0;
stripe->idx.parity = stripe->idx.recover = -1;
stripe_chunks_invalidate(stripe);
}
/*
* Allow io on all chunks of a stripe.
* If not set, IO will not occur; i.e. it's prohibited.
*
* Actual IO submission for allowed chunks depends
* on their !uptodate or dirty state.
*/
static void stripe_allow_io(struct stripe *stripe)
{
unsigned p = RS(stripe->sc)->set.raid_devs;
while (p--)
SetChunkIo(CHUNK(stripe, p));
}
/* Initialize a stripe. */
static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
{
unsigned i, p = RS(sc)->set.raid_devs;
/* Work all io chunks. */
while (p--) {
struct stripe_chunk *chunk = CHUNK(stripe, p);
atomic_set(&chunk->cnt, 0);
chunk->stripe = stripe;
i = ARRAY_SIZE(chunk->bl);
while (i--)
bio_list_init(chunk->bl + i);
}
stripe->sc = sc;
i = ARRAY_SIZE(stripe->lists);
while (i--)
INIT_LIST_HEAD(stripe->lists + i);
stripe->io.size = RS(sc)->set.io_size;
atomic_set(&stripe->cnt, 0);
atomic_set(&stripe->io.pending, 0);
stripe_invalidate(stripe);
}
/* Number of pages per chunk. */
static inline unsigned chunk_pages(unsigned sectors)
{
return dm_div_up(sectors, SECTORS_PER_PAGE);
}
/* Number of pages per stripe. */
static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
{
return chunk_pages(io_size) * rs->set.raid_devs;
}
/* Initialize part of page_list (recovery). */
static void stripe_zero_pl_part(struct stripe *stripe, int p,
unsigned start, unsigned count)
{
unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
/* Get offset into the page_list. */
struct page_list *pl = pl_elem(PL(stripe, p), o);
BUG_ON(!pl);
while (pl && pages--) {
BUG_ON(!pl->page);
memset(page_address(pl->page), 0, PAGE_SIZE);
pl = pl->next;
}
}
/* Initialize parity chunk of stripe. */
static void stripe_zero_chunk(struct stripe *stripe, int p)
{
if (p > -1)
stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
}
/* Return dynamic stripe structure size. */
static size_t stripe_size(struct raid_set *rs)
{
return sizeof(struct stripe) +
rs->set.raid_devs * sizeof(struct stripe_chunk);
}
/* Allocate a stripe and its memory object. */
/* XXX adjust to cope with stripe cache and recovery stripe caches. */
enum grow { SC_GROW, SC_KEEP };
static struct stripe *stripe_alloc(struct stripe_cache *sc,
struct dm_mem_cache_client *mc,
enum grow grow)
{
int r;
struct stripe *stripe;
stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
if (stripe) {
/* Grow the dm-mem-cache by one object. */
if (grow == SC_GROW) {
r = dm_mem_cache_grow(mc, 1);
if (r)
goto err_free;
}
stripe->obj = dm_mem_cache_alloc(mc);
if (!stripe->obj)
goto err_shrink;
stripe_init(sc, stripe);
}
return stripe;
err_shrink:
if (grow == SC_GROW)
dm_mem_cache_shrink(mc, 1);
err_free:
kmem_cache_free(sc->kc.cache, stripe);
return NULL;
}
/*
* Free a stripes memory object, shrink the
* memory cache and free the stripe itself.
*/
static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
{
dm_mem_cache_free(mc, stripe->obj);
dm_mem_cache_shrink(mc, 1);
kmem_cache_free(stripe->sc->kc.cache, stripe);
}
/* Free the recovery stripe. */
static void stripe_recover_free(struct raid_set *rs)
{
struct recover *rec = &rs->recover;
struct dm_mem_cache_client *mc;
mc = rec->mem_cache_client;
rec->mem_cache_client = NULL;
if (mc) {
struct stripe *stripe;
while (!list_empty(&rec->stripes)) {
stripe = list_first_entry(&rec->stripes, struct stripe,
lists[LIST_RECOVER]);
list_del(stripe->lists + LIST_RECOVER);
kfree(stripe->recover);
stripe_free(stripe, mc);
}
dm_mem_cache_client_destroy(mc);
dm_io_client_destroy(rec->dm_io_client);
rec->dm_io_client = NULL;
}
}
/* Grow stripe cache. */
static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
{
int r = 0;
/* Try to allocate this many (additional) stripes. */
while (stripes--) {
struct stripe *stripe =
stripe_alloc(sc, sc->mem_cache_client, grow);
if (likely(stripe)) {
stripe_lru_add(stripe);
atomic_inc(&sc->stripes);
} else {
r = -ENOMEM;
break;
}
}
return r ? r : sc_hash_resize(sc);
}
/* Shrink stripe cache. */
static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
{
int r = 0;
/* Try to get unused stripe from LRU list. */
while (stripes--) {
struct stripe *stripe;
stripe = stripe_lru_pop(sc);
if (stripe) {
/* An LRU stripe may never have ios pending! */
BUG_ON(stripe_io_ref(stripe));
BUG_ON(stripe_ref(stripe));
atomic_dec(&sc->stripes);
/* Remove from hash if on before deletion. */
stripe_hash_del(stripe);
stripe_free(stripe, sc->mem_cache_client);
} else {
r = -ENOENT;
break;
}
}
/* Check if stats are still sane. */
if (atomic_read(&sc->active_stripes_max) >
atomic_read(&sc->stripes))
atomic_set(&sc->active_stripes_max, 0);
if (r)
return r;
return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
}
/* Create stripe cache and recovery. */
static int sc_init(struct raid_set *rs, unsigned stripes)
{
unsigned i, r, rstripes;
struct stripe_cache *sc = &rs->sc;
struct stripe *stripe;
struct recover *rec = &rs->recover;
struct mapped_device *md;
struct gendisk *disk;
/* Initialize lists and locks. */
i = ARRAY_SIZE(sc->lists);
while (i--)
INIT_LIST_HEAD(sc->lists + i);
INIT_LIST_HEAD(&rec->stripes);
/* Initialize endio and LRU list locks. */
i = NR_LOCKS;
while (i--)
spin_lock_init(sc->locks + i);
/* Initialize atomic variables. */
atomic_set(&sc->stripes, 0);
atomic_set(&sc->stripes_to_set, 0);
atomic_set(&sc->active_stripes, 0);
atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
/*
* We need a runtime unique # to suffix the kmem cache name
* because we'll have one for each active RAID set.
*/
md = dm_table_get_md(rs->ti->table);
disk = dm_disk(md);
sprintf(sc->kc.name, "%s-%d", TARGET, disk->first_minor);
dm_put(md);
sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
0, 0, NULL);
if (!sc->kc.cache)
return -ENOMEM;
/* Create memory cache client context for RAID stripe cache. */
sc->mem_cache_client =
dm_mem_cache_client_create(stripes, rs->set.raid_devs,
chunk_pages(rs->set.io_size));
if (IS_ERR(sc->mem_cache_client))
return PTR_ERR(sc->mem_cache_client);
/* Create memory cache client context for RAID recovery stripe(s). */
rstripes = rec->recovery_stripes;
rec->mem_cache_client =
dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
chunk_pages(rec->io_size));
if (IS_ERR(rec->mem_cache_client))
return PTR_ERR(rec->mem_cache_client);
/* Create dm-io client context for IO stripes. */
sc->dm_io_client =
dm_io_client_create((stripes > 32 ? 32 : stripes) *
rs->set.raid_devs *
chunk_pages(rs->set.io_size));
if (IS_ERR(sc->dm_io_client))
return PTR_ERR(sc->dm_io_client);
/* FIXME: intermingeled with stripe cache initialization. */
/* Create dm-io client context for recovery stripes. */
rec->dm_io_client =
dm_io_client_create(rstripes * rs->set.raid_devs *
chunk_pages(rec->io_size));
if (IS_ERR(rec->dm_io_client))
return PTR_ERR(rec->dm_io_client);
/* Allocate stripes for set recovery. */
while (rstripes--) {
stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
if (!stripe)
return -ENOMEM;
stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
if (!stripe->recover) {
stripe_free(stripe, rec->mem_cache_client);
return -ENOMEM;
}
SetStripeRecover(stripe);
stripe->io.size = rec->io_size;
list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
/* Don't add recovery stripes to LRU list! */
}
/*
* Allocate the stripe objetcs from the
* cache and add them to the LRU list.
*/
r = sc_grow(sc, stripes, SC_KEEP);
if (!r)
atomic_set(&sc->stripes_last, stripes);
return r;
}
/* Destroy the stripe cache. */
static void sc_exit(struct stripe_cache *sc)
{
struct raid_set *rs = RS(sc);
if (sc->kc.cache) {
stripe_recover_free(rs);
BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
kmem_cache_destroy(sc->kc.cache);
sc->kc.cache = NULL;
if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
dm_mem_cache_client_destroy(sc->mem_cache_client);
if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
dm_io_client_destroy(sc->dm_io_client);
hash_exit(&sc->hash);
}
}
/*
* Calculate RAID address
*
* Delivers tuple with the index of the data disk holding the chunk
* in the set, the parity disks index and the start of the stripe
* within the address space of the set (used as the stripe cache hash key).
*/
/* thx MD. */
static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
struct raid_address *addr)
{
sector_t stripe, tmp;
/*
* chunk_number = sector / chunk_size
* stripe_number = chunk_number / data_devs
* di = stripe % data_devs;
*/
stripe = sector >> rs->set.chunk_shift;
addr->di = sector_div(stripe, rs->set.data_devs);
switch (rs->set.raid_type->level) {
case raid4:
addr->pi = rs->set.pi;
goto check_shift_di;
case raid5:
tmp = stripe;
addr->pi = sector_div(tmp, rs->set.raid_devs);
switch (rs->set.raid_type->algorithm) {
case left_asym: /* Left asymmetric. */
addr->pi = rs->set.data_devs - addr->pi;
case right_asym: /* Right asymmetric. */
check_shift_di:
if (addr->di >= addr->pi)
addr->di++;
break;
case left_sym: /* Left symmetric. */
addr->pi = rs->set.data_devs - addr->pi;
case right_sym: /* Right symmetric. */
addr->di = (addr->pi + addr->di + 1) %
rs->set.raid_devs;
break;
case none: /* Ain't happen: RAID4 algorithm placeholder. */
BUG();
}
}
/*
* Start offset of the stripes chunk on any single device of the RAID
* set, adjusted in case io size differs from chunk size.
*/
addr->key = (stripe << rs->set.chunk_shift) +
(sector & rs->set.io_inv_mask);
return addr;
}
/*
* Copy data across between stripe pages and bio vectors.
*
* Pay attention to data alignment in stripe and bio pages.
*/
static void bio_copy_page_list(int rw, struct stripe *stripe,
struct page_list *pl, struct bio *bio)
{
unsigned i, page_offset;
void *page_addr;
struct raid_set *rs = RS(stripe->sc);
struct bio_vec *bv;
/* Get start page in page list for this sector. */
i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
pl = pl_elem(pl, i);
BUG_ON(!pl);
BUG_ON(!pl->page);
page_addr = page_address(pl->page);
page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
/* Walk all segments and copy data across between bio_vecs and pages. */
bio_for_each_segment(bv, bio, i) {
int len = bv->bv_len, size;
unsigned bio_offset = 0;
void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
redo:
size = (page_offset + len > PAGE_SIZE) ?
PAGE_SIZE - page_offset : len;
if (rw == READ)
memcpy(bio_addr + bio_offset,
page_addr + page_offset, size);
else
memcpy(page_addr + page_offset,
bio_addr + bio_offset, size);
page_offset += size;
if (page_offset == PAGE_SIZE) {
/*
* We reached the end of the chunk page ->
* need to refer to the next one to copy more data.
*/
len -= size;
if (len) {
/* Get next page. */
pl = pl->next;
BUG_ON(!pl);
BUG_ON(!pl->page);
page_addr = page_address(pl->page);
page_offset = 0;
bio_offset += size;
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
goto redo;
}
}
__bio_kunmap_atomic(bio_addr, KM_USER0);
}
}
/*
* Xor optimization macros.
*/
/* Xor data pointer declaration and initialization macros. */
#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
#define DECLARE_3 DECLARE_2, *d2 = data[2]
#define DECLARE_4 DECLARE_3, *d3 = data[3]
#define DECLARE_5 DECLARE_4, *d4 = data[4]
#define DECLARE_6 DECLARE_5, *d5 = data[5]
#define DECLARE_7 DECLARE_6, *d6 = data[6]
#define DECLARE_8 DECLARE_7, *d7 = data[7]
/* Xor unrole macros. */
#define D2(n) d0[n] = d0[n] ^ d1[n]
#define D3(n) D2(n) ^ d2[n]
#define D4(n) D3(n) ^ d3[n]
#define D5(n) D4(n) ^ d4[n]
#define D6(n) D5(n) ^ d5[n]
#define D7(n) D6(n) ^ d6[n]
#define D8(n) D7(n) ^ d7[n]
#define X_2(macro, offset) macro(offset); macro(offset + 1);
#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
/* Define a _xor_#chunks_#xors_per_run() function. */
#define _XOR(chunks, xors_per_run) \
static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
{ \
unsigned end = XOR_SIZE / sizeof(data[0]), i; \
DECLARE_ ## chunks; \
\
for (i = 0; i < end; i += xors_per_run) { \
X_ ## xors_per_run(D ## chunks, i); \
} \
}
/* Define xor functions for 2 - 8 chunks and xors per run. */
#define MAKE_XOR_PER_RUN(xors_per_run) \
_XOR(2, xors_per_run); _XOR(3, xors_per_run); \
_XOR(4, xors_per_run); _XOR(5, xors_per_run); \
_XOR(6, xors_per_run); _XOR(7, xors_per_run); \
_XOR(8, xors_per_run);
MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
#define MAKE_XOR(xors_per_run) \
struct { \
void (*f)(unsigned long **); \
} static xor_funcs ## xors_per_run[] = { \
{ NULL }, /* NULL pointers to optimize indexing in xor(). */ \
{ NULL }, \
{ _xor2_ ## xors_per_run }, \
{ _xor3_ ## xors_per_run }, \
{ _xor4_ ## xors_per_run }, \
{ _xor5_ ## xors_per_run }, \
{ _xor6_ ## xors_per_run }, \
{ _xor7_ ## xors_per_run }, \
{ _xor8_ ## xors_per_run }, \
}; \
\
static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
{ \
/* Call respective function for amount of chunks. */ \
xor_funcs ## xors_per_run[n].f(data); \
}
/* Define xor_8() - xor_64 functions. */
MAKE_XOR(8)
MAKE_XOR(16)
MAKE_XOR(32)
MAKE_XOR(64)
/* Maximum number of chunks, which can be xor'ed in one go. */
#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
static void xor_blocks_wrapper(unsigned n, unsigned long **data)
{
BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
}
struct xor_func {
xor_function_t f;
const char *name;
} static xor_funcs[] = {
{ xor_8, "xor_8" },
{ xor_16, "xor_16" },
{ xor_32, "xor_32" },
{ xor_64, "xor_64" },
{ xor_blocks_wrapper, "xor_blocks" },
};
/*
* Check, if chunk has to be xored in/out:
*
* o if writes are queued
* o if writes are merged
* o if stripe is to be reconstructed
* o if recovery stripe
*/
static inline int chunk_must_xor(struct stripe_chunk *chunk)
{
if (ChunkUptodate(chunk)) {
BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
!bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
!bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
return 1;
if (StripeReconstruct(chunk->stripe) ||
StripeRecover(chunk->stripe))
return 1;
}
return 0;
}
/*
* Calculate crc.
*
* This indexes into the chunks of a stripe and their pages.
*
* All chunks will be xored into the indexed (@pi)
* chunk in maximum groups of xor.chunks.
*
*/
static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
{
struct raid_set *rs = RS(stripe->sc);
unsigned max_chunks = rs->xor.chunks, n = 1,
o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
p = rs->set.raid_devs;
unsigned long **d = rs->data;
xor_function_t xor_f = rs->xor.f->f;
BUG_ON(sector > stripe->io.size);
/* Address of parity page to xor into. */
d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
while (p--) {
/* Preset pointers to data pages. */
if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
/* If max chunks -> xor. */
if (n == max_chunks) {
xor_f(n, d);
n = 1;
}
}
/* If chunks -> xor. */
if (n > 1)
xor_f(n, d);
}
/* Common xor loop through all stripe page lists. */
static void common_xor(struct stripe *stripe, sector_t count,
unsigned off, unsigned pi)
{
unsigned sector;
BUG_ON(!count);
for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
xor(stripe, pi, sector);
/* Set parity page uptodate and clean. */
chunk_set(CHUNK(stripe, pi), CLEAN);
atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
}
/*
* Calculate parity sectors on intact stripes.
*
* Need to calculate raid address for recover stripe, because its
* chunk sizes differs and is typically larger than io chunk size.
*/
static void parity_xor(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
xor_size = chunk_size > io_size ? io_size : chunk_size;
sector_t off;
/* This can be the recover stripe with a larger io size. */
for (off = 0; off < io_size; off += xor_size) {
/*
* Recover stripe is likely bigger than regular io
* ones and has no precalculated parity disk index ->
* need to calculate RAID address.
*/
if (unlikely(StripeRecover(stripe))) {
struct raid_address addr;
raid_address(rs,
(stripe->key + off) * rs->set.data_devs,
&addr);
stripe->idx.parity = addr.pi;
stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
}
common_xor(stripe, xor_size, off, stripe->idx.parity);
chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
}
}
/* Reconstruct missing chunk. */
static void stripe_reconstruct(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
int p = rs->set.raid_devs, pr = stripe->idx.recover;
BUG_ON(pr < 0);
/* Check if all but the chunk to be reconstructed are uptodate. */
while (p--)
BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
/* REMOVEME: statistics. */
atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
S_RECONSTRUCT_DEV));
/* Zero chunk to be reconstructed. */
stripe_zero_chunk(stripe, pr);
common_xor(stripe, stripe->io.size, 0, pr);
stripe->idx.recover = -1;
}
/*
* Recovery io throttling
*/
/* Conditionally reset io counters. */
static int recover_io_reset(struct raid_set *rs)
{
unsigned long j = jiffies;
/* Pay attention to jiffies overflows. */
if (j > rs->recover.last_jiffies + HZ / 20 ||
j < rs->recover.last_jiffies) {
atomic_set(rs->recover.io_count + IO_WORK, 0);
atomic_set(rs->recover.io_count + IO_RECOVER, 0);
rs->recover.last_jiffies = j;
return 1;
}
return 0;
}
/* Count ios. */
static void recover_io_count(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
recover_io_reset(rs);
atomic_inc(rs->recover.io_count +
(StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
}
/* Try getting a stripe either from the hash or from the LRU list. */
static struct stripe *stripe_find(struct raid_set *rs,
struct raid_address *addr)
{
int r;
struct stripe_cache *sc = &rs->sc;
struct stripe *stripe;
/* Try stripe from hash. */
stripe = stripe_lookup(sc, addr->key);
if (stripe) {
r = stripe_get(stripe);
if (r)
goto get_lock_failed;
atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
} else {
/* Not in hash -> try to get an LRU stripe. */
stripe = stripe_lru_pop(sc);
if (stripe) {
/*
* An LRU stripe may not be referenced
* and may never have ios pending!
*/
BUG_ON(stripe_ref(stripe));
BUG_ON(stripe_io_ref(stripe));
/* Remove from hash if on before reuse. */
stripe_hash_del(stripe);
/* Invalidate before reinserting with changed key. */
stripe_invalidate(stripe);
stripe->key = addr->key;
stripe->region = dm_rh_sector_to_region(rs->recover.rh,
addr->key);
stripe->idx.parity = addr->pi;
r = stripe_get(stripe);
if (r)
goto get_lock_failed;
/* Insert stripe into the stripe hash. */
stripe_insert(&sc->hash, stripe);
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_INSCACHE);
}
}
return stripe;
get_lock_failed:
stripe_put(stripe);
return NULL;
}
/*
* Process end io
*
* I need to do it here because I can't in interrupt
*/
/* End io all bios on a bio list. */
static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
int p, int error)
{
struct raid_set *rs = RS(stripe->sc);
struct bio *bio;
struct page_list *pl = PL(stripe, p);
struct stripe_chunk *chunk = CHUNK(stripe, p);
/* Update region counters. */
while ((bio = bio_list_pop(bl))) {
if (bio_data_dir(bio) == WRITE)
/* Drop io pending count for any writes. */
dm_rh_dec(rs->recover.rh, stripe->region);
else if (!error)
/* Copy data accross. */
bio_copy_page_list(READ, stripe, pl, bio);
bio_endio(bio, error);
/* REMOVEME: statistics. */
atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
chunk_put(chunk);
stripe_put(stripe);
io_put(rs); /* Wake any suspend waiters on last bio. */
}
}
/*
* End io all reads/writes on a stripe copying
* read data accross from stripe to bios and
* decrementing region counters for writes.
*
* Processing of ios depeding on state:
* o no chunk error -> endio ok
* o degraded:
* - chunk error and read -> ignore to be requeued
* - chunk error and write -> endio ok
* o dead (more than parity_devs failed) and chunk_error-> endio failed
*/
static void stripe_endio(int rw, struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
unsigned p = rs->set.raid_devs;
int write = (rw != READ);
while (p--) {
struct stripe_chunk *chunk = CHUNK(stripe, p);
struct bio_list *bl;
BUG_ON(ChunkLocked(chunk));
bl = BL_CHUNK(chunk, rw);
if (bio_list_empty(bl))
continue;
if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
/* RAID set dead. */
if (unlikely(RSDead(rs)))
bio_list_endio(stripe, bl, p, -EIO);
/* RAID set degraded. */
else if (write)
bio_list_endio(stripe, bl, p, 0);
} else {
BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
bio_list_endio(stripe, bl, p, 0);
}
}
}
/* Fail all ios hanging off all bio lists of a stripe. */
static void stripe_fail_io(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
unsigned p = rs->set.raid_devs;
while (p--) {
struct stripe_chunk *chunk = CHUNK(stripe, p);
int i = ARRAY_SIZE(chunk->bl);
/* Fail all bios on all bio lists of the stripe. */
while (i--) {
struct bio_list *bl = chunk->bl + i;
if (!bio_list_empty(bl))
bio_list_endio(stripe, bl, p, -EIO);
}
}
/* Put stripe on LRU list. */
BUG_ON(stripe_io_ref(stripe));
BUG_ON(stripe_ref(stripe));
}
/* Unlock all required chunks. */
static void stripe_chunks_unlock(struct stripe *stripe)
{
unsigned p = RS(stripe->sc)->set.raid_devs;
struct stripe_chunk *chunk;
while (p--) {
chunk = CHUNK(stripe, p);
if (TestClearChunkUnlock(chunk))
ClearChunkLocked(chunk);
}
}
/*
* Queue reads and writes to a stripe by hanging
* their bios off the stripesets read/write lists.
*/
static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
struct bio_list *reject)
{
struct raid_address addr;
struct stripe *stripe;
stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
if (stripe) {
int r = 0, rw = bio_data_dir(bio);
/* Distinguish reads and writes. */
bio_list_add(BL(stripe, addr.di, rw), bio);
if (rw == READ)
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_BIOS_ADDED_READ);
else {
/* Inrement pending write count on region. */
dm_rh_inc(rs->recover.rh, stripe->region);
r = 1;
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
}
/*
* Put on io (flush) list in case of
* initial bio queued to chunk.
*/
if (chunk_get(CHUNK(stripe, addr.di)) == 1)
stripe_flush_add(stripe);
return r;
}
/* Got no stripe from cache or failed to lock it -> reject bio. */
bio_list_add(reject, bio);
atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
return 0;
}
/*
* Handle all stripes by handing them to the daemon, because we can't
* map their chunk pages to copy the data in interrupt context.
*
* We don't want to handle them here either, while interrupts are disabled.
*/
/* Read/write endio function for dm-io (interrupt context). */
static void endio(unsigned long error, void *context)
{
struct stripe_chunk *chunk = context;
if (unlikely(error)) {
chunk_set(chunk, ERROR);
/* REMOVEME: statistics. */
atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
} else
chunk_set(chunk, CLEAN);
/*
* For recovery stripes, I need to reset locked locked
* here, because those aren't processed in do_endios().
*/
if (unlikely(StripeRecover(chunk->stripe)))
ClearChunkLocked(chunk);
else
SetChunkUnlock(chunk);
/* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
stripe_put_references(chunk->stripe);
}
/* Read/Write a chunk asynchronously. */
static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
{
struct stripe_cache *sc = stripe->sc;
struct raid_set *rs = RS(sc);
struct dm_mem_cache_object *obj = stripe->obj + p;
struct page_list *pl = obj->pl;
struct stripe_chunk *chunk = CHUNK(stripe, p);
struct raid_dev *dev = rs->dev + p;
struct dm_io_region io = {
.bdev = dev->dev->bdev,
.sector = stripe->key,
.count = stripe->io.size,
};
struct dm_io_request control = {
.bi_rw = ChunkDirty(chunk) ? WRITE : READ,
.mem = {
.type = DM_IO_PAGE_LIST,
.ptr.pl = pl,
.offset = 0,
},
.notify = {
.fn = endio,
.context = chunk,
},
.client = StripeRecover(stripe) ? rs->recover.dm_io_client :
sc->dm_io_client,
};
BUG_ON(ChunkLocked(chunk));
BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
/*
* Don't rw past end of device, which can happen, because
* typically sectors_per_dev isn't divisible by io_size.
*/
if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
io.count = rs->set.sectors_per_dev - io.sector;
BUG_ON(!io.count);
io.sector += dev->start; /* Add <offset>. */
if (RSRecover(rs))
recover_io_count(stripe); /* Recovery io accounting. */
/* REMOVEME: statistics. */
atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
S_DM_IO_READ));
SetChunkLocked(chunk);
SetDevIoQueued(dev);
BUG_ON(dm_io(&control, 1, &io, NULL));
}
/*
* Write dirty or read not uptodate page lists of a stripe.
*/
static int stripe_chunks_rw(struct stripe *stripe)
{
int r;
struct raid_set *rs = RS(stripe->sc);
/*
* Increment the pending count on the stripe
* first, so that we don't race in endio().
*
* An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
*
* o not uptodate
* o dirtied by writes merged
* o dirtied by parity calculations
*/
r = for_each_io_dev(stripe, stripe_get_references);
if (r) {
/* Io needed: chunks are either not uptodate or dirty. */
int max; /* REMOVEME: */
struct stripe_cache *sc = &rs->sc;
/* Submit actual io. */
for_each_io_dev(stripe, stripe_chunk_rw);
/* REMOVEME: statistics */
max = sc_active(sc);
if (atomic_read(&sc->active_stripes_max) < max)
atomic_set(&sc->active_stripes_max, max);
atomic_inc(rs->stats + S_FLUSHS);
/* END REMOVEME: statistics */
}
return r;
}
/* Merge in all writes hence dirtying respective chunks. */
static void stripe_merge_writes(struct stripe *stripe)
{
unsigned p = RS(stripe->sc)->set.raid_devs;
while (p--) {
struct stripe_chunk *chunk = CHUNK(stripe, p);
struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
if (!bio_list_empty(write)) {
struct bio *bio;
struct page_list *pl = stripe->obj[p].pl;
/*
* We can play with the lists without holding a lock,
* because it is just us accessing them anyway.
*/
bio_list_for_each(bio, write)
bio_copy_page_list(WRITE, stripe, pl, bio);
bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
bio_list_init(write);
chunk_set(chunk, DIRTY);
}
}
}
/* Queue all writes to get merged. */
static int stripe_queue_writes(struct stripe *stripe)
{
int r = 0;
unsigned p = RS(stripe->sc)->set.raid_devs;
while (p--) {
struct stripe_chunk *chunk = CHUNK(stripe, p);
struct bio_list *write = BL_CHUNK(chunk, WRITE);
if (!bio_list_empty(write)) {
bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
bio_list_init(write);
SetChunkIo(chunk);
r = 1;
}
}
return r;
}
/* Check, if a chunk gets completely overwritten. */
static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
{
unsigned sectors = 0;
struct bio *bio;
struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
bio_list_for_each(bio, bl)
sectors += bio_sectors(bio);
BUG_ON(sectors > RS(stripe->sc)->set.io_size);
return sectors == RS(stripe->sc)->set.io_size;
}
/*
* Avoid io on broken/reconstructed drive in order to
* reconstruct date on endio.
*
* (*1*) We set StripeReconstruct() in here, so that _do_endios()
* will trigger a reconstruct call before resetting it.
*/
static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
{
struct stripe_chunk *chunk = CHUNK(stripe, pr);
/*
* Allow io on all chunks but the indexed one,
* because we're either degraded or prohibit it
* on the one for later reconstruction.
*/
/* Includes ClearChunkIo(), ClearChunkUptodate(). */
stripe_chunk_invalidate(chunk);
stripe->idx.recover = pr;
SetStripeReconstruct(stripe);
/* REMOVEME: statistics. */
atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
return -EPERM;
}
/* Chunk locked/uptodate and device failed tests. */
static struct stripe_chunk *
stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
{
struct raid_set *rs = RS(stripe->sc);
struct stripe_chunk *chunk = CHUNK(stripe, p);
/* Can't access active chunks. */
if (ChunkLocked(chunk)) {
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_CHUNK_LOCKED);
return NULL;
}
/* Can't access broken devive. */
if (ChunkError(chunk) || DevFailed(rs->dev + p))
return NULL;
/* Can access uptodate chunks. */
if (ChunkUptodate(chunk)) {
(*chunks_uptodate)++;
return NULL;
}
return chunk;
}
/*
* Degraded/reconstruction mode.
*
* Check stripe state to figure which chunks don't need IO.
*
* Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
*/
static int stripe_check_reconstruct(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
if (RSDead(rs)) {
ClearStripeReconstruct(stripe);
ClearStripeReconstructed(stripe);
stripe_allow_io(stripe);
return 0;
}
/* Avoid further reconstruction setting, when already set. */
if (StripeReconstruct(stripe)) {
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_RECONSTRUCT_SET);
return -EBUSY;
}
/* Initially allow io on all chunks. */
stripe_allow_io(stripe);
/* Return if stripe is already reconstructed. */
if (StripeReconstructed(stripe)) {
atomic_inc(rs->stats + S_RECONSTRUCTED);
return 0;
}
/*
* Degraded/reconstruction mode (device failed) ->
* avoid io on the failed device.
*/
if (unlikely(RSDegraded(rs))) {
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_DEGRADED);
/* Allow IO on all devices but the dead one. */
BUG_ON(rs->set.ei < 0);
return stripe_chunk_set_io_flags(stripe, rs->set.ei);
} else {
int sync, pi = dev_for_parity(stripe, &sync);
/*
* Reconstruction mode (ie. a particular (replaced) device or
* some (rotating) parity chunk is being resynchronized) ->
* o make sure all needed chunks are read in
* o writes are allowed to go through
*/
if (!sync) {
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_NOSYNC);
/* Allow IO on all devs but the one to reconstruct. */
return stripe_chunk_set_io_flags(stripe, pi);
}
}
return 0;
}
/*
* Check, if stripe is ready to merge writes.
* I.e. if all chunks present to allow to merge bios.
*
* We prohibit io on:
*
* o chunks without bios
* o chunks which get completely written over
*/
static int stripe_merge_possible(struct stripe *stripe, int nosync)
{
struct raid_set *rs = RS(stripe->sc);
unsigned chunks_overwrite = 0, chunks_prohibited = 0,
chunks_uptodate = 0, p = rs->set.raid_devs;
/* Walk all chunks. */
while (p--) {
struct stripe_chunk *chunk;
/* Prohibit io on broken devices. */
if (DevFailed(rs->dev + p)) {
chunk = CHUNK(stripe, p);
goto prohibit_io;
}
/* We can't optimize any further if no chunk. */
chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
if (!chunk || nosync)
continue;
/*
* We have a chunk, which is not uptodate.
*
* If this is not parity and we don't have
* reads queued, we can optimize further.
*/
if (p != stripe->idx.parity &&
bio_list_empty(BL_CHUNK(chunk, READ)) &&
bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
goto prohibit_io;
else if (RSCheckOverwrite(rs) &&
stripe_check_chunk_overwrite(stripe, p))
/* Completely overwritten chunk. */
chunks_overwrite++;
}
/* Allow io for chunks with bios and overwritten ones. */
SetChunkIo(chunk);
continue;
prohibit_io:
/* No io for broken devices or for chunks w/o bios. */
ClearChunkIo(chunk);
chunks_prohibited++;
/* REMOVEME: statistics. */
atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
}
/* All data chunks will get written over. */
if (chunks_overwrite == rs->set.data_devs)
atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
/* We don't have enough chunks to merge. */
atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
return -EPERM;
}
/*
* If we have all chunks up to date or overwrite them, we
* just zero the parity chunk and let stripe_rw() recreate it.
*/
if (chunks_uptodate == rs->set.raid_devs ||
chunks_overwrite == rs->set.data_devs) {
stripe_zero_chunk(stripe, stripe->idx.parity);
BUG_ON(StripeReconstruct(stripe));
SetStripeReconstruct(stripe); /* Enforce xor in caller. */
} else {
/*
* With less chunks, we xor parity out.
*
* (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
* so that only chunks with queued or merged writes
* are being xored.
*/
parity_xor(stripe);
}
/*
* We do have enough chunks to merge.
* All chunks are uptodate or get written over.
*/
atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
return 0;
}
/*
* Avoid reading chunks in case we're fully operational.
*
* We prohibit io on any chunks without bios but the parity chunk.
*/
static void stripe_avoid_reads(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
unsigned dummy = 0, p = rs->set.raid_devs;
/* Walk all chunks. */
while (p--) {
struct stripe_chunk *chunk =
stripe_chunk_check(stripe, p, &dummy);
if (!chunk)
continue;
/* If parity or any bios pending -> allow io. */
if (chunk_ref(chunk) || p == stripe->idx.parity)
SetChunkIo(chunk);
else {
ClearChunkIo(chunk);
/* REMOVEME: statistics. */
atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
}
}
}
/*
* Read/write a stripe.
*
* All stripe read/write activity goes through this function
* unless recovery, which has to call stripe_chunk_rw() directly.
*
* Make sure we don't try already merged stripes in order
* to avoid data corruption.
*
* Check the state of the RAID set and if degraded (or
* resynchronizing for reads), read in all other chunks but
* the one on the dead/resynchronizing device in order to be
* able to reconstruct the missing one in _do_endios().
*
* Can be called on active stripes in order
* to dispatch new io on inactive chunks.
*
* States to cover:
* o stripe to read and/or write
* o stripe with error to reconstruct
*/
static void stripe_rw(struct stripe *stripe)
{
int nosync, r;
struct raid_set *rs = RS(stripe->sc);
/*
* Check, if a chunk needs to be reconstructed
* because of a degraded set or a region out of sync.
*/
nosync = stripe_check_reconstruct(stripe);
switch (nosync) {
case -EBUSY:
return; /* Wait for stripe reconstruction to finish. */
case -EPERM:
goto io;
}
/*
* If we don't have merged writes pending, we can schedule
* queued writes to be merged next without corrupting data.
*/
if (!StripeMerged(stripe)) {
r = stripe_queue_writes(stripe);
if (r)
/* Writes got queued -> flag RBW. */
SetStripeRBW(stripe);
}
/*
* Merge all writes hanging off uptodate/overwritten
* chunks of the stripe.
*/
if (StripeRBW(stripe)) {
r = stripe_merge_possible(stripe, nosync);
if (!r) { /* Merge possible. */
struct stripe_chunk *chunk;
/*
* I rely on valid parity in order
* to xor a fraction of chunks out
* of parity and back in.
*/
stripe_merge_writes(stripe); /* Merge writes in. */
parity_xor(stripe); /* Update parity. */
ClearStripeReconstruct(stripe); /* Reset xor enforce. */
SetStripeMerged(stripe); /* Writes merged. */
ClearStripeRBW(stripe); /* Disable RBW. */
/*
* REMOVEME: sanity check on parity chunk
* states after writes got merged.
*/
chunk = CHUNK(stripe, stripe->idx.parity);
BUG_ON(ChunkLocked(chunk));
BUG_ON(!ChunkUptodate(chunk));
BUG_ON(!ChunkDirty(chunk));
BUG_ON(!ChunkIo(chunk));
}
} else if (!nosync && !StripeMerged(stripe))
/* Read avoidance if not degraded/resynchronizing/merged. */
stripe_avoid_reads(stripe);
io:
/* Now submit any reads/writes for non-uptodate or dirty chunks. */
r = stripe_chunks_rw(stripe);
if (!r) {
/*
* No io submitted because of chunk io
* prohibited or locked chunks/failed devices
* -> push to end io list for processing.
*/
stripe_endio_push(stripe);
atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
}
}
/*
* Recovery functions
*/
/* Read a stripe off a raid set for recovery. */
static int stripe_recover_read(struct stripe *stripe, int pi)
{
BUG_ON(stripe_io_ref(stripe));
/* Invalidate all chunks so that they get read in. */
stripe_chunks_invalidate(stripe);
stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
/*
* If we are reconstructing a perticular device, we can avoid
* reading the respective chunk in, because we're going to
* reconstruct it anyway.
*
* We can't do that for resynchronization of rotating parity,
* because the recovery stripe chunk size is typically larger
* than the sets chunk size.
*/
if (pi > -1)
ClearChunkIo(CHUNK(stripe, pi));
return stripe_chunks_rw(stripe);
}
/* Write a stripe to a raid set for recovery. */
static int stripe_recover_write(struct stripe *stripe, int pi)
{
BUG_ON(stripe_io_ref(stripe));
/*
* If this is a reconstruct of a particular device, then
* reconstruct the respective chunk, else create parity chunk.
*/
if (pi > -1) {
stripe_zero_chunk(stripe, pi);
common_xor(stripe, stripe->io.size, 0, pi);
chunk_set(CHUNK(stripe, pi), DIRTY);
} else
parity_xor(stripe);
return stripe_chunks_rw(stripe);
}
/* Read/write a recovery stripe. */
static int stripe_recover_rw(struct stripe *stripe)
{
int r = 0, sync = 0;
/* Read/write flip-flop. */
if (TestClearStripeRBW(stripe)) {
SetStripeMerged(stripe);
stripe->key = stripe->recover->pos;
r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
BUG_ON(!r);
} else if (TestClearStripeMerged(stripe)) {
r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
BUG_ON(!r);
}
BUG_ON(sync);
return r;
}
/* Recover bandwidth available ?. */
static int recover_bandwidth(struct raid_set *rs)
{
int r, work;
/* On reset or when bios delayed -> allow recovery. */
r = recover_io_reset(rs);
if (r || RSBandwidth(rs))
goto out;
work = atomic_read(rs->recover.io_count + IO_WORK);
if (work) {
/* Pay attention to larger recover stripe size. */
int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
rs->recover.io_size / rs->set.io_size;
/*
* Don't use more than given bandwidth
* of the work io for recovery.
*/
if (recover > work / rs->recover.bandwidth_work) {
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_NO_BANDWIDTH);
return 0;
}
}
out:
atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
return 1;
}
/* Try to get a region to recover. */
static int stripe_recover_get_region(struct stripe *stripe)
{
struct raid_set *rs = RS(stripe->sc);
struct recover *rec = &rs->recover;
struct recover_addr *addr = stripe->recover;
struct dm_dirty_log *dl = rec->dl;
struct dm_rh_client *rh = rec->rh;
BUG_ON(!dl);
BUG_ON(!rh);
/* Return, that we have region first to finish it during suspension. */
if (addr->reg)
return 1;
if (RSSuspend(rs))
return -EPERM;
if (dl->type->get_sync_count(dl) >= rec->nr_regions)
return -ENOENT;
/* If we don't have enough bandwidth, we don't proceed recovering. */
if (!recover_bandwidth(rs))
return -EAGAIN;
/* Start quiescing a region. */
dm_rh_recovery_prepare(rh);
addr->reg = dm_rh_recovery_start(rh);
if (!addr->reg)
return -EAGAIN;
addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
addr->end = addr->pos + dm_rh_get_region_size(rh);
/*
* Take one global io reference out for the
* whole region, which is going to be released
* when the region is completely done with.
*/
io_get(rs);
return 0;
}
/* Update region hash state. */
enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
static void recover_rh_update(struct stripe *stripe, enum recover_type success)
{
struct recover_addr *addr = stripe->recover;
struct raid_set *rs = RS(stripe->sc);
struct recover *rec = &rs->recover;
if (!addr->reg) {
DMERR("%s- Called w/o region", __func__);
return;
}
dm_rh_recovery_end(addr->reg, success);
if (success)
rec->nr_regions_recovered++;
addr->reg = NULL;
/*
* Completely done with this region ->
* release the 1st io reference.
*/
io_put(rs);
}
/* Set start of recovery state. */
static void set_start_recovery(struct raid_set *rs)
{
/* Initialize recovery. */
rs->recover.start_jiffies = jiffies;
rs->recover.end_jiffies = 0;
}
/* Set end of recovery state. */
static void set_end_recovery(struct raid_set *rs)
{
ClearRSRecover(rs);
rs->set.dev_to_init = -1;
/* Check for jiffies overrun. */
rs->recover.end_jiffies = jiffies;
if (rs->recover.end_jiffies < rs->recover.start_jiffies)
rs->recover.end_jiffies = ~0;
}
/* Handle recovery on one recovery stripe. */
static int _do_recovery(struct stripe *stripe)
{
int r;
struct raid_set *rs = RS(stripe->sc);
struct recover_addr *addr = stripe->recover;
/* If recovery is active -> return. */
if (stripe_io_ref(stripe))
return 1;
/* IO error is fatal for recovery -> stop it. */
if (unlikely(StripeError(stripe)))
goto err;
/* Recovery end required. */
if (!RSRecover(rs))
goto err;
/* Get a region to recover. */
r = stripe_recover_get_region(stripe);
switch (r) {
case 0: /* Got a new region: flag initial read before write. */
SetStripeRBW(stripe);
case 1: /* Have a region in the works. */
break;
case -EAGAIN:
/* No bandwidth/quiesced region yet, try later. */
if (!io_ref(rs))
wake_do_raid_delayed(rs, HZ / 4);
case -EPERM:
/* Suspend. */
return 1;
case -ENOENT: /* No more regions to recover. */
schedule_work(&rs->io.ws_do_table_event);
return 0;
default:
BUG();
}
/* Read/write a recover stripe. */
r = stripe_recover_rw(stripe);
if (r)
/* IO initiated. */
return 1;
/* Read and write finished-> update recovery position within region. */
addr->pos += stripe->io.size;
/* If we're at end of region, update region hash. */
if (addr->pos >= addr->end ||
addr->pos >= rs->set.sectors_per_dev)
recover_rh_update(stripe, REC_SUCCESS);
else
/* Prepare to read next region segment. */
SetStripeRBW(stripe);
/* Schedule myself for another round... */
wake_do_raid(rs);
return 1;
err:
/* FIXME: rather try recovering other regions on error? */
rs_check_degrade(stripe);
recover_rh_update(stripe, REC_FAILURE);
/* Check state of partially recovered array. */
if (RSDegraded(rs) && !RSDead(rs) &&
rs->set.dev_to_init != -1 &&
rs->set.ei != rs->set.dev_to_init)
/* Broken drive != drive to recover -> FATAL. */
SetRSDead(rs);
if (StripeError(stripe)) {
char buf[BDEVNAME_SIZE];
DMERR("stopping recovery due to "
"ERROR on /dev/%s, stripe at offset %llu",
bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
(unsigned long long) stripe->key);
}
/* Make sure, that all quiesced regions get released. */
while (addr->reg) {
dm_rh_recovery_end(addr->reg, -EIO);
addr->reg = dm_rh_recovery_start(rs->recover.rh);
}
return 0;
}
/* Called by main io daemon to recover regions. */
static void do_recovery(struct raid_set *rs)
{
if (RSRecover(rs)) {
int r = 0;
struct stripe *stripe;
list_for_each_entry(stripe, &rs->recover.stripes,
lists[LIST_RECOVER])
r += _do_recovery(stripe);
if (!r) {
set_end_recovery(rs);
stripe_recover_free(rs);
}
}
}
/*
* END recovery functions
*/
/* End io process all stripes handed in by endio() callback. */
static void _do_endios(struct raid_set *rs, struct stripe *stripe,
struct list_head *flush_list)
{
/* First unlock all required chunks. */
stripe_chunks_unlock(stripe);
/*
* If an io error on a stripe occured, degrade the RAID set
* and try to endio as many bios as possible. If any bios can't
* be endio processed, requeue the stripe (stripe_ref() != 0).
*/
if (TestClearStripeError(stripe)) {
/*
* FIXME: if read, rewrite the failed chunk after reconstruction
* in order to trigger disk bad sector relocation.
*/
rs_check_degrade(stripe); /* Resets ChunkError(). */
ClearStripeReconstruct(stripe);
ClearStripeReconstructed(stripe);
}
/* Got to reconstruct a missing chunk. */
if (StripeReconstruct(stripe)) {
/*
* (*2*) We use StripeReconstruct() to allow for
* all chunks to be xored into the reconstructed
* one (see chunk_must_xor()).
*/
stripe_reconstruct(stripe);
/*
* (*3*) Now we reset StripeReconstruct() and flag
* StripeReconstructed() to show to stripe_rw(),
* that we have reconstructed a missing chunk.
*/
ClearStripeReconstruct(stripe);
SetStripeReconstructed(stripe);
/* FIXME: reschedule to be written in case of read. */
// if (!StripeRBW(stripe)) {
// chunk_set(CHUNK(stripe, pr), DIRTY);
// stripe_chunks_rw(stripe);
// }
}
/*
* Now that we eventually got a complete stripe, we
* can process the rest of the end ios on reads.
*/
stripe_endio(READ, stripe);
/* End io all merged writes. */
if (TestClearStripeMerged(stripe))
stripe_endio(WRITE_MERGED, stripe);
/* If RAID set is dead -> fail any ios to dead drives. */
if (RSDead(rs)) {
DMERR_LIMIT("RAID set dead: failing ios to dead devices");
stripe_fail_io(stripe);
}
/*
* We have stripe references still,
* beacuse of read befeore writes or IO errors ->
* got to put on flush list for processing.
*/
if (stripe_ref(stripe)) {
BUG_ON(!list_empty(stripe->lists + LIST_LRU));
list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
} else
stripe_lru_add(stripe);
}
/* Pop any endio stripes off of the endio list and belabour them. */
static void do_endios(struct raid_set *rs)
{
struct stripe_cache *sc = &rs->sc;
struct stripe *stripe;
/* IO flush list for sorted requeued stripes. */
struct list_head flush_list;
INIT_LIST_HEAD(&flush_list);
while ((stripe = stripe_endio_pop(sc))) {
/* Avoid endio on stripes with newly io'ed chunks. */
if (!stripe_io_ref(stripe))
_do_endios(rs, stripe, &flush_list);
}
/*
* Insert any requeued stripes in the proper
* order at the beginning of the io (flush) list.
*/
list_splice(&flush_list, sc->lists + LIST_FLUSH);
}
/* Flush any stripes on the io list. */
static void do_flush(struct raid_set *rs)
{
struct stripe *stripe;
while ((stripe = stripe_io_pop(&rs->sc)))
stripe_rw(stripe); /* Read/write stripe. */
}
/* Stripe cache resizing. */
static void do_sc_resize(struct raid_set *rs)
{
unsigned set = atomic_read(&rs->sc.stripes_to_set);
if (set) {
unsigned cur = atomic_read(&rs->sc.stripes);
int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
sc_shrink(&rs->sc, cur - set);
/* Flag end of resizeing if ok. */
if (!r)
atomic_set(&rs->sc.stripes_to_set, 0);
}
}
/*
* Process all ios
*
* We do different things with the io depending
* on the state of the region that it is in:
*
* o reads: hang off stripe cache or postpone if full
*
* o writes:
*
* CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
* In case stripe cache is full or busy, postpone the io.
*
* RECOVERING: delay the io until recovery of the region completes.
*
*/
static void do_ios(struct raid_set *rs, struct bio_list *ios)
{
int r;
unsigned flush = 0, delay = 0;
sector_t sector;
struct dm_rh_client *rh = rs->recover.rh;
struct bio *bio;
struct bio_list reject;
bio_list_init(&reject);
/*
* Classify each io:
* o delay writes to recovering regions (let reads go through)
* o queue io to all other regions
*/
while ((bio = bio_list_pop(ios))) {
/*
* In case we get a barrier bio, push it back onto
* the input queue unless all work queues are empty
* and the stripe cache is inactive.
*/
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
/* REMOVEME: statistics. */
atomic_inc(rs->stats + S_BARRIER);
if (delay ||
!list_empty(rs->sc.lists + LIST_FLUSH) ||
!bio_list_empty(&reject) ||
sc_active(&rs->sc)) {
bio_list_push(ios, bio);
break;
}
}
/* Check for recovering regions. */
sector = _sector(rs, bio);
r = region_state(rs, sector, DM_RH_RECOVERING);
if (unlikely(r && bio_data_dir(bio) == WRITE)) {
delay++;
/* Wait writing to recovering regions. */
dm_rh_delay_by_region(rh, bio,
dm_rh_sector_to_region(rh,
sector));
/* REMOVEME: statistics.*/
atomic_inc(rs->stats + S_DELAYED_BIOS);
atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
/* Force bandwidth tests in recovery. */
SetRSBandwidth(rs);
} else {
/*
* Process ios to non-recovering regions by queueing
* them to stripes (does dm_rh_inc()) for writes).
*/
flush += stripe_queue_bio(rs, bio, &reject);
}
}
if (flush) {
/* FIXME: better error handling. */
r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
if (r)
DMERR_LIMIT("dirty log flush");
}
/* Merge any rejected bios back to the head of the input list. */
bio_list_merge_head(ios, &reject);
}
/* Unplug: let any queued io role on the sets devices. */
static void do_unplug(struct raid_set *rs)
{
struct raid_dev *dev = rs->dev + rs->set.raid_devs;
while (dev-- > rs->dev) {
/* Only call any device unplug function, if io got queued. */
if (TestClearDevIoQueued(dev))
blk_unplug(bdev_get_queue(dev->dev->bdev));
}
}
/* Send an event in case we're getting too busy. */
static void do_busy_event(struct raid_set *rs)
{
if (sc_busy(rs)) {
if (!TestSetRSScBusy(rs))
schedule_work(&rs->io.ws_do_table_event);
}
ClearRSScBusy(rs);
}
/* Throw an event. */
static void do_table_event(struct work_struct *ws)
{
struct raid_set *rs = container_of(ws, struct raid_set,
io.ws_do_table_event);
dm_table_event(rs->ti->table);
}
/*-----------------------------------------------------------------
* RAID daemon
*---------------------------------------------------------------*/
/*
* o belabour all end ios
* o update the region hash states
* o optionally shrink the stripe cache
* o optionally do recovery
* o unplug any component raid devices with queued bios
* o grab the input queue
* o work an all requeued or new ios and perform stripe cache flushs
* o unplug any component raid devices with queued bios
* o check, if the stripe cache gets too busy and throw an event if so
*/
static void do_raid(struct work_struct *ws)
{
struct raid_set *rs = container_of(ws, struct raid_set,
io.dws_do_raid.work);
struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
/*
* We always need to end io, so that ios can get errored in
* case the set failed and the region counters get decremented
* before we update region hash states and go any further.
*/
do_endios(rs);
dm_rh_update_states(rs->recover.rh, 1);
/*
* Now that we've end io'd, which may have put stripes on the LRU list
* to allow for shrinking, we resize the stripe cache if requested.
*/
do_sc_resize(rs);
/* Try to recover regions. */
do_recovery(rs);
do_unplug(rs); /* Unplug the sets device queues. */
/* Quickly grab all new ios queued and add them to the work list. */
mutex_lock(&rs->io.in_lock);
bio_list_merge(ios, ios_in);
bio_list_init(ios_in);
mutex_unlock(&rs->io.in_lock);
if (!bio_list_empty(ios))
do_ios(rs, ios); /* Got ios to work into the cache. */
do_flush(rs); /* Flush any stripes on io list. */
do_unplug(rs); /* Unplug the sets device queues. */
do_busy_event(rs); /* Check if we got too busy. */
}
/*
* Callback for region hash to dispatch
* delayed bios queued to recovered regions
* (gets called via dm_rh_update_states()).
*/
static void dispatch_delayed_bios(void *context, struct bio_list *bl)
{
struct raid_set *rs = context;
struct bio *bio;
/* REMOVEME: statistics; decrement pending delayed bios counter. */
bio_list_for_each(bio, bl)
atomic_dec(rs->stats + S_DELAYED_BIOS);
/* Merge region hash private list to work list. */
bio_list_merge_head(&rs->io.work, bl);
bio_list_init(bl);
ClearRSBandwidth(rs);
}
/*************************************************************
* Constructor helpers
*************************************************************/
/* Calculate MB/sec. */
static unsigned mbpers(struct raid_set *rs, unsigned speed)
{
return to_bytes(speed * rs->set.data_devs *
rs->recover.io_size * HZ >> 10) >> 10;
}
/*
* Discover fastest xor algorithm and # of chunks combination.
*/
/* Calculate speed for algorithm and # of chunks. */
static unsigned xor_speed(struct stripe *stripe)
{
unsigned r = 0;
unsigned long j;
/* Wait for next tick. */
for (j = jiffies; j == jiffies; )
;
/* Do xors for a full tick. */
for (j = jiffies; j == jiffies; ) {
mb();
common_xor(stripe, stripe->io.size, 0, 0);
mb();
r++;
}
return r;
}
/* Optimize xor algorithm for this RAID set. */
static unsigned xor_optimize(struct raid_set *rs)
{
unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0;
struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
struct stripe *stripe;
BUG_ON(list_empty(&rs->recover.stripes));
stripe = list_first_entry(&rs->recover.stripes, struct stripe,
lists[LIST_RECOVER]);
/* Must set uptodate so that xor() will belabour chunks. */
while (p--)
SetChunkUptodate(CHUNK(stripe, p));
/* Try all xor functions. */
while (f-- > xor_funcs) {
unsigned speed;
/* Set actual xor function for common_xor(). */
rs->xor.f = f;
rs->xor.chunks = (f->f == xor_blocks_wrapper ?
(MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1;
while (rs->xor.chunks-- > 2) {
speed = xor_speed(stripe);
if (speed > speed_max) {
speed_max = speed;
chunks_max = rs->xor.chunks;
f_max = f;
}
}
}
/* Memorize optimum parameters. */
rs->xor.f = f_max;
rs->xor.chunks = chunks_max;
return speed_max;
}
/*
* Allocate a RAID context (a RAID set)
*/
/* Structure for variable RAID parameters. */
struct variable_parms {
int bandwidth;
int bandwidth_parm;
int chunk_size;
int chunk_size_parm;
int io_size;
int io_size_parm;
int stripes;
int stripes_parm;
int recover_io_size;
int recover_io_size_parm;
int raid_parms;
int recovery;
int recovery_stripes;
int recovery_stripes_parm;
};
static struct raid_set *
context_alloc(struct raid_type *raid_type, struct variable_parms *p,
unsigned raid_devs, sector_t sectors_per_dev,
struct dm_target *ti, unsigned dl_parms, char **argv)
{
int r;
size_t len;
sector_t region_size, ti_len;
struct raid_set *rs = NULL;
struct dm_dirty_log *dl;
struct recover *rec;
/*
* Create the dirty log
*
* We need to change length for the dirty log constructor,
* because we want an amount of regions for all stripes derived
* from the single device size, so that we can keep region
* size = 2^^n independant of the number of devices
*/
ti_len = ti->len;
ti->len = sectors_per_dev;
dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2);
ti->len = ti_len;
if (!dl)
goto bad_dirty_log;
/* Chunk size *must* be smaller than region size. */
region_size = dl->type->get_region_size(dl);
if (p->chunk_size > region_size)
goto bad_chunk_size;
/* Recover io size *must* be smaller than region size as well. */
if (p->recover_io_size > region_size)
goto bad_recover_io_size;
/* Size and allocate the RAID set structure. */
len = sizeof(*rs->data) + sizeof(*rs->dev);
if (dm_array_too_big(sizeof(*rs), len, raid_devs))
goto bad_array;
len = sizeof(*rs) + raid_devs * len;
rs = kzalloc(len, GFP_KERNEL);
if (!rs)
goto bad_alloc;
rec = &rs->recover;
atomic_set(&rs->io.in_process, 0);
atomic_set(&rs->io.in_process_max, 0);
rec->io_size = p->recover_io_size;
/* Pointer to data array. */
rs->data = (unsigned long **)
((void *) rs->dev + raid_devs * sizeof(*rs->dev));
rec->dl = dl;
rs->set.raid_devs = raid_devs;
rs->set.data_devs = raid_devs - raid_type->parity_devs;
rs->set.raid_type = raid_type;
rs->set.raid_parms = p->raid_parms;
rs->set.chunk_size_parm = p->chunk_size_parm;
rs->set.io_size_parm = p->io_size_parm;
rs->sc.stripes_parm = p->stripes_parm;
rec->io_size_parm = p->recover_io_size_parm;
rec->bandwidth_parm = p->bandwidth_parm;
rec->recovery = p->recovery;
rec->recovery_stripes = p->recovery_stripes;
/*
* Set chunk and io size and respective shifts
* (used to avoid divisions)
*/
rs->set.chunk_size = p->chunk_size;
rs->set.chunk_shift = ffs(p->chunk_size) - 1;
rs->set.io_size = p->io_size;
rs->set.io_mask = p->io_size - 1;
/* Mask to adjust address key in case io_size != chunk_size. */
rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
rs->set.sectors_per_dev = sectors_per_dev;
rs->set.ei = -1; /* Indicate no failed device. */
atomic_set(&rs->set.failed_devs, 0);
rs->ti = ti;
atomic_set(rec->io_count + IO_WORK, 0);
atomic_set(rec->io_count + IO_RECOVER, 0);
/* Initialize io lock and queues. */
mutex_init(&rs->io.in_lock);
bio_list_init(&rs->io.in);
bio_list_init(&rs->io.work);
init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
wake_dummy, wake_do_raid, 0, p->recovery_stripes,
dl, region_size, rec->nr_regions);
if (IS_ERR(rec->rh))
goto bad_rh;
/* Initialize stripe cache. */
r = sc_init(rs, p->stripes);
if (r)
goto bad_sc;
/* REMOVEME: statistics. */
stats_reset(rs);
ClearRSDevelStats(rs); /* Disnable development status. */
return rs;
bad_dirty_log:
TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
bad_chunk_size:
dm_dirty_log_destroy(dl);
TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
bad_recover_io_size:
dm_dirty_log_destroy(dl);
TI_ERR_RET("Recover stripe io size larger than region size",
ERR_PTR(-EINVAL));
bad_array:
dm_dirty_log_destroy(dl);
TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
bad_alloc:
dm_dirty_log_destroy(dl);
TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
bad_rh:
dm_dirty_log_destroy(dl);
ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
goto free_rs;
bad_sc:
dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
sc_exit(&rs->sc);
ti->error = DM_MSG_PREFIX "Error creating stripe cache";
free_rs:
kfree(rs);
return ERR_PTR(-ENOMEM);
}
/* Free a RAID context (a RAID set). */
static void context_free(struct raid_set *rs, unsigned p)
{
while (p--)
dm_put_device(rs->ti, rs->dev[p].dev);
sc_exit(&rs->sc);
dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
kfree(rs);
}
/* Create work queue and initialize delayed work. */
static int rs_workqueue_init(struct raid_set *rs)
{
struct dm_target *ti = rs->ti;
rs->io.wq = create_singlethread_workqueue(DAEMON);
if (!rs->io.wq)
TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
return 0;
}
/* Return pointer to raid_type structure for raid name. */
static struct raid_type *get_raid_type(char *name)
{
struct raid_type *r = ARRAY_END(raid_types);
while (r-- > raid_types) {
if (!strcmp(r->name, name))
return r;
}
return NULL;
}
/* FIXME: factor out to dm core. */
static int multiple(sector_t a, sector_t b, sector_t *n)
{
sector_t r = a;
sector_div(r, b);
*n = r;
return a == r * b;
}
/* Log RAID set information to kernel log. */
static void rs_log(struct raid_set *rs, unsigned speed)
{
unsigned p;
char buf[BDEVNAME_SIZE];
for (p = 0; p < rs->set.raid_devs; p++)
DMINFO("/dev/%s is raid disk %u%s",
bdevname(rs->dev[p].dev->bdev, buf), p,
(p == rs->set.pi) ? " (parity)" : "");
DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
"algorithm \"%s\", %u chunks with %uMB/s\n"
"%s set with net %u/%u devices",
rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
atomic_read(&rs->sc.stripes),
rs->xor.f->name, rs->xor.chunks, mbpers(rs, speed),
rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
}
/* Get all devices and offsets. */
static int dev_parms(struct raid_set *rs, char **argv, int *p)
{
struct dm_target *ti = rs->ti;
for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
int r;
unsigned long long tmp;
struct raid_dev *dev = rs->dev + *p;
/* Get offset and device. */
if (sscanf(argv[1], "%llu", &tmp) != 1 ||
tmp > rs->set.sectors_per_dev)
TI_ERR("Invalid RAID device offset parameter");
dev->start = tmp;
r = dm_get_device(ti, *argv, dev->start,
rs->set.sectors_per_dev,
dm_table_get_mode(ti->table), &dev->dev);
if (r)
TI_ERR_RET("RAID device lookup failure", r);
r = raid_dev_lookup(rs, dev);
if (r != -ENODEV && r < *p) {
(*p)++; /* Ensure dm_put_device() on actual device. */
TI_ERR_RET("Duplicate RAID device", -ENXIO);
}
}
return 0;
}
/* Set recovery bandwidth. */
static void
recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
{
rs->recover.bandwidth = bandwidth;
rs->recover.bandwidth_work = 100 / bandwidth;
}
/* Handle variable number of RAID parameters. */
static int get_raid_variable_parms(struct dm_target *ti, char **argv,
struct variable_parms *vp)
{
int p, value;
struct {
int action; /* -1: skip, 0: no pwer2 check, 1: power2 check */
char *errmsg;
int min, max;
int *var, *var2, *var3;
} argctr[] = {
{ 1,
"Invalid chunk size; must be -1 or 2^^n and <= 16384",
IO_SIZE_MIN, CHUNK_SIZE_MAX,
&vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
{ 0,
"Invalid number of stripes: must be -1 or >= 8 and <= 16384",
STRIPES_MIN, STRIPES_MAX,
&vp->stripes_parm, &vp->stripes, NULL },
{ 1,
"Invalid io size; must -1 or >= 8, 2^^n and less equal "
"min(BIO_MAX_SECTORS/2, chunk size)",
IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
&vp->io_size_parm, &vp->io_size, NULL },
{ 1,
"Invalid recovery io size; must be -1 or "
"2^^n and less equal BIO_MAX_SECTORS/2",
RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
&vp->recover_io_size_parm, &vp->recover_io_size, NULL },
{ 0,
"Invalid recovery bandwidth percentage; "
"must be -1 or > 0 and <= 100",
BANDWIDTH_MIN, BANDWIDTH_MAX,
&vp->bandwidth_parm, &vp->bandwidth, NULL },
/* Handle sync argument seperately in loop. */
{ -1,
"Invalid recovery switch; must be \"sync\" or \"nosync\"" },
{ 0,
"Invalid number of recovery stripes;"
"must be -1, > 0 and <= 16384",
RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
&vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
}, *varp;
/* Fetch # of variable raid parameters. */
if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
!range_ok(vp->raid_parms, 0, 7))
TI_ERR("Bad variable raid parameters number");
/* Preset variable RAID parameters. */
vp->chunk_size = CHUNK_SIZE_DEFAULT;
vp->io_size = IO_SIZE_DEFAULT;
vp->stripes = STRIPES_DEFAULT;
vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
vp->bandwidth = BANDWIDTH_DEFAULT;
vp->recovery = 1;
vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
/* Walk the array of argument constraints for all given ones. */
for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
BUG_ON(varp >= ARRAY_END(argctr));
/* Special case for "[no]sync" string argument. */
if (varp->action < 0) {
if (!strcmp(*argv, "sync"))
;
else if (!strcmp(*argv, "nosync"))
vp->recovery = 0;
else
TI_ERR(varp->errmsg);
argv++;
continue;
}
/*
* Special case for io_size depending
* on previously set chunk size.
*/
if (p == 2)
varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
if (sscanf(*(argv++), "%d", &value) != 1 ||
(value != -1 &&
((varp->action && !POWER_OF_2(value)) ||
!range_ok(value, varp->min, varp->max))))
TI_ERR(varp->errmsg);
*varp->var = value;
if (value != -1) {
if (varp->var2)
*varp->var2 = value;
if (varp->var3)
*varp->var3 = value;
}
}
return 0;
}
/* Parse optional locking parameters. */
static int get_raid_locking_parms(struct dm_target *ti, char **argv,
int *locking_parms,
struct dm_raid45_locking_type **locking_type)
{
if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
char *lckstr = argv[1];
size_t lcksz = strlen(lckstr);
if (!strnicmp(lckstr, "none", lcksz)) {
*locking_type = &locking_none;
*locking_parms = 2;
} else if (!strnicmp(lckstr, "cluster", lcksz)) {
DMERR("locking type \"%s\" not yet implemented",
lckstr);
return -EINVAL;
} else {
DMERR("unknown locking type \"%s\"", lckstr);
return -EINVAL;
}
}
*locking_parms = 0;
*locking_type = &locking_none;
return 0;
}
/* Set backing device read ahead properties of RAID set. */
static void rs_set_read_ahead(struct raid_set *rs,
unsigned sectors, unsigned stripes)
{
unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
struct mapped_device *md = dm_table_get_md(rs->ti->table);
struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
/* Set read-ahead for the RAID set and the component devices. */
if (ra_pages) {
unsigned p = rs->set.raid_devs;
bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
while (p--) {
struct request_queue *q =
bdev_get_queue(rs->dev[p].dev->bdev);
q->backing_dev_info.ra_pages = ra_pages;
}
}
dm_put(md);
}
/* Set congested function. */
static void rs_set_congested_fn(struct raid_set *rs)
{
struct mapped_device *md = dm_table_get_md(rs->ti->table);
struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
/* Set congested function and data. */
bdi->congested_fn = rs_congested;
bdi->congested_data = rs;
dm_put(md);
}
/*
* Construct a RAID4/5 mapping:
*
* log_type #log_params <log_params> \
* raid_type [#parity_dev] #raid_variable_params <raid_params> \
* [locking "none"/"cluster"]
* #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
*
* log_type = "core"/"disk",
* #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
* log_params = [dirty_log_path] region_size [[no]sync])
*
* raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
*
* #parity_dev = N if raid_type = "raid4"
* o N = -1: pick default = last device
* o N >= 0 and < #raid_devs: parity device index
*
* #raid_variable_params = 0-7; raid_params (-1 = default):
* [chunk_size [#stripes [io_size [recover_io_size \
* [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
* o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
* and <= CHUNK_SIZE_MAX)
* o #stripes is number of stripes allocated to stripe cache
* (must be > 1 and < STRIPES_MAX)
* o io_size (io unit size per device in sectors; must be 2^^n and > 8)
* o recover_io_size (io unit size per device for recovery in sectors;
must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
* o %recovery_bandwith is the maximum amount spend for recovery during
* application io (1-100%)
* o recovery switch = [sync|nosync]
* o #recovery_stripes is the number of recovery stripes used for
* parallel recovery of the RAID set
* If raid_variable_params = 0, defaults will be used.
* Any raid_variable_param can be set to -1 to apply a default
*
* #raid_devs = N (N >= 3)
*
* #dev_to_initialize = N
* -1: initialize parity on all devices
* >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
* of a failed devices content after replacement
*
* <dev_path> = device_path (eg, /dev/sdd1)
* <offset> = begin at offset on <dev_path>
*
*/
#define MIN_PARMS 13
static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
int dev_to_init, dl_parms, i, locking_parms,
parity_parm, pi = -1, r, raid_devs;
unsigned speed;
sector_t tmp, sectors_per_dev;
struct dm_raid45_locking_type *locking;
struct raid_set *rs;
struct raid_type *raid_type;
struct variable_parms parms;
/* Ensure minimum number of parameters. */
if (argc < MIN_PARMS)
TI_ERR("Not enough parameters");
/* Fetch # of dirty log parameters. */
if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
!range_ok(dl_parms, 1, 4711)) /* ;-) */
TI_ERR("Bad dirty log parameters number");
/* Check raid_type. */
raid_type = get_raid_type(argv[dl_parms + 2]);
if (!raid_type)
TI_ERR("Bad raid type");
/* In case of RAID4, parity drive is selectable. */
parity_parm = !!(raid_type->level == raid4);
/* Handle variable number of RAID parameters. */
r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
&parms);
if (r)
return r;
/* Handle any locking parameters. */
r = get_raid_locking_parms(ti,
argv + dl_parms + parity_parm +
parms.raid_parms + 4,
&locking_parms, &locking);
if (r)
return r;
/* # of raid devices. */
i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
raid_devs < raid_type->minimal_devs)
TI_ERR("Invalid number of raid devices");
/* In case of RAID4, check parity drive index is in limits. */
if (raid_type->level == raid4) {
/* Fetch index of parity device. */
if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
(pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
TI_ERR("Invalid RAID4 parity device index");
}
/*
* Index of device to initialize starts at 0
*
* o -1 -> don't initialize a selected device;
* initialize parity conforming to algorithm
* o 0..raid_devs-1 -> initialize respective device
* (used for reconstruction of a replaced device)
*/
if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
locking_parms + 5], "%d", &dev_to_init) != 1 ||
!range_ok(dev_to_init, -1, raid_devs - 1))
TI_ERR("Invalid number for raid device to initialize");
/* Check # of raid device arguments. */
if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
2 * raid_devs)
TI_ERR("Wrong number of raid device/offset arguments");
/*
* Check that the table length is devisable
* w/o rest by (raid_devs - parity_devs)
*/
if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
&sectors_per_dev))
TI_ERR("Target length not divisible by number of data devices");
/*
* Check that the device size is
* devisable w/o rest by chunk size
*/
if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
TI_ERR("Device length not divisible by chunk_size");
/****************************************************************
* Now that we checked the constructor arguments ->
* let's allocate the RAID set
****************************************************************/
rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
ti, dl_parms, argv);
if (IS_ERR(rs))
return PTR_ERR(rs);
rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
rs->set.pi = rs->set.pi_parm = pi;
/* Set RAID4 parity drive index. */
if (raid_type->level == raid4)
rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
recover_set_bandwidth(rs, parms.bandwidth);
/* Use locking type to lock stripe access. */
rs->locking = locking;
/* Get the device/offset tupels. */
argv += dl_parms + 6 + parity_parm + parms.raid_parms;
r = dev_parms(rs, argv, &i);
if (r)
goto err;
/* Set backing device information (eg. read ahead). */
rs_set_read_ahead(rs, 2 * rs->set.chunk_size, 4 /* stripes */);
rs_set_congested_fn(rs); /* Set congested function. */
SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
speed = xor_optimize(rs); /* Select best xor algorithm. */
/* Set for recovery of any nosync regions. */
if (parms.recovery)
SetRSRecover(rs);
else {
/*
* Need to free recovery stripe(s) here in case
* of nosync, because xor_optimize uses one.
*/
set_start_recovery(rs);
set_end_recovery(rs);
stripe_recover_free(rs);
}
/*
* Make sure that dm core only hands maximum io size
* length down and pays attention to io boundaries.
*/
ti->split_io = rs->set.io_size;
ti->private = rs;
/* Initialize work queue to handle this RAID set's io. */
r = rs_workqueue_init(rs);
if (r)
goto err;
rs_log(rs, speed); /* Log information about RAID set. */
return 0;
err:
context_free(rs, i);
return r;
}
/*
* Destruct a raid mapping
*/
static void raid_dtr(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
destroy_workqueue(rs->io.wq);
context_free(rs, rs->set.raid_devs);
}
/* Raid mapping function. */
static int raid_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
/* I don't want to waste stripe cache capacity. */
if (bio_rw(bio) == READA)
return -EIO;
else {
struct raid_set *rs = ti->private;
/*
* Get io reference to be waiting for to drop
* to zero on device suspension/destruction.
*/
io_get(rs);
bio->bi_sector -= ti->begin; /* Remap sector. */
/* Queue io to RAID set. */
mutex_lock(&rs->io.in_lock);
bio_list_add(&rs->io.in, bio);
mutex_unlock(&rs->io.in_lock);
/* Wake daemon to process input list. */
wake_do_raid(rs);
/* REMOVEME: statistics. */
atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
S_BIOS_READ : S_BIOS_WRITE));
return DM_MAPIO_SUBMITTED; /* Handle later. */
}
}
/* Device suspend. */
static void raid_presuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
struct dm_dirty_log *dl = rs->recover.dl;
SetRSSuspend(rs);
if (RSRecover(rs))
dm_rh_stop_recovery(rs->recover.rh);
cancel_delayed_work(&rs->io.dws_do_raid);
flush_workqueue(rs->io.wq);
wait_ios(rs); /* Wait for completion of all ios being processed. */
if (dl->type->presuspend && dl->type->presuspend(dl))
/* FIXME: need better error handling. */
DMWARN("log presuspend failed");
}
static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
struct dm_dirty_log *dl = rs->recover.dl;
if (dl->type->postsuspend && dl->type->postsuspend(dl))
/* FIXME: need better error handling. */
DMWARN("log postsuspend failed");
}
/* Device resume. */
static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
struct recover *rec = &rs->recover;
struct dm_dirty_log *dl = rec->dl;
if (dl->type->resume && dl->type->resume(dl))
/* Resume dirty log. */
/* FIXME: need better error handling. */
DMWARN("log resume failed");
rec->nr_regions_to_recover =
rec->nr_regions - dl->type->get_sync_count(dl);
/* Restart any unfinished recovery. */
if (RSRecover(rs)) {
set_start_recovery(rs);
dm_rh_start_recovery(rec->rh);
}
ClearRSSuspend(rs);
wake_do_raid(rs);
}
/* Return stripe cache size. */
static unsigned sc_size(struct raid_set *rs)
{
return to_sector(atomic_read(&rs->sc.stripes) *
(sizeof(struct stripe) +
(sizeof(struct stripe_chunk) +
(sizeof(struct page_list) +
to_bytes(rs->set.io_size) *
rs->set.raid_devs)) +
(rs->recover.end_jiffies ?
0 : rs->recover.recovery_stripes *
to_bytes(rs->set.raid_devs * rs->recover.io_size))));
}
/* REMOVEME: status output for development. */
static void raid_devel_stats(struct dm_target *ti, char *result,
unsigned *size, unsigned maxlen)
{
unsigned sz = *size;
unsigned long j;
char buf[BDEVNAME_SIZE], *p;
struct stats_map *sm;
struct raid_set *rs = ti->private;
struct recover *rec = &rs->recover;
struct timespec ts;
DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks);
DMEMIT("act_ios=%d ", io_ref(rs));
DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
DMEMIT("act_stripes_max=%d\n",
atomic_read(&rs->sc.active_stripes_max));
for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
atomic_read(&rs->sc.stripes), rs->set.io_size,
rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
sc_size(rs));
j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
rec->start_jiffies;
jiffies_to_timespec(j, &ts);
sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
p = strchr(buf, '.');
p[3] = 0;
DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
(unsigned long long) rec->nr_regions_recovered,
(unsigned long long) rec->nr_regions_to_recover,
(unsigned long long) rec->nr_regions, rec->bandwidth, buf);
*size = sz;
}
static int raid_status(struct dm_target *ti, status_type_t type,
char *result, unsigned maxlen)
{
unsigned p, sz = 0;
char buf[BDEVNAME_SIZE];
struct raid_set *rs = ti->private;
int raid_parms[] = {
rs->set.chunk_size_parm,
rs->sc.stripes_parm,
rs->set.io_size_parm,
rs->recover.io_size_parm,
rs->recover.bandwidth_parm,
-2,
rs->recover.recovery_stripes,
};
switch (type) {
case STATUSTYPE_INFO:
/* REMOVEME: statistics. */
if (RSDevelStats(rs))
raid_devel_stats(ti, result, &sz, maxlen);
DMEMIT("%u ", rs->set.raid_devs);
for (p = 0; p < rs->set.raid_devs; p++)
DMEMIT("%s ",
format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
DMEMIT("1 ");
for (p = 0; p < rs->set.raid_devs; p++) {
DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
if (p == rs->set.pi)
DMEMIT("p");
if (rs->set.dev_to_init == p)
DMEMIT("i");
}
break;
case STATUSTYPE_TABLE:
sz = rs->recover.dl->type->status(rs->recover.dl, type,
result, maxlen);
DMEMIT("%s %u ", rs->set.raid_type->name,
rs->set.raid_parms);
for (p = 0; p < rs->set.raid_parms; p++) {
if (raid_parms[p] > -2)
DMEMIT("%d ", raid_parms[p]);
else
DMEMIT("%s ", rs->recover.recovery ?
"sync" : "nosync");
}
DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
for (p = 0; p < rs->set.raid_devs; p++)
DMEMIT("%s %llu ",
format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
(unsigned long long) rs->dev[p].start);
}
return 0;
}
/*
* Message interface
*/
enum raid_msg_actions {
act_bw, /* Recovery bandwidth switch. */
act_dev, /* Device failure switch. */
act_overwrite, /* Stripe overwrite check. */
act_stats, /* Development statistics switch. */
act_sc, /* Stripe cache switch. */
act_on, /* Set entity on. */
act_off, /* Set entity off. */
act_reset, /* Reset entity. */
act_set = act_on, /* Set # absolute. */
act_grow = act_off, /* Grow # by an amount. */
act_shrink = act_reset, /* Shrink # by an amount. */
};
/* Turn a delta into an absolute value. */
static int _absolute(unsigned long action, int act, int r)
{
/* Make delta absolute. */
if (test_bit(act_set, &action))
;
else if (test_bit(act_grow, &action))
r += act;
else if (test_bit(act_shrink, &action))
r = act - r;
else
r = -EINVAL;
return r;
}
/* Change recovery io bandwidth. */
static int bandwidth_change(struct dm_msg *msg, void *context)
{
struct raid_set *rs = context;
int act = rs->recover.bandwidth;
int bandwidth = DM_MSG_INT_ARG(msg);
if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
/* Make delta bandwidth absolute. */
bandwidth = _absolute(msg->action, act, bandwidth);
/* Check range. */
if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
recover_set_bandwidth(rs, bandwidth);
return 0;
}
}
set_bit(dm_msg_ret_arg, &msg->ret);
set_bit(dm_msg_ret_inval, &msg->ret);
return -EINVAL;
}
/* Set/reset development feature flags. */
static int devel_flags(struct dm_msg *msg, void *context)
{
struct raid_set *rs = context;
if (test_bit(act_on, &msg->action))
return test_and_set_bit(msg->spec->parm,
&rs->io.flags) ? -EPERM : 0;
else if (test_bit(act_off, &msg->action))
return test_and_clear_bit(msg->spec->parm,
&rs->io.flags) ? 0 : -EPERM;
else if (test_bit(act_reset, &msg->action)) {
if (test_bit(act_stats, &msg->action)) {
stats_reset(rs);
goto on;
} else if (test_bit(act_overwrite, &msg->action)) {
on:
set_bit(msg->spec->parm, &rs->io.flags);
return 0;
}
}
return -EINVAL;
}
/* Resize the stripe cache. */
static int sc_resize(struct dm_msg *msg, void *context)
{
int act, stripes;
struct raid_set *rs = context;
/* Deny permission in case the daemon is still resizing!. */
if (atomic_read(&rs->sc.stripes_to_set))
return -EPERM;
stripes = DM_MSG_INT_ARG(msg);
if (stripes > 0) {
act = atomic_read(&rs->sc.stripes);
/* Make delta stripes absolute. */
stripes = _absolute(msg->action, act, stripes);
/*
* Check range and that the # of stripes changes.
* We leave the resizing to the wroker.
*/
if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
stripes != atomic_read(&rs->sc.stripes)) {
atomic_set(&rs->sc.stripes_to_set, stripes);
wake_do_raid(rs);
return 0;
}
}
set_bit(dm_msg_ret_arg, &msg->ret);
set_bit(dm_msg_ret_inval, &msg->ret);
return -EINVAL;
}
/* Parse the RAID message action. */
/*
* 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
* "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
* 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
* 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
*
*/
static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
{
/* Variables to store the parsed parameters im. */
static int i[2];
static unsigned long *i_arg[] = {
(unsigned long *) i + 0,
(unsigned long *) i + 1,
};
/* Declare all message option strings. */
static char *str_sgs[] = { "set", "grow", "shrink" };
static char *str_oor[] = { "on", "off", "reset" };
/* Declare all actions. */
static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
static unsigned long act_oor[] = { act_on, act_off, act_reset };
/* Bandwidth option. */
static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
static struct dm_message_argument bw_args = {
1, i_arg, { dm_msg_int_t }
};
static struct dm_message_argument null_args = {
0, NULL, { dm_msg_int_t }
};
/* Overwrite and statistics option. */
static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
/* Sripecache option. */
static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
/* Declare messages. */
static struct dm_msg_spec specs[] = {
{ "bandwidth", act_bw, &bw_opt, &bw_args,
0, bandwidth_change },
{ "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
RS_CHECK_OVERWRITE, devel_flags },
{ "statistics", act_stats, &ovr_stats_opt, &null_args,
RS_DEVEL_STATS, devel_flags },
{ "stripecache", act_sc, &stripe_opt, &bw_args,
0, sc_resize },
};
/* The message for the parser. */
struct dm_msg msg = {
.num_specs = ARRAY_SIZE(specs),
.specs = specs,
};
return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
}
/*
* END message interface
*/
static struct target_type raid_target = {
.name = "raid45",
.version = {1, 0, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
.map = raid_map,
.presuspend = raid_presuspend,
.postsuspend = raid_postsuspend,
.resume = raid_resume,
.status = raid_status,
.message = raid_message,
};
static void init_exit(const char *bad_msg, const char *good_msg, int r)
{
if (r)
DMERR("Failed to %sregister target [%d]", bad_msg, r);
else
DMINFO("%s %s", good_msg, version);
}
static int __init dm_raid_init(void)
{
int r = dm_register_target(&raid_target);
init_exit("", "initialized", r);
return r;
}
static void __exit dm_raid_exit(void)
{
dm_unregister_target(&raid_target);
init_exit("un", "exit", 0);
}
/* Module hooks. */
module_init(dm_raid_init);
module_exit(dm_raid_exit);
MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
MODULE_LICENSE("GPL");
MODULE_ALIAS("dm-raid4");
MODULE_ALIAS("dm-raid5");