| /*[A[A |
| * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. |
| * |
| * Module Author: Heinz Mauelshagen <heinzm@redhat.com> |
| * |
| * This file is released under the GPL. |
| * |
| * |
| * Linux 2.6 Device Mapper RAID4 and RAID5 target. |
| * |
| * Supports: |
| * o RAID4 with dedicated and selectable parity device |
| * o RAID5 with rotating parity (left+right, symmetric+asymmetric) |
| * o recovery of out of sync device for initial |
| * RAID set creation or after dead drive replacement |
| * o run time optimization of xor algorithm used to calculate parity |
| * |
| * |
| * Thanks to MD for: |
| * o the raid address calculation algorithm |
| * o the base of the biovec <-> page list copier. |
| * |
| * |
| * Uses region hash to keep track of how many writes are in flight to |
| * regions in order to use dirty log to keep state of regions to recover: |
| * |
| * o clean regions (those which are synchronized |
| * and don't have write io in flight) |
| * o dirty regions (those with write io in flight) |
| * |
| * |
| * On startup, any dirty regions are migrated to the |
| * 'nosync' state and are subject to recovery by the daemon. |
| * |
| * See raid_ctr() for table definition. |
| * |
| * FIXME: recovery bandwidth |
| */ |
| |
| static const char *version = "v0.2594b"; |
| |
| #include "dm.h" |
| #include "dm-memcache.h" |
| #include "dm-message.h" |
| #include "dm-raid45.h" |
| |
| #include <linux/kernel.h> |
| #include <linux/vmalloc.h> |
| #include <linux/raid/xor.h> |
| |
| #include <linux/bio.h> |
| #include <linux/dm-io.h> |
| #include <linux/dm-dirty-log.h> |
| #include "dm-region-hash.h" |
| |
| |
| /* |
| * Configurable parameters |
| */ |
| |
| /* Minimum/maximum and default # of selectable stripes. */ |
| #define STRIPES_MIN 8 |
| #define STRIPES_MAX 16384 |
| #define STRIPES_DEFAULT 80 |
| |
| /* Maximum and default chunk size in sectors if not set in constructor. */ |
| #define CHUNK_SIZE_MIN 8 |
| #define CHUNK_SIZE_MAX 16384 |
| #define CHUNK_SIZE_DEFAULT 64 |
| |
| /* Default io size in sectors if not set in constructor. */ |
| #define IO_SIZE_MIN CHUNK_SIZE_MIN |
| #define IO_SIZE_DEFAULT IO_SIZE_MIN |
| |
| /* Recover io size default in sectors. */ |
| #define RECOVER_IO_SIZE_MIN 64 |
| #define RECOVER_IO_SIZE_DEFAULT 256 |
| |
| /* Default, minimum and maximum percentage of recover io bandwidth. */ |
| #define BANDWIDTH_DEFAULT 10 |
| #define BANDWIDTH_MIN 1 |
| #define BANDWIDTH_MAX 100 |
| |
| /* # of parallel recovered regions */ |
| #define RECOVERY_STRIPES_MIN 1 |
| #define RECOVERY_STRIPES_MAX 64 |
| #define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN |
| /* |
| * END Configurable parameters |
| */ |
| |
| #define TARGET "dm-raid45" |
| #define DAEMON "kraid45d" |
| #define DM_MSG_PREFIX TARGET |
| |
| #define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT) |
| |
| /* Amount/size for __xor(). */ |
| #define XOR_SIZE PAGE_SIZE |
| |
| /* Check value in range. */ |
| #define range_ok(i, min, max) (i >= min && i <= max) |
| |
| /* Check argument is power of 2. */ |
| #define POWER_OF_2(a) (!(a & (a - 1))) |
| |
| /* Structure access macros. */ |
| /* Derive raid_set from stripe_cache pointer. */ |
| #define RS(x) container_of(x, struct raid_set, sc) |
| |
| /* Page reference. */ |
| #define PAGE(stripe, p) ((stripe)->obj[p].pl->page) |
| |
| /* Stripe chunk reference. */ |
| #define CHUNK(stripe, p) ((stripe)->chunk + p) |
| |
| /* Bio list reference. */ |
| #define BL(stripe, p, rw) (stripe->chunk[p].bl + rw) |
| #define BL_CHUNK(chunk, rw) (chunk->bl + rw) |
| |
| /* Page list reference. */ |
| #define PL(stripe, p) (stripe->obj[p].pl) |
| /* END: structure access macros. */ |
| |
| /* Factor out to dm-bio-list.h */ |
| static inline void bio_list_push(struct bio_list *bl, struct bio *bio) |
| { |
| bio->bi_next = bl->head; |
| bl->head = bio; |
| |
| if (!bl->tail) |
| bl->tail = bio; |
| } |
| |
| /* Factor out to dm.h */ |
| #define TI_ERR_RET(str, ret) \ |
| do { ti->error = str; return ret; } while (0); |
| #define TI_ERR(str) TI_ERR_RET(str, -EINVAL) |
| |
| /* Macro to define access IO flags access inline functions. */ |
| #define BITOPS(name, what, var, flag) \ |
| static inline int TestClear ## name ## what(struct var *v) \ |
| { return test_and_clear_bit(flag, &v->io.flags); } \ |
| static inline int TestSet ## name ## what(struct var *v) \ |
| { return test_and_set_bit(flag, &v->io.flags); } \ |
| static inline void Clear ## name ## what(struct var *v) \ |
| { clear_bit(flag, &v->io.flags); } \ |
| static inline void Set ## name ## what(struct var *v) \ |
| { set_bit(flag, &v->io.flags); } \ |
| static inline int name ## what(struct var *v) \ |
| { return test_bit(flag, &v->io.flags); } |
| |
| /*----------------------------------------------------------------- |
| * Stripe cache |
| * |
| * Cache for all reads and writes to raid sets (operational or degraded) |
| * |
| * We need to run all data to and from a RAID set through this cache, |
| * because parity chunks need to get calculated from data chunks |
| * or, in the degraded/resynchronization case, missing chunks need |
| * to be reconstructed using the other chunks of the stripe. |
| *---------------------------------------------------------------*/ |
| /* A chunk within a stripe (holds bios hanging off). */ |
| /* IO status flags for chunks of a stripe. */ |
| enum chunk_flags { |
| CHUNK_DIRTY, /* Pages of chunk dirty; need writing. */ |
| CHUNK_ERROR, /* IO error on any chunk page. */ |
| CHUNK_IO, /* Allow/prohibit IO on chunk pages. */ |
| CHUNK_LOCKED, /* Chunk pages locked during IO. */ |
| CHUNK_MUST_IO, /* Chunk must io. */ |
| CHUNK_UNLOCK, /* Enforce chunk unlock. */ |
| CHUNK_UPTODATE, /* Chunk pages are uptodate. */ |
| }; |
| |
| #if READ != 0 || WRITE != 1 |
| #error dm-raid45: READ/WRITE != 0/1 used as index!!! |
| #endif |
| |
| enum bl_type { |
| WRITE_QUEUED = WRITE + 1, |
| WRITE_MERGED, |
| NR_BL_TYPES, /* Must be last one! */ |
| }; |
| struct stripe_chunk { |
| atomic_t cnt; /* Reference count. */ |
| struct stripe *stripe; /* Backpointer to stripe for endio(). */ |
| /* Bio lists for reads, writes, and writes merged. */ |
| struct bio_list bl[NR_BL_TYPES]; |
| struct { |
| unsigned long flags; /* IO status flags. */ |
| } io; |
| }; |
| |
| /* Define chunk bit operations. */ |
| BITOPS(Chunk, Dirty, stripe_chunk, CHUNK_DIRTY) |
| BITOPS(Chunk, Error, stripe_chunk, CHUNK_ERROR) |
| BITOPS(Chunk, Io, stripe_chunk, CHUNK_IO) |
| BITOPS(Chunk, Locked, stripe_chunk, CHUNK_LOCKED) |
| BITOPS(Chunk, MustIo, stripe_chunk, CHUNK_MUST_IO) |
| BITOPS(Chunk, Unlock, stripe_chunk, CHUNK_UNLOCK) |
| BITOPS(Chunk, Uptodate, stripe_chunk, CHUNK_UPTODATE) |
| |
| /* |
| * Stripe linked list indexes. Keep order, because the stripe |
| * and the stripe cache rely on the first 3! |
| */ |
| enum list_types { |
| LIST_FLUSH, /* Stripes to flush for io. */ |
| LIST_ENDIO, /* Stripes to endio. */ |
| LIST_LRU, /* Least recently used stripes. */ |
| SC_NR_LISTS, /* # of lists in stripe cache. */ |
| LIST_HASH = SC_NR_LISTS, /* Hashed stripes. */ |
| LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */ |
| STRIPE_NR_LISTS,/* To size array in struct stripe. */ |
| }; |
| |
| /* Adressing region recovery. */ |
| struct recover_addr { |
| struct dm_region *reg; /* Actual region to recover. */ |
| sector_t pos; /* Position within region to recover. */ |
| sector_t end; /* End of region to recover. */ |
| }; |
| |
| /* A stripe: the io object to handle all reads and writes to a RAID set. */ |
| struct stripe { |
| atomic_t cnt; /* Reference count. */ |
| struct stripe_cache *sc; /* Backpointer to stripe cache. */ |
| |
| /* |
| * 4 linked lists: |
| * o io list to flush io |
| * o endio list |
| * o LRU list to put stripes w/o reference count on |
| * o stripe cache hash |
| */ |
| struct list_head lists[STRIPE_NR_LISTS]; |
| |
| sector_t key; /* Hash key. */ |
| region_t region; /* Region stripe is mapped to. */ |
| |
| struct { |
| unsigned long flags; /* Stripe state flags (see below). */ |
| |
| /* |
| * Pending ios in flight: |
| * |
| * used to control move of stripe to endio list |
| */ |
| atomic_t pending; |
| |
| /* Sectors to read and write for multi page stripe sets. */ |
| unsigned size; |
| } io; |
| |
| /* Address region recovery. */ |
| struct recover_addr *recover; |
| |
| /* Lock on stripe (Future: for clustering). */ |
| void *lock; |
| |
| struct { |
| unsigned short parity; /* Parity chunk index. */ |
| short recover; /* Recovery chunk index. */ |
| } idx; |
| |
| /* |
| * This stripe's memory cache object (dm-mem-cache); |
| * i.e. the io chunk pages. |
| */ |
| struct dm_mem_cache_object *obj; |
| |
| /* Array of stripe sets (dynamically allocated). */ |
| struct stripe_chunk chunk[0]; |
| }; |
| |
| /* States stripes can be in (flags field). */ |
| enum stripe_states { |
| STRIPE_ERROR, /* io error on stripe. */ |
| STRIPE_MERGED, /* Writes got merged to be written. */ |
| STRIPE_RBW, /* Read-before-write stripe. */ |
| STRIPE_RECONSTRUCT, /* Reconstruct of a missing chunk required. */ |
| STRIPE_RECONSTRUCTED, /* Reconstructed of a missing chunk. */ |
| STRIPE_RECOVER, /* Stripe used for RAID set recovery. */ |
| }; |
| |
| /* Define stripe bit operations. */ |
| BITOPS(Stripe, Error, stripe, STRIPE_ERROR) |
| BITOPS(Stripe, Merged, stripe, STRIPE_MERGED) |
| BITOPS(Stripe, RBW, stripe, STRIPE_RBW) |
| BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT) |
| BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED) |
| BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER) |
| |
| /* A stripe hash. */ |
| struct stripe_hash { |
| struct list_head *hash; |
| unsigned buckets; |
| unsigned mask; |
| unsigned prime; |
| unsigned shift; |
| }; |
| |
| enum sc_lock_types { |
| LOCK_ENDIO, /* Protect endio list. */ |
| LOCK_LRU, /* Protect LRU list. */ |
| NR_LOCKS, /* To size array in struct stripe_cache. */ |
| }; |
| |
| /* A stripe cache. */ |
| struct stripe_cache { |
| /* Stripe hash. */ |
| struct stripe_hash hash; |
| |
| spinlock_t locks[NR_LOCKS]; /* Locks to protect lists. */ |
| |
| /* Stripes with io to flush, stripes to endio and LRU lists. */ |
| struct list_head lists[SC_NR_LISTS]; |
| |
| /* Slab cache to allocate stripes from. */ |
| struct { |
| struct kmem_cache *cache; /* Cache itself. */ |
| char name[32]; /* Unique name. */ |
| } kc; |
| |
| struct dm_io_client *dm_io_client; /* dm-io client resource context. */ |
| |
| /* dm-mem-cache client resource context. */ |
| struct dm_mem_cache_client *mem_cache_client; |
| |
| int stripes_parm; /* # stripes parameter from constructor. */ |
| atomic_t stripes; /* actual # of stripes in cache. */ |
| atomic_t stripes_to_set; /* # of stripes to resize cache to. */ |
| atomic_t stripes_last; /* last # of stripes in cache. */ |
| atomic_t active_stripes; /* actual # of active stripes in cache. */ |
| |
| /* REMOVEME: */ |
| atomic_t active_stripes_max; /* actual # of active stripes in cache. */ |
| }; |
| |
| /* Flag specs for raid_dev */ ; |
| enum raid_dev_flags { |
| DEV_FAILED, /* Device failed. */ |
| DEV_IO_QUEUED, /* Io got queued to device. */ |
| }; |
| |
| /* The raid device in a set. */ |
| struct raid_dev { |
| struct dm_dev *dev; |
| sector_t start; /* Offset to map to. */ |
| struct { /* Using struct to be able to BITOPS(). */ |
| unsigned long flags; /* raid_dev_flags. */ |
| } io; |
| }; |
| |
| BITOPS(Dev, Failed, raid_dev, DEV_FAILED) |
| BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED) |
| |
| /* Flags spec for raid_set. */ |
| enum raid_set_flags { |
| RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */ |
| RS_DEAD, /* RAID set inoperational. */ |
| RS_DEGRADED, /* Io errors on RAID device. */ |
| RS_DEVEL_STATS, /* REMOVEME: display status information. */ |
| RS_RECOVER, /* Do recovery. */ |
| RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */ |
| RS_SC_BUSY, /* Stripe cache busy -> send an event. */ |
| RS_SUSPEND, /* Suspend RAID set. */ |
| }; |
| |
| /* REMOVEME: devel stats counters. */ |
| enum stats_types { |
| S_BIOS_READ, |
| S_BIOS_ADDED_READ, |
| S_BIOS_ENDIO_READ, |
| S_BIOS_WRITE, |
| S_BIOS_ADDED_WRITE, |
| S_BIOS_ENDIO_WRITE, |
| S_CAN_MERGE, |
| S_CANT_MERGE, |
| S_CONGESTED, |
| S_DM_IO_READ, |
| S_DM_IO_WRITE, |
| S_BANDWIDTH, |
| S_BARRIER, |
| S_BIO_COPY_PL_NEXT, |
| S_DEGRADED, |
| S_DELAYED_BIOS, |
| S_FLUSHS, |
| S_HITS_1ST, |
| S_IOS_POST, |
| S_INSCACHE, |
| S_MAX_LOOKUP, |
| S_CHUNK_LOCKED, |
| S_NO_BANDWIDTH, |
| S_NOT_CONGESTED, |
| S_NO_RW, |
| S_NOSYNC, |
| S_OVERWRITE, |
| S_PROHIBITCHUNKIO, |
| S_RECONSTRUCT_EI, |
| S_RECONSTRUCT_DEV, |
| S_RECONSTRUCT_SET, |
| S_RECONSTRUCTED, |
| S_REQUEUE, |
| S_STRIPE_ERROR, |
| S_SUM_DELAYED_BIOS, |
| S_XORS, |
| S_NR_STATS, /* # of stats counters. Must be last! */ |
| }; |
| |
| /* Status type -> string mappings. */ |
| struct stats_map { |
| const enum stats_types type; |
| const char *str; |
| }; |
| |
| static struct stats_map stats_map[] = { |
| { S_BIOS_READ, "r=" }, |
| { S_BIOS_ADDED_READ, "/" }, |
| { S_BIOS_ENDIO_READ, "/" }, |
| { S_BIOS_WRITE, " w=" }, |
| { S_BIOS_ADDED_WRITE, "/" }, |
| { S_BIOS_ENDIO_WRITE, "/" }, |
| { S_DM_IO_READ, " rc=" }, |
| { S_DM_IO_WRITE, " wc=" }, |
| { S_BANDWIDTH, "\nbw=" }, |
| { S_NO_BANDWIDTH, " no_bw=" }, |
| { S_BARRIER, "\nbarrier=" }, |
| { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" }, |
| { S_CAN_MERGE, "\nmerge=" }, |
| { S_CANT_MERGE, "/no_merge=" }, |
| { S_CHUNK_LOCKED, "\nchunk_locked=" }, |
| { S_CONGESTED, "\ncgst=" }, |
| { S_NOT_CONGESTED, "/not_cgst=" }, |
| { S_DEGRADED, "\ndegraded=" }, |
| { S_DELAYED_BIOS, "\ndel_bios=" }, |
| { S_SUM_DELAYED_BIOS, "/sum_del_bios=" }, |
| { S_FLUSHS, "\nflushs=" }, |
| { S_HITS_1ST, "\nhits_1st=" }, |
| { S_IOS_POST, " ios_post=" }, |
| { S_INSCACHE, " inscache=" }, |
| { S_MAX_LOOKUP, " maxlookup=" }, |
| { S_NO_RW, "\nno_rw=" }, |
| { S_NOSYNC, " nosync=" }, |
| { S_OVERWRITE, " ovr=" }, |
| { S_PROHIBITCHUNKIO, " prhbt_io=" }, |
| { S_RECONSTRUCT_EI, "\nrec_ei=" }, |
| { S_RECONSTRUCT_DEV, " rec_dev=" }, |
| { S_RECONSTRUCT_SET, " rec_set=" }, |
| { S_RECONSTRUCTED, " rec=" }, |
| { S_REQUEUE, " requeue=" }, |
| { S_STRIPE_ERROR, " stripe_err=" }, |
| { S_XORS, " xors=" }, |
| }; |
| |
| /* |
| * A RAID set. |
| */ |
| #define dm_rh_client dm_region_hash |
| enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT }; |
| typedef void (*xor_function_t)(unsigned count, unsigned long **data); |
| struct raid_set { |
| struct dm_target *ti; /* Target pointer. */ |
| |
| struct { |
| unsigned long flags; /* State flags. */ |
| struct mutex in_lock; /* Protects central input list below. */ |
| struct bio_list in; /* Pending ios (central input list). */ |
| struct bio_list work; /* ios work set. */ |
| wait_queue_head_t suspendq; /* suspend synchronization. */ |
| atomic_t in_process; /* counter of queued bios (suspendq). */ |
| atomic_t in_process_max;/* counter of queued bios max. */ |
| |
| /* io work. */ |
| struct workqueue_struct *wq; |
| struct delayed_work dws_do_raid; /* For main worker. */ |
| struct work_struct ws_do_table_event; /* For event worker. */ |
| } io; |
| |
| /* Stripe locking abstraction. */ |
| struct dm_raid45_locking_type *locking; |
| |
| struct stripe_cache sc; /* Stripe cache for this set. */ |
| |
| /* Xor optimization. */ |
| struct { |
| struct xor_func *f; |
| unsigned chunks; |
| unsigned speed; |
| } xor; |
| |
| /* Recovery parameters. */ |
| struct recover { |
| struct dm_dirty_log *dl; /* Dirty log. */ |
| struct dm_rh_client *rh; /* Region hash. */ |
| |
| struct dm_io_client *dm_io_client; /* recovery dm-io client. */ |
| /* dm-mem-cache client resource context for recovery stripes. */ |
| struct dm_mem_cache_client *mem_cache_client; |
| |
| struct list_head stripes; /* List of recovery stripes. */ |
| |
| region_t nr_regions; |
| region_t nr_regions_to_recover; |
| region_t nr_regions_recovered; |
| unsigned long start_jiffies; |
| unsigned long end_jiffies; |
| |
| unsigned bandwidth; /* Recovery bandwidth [%]. */ |
| unsigned bandwidth_work; /* Recovery bandwidth [factor]. */ |
| unsigned bandwidth_parm; /* " constructor parm. */ |
| unsigned io_size; /* recovery io size <= region size. */ |
| unsigned io_size_parm; /* recovery io size ctr parameter. */ |
| unsigned recovery; /* Recovery allowed/prohibited. */ |
| unsigned recovery_stripes; /* # of parallel recovery stripes. */ |
| |
| /* recovery io throttling. */ |
| atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/ |
| unsigned long last_jiffies; |
| } recover; |
| |
| /* RAID set parameters. */ |
| struct { |
| struct raid_type *raid_type; /* RAID type (eg, RAID4). */ |
| unsigned raid_parms; /* # variable raid parameters. */ |
| |
| unsigned chunk_size; /* Sectors per chunk. */ |
| unsigned chunk_size_parm; |
| unsigned chunk_shift; /* rsector chunk size shift. */ |
| |
| unsigned io_size; /* Sectors per io. */ |
| unsigned io_size_parm; |
| unsigned io_mask; /* Mask for bio_copy_page_list(). */ |
| unsigned io_inv_mask; /* Mask for raid_address(). */ |
| |
| sector_t sectors_per_dev; /* Sectors per device. */ |
| |
| atomic_t failed_devs; /* Amount of devices failed. */ |
| |
| /* Index of device to initialize. */ |
| int dev_to_init; |
| int dev_to_init_parm; |
| |
| /* Raid devices dynamically allocated. */ |
| unsigned raid_devs; /* # of RAID devices below. */ |
| unsigned data_devs; /* # of RAID data devices. */ |
| |
| int ei; /* index of failed RAID device. */ |
| |
| /* Index of dedicated parity device (i.e. RAID4). */ |
| int pi; |
| int pi_parm; /* constructor parm for status output. */ |
| } set; |
| |
| /* REMOVEME: devel stats counters. */ |
| atomic_t stats[S_NR_STATS]; |
| |
| /* Dynamically allocated temporary pointers for xor(). */ |
| unsigned long **data; |
| |
| /* Dynamically allocated RAID devices. Alignment? */ |
| struct raid_dev dev[0]; |
| }; |
| |
| /* Define RAID set bit operations. */ |
| BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH) |
| BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE) |
| BITOPS(RS, Dead, raid_set, RS_DEAD) |
| BITOPS(RS, Degraded, raid_set, RS_DEGRADED) |
| BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS) |
| BITOPS(RS, Recover, raid_set, RS_RECOVER) |
| BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY) |
| BITOPS(RS, Suspend, raid_set, RS_SUSPEND) |
| #undef BITOPS |
| |
| /*----------------------------------------------------------------- |
| * Raid-4/5 set structures. |
| *---------------------------------------------------------------*/ |
| /* RAID level definitions. */ |
| enum raid_level { |
| raid4, |
| raid5, |
| }; |
| |
| /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */ |
| enum raid_algorithm { |
| none, |
| left_asym, |
| right_asym, |
| left_sym, |
| right_sym, |
| }; |
| |
| struct raid_type { |
| const char *name; /* RAID algorithm. */ |
| const char *descr; /* Descriptor text for logging. */ |
| const unsigned parity_devs; /* # of parity devices. */ |
| const unsigned minimal_devs; /* minimal # of devices in set. */ |
| const enum raid_level level; /* RAID level. */ |
| const enum raid_algorithm algorithm; /* RAID algorithm. */ |
| }; |
| |
| /* Supported raid types and properties. */ |
| static struct raid_type raid_types[] = { |
| {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none}, |
| {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym}, |
| {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym}, |
| {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym}, |
| {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym}, |
| }; |
| |
| /* Address as calculated by raid_address(). */ |
| struct raid_address { |
| sector_t key; /* Hash key (address of stripe % chunk_size). */ |
| unsigned di, pi; /* Data and parity disks index. */ |
| }; |
| |
| /* REMOVEME: reset statistics counters. */ |
| static void stats_reset(struct raid_set *rs) |
| { |
| unsigned s = S_NR_STATS; |
| |
| while (s--) |
| atomic_set(rs->stats + s, 0); |
| } |
| |
| /*---------------------------------------------------------------- |
| * RAID set management routines. |
| *--------------------------------------------------------------*/ |
| /* |
| * Begin small helper functions. |
| */ |
| /* No need to be called from region hash indirectly at dm_rh_dec(). */ |
| static void wake_dummy(void *context) {} |
| |
| /* Return # of io reference. */ |
| static int io_ref(struct raid_set *rs) |
| { |
| return atomic_read(&rs->io.in_process); |
| } |
| |
| /* Get an io reference. */ |
| static void io_get(struct raid_set *rs) |
| { |
| int p = atomic_inc_return(&rs->io.in_process); |
| |
| if (p > atomic_read(&rs->io.in_process_max)) |
| atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */ |
| } |
| |
| /* Put the io reference and conditionally wake io waiters. */ |
| static void io_put(struct raid_set *rs) |
| { |
| /* Intel: rebuild data corrupter? */ |
| if (atomic_dec_and_test(&rs->io.in_process)) |
| wake_up(&rs->io.suspendq); |
| else |
| BUG_ON(io_ref(rs) < 0); |
| } |
| |
| /* Wait until all io has been processed. */ |
| static void wait_ios(struct raid_set *rs) |
| { |
| wait_event(rs->io.suspendq, !io_ref(rs)); |
| } |
| |
| /* Queue (optionally delayed) io work. */ |
| static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay) |
| { |
| queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay); |
| } |
| |
| /* Queue io work immediately (called from region hash too). */ |
| static void wake_do_raid(void *context) |
| { |
| struct raid_set *rs = context; |
| |
| queue_work(rs->io.wq, &rs->io.dws_do_raid.work); |
| } |
| |
| /* Calculate device sector offset. */ |
| static sector_t _sector(struct raid_set *rs, struct bio *bio) |
| { |
| sector_t sector = bio->bi_sector; |
| |
| sector_div(sector, rs->set.data_devs); |
| return sector; |
| } |
| |
| /* Return # of active stripes in stripe cache. */ |
| static int sc_active(struct stripe_cache *sc) |
| { |
| return atomic_read(&sc->active_stripes); |
| } |
| |
| /* Stripe cache busy indicator. */ |
| static int sc_busy(struct raid_set *rs) |
| { |
| return sc_active(&rs->sc) > |
| atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2); |
| } |
| |
| /* Set chunks states. */ |
| enum chunk_dirty_type { CLEAN, DIRTY, ERROR }; |
| static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type) |
| { |
| switch (type) { |
| case CLEAN: |
| ClearChunkDirty(chunk); |
| break; |
| case DIRTY: |
| SetChunkDirty(chunk); |
| break; |
| case ERROR: |
| SetChunkError(chunk); |
| SetStripeError(chunk->stripe); |
| return; |
| default: |
| BUG(); |
| } |
| |
| SetChunkUptodate(chunk); |
| SetChunkIo(chunk); |
| ClearChunkError(chunk); |
| } |
| |
| /* Return region state for a sector. */ |
| static int region_state(struct raid_set *rs, sector_t sector, |
| enum dm_rh_region_states state) |
| { |
| struct dm_rh_client *rh = rs->recover.rh; |
| region_t region = dm_rh_sector_to_region(rh, sector); |
| |
| return !!(dm_rh_get_state(rh, region, 1) & state); |
| } |
| |
| /* |
| * Return true in case a chunk should be read/written |
| * |
| * Conditions to read/write: |
| * o chunk not uptodate |
| * o chunk dirty |
| * |
| * Conditios to avoid io: |
| * o io already ongoing on chunk |
| * o io explitely prohibited |
| */ |
| static int chunk_io(struct stripe_chunk *chunk) |
| { |
| /* 2nd run optimization (flag set below on first run). */ |
| if (TestClearChunkMustIo(chunk)) |
| return 1; |
| |
| /* Avoid io if prohibited or a locked chunk. */ |
| if (!ChunkIo(chunk) || ChunkLocked(chunk)) |
| return 0; |
| |
| if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) { |
| SetChunkMustIo(chunk); /* 2nd run optimization. */ |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| /* Call a function on each chunk needing io unless device failed. */ |
| static unsigned for_each_io_dev(struct stripe *stripe, |
| void (*f_io)(struct stripe *stripe, unsigned p)) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned p, r = 0; |
| |
| for (p = 0; p < rs->set.raid_devs; p++) { |
| if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) { |
| f_io(stripe, p); |
| r++; |
| } |
| } |
| |
| return r; |
| } |
| |
| /* |
| * Index of device to calculate parity on. |
| * |
| * Either the parity device index *or* the selected |
| * device to init after a spare replacement. |
| */ |
| static int dev_for_parity(struct stripe *stripe, int *sync) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING); |
| |
| *sync = !r; |
| |
| /* Reconstruct a particular device ?. */ |
| if (r && rs->set.dev_to_init > -1) |
| return rs->set.dev_to_init; |
| else if (rs->set.raid_type->level == raid4) |
| return rs->set.pi; |
| else if (!StripeRecover(stripe)) |
| return stripe->idx.parity; |
| else |
| return -1; |
| } |
| |
| /* RAID set congested function. */ |
| static int rs_congested(void *congested_data, int bdi_bits) |
| { |
| int r; |
| unsigned p; |
| struct raid_set *rs = congested_data; |
| |
| if (sc_busy(rs) || RSSuspend(rs)) |
| r = 1; |
| else for (r = 0, p = rs->set.raid_devs; !r && p--; ) { |
| /* If any of our component devices are overloaded. */ |
| struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev); |
| |
| r |= bdi_congested(&q->backing_dev_info, bdi_bits); |
| } |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED)); |
| return r; |
| } |
| |
| /* RAID device degrade check. */ |
| static void rs_check_degrade_dev(struct raid_set *rs, |
| struct stripe *stripe, unsigned p) |
| { |
| if (TestSetDevFailed(rs->dev + p)) |
| return; |
| |
| /* Through an event in case of member device errors. */ |
| if (atomic_inc_return(&rs->set.failed_devs) > |
| rs->set.raid_type->parity_devs && |
| !TestSetRSDead(rs)) { |
| /* Display RAID set dead message once. */ |
| unsigned p; |
| char buf[BDEVNAME_SIZE]; |
| |
| DMERR("FATAL: too many devices failed -> RAID set broken"); |
| for (p = 0; p < rs->set.raid_devs; p++) { |
| if (DevFailed(rs->dev + p)) |
| DMERR("device /dev/%s failed", |
| bdevname(rs->dev[p].dev->bdev, buf)); |
| } |
| } |
| |
| /* Only log the first member error. */ |
| if (!TestSetRSDegraded(rs)) { |
| char buf[BDEVNAME_SIZE]; |
| |
| /* Store index for recovery. */ |
| rs->set.ei = p; |
| DMERR("CRITICAL: %sio error on device /dev/%s " |
| "in region=%llu; DEGRADING RAID set\n", |
| stripe ? "" : "FAKED ", |
| bdevname(rs->dev[p].dev->bdev, buf), |
| (unsigned long long) (stripe ? stripe->key : 0)); |
| DMERR("further device error messages suppressed"); |
| } |
| |
| schedule_work(&rs->io.ws_do_table_event); |
| } |
| |
| /* RAID set degrade check. */ |
| static void rs_check_degrade(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned p = rs->set.raid_devs; |
| |
| while (p--) { |
| if (ChunkError(CHUNK(stripe, p))) |
| rs_check_degrade_dev(rs, stripe, p); |
| } |
| } |
| |
| /* Lookup a RAID device by name or by major:minor number. */ |
| static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup) |
| { |
| unsigned p; |
| struct raid_dev *dev; |
| |
| /* |
| * Must be an incremental loop, because the device array |
| * can have empty slots still on calls from raid_ctr() |
| */ |
| for (dev = rs->dev, p = 0; |
| dev->dev && p < rs->set.raid_devs; |
| dev++, p++) { |
| if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev) |
| return p; |
| } |
| |
| return -ENODEV; |
| } |
| /* |
| * End small helper functions. |
| */ |
| |
| /* |
| * Stripe hash functions |
| */ |
| /* Initialize/destroy stripe hash. */ |
| static int hash_init(struct stripe_hash *hash, unsigned stripes) |
| { |
| unsigned buckets = 2, max_buckets = stripes >> 1; |
| static unsigned hash_primes[] = { |
| /* Table of primes for hash_fn/table size optimization. */ |
| 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769, |
| 1543, 3079, 6151, 12289, 24593, 49157, 98317, |
| }; |
| |
| /* Calculate number of buckets (2^^n <= stripes / 2). */ |
| while (buckets < max_buckets) |
| buckets <<= 1; |
| |
| /* Allocate stripe hash buckets. */ |
| hash->hash = vmalloc(buckets * sizeof(*hash->hash)); |
| if (!hash->hash) |
| return -ENOMEM; |
| |
| hash->buckets = buckets; |
| hash->mask = buckets - 1; |
| hash->shift = ffs(buckets); |
| if (hash->shift > ARRAY_SIZE(hash_primes)) |
| hash->shift = ARRAY_SIZE(hash_primes) - 1; |
| |
| BUG_ON(hash->shift < 2); |
| hash->prime = hash_primes[hash->shift]; |
| |
| /* Initialize buckets. */ |
| while (buckets--) |
| INIT_LIST_HEAD(hash->hash + buckets); |
| return 0; |
| } |
| |
| static void hash_exit(struct stripe_hash *hash) |
| { |
| if (hash->hash) { |
| vfree(hash->hash); |
| hash->hash = NULL; |
| } |
| } |
| |
| static unsigned hash_fn(struct stripe_hash *hash, sector_t key) |
| { |
| return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask); |
| } |
| |
| static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key) |
| { |
| return hash->hash + hash_fn(hash, key); |
| } |
| |
| /* Insert an entry into a hash. */ |
| static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe) |
| { |
| list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key)); |
| } |
| |
| /* Lookup an entry in the stripe hash. */ |
| static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key) |
| { |
| unsigned look = 0; |
| struct stripe *stripe; |
| struct list_head *bucket = hash_bucket(&sc->hash, key); |
| |
| list_for_each_entry(stripe, bucket, lists[LIST_HASH]) { |
| look++; |
| |
| if (stripe->key == key) { |
| /* REMOVEME: statisics. */ |
| if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP)) |
| atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look); |
| return stripe; |
| } |
| } |
| |
| return NULL; |
| } |
| |
| /* Resize the stripe cache hash on size changes. */ |
| static int sc_hash_resize(struct stripe_cache *sc) |
| { |
| /* Resize indicated ? */ |
| if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) { |
| int r; |
| struct stripe_hash hash; |
| |
| r = hash_init(&hash, atomic_read(&sc->stripes)); |
| if (r) |
| return r; |
| |
| if (sc->hash.hash) { |
| unsigned b = sc->hash.buckets; |
| struct list_head *pos, *tmp; |
| |
| /* Walk old buckets and insert into new. */ |
| while (b--) { |
| list_for_each_safe(pos, tmp, sc->hash.hash + b) |
| stripe_insert(&hash, |
| list_entry(pos, struct stripe, |
| lists[LIST_HASH])); |
| } |
| |
| } |
| |
| hash_exit(&sc->hash); |
| memcpy(&sc->hash, &hash, sizeof(sc->hash)); |
| atomic_set(&sc->stripes_last, atomic_read(&sc->stripes)); |
| } |
| |
| return 0; |
| } |
| /* End hash stripe hash function. */ |
| |
| /* List add, delete, push and pop functions. */ |
| /* Add stripe to flush list. */ |
| #define DEL_LIST(lh) \ |
| if (!list_empty(lh)) \ |
| list_del_init(lh); |
| |
| /* Delete stripe from hash. */ |
| static void stripe_hash_del(struct stripe *stripe) |
| { |
| DEL_LIST(stripe->lists + LIST_HASH); |
| } |
| |
| /* Return stripe reference count. */ |
| static inline int stripe_ref(struct stripe *stripe) |
| { |
| return atomic_read(&stripe->cnt); |
| } |
| |
| static void stripe_flush_add(struct stripe *stripe) |
| { |
| struct stripe_cache *sc = stripe->sc; |
| struct list_head *lh = stripe->lists + LIST_FLUSH; |
| |
| if (!StripeReconstruct(stripe) && list_empty(lh)) |
| list_add_tail(lh, sc->lists + LIST_FLUSH); |
| } |
| |
| /* |
| * Add stripe to LRU (inactive) list. |
| * |
| * Need lock, because of concurrent access from message interface. |
| */ |
| static void stripe_lru_add(struct stripe *stripe) |
| { |
| if (!StripeRecover(stripe)) { |
| unsigned long flags; |
| struct list_head *lh = stripe->lists + LIST_LRU; |
| spinlock_t *lock = stripe->sc->locks + LOCK_LRU; |
| |
| spin_lock_irqsave(lock, flags); |
| if (list_empty(lh)) |
| list_add_tail(lh, stripe->sc->lists + LIST_LRU); |
| spin_unlock_irqrestore(lock, flags); |
| } |
| } |
| |
| #define POP_LIST(list) \ |
| do { \ |
| if (list_empty(sc->lists + (list))) \ |
| stripe = NULL; \ |
| else { \ |
| stripe = list_first_entry(sc->lists + (list), \ |
| struct stripe, \ |
| lists[(list)]); \ |
| list_del_init(stripe->lists + (list)); \ |
| } \ |
| } while (0); |
| |
| /* Pop an available stripe off the LRU list. */ |
| static struct stripe *stripe_lru_pop(struct stripe_cache *sc) |
| { |
| struct stripe *stripe; |
| spinlock_t *lock = sc->locks + LOCK_LRU; |
| |
| spin_lock_irq(lock); |
| POP_LIST(LIST_LRU); |
| spin_unlock_irq(lock); |
| |
| return stripe; |
| } |
| |
| /* Pop an available stripe off the io list. */ |
| static struct stripe *stripe_io_pop(struct stripe_cache *sc) |
| { |
| struct stripe *stripe; |
| |
| POP_LIST(LIST_FLUSH); |
| return stripe; |
| } |
| |
| /* Push a stripe safely onto the endio list to be handled by do_endios(). */ |
| static void stripe_endio_push(struct stripe *stripe) |
| { |
| unsigned long flags; |
| struct stripe_cache *sc = stripe->sc; |
| struct list_head *stripe_list = stripe->lists + LIST_ENDIO, |
| *sc_list = sc->lists + LIST_ENDIO; |
| spinlock_t *lock = sc->locks + LOCK_ENDIO; |
| |
| /* This runs in parallel with do_endios(). */ |
| spin_lock_irqsave(lock, flags); |
| if (list_empty(stripe_list)) |
| list_add_tail(stripe_list, sc_list); |
| spin_unlock_irqrestore(lock, flags); |
| |
| wake_do_raid(RS(sc)); /* Wake myself. */ |
| } |
| |
| /* Pop a stripe off safely off the endio list. */ |
| static struct stripe *stripe_endio_pop(struct stripe_cache *sc) |
| { |
| struct stripe *stripe; |
| spinlock_t *lock = sc->locks + LOCK_ENDIO; |
| |
| /* This runs in parallel with endio(). */ |
| spin_lock_irq(lock); |
| POP_LIST(LIST_ENDIO) |
| spin_unlock_irq(lock); |
| return stripe; |
| } |
| #undef POP_LIST |
| |
| /* |
| * Stripe cache locking functions |
| */ |
| /* Dummy lock function for single host RAID4+5. */ |
| static void *no_lock(sector_t key, enum dm_lock_type type) |
| { |
| return &no_lock; |
| } |
| |
| /* Dummy unlock function for single host RAID4+5. */ |
| static void no_unlock(void *lock_handle) |
| { |
| } |
| |
| /* No locking (for single host RAID 4+5). */ |
| static struct dm_raid45_locking_type locking_none = { |
| .lock = no_lock, |
| .unlock = no_unlock, |
| }; |
| |
| /* Lock a stripe (for clustering). */ |
| static int |
| stripe_lock(struct stripe *stripe, int rw, sector_t key) |
| { |
| stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX); |
| return stripe->lock ? 0 : -EPERM; |
| } |
| |
| /* Unlock a stripe (for clustering). */ |
| static void stripe_unlock(struct stripe *stripe) |
| { |
| RS(stripe->sc)->locking->unlock(stripe->lock); |
| stripe->lock = NULL; |
| } |
| |
| /* Test io pending on stripe. */ |
| static int stripe_io_ref(struct stripe *stripe) |
| { |
| return atomic_read(&stripe->io.pending); |
| } |
| |
| static void stripe_io_get(struct stripe *stripe) |
| { |
| if (atomic_inc_return(&stripe->io.pending) == 1) |
| /* REMOVEME: statistics */ |
| atomic_inc(&stripe->sc->active_stripes); |
| else |
| BUG_ON(stripe_io_ref(stripe) < 0); |
| } |
| |
| static void stripe_io_put(struct stripe *stripe) |
| { |
| if (atomic_dec_and_test(&stripe->io.pending)) { |
| if (unlikely(StripeRecover(stripe))) |
| /* Don't put recovery stripe on endio list. */ |
| wake_do_raid(RS(stripe->sc)); |
| else |
| /* Add regular stripe to endio list and wake daemon. */ |
| stripe_endio_push(stripe); |
| |
| /* REMOVEME: statistics */ |
| atomic_dec(&stripe->sc->active_stripes); |
| } else |
| BUG_ON(stripe_io_ref(stripe) < 0); |
| } |
| |
| /* Take stripe reference out. */ |
| static int stripe_get(struct stripe *stripe) |
| { |
| int r; |
| struct list_head *lh = stripe->lists + LIST_LRU; |
| spinlock_t *lock = stripe->sc->locks + LOCK_LRU; |
| |
| /* Delete stripe from LRU (inactive) list if on. */ |
| spin_lock_irq(lock); |
| DEL_LIST(lh); |
| spin_unlock_irq(lock); |
| |
| BUG_ON(stripe_ref(stripe) < 0); |
| |
| /* Lock stripe on first reference */ |
| r = (atomic_inc_return(&stripe->cnt) == 1) ? |
| stripe_lock(stripe, WRITE, stripe->key) : 0; |
| |
| return r; |
| } |
| #undef DEL_LIST |
| |
| /* Return references on a chunk. */ |
| static int chunk_ref(struct stripe_chunk *chunk) |
| { |
| return atomic_read(&chunk->cnt); |
| } |
| |
| /* Take out reference on a chunk. */ |
| static int chunk_get(struct stripe_chunk *chunk) |
| { |
| return atomic_inc_return(&chunk->cnt); |
| } |
| |
| /* Drop reference on a chunk. */ |
| static void chunk_put(struct stripe_chunk *chunk) |
| { |
| BUG_ON(atomic_dec_return(&chunk->cnt) < 0); |
| } |
| |
| /* |
| * Drop reference on a stripe. |
| * |
| * Move it to list of LRU stripes if zero. |
| */ |
| static void stripe_put(struct stripe *stripe) |
| { |
| if (atomic_dec_and_test(&stripe->cnt)) { |
| BUG_ON(stripe_io_ref(stripe)); |
| stripe_unlock(stripe); |
| } else |
| BUG_ON(stripe_ref(stripe) < 0); |
| } |
| |
| /* Helper needed by for_each_io_dev(). */ |
| static void stripe_get_references(struct stripe *stripe, unsigned p) |
| { |
| |
| /* |
| * Another one to reference the stripe in |
| * order to protect vs. LRU list moves. |
| */ |
| io_get(RS(stripe->sc)); /* Global io references. */ |
| stripe_get(stripe); |
| stripe_io_get(stripe); /* One for each chunk io. */ |
| } |
| |
| /* Helper for endio() to put all take references. */ |
| static void stripe_put_references(struct stripe *stripe) |
| { |
| stripe_io_put(stripe); /* One for each chunk io. */ |
| stripe_put(stripe); |
| io_put(RS(stripe->sc)); |
| } |
| |
| /* |
| * Stripe cache functions. |
| */ |
| /* |
| * Invalidate all chunks (i.e. their pages) of a stripe. |
| * |
| * I only keep state for the whole chunk. |
| */ |
| static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk) |
| { |
| chunk->io.flags = 0; |
| } |
| |
| static void |
| stripe_chunks_invalidate(struct stripe *stripe) |
| { |
| unsigned p = RS(stripe->sc)->set.raid_devs; |
| |
| while (p--) |
| stripe_chunk_invalidate(CHUNK(stripe, p)); |
| } |
| |
| /* Prepare stripe for (re)use. */ |
| static void stripe_invalidate(struct stripe *stripe) |
| { |
| stripe->io.flags = 0; |
| stripe->idx.parity = stripe->idx.recover = -1; |
| stripe_chunks_invalidate(stripe); |
| } |
| |
| /* |
| * Allow io on all chunks of a stripe. |
| * If not set, IO will not occur; i.e. it's prohibited. |
| * |
| * Actual IO submission for allowed chunks depends |
| * on their !uptodate or dirty state. |
| */ |
| static void stripe_allow_io(struct stripe *stripe) |
| { |
| unsigned p = RS(stripe->sc)->set.raid_devs; |
| |
| while (p--) |
| SetChunkIo(CHUNK(stripe, p)); |
| } |
| |
| /* Initialize a stripe. */ |
| static void stripe_init(struct stripe_cache *sc, struct stripe *stripe) |
| { |
| unsigned i, p = RS(sc)->set.raid_devs; |
| |
| /* Work all io chunks. */ |
| while (p--) { |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| |
| atomic_set(&chunk->cnt, 0); |
| chunk->stripe = stripe; |
| i = ARRAY_SIZE(chunk->bl); |
| while (i--) |
| bio_list_init(chunk->bl + i); |
| } |
| |
| stripe->sc = sc; |
| |
| |
| i = ARRAY_SIZE(stripe->lists); |
| while (i--) |
| INIT_LIST_HEAD(stripe->lists + i); |
| |
| stripe->io.size = RS(sc)->set.io_size; |
| atomic_set(&stripe->cnt, 0); |
| atomic_set(&stripe->io.pending, 0); |
| stripe_invalidate(stripe); |
| } |
| |
| /* Number of pages per chunk. */ |
| static inline unsigned chunk_pages(unsigned sectors) |
| { |
| return dm_div_up(sectors, SECTORS_PER_PAGE); |
| } |
| |
| /* Number of pages per stripe. */ |
| static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size) |
| { |
| return chunk_pages(io_size) * rs->set.raid_devs; |
| } |
| |
| /* Initialize part of page_list (recovery). */ |
| static void stripe_zero_pl_part(struct stripe *stripe, int p, |
| unsigned start, unsigned count) |
| { |
| unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count); |
| /* Get offset into the page_list. */ |
| struct page_list *pl = pl_elem(PL(stripe, p), o); |
| |
| BUG_ON(!pl); |
| while (pl && pages--) { |
| BUG_ON(!pl->page); |
| memset(page_address(pl->page), 0, PAGE_SIZE); |
| pl = pl->next; |
| } |
| } |
| |
| /* Initialize parity chunk of stripe. */ |
| static void stripe_zero_chunk(struct stripe *stripe, int p) |
| { |
| if (p > -1) |
| stripe_zero_pl_part(stripe, p, 0, stripe->io.size); |
| } |
| |
| /* Return dynamic stripe structure size. */ |
| static size_t stripe_size(struct raid_set *rs) |
| { |
| return sizeof(struct stripe) + |
| rs->set.raid_devs * sizeof(struct stripe_chunk); |
| } |
| |
| /* Allocate a stripe and its memory object. */ |
| /* XXX adjust to cope with stripe cache and recovery stripe caches. */ |
| enum grow { SC_GROW, SC_KEEP }; |
| static struct stripe *stripe_alloc(struct stripe_cache *sc, |
| struct dm_mem_cache_client *mc, |
| enum grow grow) |
| { |
| int r; |
| struct stripe *stripe; |
| |
| stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL); |
| if (stripe) { |
| /* Grow the dm-mem-cache by one object. */ |
| if (grow == SC_GROW) { |
| r = dm_mem_cache_grow(mc, 1); |
| if (r) |
| goto err_free; |
| } |
| |
| stripe->obj = dm_mem_cache_alloc(mc); |
| if (!stripe->obj) |
| goto err_shrink; |
| |
| stripe_init(sc, stripe); |
| } |
| |
| return stripe; |
| |
| err_shrink: |
| if (grow == SC_GROW) |
| dm_mem_cache_shrink(mc, 1); |
| err_free: |
| kmem_cache_free(sc->kc.cache, stripe); |
| return NULL; |
| } |
| |
| /* |
| * Free a stripes memory object, shrink the |
| * memory cache and free the stripe itself. |
| */ |
| static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc) |
| { |
| dm_mem_cache_free(mc, stripe->obj); |
| dm_mem_cache_shrink(mc, 1); |
| kmem_cache_free(stripe->sc->kc.cache, stripe); |
| } |
| |
| /* Free the recovery stripe. */ |
| static void stripe_recover_free(struct raid_set *rs) |
| { |
| struct recover *rec = &rs->recover; |
| struct dm_mem_cache_client *mc; |
| |
| mc = rec->mem_cache_client; |
| rec->mem_cache_client = NULL; |
| if (mc) { |
| struct stripe *stripe; |
| |
| while (!list_empty(&rec->stripes)) { |
| stripe = list_first_entry(&rec->stripes, struct stripe, |
| lists[LIST_RECOVER]); |
| list_del(stripe->lists + LIST_RECOVER); |
| kfree(stripe->recover); |
| stripe_free(stripe, mc); |
| } |
| |
| dm_mem_cache_client_destroy(mc); |
| dm_io_client_destroy(rec->dm_io_client); |
| rec->dm_io_client = NULL; |
| } |
| } |
| |
| /* Grow stripe cache. */ |
| static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow) |
| { |
| int r = 0; |
| |
| /* Try to allocate this many (additional) stripes. */ |
| while (stripes--) { |
| struct stripe *stripe = |
| stripe_alloc(sc, sc->mem_cache_client, grow); |
| |
| if (likely(stripe)) { |
| stripe_lru_add(stripe); |
| atomic_inc(&sc->stripes); |
| } else { |
| r = -ENOMEM; |
| break; |
| } |
| } |
| |
| return r ? r : sc_hash_resize(sc); |
| } |
| |
| /* Shrink stripe cache. */ |
| static int sc_shrink(struct stripe_cache *sc, unsigned stripes) |
| { |
| int r = 0; |
| |
| /* Try to get unused stripe from LRU list. */ |
| while (stripes--) { |
| struct stripe *stripe; |
| |
| stripe = stripe_lru_pop(sc); |
| if (stripe) { |
| /* An LRU stripe may never have ios pending! */ |
| BUG_ON(stripe_io_ref(stripe)); |
| BUG_ON(stripe_ref(stripe)); |
| atomic_dec(&sc->stripes); |
| /* Remove from hash if on before deletion. */ |
| stripe_hash_del(stripe); |
| stripe_free(stripe, sc->mem_cache_client); |
| } else { |
| r = -ENOENT; |
| break; |
| } |
| } |
| |
| /* Check if stats are still sane. */ |
| if (atomic_read(&sc->active_stripes_max) > |
| atomic_read(&sc->stripes)) |
| atomic_set(&sc->active_stripes_max, 0); |
| |
| if (r) |
| return r; |
| |
| return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0; |
| } |
| |
| /* Create stripe cache and recovery. */ |
| static int sc_init(struct raid_set *rs, unsigned stripes) |
| { |
| unsigned i, r, rstripes; |
| struct stripe_cache *sc = &rs->sc; |
| struct stripe *stripe; |
| struct recover *rec = &rs->recover; |
| struct mapped_device *md; |
| struct gendisk *disk; |
| |
| /* Initialize lists and locks. */ |
| i = ARRAY_SIZE(sc->lists); |
| while (i--) |
| INIT_LIST_HEAD(sc->lists + i); |
| |
| INIT_LIST_HEAD(&rec->stripes); |
| |
| /* Initialize endio and LRU list locks. */ |
| i = NR_LOCKS; |
| while (i--) |
| spin_lock_init(sc->locks + i); |
| |
| /* Initialize atomic variables. */ |
| atomic_set(&sc->stripes, 0); |
| atomic_set(&sc->stripes_to_set, 0); |
| atomic_set(&sc->active_stripes, 0); |
| atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */ |
| |
| /* |
| * We need a runtime unique # to suffix the kmem cache name |
| * because we'll have one for each active RAID set. |
| */ |
| md = dm_table_get_md(rs->ti->table); |
| disk = dm_disk(md); |
| sprintf(sc->kc.name, "%s-%d", TARGET, disk->first_minor); |
| dm_put(md); |
| sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs), |
| 0, 0, NULL); |
| if (!sc->kc.cache) |
| return -ENOMEM; |
| |
| /* Create memory cache client context for RAID stripe cache. */ |
| sc->mem_cache_client = |
| dm_mem_cache_client_create(stripes, rs->set.raid_devs, |
| chunk_pages(rs->set.io_size)); |
| if (IS_ERR(sc->mem_cache_client)) |
| return PTR_ERR(sc->mem_cache_client); |
| |
| /* Create memory cache client context for RAID recovery stripe(s). */ |
| rstripes = rec->recovery_stripes; |
| rec->mem_cache_client = |
| dm_mem_cache_client_create(rstripes, rs->set.raid_devs, |
| chunk_pages(rec->io_size)); |
| if (IS_ERR(rec->mem_cache_client)) |
| return PTR_ERR(rec->mem_cache_client); |
| |
| /* Create dm-io client context for IO stripes. */ |
| sc->dm_io_client = |
| dm_io_client_create((stripes > 32 ? 32 : stripes) * |
| rs->set.raid_devs * |
| chunk_pages(rs->set.io_size)); |
| if (IS_ERR(sc->dm_io_client)) |
| return PTR_ERR(sc->dm_io_client); |
| |
| /* FIXME: intermingeled with stripe cache initialization. */ |
| /* Create dm-io client context for recovery stripes. */ |
| rec->dm_io_client = |
| dm_io_client_create(rstripes * rs->set.raid_devs * |
| chunk_pages(rec->io_size)); |
| if (IS_ERR(rec->dm_io_client)) |
| return PTR_ERR(rec->dm_io_client); |
| |
| /* Allocate stripes for set recovery. */ |
| while (rstripes--) { |
| stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP); |
| if (!stripe) |
| return -ENOMEM; |
| |
| stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL); |
| if (!stripe->recover) { |
| stripe_free(stripe, rec->mem_cache_client); |
| return -ENOMEM; |
| } |
| |
| SetStripeRecover(stripe); |
| stripe->io.size = rec->io_size; |
| list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes); |
| /* Don't add recovery stripes to LRU list! */ |
| } |
| |
| /* |
| * Allocate the stripe objetcs from the |
| * cache and add them to the LRU list. |
| */ |
| r = sc_grow(sc, stripes, SC_KEEP); |
| if (!r) |
| atomic_set(&sc->stripes_last, stripes); |
| |
| return r; |
| } |
| |
| /* Destroy the stripe cache. */ |
| static void sc_exit(struct stripe_cache *sc) |
| { |
| struct raid_set *rs = RS(sc); |
| |
| if (sc->kc.cache) { |
| stripe_recover_free(rs); |
| BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes))); |
| kmem_cache_destroy(sc->kc.cache); |
| sc->kc.cache = NULL; |
| |
| if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client)) |
| dm_mem_cache_client_destroy(sc->mem_cache_client); |
| |
| if (sc->dm_io_client && !IS_ERR(sc->dm_io_client)) |
| dm_io_client_destroy(sc->dm_io_client); |
| |
| hash_exit(&sc->hash); |
| } |
| } |
| |
| /* |
| * Calculate RAID address |
| * |
| * Delivers tuple with the index of the data disk holding the chunk |
| * in the set, the parity disks index and the start of the stripe |
| * within the address space of the set (used as the stripe cache hash key). |
| */ |
| /* thx MD. */ |
| static struct raid_address *raid_address(struct raid_set *rs, sector_t sector, |
| struct raid_address *addr) |
| { |
| sector_t stripe, tmp; |
| |
| /* |
| * chunk_number = sector / chunk_size |
| * stripe_number = chunk_number / data_devs |
| * di = stripe % data_devs; |
| */ |
| stripe = sector >> rs->set.chunk_shift; |
| addr->di = sector_div(stripe, rs->set.data_devs); |
| |
| switch (rs->set.raid_type->level) { |
| case raid4: |
| addr->pi = rs->set.pi; |
| goto check_shift_di; |
| case raid5: |
| tmp = stripe; |
| addr->pi = sector_div(tmp, rs->set.raid_devs); |
| |
| switch (rs->set.raid_type->algorithm) { |
| case left_asym: /* Left asymmetric. */ |
| addr->pi = rs->set.data_devs - addr->pi; |
| case right_asym: /* Right asymmetric. */ |
| check_shift_di: |
| if (addr->di >= addr->pi) |
| addr->di++; |
| break; |
| case left_sym: /* Left symmetric. */ |
| addr->pi = rs->set.data_devs - addr->pi; |
| case right_sym: /* Right symmetric. */ |
| addr->di = (addr->pi + addr->di + 1) % |
| rs->set.raid_devs; |
| break; |
| case none: /* Ain't happen: RAID4 algorithm placeholder. */ |
| BUG(); |
| } |
| } |
| |
| /* |
| * Start offset of the stripes chunk on any single device of the RAID |
| * set, adjusted in case io size differs from chunk size. |
| */ |
| addr->key = (stripe << rs->set.chunk_shift) + |
| (sector & rs->set.io_inv_mask); |
| return addr; |
| } |
| |
| /* |
| * Copy data across between stripe pages and bio vectors. |
| * |
| * Pay attention to data alignment in stripe and bio pages. |
| */ |
| static void bio_copy_page_list(int rw, struct stripe *stripe, |
| struct page_list *pl, struct bio *bio) |
| { |
| unsigned i, page_offset; |
| void *page_addr; |
| struct raid_set *rs = RS(stripe->sc); |
| struct bio_vec *bv; |
| |
| /* Get start page in page list for this sector. */ |
| i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE; |
| pl = pl_elem(pl, i); |
| BUG_ON(!pl); |
| BUG_ON(!pl->page); |
| |
| page_addr = page_address(pl->page); |
| page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1)); |
| |
| /* Walk all segments and copy data across between bio_vecs and pages. */ |
| bio_for_each_segment(bv, bio, i) { |
| int len = bv->bv_len, size; |
| unsigned bio_offset = 0; |
| void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0); |
| redo: |
| size = (page_offset + len > PAGE_SIZE) ? |
| PAGE_SIZE - page_offset : len; |
| |
| if (rw == READ) |
| memcpy(bio_addr + bio_offset, |
| page_addr + page_offset, size); |
| else |
| memcpy(page_addr + page_offset, |
| bio_addr + bio_offset, size); |
| |
| page_offset += size; |
| if (page_offset == PAGE_SIZE) { |
| /* |
| * We reached the end of the chunk page -> |
| * need to refer to the next one to copy more data. |
| */ |
| len -= size; |
| if (len) { |
| /* Get next page. */ |
| pl = pl->next; |
| BUG_ON(!pl); |
| BUG_ON(!pl->page); |
| page_addr = page_address(pl->page); |
| page_offset = 0; |
| bio_offset += size; |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT); |
| goto redo; |
| } |
| } |
| |
| __bio_kunmap_atomic(bio_addr, KM_USER0); |
| } |
| } |
| |
| /* |
| * Xor optimization macros. |
| */ |
| /* Xor data pointer declaration and initialization macros. */ |
| #define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1] |
| #define DECLARE_3 DECLARE_2, *d2 = data[2] |
| #define DECLARE_4 DECLARE_3, *d3 = data[3] |
| #define DECLARE_5 DECLARE_4, *d4 = data[4] |
| #define DECLARE_6 DECLARE_5, *d5 = data[5] |
| #define DECLARE_7 DECLARE_6, *d6 = data[6] |
| #define DECLARE_8 DECLARE_7, *d7 = data[7] |
| |
| /* Xor unrole macros. */ |
| #define D2(n) d0[n] = d0[n] ^ d1[n] |
| #define D3(n) D2(n) ^ d2[n] |
| #define D4(n) D3(n) ^ d3[n] |
| #define D5(n) D4(n) ^ d4[n] |
| #define D6(n) D5(n) ^ d5[n] |
| #define D7(n) D6(n) ^ d6[n] |
| #define D8(n) D7(n) ^ d7[n] |
| |
| #define X_2(macro, offset) macro(offset); macro(offset + 1); |
| #define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2); |
| #define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4); |
| #define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8); |
| #define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16); |
| #define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32); |
| |
| /* Define a _xor_#chunks_#xors_per_run() function. */ |
| #define _XOR(chunks, xors_per_run) \ |
| static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \ |
| { \ |
| unsigned end = XOR_SIZE / sizeof(data[0]), i; \ |
| DECLARE_ ## chunks; \ |
| \ |
| for (i = 0; i < end; i += xors_per_run) { \ |
| X_ ## xors_per_run(D ## chunks, i); \ |
| } \ |
| } |
| |
| /* Define xor functions for 2 - 8 chunks and xors per run. */ |
| #define MAKE_XOR_PER_RUN(xors_per_run) \ |
| _XOR(2, xors_per_run); _XOR(3, xors_per_run); \ |
| _XOR(4, xors_per_run); _XOR(5, xors_per_run); \ |
| _XOR(6, xors_per_run); _XOR(7, xors_per_run); \ |
| _XOR(8, xors_per_run); |
| |
| MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */ |
| MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */ |
| MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */ |
| MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */ |
| |
| #define MAKE_XOR(xors_per_run) \ |
| struct { \ |
| void (*f)(unsigned long **); \ |
| } static xor_funcs ## xors_per_run[] = { \ |
| { NULL }, /* NULL pointers to optimize indexing in xor(). */ \ |
| { NULL }, \ |
| { _xor2_ ## xors_per_run }, \ |
| { _xor3_ ## xors_per_run }, \ |
| { _xor4_ ## xors_per_run }, \ |
| { _xor5_ ## xors_per_run }, \ |
| { _xor6_ ## xors_per_run }, \ |
| { _xor7_ ## xors_per_run }, \ |
| { _xor8_ ## xors_per_run }, \ |
| }; \ |
| \ |
| static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \ |
| { \ |
| /* Call respective function for amount of chunks. */ \ |
| xor_funcs ## xors_per_run[n].f(data); \ |
| } |
| |
| /* Define xor_8() - xor_64 functions. */ |
| MAKE_XOR(8) |
| MAKE_XOR(16) |
| MAKE_XOR(32) |
| MAKE_XOR(64) |
| |
| /* Maximum number of chunks, which can be xor'ed in one go. */ |
| #define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1) |
| |
| static void xor_blocks_wrapper(unsigned n, unsigned long **data) |
| { |
| BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1); |
| xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1); |
| } |
| |
| struct xor_func { |
| xor_function_t f; |
| const char *name; |
| } static xor_funcs[] = { |
| { xor_8, "xor_8" }, |
| { xor_16, "xor_16" }, |
| { xor_32, "xor_32" }, |
| { xor_64, "xor_64" }, |
| { xor_blocks_wrapper, "xor_blocks" }, |
| }; |
| |
| /* |
| * Check, if chunk has to be xored in/out: |
| * |
| * o if writes are queued |
| * o if writes are merged |
| * o if stripe is to be reconstructed |
| * o if recovery stripe |
| */ |
| static inline int chunk_must_xor(struct stripe_chunk *chunk) |
| { |
| if (ChunkUptodate(chunk)) { |
| BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) && |
| !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))); |
| |
| if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) || |
| !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) |
| return 1; |
| |
| if (StripeReconstruct(chunk->stripe) || |
| StripeRecover(chunk->stripe)) |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Calculate crc. |
| * |
| * This indexes into the chunks of a stripe and their pages. |
| * |
| * All chunks will be xored into the indexed (@pi) |
| * chunk in maximum groups of xor.chunks. |
| * |
| */ |
| static void xor(struct stripe *stripe, unsigned pi, unsigned sector) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned max_chunks = rs->xor.chunks, n = 1, |
| o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */ |
| p = rs->set.raid_devs; |
| unsigned long **d = rs->data; |
| xor_function_t xor_f = rs->xor.f->f; |
| |
| BUG_ON(sector > stripe->io.size); |
| |
| /* Address of parity page to xor into. */ |
| d[0] = page_address(pl_elem(PL(stripe, pi), o)->page); |
| |
| while (p--) { |
| /* Preset pointers to data pages. */ |
| if (p != pi && chunk_must_xor(CHUNK(stripe, p))) |
| d[n++] = page_address(pl_elem(PL(stripe, p), o)->page); |
| |
| /* If max chunks -> xor. */ |
| if (n == max_chunks) { |
| xor_f(n, d); |
| n = 1; |
| } |
| } |
| |
| /* If chunks -> xor. */ |
| if (n > 1) |
| xor_f(n, d); |
| } |
| |
| /* Common xor loop through all stripe page lists. */ |
| static void common_xor(struct stripe *stripe, sector_t count, |
| unsigned off, unsigned pi) |
| { |
| unsigned sector; |
| |
| BUG_ON(!count); |
| for (sector = off; sector < count; sector += SECTORS_PER_PAGE) |
| xor(stripe, pi, sector); |
| |
| /* Set parity page uptodate and clean. */ |
| chunk_set(CHUNK(stripe, pi), CLEAN); |
| atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */ |
| } |
| |
| /* |
| * Calculate parity sectors on intact stripes. |
| * |
| * Need to calculate raid address for recover stripe, because its |
| * chunk sizes differs and is typically larger than io chunk size. |
| */ |
| static void parity_xor(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size, |
| xor_size = chunk_size > io_size ? io_size : chunk_size; |
| sector_t off; |
| |
| /* This can be the recover stripe with a larger io size. */ |
| for (off = 0; off < io_size; off += xor_size) { |
| /* |
| * Recover stripe is likely bigger than regular io |
| * ones and has no precalculated parity disk index -> |
| * need to calculate RAID address. |
| */ |
| if (unlikely(StripeRecover(stripe))) { |
| struct raid_address addr; |
| |
| raid_address(rs, |
| (stripe->key + off) * rs->set.data_devs, |
| &addr); |
| stripe->idx.parity = addr.pi; |
| stripe_zero_pl_part(stripe, addr.pi, off, xor_size); |
| } |
| |
| common_xor(stripe, xor_size, off, stripe->idx.parity); |
| chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY); |
| } |
| } |
| |
| /* Reconstruct missing chunk. */ |
| static void stripe_reconstruct(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| int p = rs->set.raid_devs, pr = stripe->idx.recover; |
| |
| BUG_ON(pr < 0); |
| |
| /* Check if all but the chunk to be reconstructed are uptodate. */ |
| while (p--) |
| BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p))); |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI : |
| S_RECONSTRUCT_DEV)); |
| /* Zero chunk to be reconstructed. */ |
| stripe_zero_chunk(stripe, pr); |
| common_xor(stripe, stripe->io.size, 0, pr); |
| stripe->idx.recover = -1; |
| } |
| |
| /* |
| * Recovery io throttling |
| */ |
| /* Conditionally reset io counters. */ |
| static int recover_io_reset(struct raid_set *rs) |
| { |
| unsigned long j = jiffies; |
| |
| /* Pay attention to jiffies overflows. */ |
| if (j > rs->recover.last_jiffies + HZ / 20 || |
| j < rs->recover.last_jiffies) { |
| atomic_set(rs->recover.io_count + IO_WORK, 0); |
| atomic_set(rs->recover.io_count + IO_RECOVER, 0); |
| rs->recover.last_jiffies = j; |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| /* Count ios. */ |
| static void recover_io_count(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| |
| recover_io_reset(rs); |
| atomic_inc(rs->recover.io_count + |
| (StripeRecover(stripe) ? IO_RECOVER : IO_WORK)); |
| } |
| |
| /* Try getting a stripe either from the hash or from the LRU list. */ |
| static struct stripe *stripe_find(struct raid_set *rs, |
| struct raid_address *addr) |
| { |
| int r; |
| struct stripe_cache *sc = &rs->sc; |
| struct stripe *stripe; |
| |
| /* Try stripe from hash. */ |
| stripe = stripe_lookup(sc, addr->key); |
| if (stripe) { |
| r = stripe_get(stripe); |
| if (r) |
| goto get_lock_failed; |
| |
| atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */ |
| } else { |
| /* Not in hash -> try to get an LRU stripe. */ |
| stripe = stripe_lru_pop(sc); |
| if (stripe) { |
| /* |
| * An LRU stripe may not be referenced |
| * and may never have ios pending! |
| */ |
| BUG_ON(stripe_ref(stripe)); |
| BUG_ON(stripe_io_ref(stripe)); |
| |
| /* Remove from hash if on before reuse. */ |
| stripe_hash_del(stripe); |
| |
| /* Invalidate before reinserting with changed key. */ |
| stripe_invalidate(stripe); |
| |
| stripe->key = addr->key; |
| stripe->region = dm_rh_sector_to_region(rs->recover.rh, |
| addr->key); |
| stripe->idx.parity = addr->pi; |
| r = stripe_get(stripe); |
| if (r) |
| goto get_lock_failed; |
| |
| /* Insert stripe into the stripe hash. */ |
| stripe_insert(&sc->hash, stripe); |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_INSCACHE); |
| } |
| } |
| |
| return stripe; |
| |
| get_lock_failed: |
| stripe_put(stripe); |
| return NULL; |
| } |
| |
| /* |
| * Process end io |
| * |
| * I need to do it here because I can't in interrupt |
| */ |
| /* End io all bios on a bio list. */ |
| static void bio_list_endio(struct stripe *stripe, struct bio_list *bl, |
| int p, int error) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| struct bio *bio; |
| struct page_list *pl = PL(stripe, p); |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| |
| /* Update region counters. */ |
| while ((bio = bio_list_pop(bl))) { |
| if (bio_data_dir(bio) == WRITE) |
| /* Drop io pending count for any writes. */ |
| dm_rh_dec(rs->recover.rh, stripe->region); |
| else if (!error) |
| /* Copy data accross. */ |
| bio_copy_page_list(READ, stripe, pl, bio); |
| |
| bio_endio(bio, error); |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + (bio_data_dir(bio) == READ ? |
| S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE)); |
| |
| chunk_put(chunk); |
| stripe_put(stripe); |
| io_put(rs); /* Wake any suspend waiters on last bio. */ |
| } |
| } |
| |
| /* |
| * End io all reads/writes on a stripe copying |
| * read data accross from stripe to bios and |
| * decrementing region counters for writes. |
| * |
| * Processing of ios depeding on state: |
| * o no chunk error -> endio ok |
| * o degraded: |
| * - chunk error and read -> ignore to be requeued |
| * - chunk error and write -> endio ok |
| * o dead (more than parity_devs failed) and chunk_error-> endio failed |
| */ |
| static void stripe_endio(int rw, struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned p = rs->set.raid_devs; |
| int write = (rw != READ); |
| |
| while (p--) { |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| struct bio_list *bl; |
| |
| BUG_ON(ChunkLocked(chunk)); |
| |
| bl = BL_CHUNK(chunk, rw); |
| if (bio_list_empty(bl)) |
| continue; |
| |
| if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) { |
| /* RAID set dead. */ |
| if (unlikely(RSDead(rs))) |
| bio_list_endio(stripe, bl, p, -EIO); |
| /* RAID set degraded. */ |
| else if (write) |
| bio_list_endio(stripe, bl, p, 0); |
| } else { |
| BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk)); |
| bio_list_endio(stripe, bl, p, 0); |
| } |
| } |
| } |
| |
| /* Fail all ios hanging off all bio lists of a stripe. */ |
| static void stripe_fail_io(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned p = rs->set.raid_devs; |
| |
| while (p--) { |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| int i = ARRAY_SIZE(chunk->bl); |
| |
| /* Fail all bios on all bio lists of the stripe. */ |
| while (i--) { |
| struct bio_list *bl = chunk->bl + i; |
| |
| if (!bio_list_empty(bl)) |
| bio_list_endio(stripe, bl, p, -EIO); |
| } |
| } |
| |
| /* Put stripe on LRU list. */ |
| BUG_ON(stripe_io_ref(stripe)); |
| BUG_ON(stripe_ref(stripe)); |
| } |
| |
| /* Unlock all required chunks. */ |
| static void stripe_chunks_unlock(struct stripe *stripe) |
| { |
| unsigned p = RS(stripe->sc)->set.raid_devs; |
| struct stripe_chunk *chunk; |
| |
| while (p--) { |
| chunk = CHUNK(stripe, p); |
| |
| if (TestClearChunkUnlock(chunk)) |
| ClearChunkLocked(chunk); |
| } |
| } |
| |
| /* |
| * Queue reads and writes to a stripe by hanging |
| * their bios off the stripesets read/write lists. |
| */ |
| static int stripe_queue_bio(struct raid_set *rs, struct bio *bio, |
| struct bio_list *reject) |
| { |
| struct raid_address addr; |
| struct stripe *stripe; |
| |
| stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr)); |
| if (stripe) { |
| int r = 0, rw = bio_data_dir(bio); |
| |
| /* Distinguish reads and writes. */ |
| bio_list_add(BL(stripe, addr.di, rw), bio); |
| |
| if (rw == READ) |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_BIOS_ADDED_READ); |
| else { |
| /* Inrement pending write count on region. */ |
| dm_rh_inc(rs->recover.rh, stripe->region); |
| r = 1; |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_BIOS_ADDED_WRITE); |
| } |
| |
| /* |
| * Put on io (flush) list in case of |
| * initial bio queued to chunk. |
| */ |
| if (chunk_get(CHUNK(stripe, addr.di)) == 1) |
| stripe_flush_add(stripe); |
| |
| return r; |
| } |
| |
| /* Got no stripe from cache or failed to lock it -> reject bio. */ |
| bio_list_add(reject, bio); |
| atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */ |
| return 0; |
| } |
| |
| /* |
| * Handle all stripes by handing them to the daemon, because we can't |
| * map their chunk pages to copy the data in interrupt context. |
| * |
| * We don't want to handle them here either, while interrupts are disabled. |
| */ |
| |
| /* Read/write endio function for dm-io (interrupt context). */ |
| static void endio(unsigned long error, void *context) |
| { |
| struct stripe_chunk *chunk = context; |
| |
| if (unlikely(error)) { |
| chunk_set(chunk, ERROR); |
| /* REMOVEME: statistics. */ |
| atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR); |
| } else |
| chunk_set(chunk, CLEAN); |
| |
| /* |
| * For recovery stripes, I need to reset locked locked |
| * here, because those aren't processed in do_endios(). |
| */ |
| if (unlikely(StripeRecover(chunk->stripe))) |
| ClearChunkLocked(chunk); |
| else |
| SetChunkUnlock(chunk); |
| |
| /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */ |
| stripe_put_references(chunk->stripe); |
| } |
| |
| /* Read/Write a chunk asynchronously. */ |
| static void stripe_chunk_rw(struct stripe *stripe, unsigned p) |
| { |
| struct stripe_cache *sc = stripe->sc; |
| struct raid_set *rs = RS(sc); |
| struct dm_mem_cache_object *obj = stripe->obj + p; |
| struct page_list *pl = obj->pl; |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| struct raid_dev *dev = rs->dev + p; |
| struct dm_io_region io = { |
| .bdev = dev->dev->bdev, |
| .sector = stripe->key, |
| .count = stripe->io.size, |
| }; |
| struct dm_io_request control = { |
| .bi_rw = ChunkDirty(chunk) ? WRITE : READ, |
| .mem = { |
| .type = DM_IO_PAGE_LIST, |
| .ptr.pl = pl, |
| .offset = 0, |
| }, |
| .notify = { |
| .fn = endio, |
| .context = chunk, |
| }, |
| .client = StripeRecover(stripe) ? rs->recover.dm_io_client : |
| sc->dm_io_client, |
| }; |
| |
| BUG_ON(ChunkLocked(chunk)); |
| BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk)); |
| BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk)); |
| |
| /* |
| * Don't rw past end of device, which can happen, because |
| * typically sectors_per_dev isn't divisible by io_size. |
| */ |
| if (unlikely(io.sector + io.count > rs->set.sectors_per_dev)) |
| io.count = rs->set.sectors_per_dev - io.sector; |
| |
| BUG_ON(!io.count); |
| io.sector += dev->start; /* Add <offset>. */ |
| if (RSRecover(rs)) |
| recover_io_count(stripe); /* Recovery io accounting. */ |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE : |
| S_DM_IO_READ)); |
| SetChunkLocked(chunk); |
| SetDevIoQueued(dev); |
| BUG_ON(dm_io(&control, 1, &io, NULL)); |
| } |
| |
| /* |
| * Write dirty or read not uptodate page lists of a stripe. |
| */ |
| static int stripe_chunks_rw(struct stripe *stripe) |
| { |
| int r; |
| struct raid_set *rs = RS(stripe->sc); |
| |
| /* |
| * Increment the pending count on the stripe |
| * first, so that we don't race in endio(). |
| * |
| * An inc (IO) is needed for any chunk unless !ChunkIo(chunk): |
| * |
| * o not uptodate |
| * o dirtied by writes merged |
| * o dirtied by parity calculations |
| */ |
| r = for_each_io_dev(stripe, stripe_get_references); |
| if (r) { |
| /* Io needed: chunks are either not uptodate or dirty. */ |
| int max; /* REMOVEME: */ |
| struct stripe_cache *sc = &rs->sc; |
| |
| /* Submit actual io. */ |
| for_each_io_dev(stripe, stripe_chunk_rw); |
| |
| /* REMOVEME: statistics */ |
| max = sc_active(sc); |
| if (atomic_read(&sc->active_stripes_max) < max) |
| atomic_set(&sc->active_stripes_max, max); |
| |
| atomic_inc(rs->stats + S_FLUSHS); |
| /* END REMOVEME: statistics */ |
| } |
| |
| return r; |
| } |
| |
| /* Merge in all writes hence dirtying respective chunks. */ |
| static void stripe_merge_writes(struct stripe *stripe) |
| { |
| unsigned p = RS(stripe->sc)->set.raid_devs; |
| |
| while (p--) { |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED); |
| |
| if (!bio_list_empty(write)) { |
| struct bio *bio; |
| struct page_list *pl = stripe->obj[p].pl; |
| |
| /* |
| * We can play with the lists without holding a lock, |
| * because it is just us accessing them anyway. |
| */ |
| bio_list_for_each(bio, write) |
| bio_copy_page_list(WRITE, stripe, pl, bio); |
| |
| bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write); |
| bio_list_init(write); |
| chunk_set(chunk, DIRTY); |
| } |
| } |
| } |
| |
| /* Queue all writes to get merged. */ |
| static int stripe_queue_writes(struct stripe *stripe) |
| { |
| int r = 0; |
| unsigned p = RS(stripe->sc)->set.raid_devs; |
| |
| while (p--) { |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| struct bio_list *write = BL_CHUNK(chunk, WRITE); |
| |
| if (!bio_list_empty(write)) { |
| bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write); |
| bio_list_init(write); |
| SetChunkIo(chunk); |
| r = 1; |
| } |
| } |
| |
| return r; |
| } |
| |
| |
| /* Check, if a chunk gets completely overwritten. */ |
| static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p) |
| { |
| unsigned sectors = 0; |
| struct bio *bio; |
| struct bio_list *bl = BL(stripe, p, WRITE_QUEUED); |
| |
| bio_list_for_each(bio, bl) |
| sectors += bio_sectors(bio); |
| |
| BUG_ON(sectors > RS(stripe->sc)->set.io_size); |
| return sectors == RS(stripe->sc)->set.io_size; |
| } |
| |
| /* |
| * Avoid io on broken/reconstructed drive in order to |
| * reconstruct date on endio. |
| * |
| * (*1*) We set StripeReconstruct() in here, so that _do_endios() |
| * will trigger a reconstruct call before resetting it. |
| */ |
| static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr) |
| { |
| struct stripe_chunk *chunk = CHUNK(stripe, pr); |
| |
| /* |
| * Allow io on all chunks but the indexed one, |
| * because we're either degraded or prohibit it |
| * on the one for later reconstruction. |
| */ |
| /* Includes ClearChunkIo(), ClearChunkUptodate(). */ |
| stripe_chunk_invalidate(chunk); |
| stripe->idx.recover = pr; |
| SetStripeReconstruct(stripe); |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO); |
| return -EPERM; |
| } |
| |
| /* Chunk locked/uptodate and device failed tests. */ |
| static struct stripe_chunk * |
| stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| struct stripe_chunk *chunk = CHUNK(stripe, p); |
| |
| /* Can't access active chunks. */ |
| if (ChunkLocked(chunk)) { |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_CHUNK_LOCKED); |
| return NULL; |
| } |
| |
| /* Can't access broken devive. */ |
| if (ChunkError(chunk) || DevFailed(rs->dev + p)) |
| return NULL; |
| |
| /* Can access uptodate chunks. */ |
| if (ChunkUptodate(chunk)) { |
| (*chunks_uptodate)++; |
| return NULL; |
| } |
| |
| return chunk; |
| } |
| |
| /* |
| * Degraded/reconstruction mode. |
| * |
| * Check stripe state to figure which chunks don't need IO. |
| * |
| * Returns 0 for fully operational, -EPERM for degraded/resynchronizing. |
| */ |
| static int stripe_check_reconstruct(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| |
| if (RSDead(rs)) { |
| ClearStripeReconstruct(stripe); |
| ClearStripeReconstructed(stripe); |
| stripe_allow_io(stripe); |
| return 0; |
| } |
| |
| /* Avoid further reconstruction setting, when already set. */ |
| if (StripeReconstruct(stripe)) { |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_RECONSTRUCT_SET); |
| return -EBUSY; |
| } |
| |
| /* Initially allow io on all chunks. */ |
| stripe_allow_io(stripe); |
| |
| /* Return if stripe is already reconstructed. */ |
| if (StripeReconstructed(stripe)) { |
| atomic_inc(rs->stats + S_RECONSTRUCTED); |
| return 0; |
| } |
| |
| /* |
| * Degraded/reconstruction mode (device failed) -> |
| * avoid io on the failed device. |
| */ |
| if (unlikely(RSDegraded(rs))) { |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_DEGRADED); |
| /* Allow IO on all devices but the dead one. */ |
| BUG_ON(rs->set.ei < 0); |
| return stripe_chunk_set_io_flags(stripe, rs->set.ei); |
| } else { |
| int sync, pi = dev_for_parity(stripe, &sync); |
| |
| /* |
| * Reconstruction mode (ie. a particular (replaced) device or |
| * some (rotating) parity chunk is being resynchronized) -> |
| * o make sure all needed chunks are read in |
| * o writes are allowed to go through |
| */ |
| if (!sync) { |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_NOSYNC); |
| /* Allow IO on all devs but the one to reconstruct. */ |
| return stripe_chunk_set_io_flags(stripe, pi); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Check, if stripe is ready to merge writes. |
| * I.e. if all chunks present to allow to merge bios. |
| * |
| * We prohibit io on: |
| * |
| * o chunks without bios |
| * o chunks which get completely written over |
| */ |
| static int stripe_merge_possible(struct stripe *stripe, int nosync) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned chunks_overwrite = 0, chunks_prohibited = 0, |
| chunks_uptodate = 0, p = rs->set.raid_devs; |
| |
| /* Walk all chunks. */ |
| while (p--) { |
| struct stripe_chunk *chunk; |
| |
| /* Prohibit io on broken devices. */ |
| if (DevFailed(rs->dev + p)) { |
| chunk = CHUNK(stripe, p); |
| goto prohibit_io; |
| } |
| |
| /* We can't optimize any further if no chunk. */ |
| chunk = stripe_chunk_check(stripe, p, &chunks_uptodate); |
| if (!chunk || nosync) |
| continue; |
| |
| /* |
| * We have a chunk, which is not uptodate. |
| * |
| * If this is not parity and we don't have |
| * reads queued, we can optimize further. |
| */ |
| if (p != stripe->idx.parity && |
| bio_list_empty(BL_CHUNK(chunk, READ)) && |
| bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) { |
| if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED))) |
| goto prohibit_io; |
| else if (RSCheckOverwrite(rs) && |
| stripe_check_chunk_overwrite(stripe, p)) |
| /* Completely overwritten chunk. */ |
| chunks_overwrite++; |
| } |
| |
| /* Allow io for chunks with bios and overwritten ones. */ |
| SetChunkIo(chunk); |
| continue; |
| |
| prohibit_io: |
| /* No io for broken devices or for chunks w/o bios. */ |
| ClearChunkIo(chunk); |
| chunks_prohibited++; |
| /* REMOVEME: statistics. */ |
| atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO); |
| } |
| |
| /* All data chunks will get written over. */ |
| if (chunks_overwrite == rs->set.data_devs) |
| atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/ |
| else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) { |
| /* We don't have enough chunks to merge. */ |
| atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/ |
| return -EPERM; |
| } |
| |
| /* |
| * If we have all chunks up to date or overwrite them, we |
| * just zero the parity chunk and let stripe_rw() recreate it. |
| */ |
| if (chunks_uptodate == rs->set.raid_devs || |
| chunks_overwrite == rs->set.data_devs) { |
| stripe_zero_chunk(stripe, stripe->idx.parity); |
| BUG_ON(StripeReconstruct(stripe)); |
| SetStripeReconstruct(stripe); /* Enforce xor in caller. */ |
| } else { |
| /* |
| * With less chunks, we xor parity out. |
| * |
| * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(), |
| * so that only chunks with queued or merged writes |
| * are being xored. |
| */ |
| parity_xor(stripe); |
| } |
| |
| /* |
| * We do have enough chunks to merge. |
| * All chunks are uptodate or get written over. |
| */ |
| atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */ |
| return 0; |
| } |
| |
| /* |
| * Avoid reading chunks in case we're fully operational. |
| * |
| * We prohibit io on any chunks without bios but the parity chunk. |
| */ |
| static void stripe_avoid_reads(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| unsigned dummy = 0, p = rs->set.raid_devs; |
| |
| /* Walk all chunks. */ |
| while (p--) { |
| struct stripe_chunk *chunk = |
| stripe_chunk_check(stripe, p, &dummy); |
| |
| if (!chunk) |
| continue; |
| |
| /* If parity or any bios pending -> allow io. */ |
| if (chunk_ref(chunk) || p == stripe->idx.parity) |
| SetChunkIo(chunk); |
| else { |
| ClearChunkIo(chunk); |
| /* REMOVEME: statistics. */ |
| atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO); |
| } |
| } |
| } |
| |
| /* |
| * Read/write a stripe. |
| * |
| * All stripe read/write activity goes through this function |
| * unless recovery, which has to call stripe_chunk_rw() directly. |
| * |
| * Make sure we don't try already merged stripes in order |
| * to avoid data corruption. |
| * |
| * Check the state of the RAID set and if degraded (or |
| * resynchronizing for reads), read in all other chunks but |
| * the one on the dead/resynchronizing device in order to be |
| * able to reconstruct the missing one in _do_endios(). |
| * |
| * Can be called on active stripes in order |
| * to dispatch new io on inactive chunks. |
| * |
| * States to cover: |
| * o stripe to read and/or write |
| * o stripe with error to reconstruct |
| */ |
| static void stripe_rw(struct stripe *stripe) |
| { |
| int nosync, r; |
| struct raid_set *rs = RS(stripe->sc); |
| |
| /* |
| * Check, if a chunk needs to be reconstructed |
| * because of a degraded set or a region out of sync. |
| */ |
| nosync = stripe_check_reconstruct(stripe); |
| switch (nosync) { |
| case -EBUSY: |
| return; /* Wait for stripe reconstruction to finish. */ |
| case -EPERM: |
| goto io; |
| } |
| |
| /* |
| * If we don't have merged writes pending, we can schedule |
| * queued writes to be merged next without corrupting data. |
| */ |
| if (!StripeMerged(stripe)) { |
| r = stripe_queue_writes(stripe); |
| if (r) |
| /* Writes got queued -> flag RBW. */ |
| SetStripeRBW(stripe); |
| } |
| |
| /* |
| * Merge all writes hanging off uptodate/overwritten |
| * chunks of the stripe. |
| */ |
| if (StripeRBW(stripe)) { |
| r = stripe_merge_possible(stripe, nosync); |
| if (!r) { /* Merge possible. */ |
| struct stripe_chunk *chunk; |
| |
| /* |
| * I rely on valid parity in order |
| * to xor a fraction of chunks out |
| * of parity and back in. |
| */ |
| stripe_merge_writes(stripe); /* Merge writes in. */ |
| parity_xor(stripe); /* Update parity. */ |
| ClearStripeReconstruct(stripe); /* Reset xor enforce. */ |
| SetStripeMerged(stripe); /* Writes merged. */ |
| ClearStripeRBW(stripe); /* Disable RBW. */ |
| |
| /* |
| * REMOVEME: sanity check on parity chunk |
| * states after writes got merged. |
| */ |
| chunk = CHUNK(stripe, stripe->idx.parity); |
| BUG_ON(ChunkLocked(chunk)); |
| BUG_ON(!ChunkUptodate(chunk)); |
| BUG_ON(!ChunkDirty(chunk)); |
| BUG_ON(!ChunkIo(chunk)); |
| } |
| } else if (!nosync && !StripeMerged(stripe)) |
| /* Read avoidance if not degraded/resynchronizing/merged. */ |
| stripe_avoid_reads(stripe); |
| |
| io: |
| /* Now submit any reads/writes for non-uptodate or dirty chunks. */ |
| r = stripe_chunks_rw(stripe); |
| if (!r) { |
| /* |
| * No io submitted because of chunk io |
| * prohibited or locked chunks/failed devices |
| * -> push to end io list for processing. |
| */ |
| stripe_endio_push(stripe); |
| atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */ |
| } |
| } |
| |
| /* |
| * Recovery functions |
| */ |
| /* Read a stripe off a raid set for recovery. */ |
| static int stripe_recover_read(struct stripe *stripe, int pi) |
| { |
| BUG_ON(stripe_io_ref(stripe)); |
| |
| /* Invalidate all chunks so that they get read in. */ |
| stripe_chunks_invalidate(stripe); |
| stripe_allow_io(stripe); /* Allow io on all recovery chunks. */ |
| |
| /* |
| * If we are reconstructing a perticular device, we can avoid |
| * reading the respective chunk in, because we're going to |
| * reconstruct it anyway. |
| * |
| * We can't do that for resynchronization of rotating parity, |
| * because the recovery stripe chunk size is typically larger |
| * than the sets chunk size. |
| */ |
| if (pi > -1) |
| ClearChunkIo(CHUNK(stripe, pi)); |
| |
| return stripe_chunks_rw(stripe); |
| } |
| |
| /* Write a stripe to a raid set for recovery. */ |
| static int stripe_recover_write(struct stripe *stripe, int pi) |
| { |
| BUG_ON(stripe_io_ref(stripe)); |
| |
| /* |
| * If this is a reconstruct of a particular device, then |
| * reconstruct the respective chunk, else create parity chunk. |
| */ |
| if (pi > -1) { |
| stripe_zero_chunk(stripe, pi); |
| common_xor(stripe, stripe->io.size, 0, pi); |
| chunk_set(CHUNK(stripe, pi), DIRTY); |
| } else |
| parity_xor(stripe); |
| |
| return stripe_chunks_rw(stripe); |
| } |
| |
| /* Read/write a recovery stripe. */ |
| static int stripe_recover_rw(struct stripe *stripe) |
| { |
| int r = 0, sync = 0; |
| |
| /* Read/write flip-flop. */ |
| if (TestClearStripeRBW(stripe)) { |
| SetStripeMerged(stripe); |
| stripe->key = stripe->recover->pos; |
| r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync)); |
| BUG_ON(!r); |
| } else if (TestClearStripeMerged(stripe)) { |
| r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync)); |
| BUG_ON(!r); |
| } |
| |
| BUG_ON(sync); |
| return r; |
| } |
| |
| /* Recover bandwidth available ?. */ |
| static int recover_bandwidth(struct raid_set *rs) |
| { |
| int r, work; |
| |
| /* On reset or when bios delayed -> allow recovery. */ |
| r = recover_io_reset(rs); |
| if (r || RSBandwidth(rs)) |
| goto out; |
| |
| work = atomic_read(rs->recover.io_count + IO_WORK); |
| if (work) { |
| /* Pay attention to larger recover stripe size. */ |
| int recover = atomic_read(rs->recover.io_count + IO_RECOVER) * |
| rs->recover.io_size / rs->set.io_size; |
| |
| /* |
| * Don't use more than given bandwidth |
| * of the work io for recovery. |
| */ |
| if (recover > work / rs->recover.bandwidth_work) { |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_NO_BANDWIDTH); |
| return 0; |
| } |
| } |
| |
| out: |
| atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */ |
| return 1; |
| } |
| |
| /* Try to get a region to recover. */ |
| static int stripe_recover_get_region(struct stripe *stripe) |
| { |
| struct raid_set *rs = RS(stripe->sc); |
| struct recover *rec = &rs->recover; |
| struct recover_addr *addr = stripe->recover; |
| struct dm_dirty_log *dl = rec->dl; |
| struct dm_rh_client *rh = rec->rh; |
| |
| BUG_ON(!dl); |
| BUG_ON(!rh); |
| |
| /* Return, that we have region first to finish it during suspension. */ |
| if (addr->reg) |
| return 1; |
| |
| if (RSSuspend(rs)) |
| return -EPERM; |
| |
| if (dl->type->get_sync_count(dl) >= rec->nr_regions) |
| return -ENOENT; |
| |
| /* If we don't have enough bandwidth, we don't proceed recovering. */ |
| if (!recover_bandwidth(rs)) |
| return -EAGAIN; |
| |
| /* Start quiescing a region. */ |
| dm_rh_recovery_prepare(rh); |
| addr->reg = dm_rh_recovery_start(rh); |
| if (!addr->reg) |
| return -EAGAIN; |
| |
| addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg)); |
| addr->end = addr->pos + dm_rh_get_region_size(rh); |
| |
| /* |
| * Take one global io reference out for the |
| * whole region, which is going to be released |
| * when the region is completely done with. |
| */ |
| io_get(rs); |
| return 0; |
| } |
| |
| /* Update region hash state. */ |
| enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 }; |
| static void recover_rh_update(struct stripe *stripe, enum recover_type success) |
| { |
| struct recover_addr *addr = stripe->recover; |
| struct raid_set *rs = RS(stripe->sc); |
| struct recover *rec = &rs->recover; |
| |
| if (!addr->reg) { |
| DMERR("%s- Called w/o region", __func__); |
| return; |
| } |
| |
| dm_rh_recovery_end(addr->reg, success); |
| if (success) |
| rec->nr_regions_recovered++; |
| |
| addr->reg = NULL; |
| |
| /* |
| * Completely done with this region -> |
| * release the 1st io reference. |
| */ |
| io_put(rs); |
| } |
| |
| /* Set start of recovery state. */ |
| static void set_start_recovery(struct raid_set *rs) |
| { |
| /* Initialize recovery. */ |
| rs->recover.start_jiffies = jiffies; |
| rs->recover.end_jiffies = 0; |
| } |
| |
| /* Set end of recovery state. */ |
| static void set_end_recovery(struct raid_set *rs) |
| { |
| ClearRSRecover(rs); |
| rs->set.dev_to_init = -1; |
| |
| /* Check for jiffies overrun. */ |
| rs->recover.end_jiffies = jiffies; |
| if (rs->recover.end_jiffies < rs->recover.start_jiffies) |
| rs->recover.end_jiffies = ~0; |
| } |
| |
| /* Handle recovery on one recovery stripe. */ |
| static int _do_recovery(struct stripe *stripe) |
| { |
| int r; |
| struct raid_set *rs = RS(stripe->sc); |
| struct recover_addr *addr = stripe->recover; |
| |
| /* If recovery is active -> return. */ |
| if (stripe_io_ref(stripe)) |
| return 1; |
| |
| /* IO error is fatal for recovery -> stop it. */ |
| if (unlikely(StripeError(stripe))) |
| goto err; |
| |
| /* Recovery end required. */ |
| if (!RSRecover(rs)) |
| goto err; |
| |
| /* Get a region to recover. */ |
| r = stripe_recover_get_region(stripe); |
| switch (r) { |
| case 0: /* Got a new region: flag initial read before write. */ |
| SetStripeRBW(stripe); |
| case 1: /* Have a region in the works. */ |
| break; |
| case -EAGAIN: |
| /* No bandwidth/quiesced region yet, try later. */ |
| if (!io_ref(rs)) |
| wake_do_raid_delayed(rs, HZ / 4); |
| case -EPERM: |
| /* Suspend. */ |
| return 1; |
| case -ENOENT: /* No more regions to recover. */ |
| schedule_work(&rs->io.ws_do_table_event); |
| return 0; |
| default: |
| BUG(); |
| } |
| |
| /* Read/write a recover stripe. */ |
| r = stripe_recover_rw(stripe); |
| if (r) |
| /* IO initiated. */ |
| return 1; |
| |
| /* Read and write finished-> update recovery position within region. */ |
| addr->pos += stripe->io.size; |
| |
| /* If we're at end of region, update region hash. */ |
| if (addr->pos >= addr->end || |
| addr->pos >= rs->set.sectors_per_dev) |
| recover_rh_update(stripe, REC_SUCCESS); |
| else |
| /* Prepare to read next region segment. */ |
| SetStripeRBW(stripe); |
| |
| /* Schedule myself for another round... */ |
| wake_do_raid(rs); |
| return 1; |
| |
| err: |
| /* FIXME: rather try recovering other regions on error? */ |
| rs_check_degrade(stripe); |
| recover_rh_update(stripe, REC_FAILURE); |
| |
| /* Check state of partially recovered array. */ |
| if (RSDegraded(rs) && !RSDead(rs) && |
| rs->set.dev_to_init != -1 && |
| rs->set.ei != rs->set.dev_to_init) |
| /* Broken drive != drive to recover -> FATAL. */ |
| SetRSDead(rs); |
| |
| if (StripeError(stripe)) { |
| char buf[BDEVNAME_SIZE]; |
| |
| DMERR("stopping recovery due to " |
| "ERROR on /dev/%s, stripe at offset %llu", |
| bdevname(rs->dev[rs->set.ei].dev->bdev, buf), |
| (unsigned long long) stripe->key); |
| |
| } |
| |
| /* Make sure, that all quiesced regions get released. */ |
| while (addr->reg) { |
| dm_rh_recovery_end(addr->reg, -EIO); |
| addr->reg = dm_rh_recovery_start(rs->recover.rh); |
| } |
| |
| return 0; |
| } |
| |
| /* Called by main io daemon to recover regions. */ |
| static void do_recovery(struct raid_set *rs) |
| { |
| if (RSRecover(rs)) { |
| int r = 0; |
| struct stripe *stripe; |
| |
| list_for_each_entry(stripe, &rs->recover.stripes, |
| lists[LIST_RECOVER]) |
| r += _do_recovery(stripe); |
| |
| if (!r) { |
| set_end_recovery(rs); |
| stripe_recover_free(rs); |
| } |
| } |
| } |
| |
| /* |
| * END recovery functions |
| */ |
| |
| /* End io process all stripes handed in by endio() callback. */ |
| static void _do_endios(struct raid_set *rs, struct stripe *stripe, |
| struct list_head *flush_list) |
| { |
| /* First unlock all required chunks. */ |
| stripe_chunks_unlock(stripe); |
| |
| /* |
| * If an io error on a stripe occured, degrade the RAID set |
| * and try to endio as many bios as possible. If any bios can't |
| * be endio processed, requeue the stripe (stripe_ref() != 0). |
| */ |
| if (TestClearStripeError(stripe)) { |
| /* |
| * FIXME: if read, rewrite the failed chunk after reconstruction |
| * in order to trigger disk bad sector relocation. |
| */ |
| rs_check_degrade(stripe); /* Resets ChunkError(). */ |
| ClearStripeReconstruct(stripe); |
| ClearStripeReconstructed(stripe); |
| } |
| |
| /* Got to reconstruct a missing chunk. */ |
| if (StripeReconstruct(stripe)) { |
| /* |
| * (*2*) We use StripeReconstruct() to allow for |
| * all chunks to be xored into the reconstructed |
| * one (see chunk_must_xor()). |
| */ |
| stripe_reconstruct(stripe); |
| |
| /* |
| * (*3*) Now we reset StripeReconstruct() and flag |
| * StripeReconstructed() to show to stripe_rw(), |
| * that we have reconstructed a missing chunk. |
| */ |
| ClearStripeReconstruct(stripe); |
| SetStripeReconstructed(stripe); |
| |
| /* FIXME: reschedule to be written in case of read. */ |
| // if (!StripeRBW(stripe)) { |
| // chunk_set(CHUNK(stripe, pr), DIRTY); |
| // stripe_chunks_rw(stripe); |
| // } |
| } |
| |
| /* |
| * Now that we eventually got a complete stripe, we |
| * can process the rest of the end ios on reads. |
| */ |
| stripe_endio(READ, stripe); |
| |
| /* End io all merged writes. */ |
| if (TestClearStripeMerged(stripe)) |
| stripe_endio(WRITE_MERGED, stripe); |
| |
| /* If RAID set is dead -> fail any ios to dead drives. */ |
| if (RSDead(rs)) { |
| DMERR_LIMIT("RAID set dead: failing ios to dead devices"); |
| stripe_fail_io(stripe); |
| } |
| |
| /* |
| * We have stripe references still, |
| * beacuse of read befeore writes or IO errors -> |
| * got to put on flush list for processing. |
| */ |
| if (stripe_ref(stripe)) { |
| BUG_ON(!list_empty(stripe->lists + LIST_LRU)); |
| list_add_tail(stripe->lists + LIST_FLUSH, flush_list); |
| atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */ |
| } else |
| stripe_lru_add(stripe); |
| } |
| |
| /* Pop any endio stripes off of the endio list and belabour them. */ |
| static void do_endios(struct raid_set *rs) |
| { |
| struct stripe_cache *sc = &rs->sc; |
| struct stripe *stripe; |
| /* IO flush list for sorted requeued stripes. */ |
| struct list_head flush_list; |
| |
| INIT_LIST_HEAD(&flush_list); |
| |
| while ((stripe = stripe_endio_pop(sc))) { |
| /* Avoid endio on stripes with newly io'ed chunks. */ |
| if (!stripe_io_ref(stripe)) |
| _do_endios(rs, stripe, &flush_list); |
| } |
| |
| /* |
| * Insert any requeued stripes in the proper |
| * order at the beginning of the io (flush) list. |
| */ |
| list_splice(&flush_list, sc->lists + LIST_FLUSH); |
| } |
| |
| /* Flush any stripes on the io list. */ |
| static void do_flush(struct raid_set *rs) |
| { |
| struct stripe *stripe; |
| |
| while ((stripe = stripe_io_pop(&rs->sc))) |
| stripe_rw(stripe); /* Read/write stripe. */ |
| } |
| |
| /* Stripe cache resizing. */ |
| static void do_sc_resize(struct raid_set *rs) |
| { |
| unsigned set = atomic_read(&rs->sc.stripes_to_set); |
| |
| if (set) { |
| unsigned cur = atomic_read(&rs->sc.stripes); |
| int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) : |
| sc_shrink(&rs->sc, cur - set); |
| |
| /* Flag end of resizeing if ok. */ |
| if (!r) |
| atomic_set(&rs->sc.stripes_to_set, 0); |
| } |
| } |
| |
| /* |
| * Process all ios |
| * |
| * We do different things with the io depending |
| * on the state of the region that it is in: |
| * |
| * o reads: hang off stripe cache or postpone if full |
| * |
| * o writes: |
| * |
| * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set. |
| * In case stripe cache is full or busy, postpone the io. |
| * |
| * RECOVERING: delay the io until recovery of the region completes. |
| * |
| */ |
| static void do_ios(struct raid_set *rs, struct bio_list *ios) |
| { |
| int r; |
| unsigned flush = 0, delay = 0; |
| sector_t sector; |
| struct dm_rh_client *rh = rs->recover.rh; |
| struct bio *bio; |
| struct bio_list reject; |
| |
| bio_list_init(&reject); |
| |
| /* |
| * Classify each io: |
| * o delay writes to recovering regions (let reads go through) |
| * o queue io to all other regions |
| */ |
| while ((bio = bio_list_pop(ios))) { |
| /* |
| * In case we get a barrier bio, push it back onto |
| * the input queue unless all work queues are empty |
| * and the stripe cache is inactive. |
| */ |
| if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + S_BARRIER); |
| if (delay || |
| !list_empty(rs->sc.lists + LIST_FLUSH) || |
| !bio_list_empty(&reject) || |
| sc_active(&rs->sc)) { |
| bio_list_push(ios, bio); |
| break; |
| } |
| } |
| |
| /* Check for recovering regions. */ |
| sector = _sector(rs, bio); |
| r = region_state(rs, sector, DM_RH_RECOVERING); |
| if (unlikely(r && bio_data_dir(bio) == WRITE)) { |
| delay++; |
| /* Wait writing to recovering regions. */ |
| dm_rh_delay_by_region(rh, bio, |
| dm_rh_sector_to_region(rh, |
| sector)); |
| /* REMOVEME: statistics.*/ |
| atomic_inc(rs->stats + S_DELAYED_BIOS); |
| atomic_inc(rs->stats + S_SUM_DELAYED_BIOS); |
| |
| /* Force bandwidth tests in recovery. */ |
| SetRSBandwidth(rs); |
| } else { |
| /* |
| * Process ios to non-recovering regions by queueing |
| * them to stripes (does dm_rh_inc()) for writes). |
| */ |
| flush += stripe_queue_bio(rs, bio, &reject); |
| } |
| } |
| |
| if (flush) { |
| /* FIXME: better error handling. */ |
| r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */ |
| if (r) |
| DMERR_LIMIT("dirty log flush"); |
| } |
| |
| /* Merge any rejected bios back to the head of the input list. */ |
| bio_list_merge_head(ios, &reject); |
| } |
| |
| /* Unplug: let any queued io role on the sets devices. */ |
| static void do_unplug(struct raid_set *rs) |
| { |
| struct raid_dev *dev = rs->dev + rs->set.raid_devs; |
| |
| while (dev-- > rs->dev) { |
| /* Only call any device unplug function, if io got queued. */ |
| if (TestClearDevIoQueued(dev)) |
| blk_unplug(bdev_get_queue(dev->dev->bdev)); |
| } |
| } |
| |
| /* Send an event in case we're getting too busy. */ |
| static void do_busy_event(struct raid_set *rs) |
| { |
| if (sc_busy(rs)) { |
| if (!TestSetRSScBusy(rs)) |
| schedule_work(&rs->io.ws_do_table_event); |
| } |
| |
| ClearRSScBusy(rs); |
| } |
| |
| /* Throw an event. */ |
| static void do_table_event(struct work_struct *ws) |
| { |
| struct raid_set *rs = container_of(ws, struct raid_set, |
| io.ws_do_table_event); |
| dm_table_event(rs->ti->table); |
| } |
| |
| |
| /*----------------------------------------------------------------- |
| * RAID daemon |
| *---------------------------------------------------------------*/ |
| /* |
| * o belabour all end ios |
| * o update the region hash states |
| * o optionally shrink the stripe cache |
| * o optionally do recovery |
| * o unplug any component raid devices with queued bios |
| * o grab the input queue |
| * o work an all requeued or new ios and perform stripe cache flushs |
| * o unplug any component raid devices with queued bios |
| * o check, if the stripe cache gets too busy and throw an event if so |
| */ |
| static void do_raid(struct work_struct *ws) |
| { |
| struct raid_set *rs = container_of(ws, struct raid_set, |
| io.dws_do_raid.work); |
| struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in; |
| |
| /* |
| * We always need to end io, so that ios can get errored in |
| * case the set failed and the region counters get decremented |
| * before we update region hash states and go any further. |
| */ |
| do_endios(rs); |
| dm_rh_update_states(rs->recover.rh, 1); |
| |
| /* |
| * Now that we've end io'd, which may have put stripes on the LRU list |
| * to allow for shrinking, we resize the stripe cache if requested. |
| */ |
| do_sc_resize(rs); |
| |
| /* Try to recover regions. */ |
| do_recovery(rs); |
| do_unplug(rs); /* Unplug the sets device queues. */ |
| |
| /* Quickly grab all new ios queued and add them to the work list. */ |
| mutex_lock(&rs->io.in_lock); |
| bio_list_merge(ios, ios_in); |
| bio_list_init(ios_in); |
| mutex_unlock(&rs->io.in_lock); |
| |
| if (!bio_list_empty(ios)) |
| do_ios(rs, ios); /* Got ios to work into the cache. */ |
| |
| do_flush(rs); /* Flush any stripes on io list. */ |
| do_unplug(rs); /* Unplug the sets device queues. */ |
| do_busy_event(rs); /* Check if we got too busy. */ |
| } |
| |
| /* |
| * Callback for region hash to dispatch |
| * delayed bios queued to recovered regions |
| * (gets called via dm_rh_update_states()). |
| */ |
| static void dispatch_delayed_bios(void *context, struct bio_list *bl) |
| { |
| struct raid_set *rs = context; |
| struct bio *bio; |
| |
| /* REMOVEME: statistics; decrement pending delayed bios counter. */ |
| bio_list_for_each(bio, bl) |
| atomic_dec(rs->stats + S_DELAYED_BIOS); |
| |
| /* Merge region hash private list to work list. */ |
| bio_list_merge_head(&rs->io.work, bl); |
| bio_list_init(bl); |
| ClearRSBandwidth(rs); |
| } |
| |
| /************************************************************* |
| * Constructor helpers |
| *************************************************************/ |
| /* Calculate MB/sec. */ |
| static unsigned mbpers(struct raid_set *rs, unsigned speed) |
| { |
| return to_bytes(speed * rs->set.data_devs * |
| rs->recover.io_size * HZ >> 10) >> 10; |
| } |
| |
| /* |
| * Discover fastest xor algorithm and # of chunks combination. |
| */ |
| /* Calculate speed for algorithm and # of chunks. */ |
| static unsigned xor_speed(struct stripe *stripe) |
| { |
| unsigned r = 0; |
| unsigned long j; |
| |
| /* Wait for next tick. */ |
| for (j = jiffies; j == jiffies; ) |
| ; |
| |
| /* Do xors for a full tick. */ |
| for (j = jiffies; j == jiffies; ) { |
| mb(); |
| common_xor(stripe, stripe->io.size, 0, 0); |
| mb(); |
| r++; |
| } |
| |
| return r; |
| } |
| |
| /* Optimize xor algorithm for this RAID set. */ |
| static unsigned xor_optimize(struct raid_set *rs) |
| { |
| unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0; |
| struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL; |
| struct stripe *stripe; |
| |
| BUG_ON(list_empty(&rs->recover.stripes)); |
| stripe = list_first_entry(&rs->recover.stripes, struct stripe, |
| lists[LIST_RECOVER]); |
| |
| /* Must set uptodate so that xor() will belabour chunks. */ |
| while (p--) |
| SetChunkUptodate(CHUNK(stripe, p)); |
| |
| /* Try all xor functions. */ |
| while (f-- > xor_funcs) { |
| unsigned speed; |
| |
| /* Set actual xor function for common_xor(). */ |
| rs->xor.f = f; |
| rs->xor.chunks = (f->f == xor_blocks_wrapper ? |
| (MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1; |
| |
| while (rs->xor.chunks-- > 2) { |
| speed = xor_speed(stripe); |
| if (speed > speed_max) { |
| speed_max = speed; |
| chunks_max = rs->xor.chunks; |
| f_max = f; |
| } |
| } |
| } |
| |
| /* Memorize optimum parameters. */ |
| rs->xor.f = f_max; |
| rs->xor.chunks = chunks_max; |
| return speed_max; |
| } |
| |
| /* |
| * Allocate a RAID context (a RAID set) |
| */ |
| /* Structure for variable RAID parameters. */ |
| struct variable_parms { |
| int bandwidth; |
| int bandwidth_parm; |
| int chunk_size; |
| int chunk_size_parm; |
| int io_size; |
| int io_size_parm; |
| int stripes; |
| int stripes_parm; |
| int recover_io_size; |
| int recover_io_size_parm; |
| int raid_parms; |
| int recovery; |
| int recovery_stripes; |
| int recovery_stripes_parm; |
| }; |
| |
| static struct raid_set * |
| context_alloc(struct raid_type *raid_type, struct variable_parms *p, |
| unsigned raid_devs, sector_t sectors_per_dev, |
| struct dm_target *ti, unsigned dl_parms, char **argv) |
| { |
| int r; |
| size_t len; |
| sector_t region_size, ti_len; |
| struct raid_set *rs = NULL; |
| struct dm_dirty_log *dl; |
| struct recover *rec; |
| |
| /* |
| * Create the dirty log |
| * |
| * We need to change length for the dirty log constructor, |
| * because we want an amount of regions for all stripes derived |
| * from the single device size, so that we can keep region |
| * size = 2^^n independant of the number of devices |
| */ |
| ti_len = ti->len; |
| ti->len = sectors_per_dev; |
| dl = dm_dirty_log_create(argv[0], ti, dl_parms, argv + 2); |
| ti->len = ti_len; |
| if (!dl) |
| goto bad_dirty_log; |
| |
| /* Chunk size *must* be smaller than region size. */ |
| region_size = dl->type->get_region_size(dl); |
| if (p->chunk_size > region_size) |
| goto bad_chunk_size; |
| |
| /* Recover io size *must* be smaller than region size as well. */ |
| if (p->recover_io_size > region_size) |
| goto bad_recover_io_size; |
| |
| /* Size and allocate the RAID set structure. */ |
| len = sizeof(*rs->data) + sizeof(*rs->dev); |
| if (dm_array_too_big(sizeof(*rs), len, raid_devs)) |
| goto bad_array; |
| |
| len = sizeof(*rs) + raid_devs * len; |
| rs = kzalloc(len, GFP_KERNEL); |
| if (!rs) |
| goto bad_alloc; |
| |
| rec = &rs->recover; |
| atomic_set(&rs->io.in_process, 0); |
| atomic_set(&rs->io.in_process_max, 0); |
| rec->io_size = p->recover_io_size; |
| |
| /* Pointer to data array. */ |
| rs->data = (unsigned long **) |
| ((void *) rs->dev + raid_devs * sizeof(*rs->dev)); |
| rec->dl = dl; |
| rs->set.raid_devs = raid_devs; |
| rs->set.data_devs = raid_devs - raid_type->parity_devs; |
| rs->set.raid_type = raid_type; |
| |
| rs->set.raid_parms = p->raid_parms; |
| rs->set.chunk_size_parm = p->chunk_size_parm; |
| rs->set.io_size_parm = p->io_size_parm; |
| rs->sc.stripes_parm = p->stripes_parm; |
| rec->io_size_parm = p->recover_io_size_parm; |
| rec->bandwidth_parm = p->bandwidth_parm; |
| rec->recovery = p->recovery; |
| rec->recovery_stripes = p->recovery_stripes; |
| |
| /* |
| * Set chunk and io size and respective shifts |
| * (used to avoid divisions) |
| */ |
| rs->set.chunk_size = p->chunk_size; |
| rs->set.chunk_shift = ffs(p->chunk_size) - 1; |
| |
| rs->set.io_size = p->io_size; |
| rs->set.io_mask = p->io_size - 1; |
| /* Mask to adjust address key in case io_size != chunk_size. */ |
| rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask; |
| |
| rs->set.sectors_per_dev = sectors_per_dev; |
| |
| rs->set.ei = -1; /* Indicate no failed device. */ |
| atomic_set(&rs->set.failed_devs, 0); |
| |
| rs->ti = ti; |
| |
| atomic_set(rec->io_count + IO_WORK, 0); |
| atomic_set(rec->io_count + IO_RECOVER, 0); |
| |
| /* Initialize io lock and queues. */ |
| mutex_init(&rs->io.in_lock); |
| bio_list_init(&rs->io.in); |
| bio_list_init(&rs->io.work); |
| |
| init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */ |
| |
| rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size); |
| rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios, |
| wake_dummy, wake_do_raid, 0, p->recovery_stripes, |
| dl, region_size, rec->nr_regions); |
| if (IS_ERR(rec->rh)) |
| goto bad_rh; |
| |
| /* Initialize stripe cache. */ |
| r = sc_init(rs, p->stripes); |
| if (r) |
| goto bad_sc; |
| |
| /* REMOVEME: statistics. */ |
| stats_reset(rs); |
| ClearRSDevelStats(rs); /* Disnable development status. */ |
| return rs; |
| |
| bad_dirty_log: |
| TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM)); |
| |
| bad_chunk_size: |
| dm_dirty_log_destroy(dl); |
| TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL)); |
| |
| bad_recover_io_size: |
| dm_dirty_log_destroy(dl); |
| TI_ERR_RET("Recover stripe io size larger than region size", |
| ERR_PTR(-EINVAL)); |
| |
| bad_array: |
| dm_dirty_log_destroy(dl); |
| TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL)); |
| |
| bad_alloc: |
| dm_dirty_log_destroy(dl); |
| TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM)); |
| |
| bad_rh: |
| dm_dirty_log_destroy(dl); |
| ti->error = DM_MSG_PREFIX "Error creating dirty region hash"; |
| goto free_rs; |
| |
| bad_sc: |
| dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */ |
| sc_exit(&rs->sc); |
| ti->error = DM_MSG_PREFIX "Error creating stripe cache"; |
| free_rs: |
| kfree(rs); |
| return ERR_PTR(-ENOMEM); |
| } |
| |
| /* Free a RAID context (a RAID set). */ |
| static void context_free(struct raid_set *rs, unsigned p) |
| { |
| while (p--) |
| dm_put_device(rs->ti, rs->dev[p].dev); |
| |
| sc_exit(&rs->sc); |
| dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */ |
| kfree(rs); |
| } |
| |
| /* Create work queue and initialize delayed work. */ |
| static int rs_workqueue_init(struct raid_set *rs) |
| { |
| struct dm_target *ti = rs->ti; |
| |
| rs->io.wq = create_singlethread_workqueue(DAEMON); |
| if (!rs->io.wq) |
| TI_ERR_RET("failed to create " DAEMON, -ENOMEM); |
| |
| INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid); |
| INIT_WORK(&rs->io.ws_do_table_event, do_table_event); |
| return 0; |
| } |
| |
| /* Return pointer to raid_type structure for raid name. */ |
| static struct raid_type *get_raid_type(char *name) |
| { |
| struct raid_type *r = ARRAY_END(raid_types); |
| |
| while (r-- > raid_types) { |
| if (!strcmp(r->name, name)) |
| return r; |
| } |
| |
| return NULL; |
| } |
| |
| /* FIXME: factor out to dm core. */ |
| static int multiple(sector_t a, sector_t b, sector_t *n) |
| { |
| sector_t r = a; |
| |
| sector_div(r, b); |
| *n = r; |
| return a == r * b; |
| } |
| |
| /* Log RAID set information to kernel log. */ |
| static void rs_log(struct raid_set *rs, unsigned speed) |
| { |
| unsigned p; |
| char buf[BDEVNAME_SIZE]; |
| |
| for (p = 0; p < rs->set.raid_devs; p++) |
| DMINFO("/dev/%s is raid disk %u%s", |
| bdevname(rs->dev[p].dev->bdev, buf), p, |
| (p == rs->set.pi) ? " (parity)" : ""); |
| |
| DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n" |
| "algorithm \"%s\", %u chunks with %uMB/s\n" |
| "%s set with net %u/%u devices", |
| rs->set.chunk_size, rs->set.io_size, rs->recover.io_size, |
| atomic_read(&rs->sc.stripes), |
| rs->xor.f->name, rs->xor.chunks, mbpers(rs, speed), |
| rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs); |
| } |
| |
| /* Get all devices and offsets. */ |
| static int dev_parms(struct raid_set *rs, char **argv, int *p) |
| { |
| struct dm_target *ti = rs->ti; |
| |
| for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) { |
| int r; |
| unsigned long long tmp; |
| struct raid_dev *dev = rs->dev + *p; |
| |
| /* Get offset and device. */ |
| if (sscanf(argv[1], "%llu", &tmp) != 1 || |
| tmp > rs->set.sectors_per_dev) |
| TI_ERR("Invalid RAID device offset parameter"); |
| |
| dev->start = tmp; |
| r = dm_get_device(ti, *argv, dev->start, |
| rs->set.sectors_per_dev, |
| dm_table_get_mode(ti->table), &dev->dev); |
| if (r) |
| TI_ERR_RET("RAID device lookup failure", r); |
| |
| r = raid_dev_lookup(rs, dev); |
| if (r != -ENODEV && r < *p) { |
| (*p)++; /* Ensure dm_put_device() on actual device. */ |
| TI_ERR_RET("Duplicate RAID device", -ENXIO); |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* Set recovery bandwidth. */ |
| static void |
| recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth) |
| { |
| rs->recover.bandwidth = bandwidth; |
| rs->recover.bandwidth_work = 100 / bandwidth; |
| } |
| |
| /* Handle variable number of RAID parameters. */ |
| static int get_raid_variable_parms(struct dm_target *ti, char **argv, |
| struct variable_parms *vp) |
| { |
| int p, value; |
| struct { |
| int action; /* -1: skip, 0: no pwer2 check, 1: power2 check */ |
| char *errmsg; |
| int min, max; |
| int *var, *var2, *var3; |
| } argctr[] = { |
| { 1, |
| "Invalid chunk size; must be -1 or 2^^n and <= 16384", |
| IO_SIZE_MIN, CHUNK_SIZE_MAX, |
| &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size }, |
| { 0, |
| "Invalid number of stripes: must be -1 or >= 8 and <= 16384", |
| STRIPES_MIN, STRIPES_MAX, |
| &vp->stripes_parm, &vp->stripes, NULL }, |
| { 1, |
| "Invalid io size; must -1 or >= 8, 2^^n and less equal " |
| "min(BIO_MAX_SECTORS/2, chunk size)", |
| IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */ |
| &vp->io_size_parm, &vp->io_size, NULL }, |
| { 1, |
| "Invalid recovery io size; must be -1 or " |
| "2^^n and less equal BIO_MAX_SECTORS/2", |
| RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2, |
| &vp->recover_io_size_parm, &vp->recover_io_size, NULL }, |
| { 0, |
| "Invalid recovery bandwidth percentage; " |
| "must be -1 or > 0 and <= 100", |
| BANDWIDTH_MIN, BANDWIDTH_MAX, |
| &vp->bandwidth_parm, &vp->bandwidth, NULL }, |
| /* Handle sync argument seperately in loop. */ |
| { -1, |
| "Invalid recovery switch; must be \"sync\" or \"nosync\"" }, |
| { 0, |
| "Invalid number of recovery stripes;" |
| "must be -1, > 0 and <= 16384", |
| RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX, |
| &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL }, |
| }, *varp; |
| |
| /* Fetch # of variable raid parameters. */ |
| if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 || |
| !range_ok(vp->raid_parms, 0, 7)) |
| TI_ERR("Bad variable raid parameters number"); |
| |
| /* Preset variable RAID parameters. */ |
| vp->chunk_size = CHUNK_SIZE_DEFAULT; |
| vp->io_size = IO_SIZE_DEFAULT; |
| vp->stripes = STRIPES_DEFAULT; |
| vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT; |
| vp->bandwidth = BANDWIDTH_DEFAULT; |
| vp->recovery = 1; |
| vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT; |
| |
| /* Walk the array of argument constraints for all given ones. */ |
| for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) { |
| BUG_ON(varp >= ARRAY_END(argctr)); |
| |
| /* Special case for "[no]sync" string argument. */ |
| if (varp->action < 0) { |
| if (!strcmp(*argv, "sync")) |
| ; |
| else if (!strcmp(*argv, "nosync")) |
| vp->recovery = 0; |
| else |
| TI_ERR(varp->errmsg); |
| |
| argv++; |
| continue; |
| } |
| |
| /* |
| * Special case for io_size depending |
| * on previously set chunk size. |
| */ |
| if (p == 2) |
| varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size); |
| |
| if (sscanf(*(argv++), "%d", &value) != 1 || |
| (value != -1 && |
| ((varp->action && !POWER_OF_2(value)) || |
| !range_ok(value, varp->min, varp->max)))) |
| TI_ERR(varp->errmsg); |
| |
| *varp->var = value; |
| if (value != -1) { |
| if (varp->var2) |
| *varp->var2 = value; |
| if (varp->var3) |
| *varp->var3 = value; |
| } |
| } |
| |
| return 0; |
| } |
| |
| /* Parse optional locking parameters. */ |
| static int get_raid_locking_parms(struct dm_target *ti, char **argv, |
| int *locking_parms, |
| struct dm_raid45_locking_type **locking_type) |
| { |
| if (!strnicmp(argv[0], "locking", strlen(argv[0]))) { |
| char *lckstr = argv[1]; |
| size_t lcksz = strlen(lckstr); |
| |
| if (!strnicmp(lckstr, "none", lcksz)) { |
| *locking_type = &locking_none; |
| *locking_parms = 2; |
| } else if (!strnicmp(lckstr, "cluster", lcksz)) { |
| DMERR("locking type \"%s\" not yet implemented", |
| lckstr); |
| return -EINVAL; |
| } else { |
| DMERR("unknown locking type \"%s\"", lckstr); |
| return -EINVAL; |
| } |
| } |
| |
| *locking_parms = 0; |
| *locking_type = &locking_none; |
| return 0; |
| } |
| |
| /* Set backing device read ahead properties of RAID set. */ |
| static void rs_set_read_ahead(struct raid_set *rs, |
| unsigned sectors, unsigned stripes) |
| { |
| unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE); |
| struct mapped_device *md = dm_table_get_md(rs->ti->table); |
| struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info; |
| |
| /* Set read-ahead for the RAID set and the component devices. */ |
| if (ra_pages) { |
| unsigned p = rs->set.raid_devs; |
| |
| bdi->ra_pages = stripes * ra_pages * rs->set.data_devs; |
| |
| while (p--) { |
| struct request_queue *q = |
| bdev_get_queue(rs->dev[p].dev->bdev); |
| |
| q->backing_dev_info.ra_pages = ra_pages; |
| } |
| } |
| |
| dm_put(md); |
| } |
| |
| /* Set congested function. */ |
| static void rs_set_congested_fn(struct raid_set *rs) |
| { |
| struct mapped_device *md = dm_table_get_md(rs->ti->table); |
| struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info; |
| |
| /* Set congested function and data. */ |
| bdi->congested_fn = rs_congested; |
| bdi->congested_data = rs; |
| dm_put(md); |
| } |
| |
| /* |
| * Construct a RAID4/5 mapping: |
| * |
| * log_type #log_params <log_params> \ |
| * raid_type [#parity_dev] #raid_variable_params <raid_params> \ |
| * [locking "none"/"cluster"] |
| * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,} |
| * |
| * log_type = "core"/"disk", |
| * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only) |
| * log_params = [dirty_log_path] region_size [[no]sync]) |
| * |
| * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs" |
| * |
| * #parity_dev = N if raid_type = "raid4" |
| * o N = -1: pick default = last device |
| * o N >= 0 and < #raid_devs: parity device index |
| * |
| * #raid_variable_params = 0-7; raid_params (-1 = default): |
| * [chunk_size [#stripes [io_size [recover_io_size \ |
| * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]] |
| * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8 |
| * and <= CHUNK_SIZE_MAX) |
| * o #stripes is number of stripes allocated to stripe cache |
| * (must be > 1 and < STRIPES_MAX) |
| * o io_size (io unit size per device in sectors; must be 2^^n and > 8) |
| * o recover_io_size (io unit size per device for recovery in sectors; |
| must be 2^^n, > SECTORS_PER_PAGE and <= region_size) |
| * o %recovery_bandwith is the maximum amount spend for recovery during |
| * application io (1-100%) |
| * o recovery switch = [sync|nosync] |
| * o #recovery_stripes is the number of recovery stripes used for |
| * parallel recovery of the RAID set |
| * If raid_variable_params = 0, defaults will be used. |
| * Any raid_variable_param can be set to -1 to apply a default |
| * |
| * #raid_devs = N (N >= 3) |
| * |
| * #dev_to_initialize = N |
| * -1: initialize parity on all devices |
| * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction |
| * of a failed devices content after replacement |
| * |
| * <dev_path> = device_path (eg, /dev/sdd1) |
| * <offset> = begin at offset on <dev_path> |
| * |
| */ |
| #define MIN_PARMS 13 |
| static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) |
| { |
| int dev_to_init, dl_parms, i, locking_parms, |
| parity_parm, pi = -1, r, raid_devs; |
| unsigned speed; |
| sector_t tmp, sectors_per_dev; |
| struct dm_raid45_locking_type *locking; |
| struct raid_set *rs; |
| struct raid_type *raid_type; |
| struct variable_parms parms; |
| |
| /* Ensure minimum number of parameters. */ |
| if (argc < MIN_PARMS) |
| TI_ERR("Not enough parameters"); |
| |
| /* Fetch # of dirty log parameters. */ |
| if (sscanf(argv[1], "%d", &dl_parms) != 1 || |
| !range_ok(dl_parms, 1, 4711)) /* ;-) */ |
| TI_ERR("Bad dirty log parameters number"); |
| |
| /* Check raid_type. */ |
| raid_type = get_raid_type(argv[dl_parms + 2]); |
| if (!raid_type) |
| TI_ERR("Bad raid type"); |
| |
| /* In case of RAID4, parity drive is selectable. */ |
| parity_parm = !!(raid_type->level == raid4); |
| |
| /* Handle variable number of RAID parameters. */ |
| r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3, |
| &parms); |
| if (r) |
| return r; |
| |
| /* Handle any locking parameters. */ |
| r = get_raid_locking_parms(ti, |
| argv + dl_parms + parity_parm + |
| parms.raid_parms + 4, |
| &locking_parms, &locking); |
| if (r) |
| return r; |
| |
| /* # of raid devices. */ |
| i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4; |
| if (sscanf(argv[i], "%d", &raid_devs) != 1 || |
| raid_devs < raid_type->minimal_devs) |
| TI_ERR("Invalid number of raid devices"); |
| |
| /* In case of RAID4, check parity drive index is in limits. */ |
| if (raid_type->level == raid4) { |
| /* Fetch index of parity device. */ |
| if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 || |
| (pi != -1 && !range_ok(pi, 0, raid_devs - 1))) |
| TI_ERR("Invalid RAID4 parity device index"); |
| } |
| |
| /* |
| * Index of device to initialize starts at 0 |
| * |
| * o -1 -> don't initialize a selected device; |
| * initialize parity conforming to algorithm |
| * o 0..raid_devs-1 -> initialize respective device |
| * (used for reconstruction of a replaced device) |
| */ |
| if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms + |
| locking_parms + 5], "%d", &dev_to_init) != 1 || |
| !range_ok(dev_to_init, -1, raid_devs - 1)) |
| TI_ERR("Invalid number for raid device to initialize"); |
| |
| /* Check # of raid device arguments. */ |
| if (argc - dl_parms - parity_parm - parms.raid_parms - 6 != |
| 2 * raid_devs) |
| TI_ERR("Wrong number of raid device/offset arguments"); |
| |
| /* |
| * Check that the table length is devisable |
| * w/o rest by (raid_devs - parity_devs) |
| */ |
| if (!multiple(ti->len, raid_devs - raid_type->parity_devs, |
| §ors_per_dev)) |
| TI_ERR("Target length not divisible by number of data devices"); |
| |
| /* |
| * Check that the device size is |
| * devisable w/o rest by chunk size |
| */ |
| if (!multiple(sectors_per_dev, parms.chunk_size, &tmp)) |
| TI_ERR("Device length not divisible by chunk_size"); |
| |
| /**************************************************************** |
| * Now that we checked the constructor arguments -> |
| * let's allocate the RAID set |
| ****************************************************************/ |
| rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev, |
| ti, dl_parms, argv); |
| if (IS_ERR(rs)) |
| return PTR_ERR(rs); |
| |
| |
| rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init; |
| rs->set.pi = rs->set.pi_parm = pi; |
| |
| /* Set RAID4 parity drive index. */ |
| if (raid_type->level == raid4) |
| rs->set.pi = (pi == -1) ? rs->set.data_devs : pi; |
| |
| recover_set_bandwidth(rs, parms.bandwidth); |
| |
| /* Use locking type to lock stripe access. */ |
| rs->locking = locking; |
| |
| /* Get the device/offset tupels. */ |
| argv += dl_parms + 6 + parity_parm + parms.raid_parms; |
| r = dev_parms(rs, argv, &i); |
| if (r) |
| goto err; |
| |
| /* Set backing device information (eg. read ahead). */ |
| rs_set_read_ahead(rs, 2 * rs->set.chunk_size, 4 /* stripes */); |
| rs_set_congested_fn(rs); /* Set congested function. */ |
| SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */ |
| speed = xor_optimize(rs); /* Select best xor algorithm. */ |
| |
| /* Set for recovery of any nosync regions. */ |
| if (parms.recovery) |
| SetRSRecover(rs); |
| else { |
| /* |
| * Need to free recovery stripe(s) here in case |
| * of nosync, because xor_optimize uses one. |
| */ |
| set_start_recovery(rs); |
| set_end_recovery(rs); |
| stripe_recover_free(rs); |
| } |
| |
| /* |
| * Make sure that dm core only hands maximum io size |
| * length down and pays attention to io boundaries. |
| */ |
| ti->split_io = rs->set.io_size; |
| ti->private = rs; |
| |
| /* Initialize work queue to handle this RAID set's io. */ |
| r = rs_workqueue_init(rs); |
| if (r) |
| goto err; |
| |
| rs_log(rs, speed); /* Log information about RAID set. */ |
| return 0; |
| |
| err: |
| context_free(rs, i); |
| return r; |
| } |
| |
| /* |
| * Destruct a raid mapping |
| */ |
| static void raid_dtr(struct dm_target *ti) |
| { |
| struct raid_set *rs = ti->private; |
| |
| destroy_workqueue(rs->io.wq); |
| context_free(rs, rs->set.raid_devs); |
| } |
| |
| /* Raid mapping function. */ |
| static int raid_map(struct dm_target *ti, struct bio *bio, |
| union map_info *map_context) |
| { |
| /* I don't want to waste stripe cache capacity. */ |
| if (bio_rw(bio) == READA) |
| return -EIO; |
| else { |
| struct raid_set *rs = ti->private; |
| |
| /* |
| * Get io reference to be waiting for to drop |
| * to zero on device suspension/destruction. |
| */ |
| io_get(rs); |
| bio->bi_sector -= ti->begin; /* Remap sector. */ |
| |
| /* Queue io to RAID set. */ |
| mutex_lock(&rs->io.in_lock); |
| bio_list_add(&rs->io.in, bio); |
| mutex_unlock(&rs->io.in_lock); |
| |
| /* Wake daemon to process input list. */ |
| wake_do_raid(rs); |
| |
| /* REMOVEME: statistics. */ |
| atomic_inc(rs->stats + (bio_data_dir(bio) == READ ? |
| S_BIOS_READ : S_BIOS_WRITE)); |
| return DM_MAPIO_SUBMITTED; /* Handle later. */ |
| } |
| } |
| |
| /* Device suspend. */ |
| static void raid_presuspend(struct dm_target *ti) |
| { |
| struct raid_set *rs = ti->private; |
| struct dm_dirty_log *dl = rs->recover.dl; |
| |
| SetRSSuspend(rs); |
| |
| if (RSRecover(rs)) |
| dm_rh_stop_recovery(rs->recover.rh); |
| |
| cancel_delayed_work(&rs->io.dws_do_raid); |
| flush_workqueue(rs->io.wq); |
| wait_ios(rs); /* Wait for completion of all ios being processed. */ |
| |
| if (dl->type->presuspend && dl->type->presuspend(dl)) |
| /* FIXME: need better error handling. */ |
| DMWARN("log presuspend failed"); |
| } |
| |
| static void raid_postsuspend(struct dm_target *ti) |
| { |
| struct raid_set *rs = ti->private; |
| struct dm_dirty_log *dl = rs->recover.dl; |
| |
| if (dl->type->postsuspend && dl->type->postsuspend(dl)) |
| /* FIXME: need better error handling. */ |
| DMWARN("log postsuspend failed"); |
| |
| } |
| |
| /* Device resume. */ |
| static void raid_resume(struct dm_target *ti) |
| { |
| struct raid_set *rs = ti->private; |
| struct recover *rec = &rs->recover; |
| struct dm_dirty_log *dl = rec->dl; |
| |
| if (dl->type->resume && dl->type->resume(dl)) |
| /* Resume dirty log. */ |
| /* FIXME: need better error handling. */ |
| DMWARN("log resume failed"); |
| |
| rec->nr_regions_to_recover = |
| rec->nr_regions - dl->type->get_sync_count(dl); |
| |
| /* Restart any unfinished recovery. */ |
| if (RSRecover(rs)) { |
| set_start_recovery(rs); |
| dm_rh_start_recovery(rec->rh); |
| } |
| |
| ClearRSSuspend(rs); |
| wake_do_raid(rs); |
| } |
| |
| /* Return stripe cache size. */ |
| static unsigned sc_size(struct raid_set *rs) |
| { |
| return to_sector(atomic_read(&rs->sc.stripes) * |
| (sizeof(struct stripe) + |
| (sizeof(struct stripe_chunk) + |
| (sizeof(struct page_list) + |
| to_bytes(rs->set.io_size) * |
| rs->set.raid_devs)) + |
| (rs->recover.end_jiffies ? |
| 0 : rs->recover.recovery_stripes * |
| to_bytes(rs->set.raid_devs * rs->recover.io_size)))); |
| } |
| |
| /* REMOVEME: status output for development. */ |
| static void raid_devel_stats(struct dm_target *ti, char *result, |
| unsigned *size, unsigned maxlen) |
| { |
| unsigned sz = *size; |
| unsigned long j; |
| char buf[BDEVNAME_SIZE], *p; |
| struct stats_map *sm; |
| struct raid_set *rs = ti->private; |
| struct recover *rec = &rs->recover; |
| struct timespec ts; |
| |
| DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks); |
| DMEMIT("act_ios=%d ", io_ref(rs)); |
| DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max)); |
| DMEMIT("act_stripes=%d ", sc_active(&rs->sc)); |
| DMEMIT("act_stripes_max=%d\n", |
| atomic_read(&rs->sc.active_stripes_max)); |
| |
| for (sm = stats_map; sm < ARRAY_END(stats_map); sm++) |
| DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type)); |
| |
| DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off"); |
| DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size, |
| atomic_read(&rs->sc.stripes), rs->set.io_size, |
| rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets, |
| sc_size(rs)); |
| |
| j = (rec->end_jiffies ? rec->end_jiffies : jiffies) - |
| rec->start_jiffies; |
| jiffies_to_timespec(j, &ts); |
| sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec); |
| p = strchr(buf, '.'); |
| p[3] = 0; |
| |
| DMEMIT("rg=%llu/%llu/%llu/%u %s\n", |
| (unsigned long long) rec->nr_regions_recovered, |
| (unsigned long long) rec->nr_regions_to_recover, |
| (unsigned long long) rec->nr_regions, rec->bandwidth, buf); |
| |
| *size = sz; |
| } |
| |
| static int raid_status(struct dm_target *ti, status_type_t type, |
| char *result, unsigned maxlen) |
| { |
| unsigned p, sz = 0; |
| char buf[BDEVNAME_SIZE]; |
| struct raid_set *rs = ti->private; |
| int raid_parms[] = { |
| rs->set.chunk_size_parm, |
| rs->sc.stripes_parm, |
| rs->set.io_size_parm, |
| rs->recover.io_size_parm, |
| rs->recover.bandwidth_parm, |
| -2, |
| rs->recover.recovery_stripes, |
| }; |
| |
| switch (type) { |
| case STATUSTYPE_INFO: |
| /* REMOVEME: statistics. */ |
| if (RSDevelStats(rs)) |
| raid_devel_stats(ti, result, &sz, maxlen); |
| |
| DMEMIT("%u ", rs->set.raid_devs); |
| |
| for (p = 0; p < rs->set.raid_devs; p++) |
| DMEMIT("%s ", |
| format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev)); |
| |
| DMEMIT("1 "); |
| for (p = 0; p < rs->set.raid_devs; p++) { |
| DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D'); |
| |
| if (p == rs->set.pi) |
| DMEMIT("p"); |
| |
| if (rs->set.dev_to_init == p) |
| DMEMIT("i"); |
| } |
| |
| break; |
| case STATUSTYPE_TABLE: |
| sz = rs->recover.dl->type->status(rs->recover.dl, type, |
| result, maxlen); |
| DMEMIT("%s %u ", rs->set.raid_type->name, |
| rs->set.raid_parms); |
| |
| for (p = 0; p < rs->set.raid_parms; p++) { |
| if (raid_parms[p] > -2) |
| DMEMIT("%d ", raid_parms[p]); |
| else |
| DMEMIT("%s ", rs->recover.recovery ? |
| "sync" : "nosync"); |
| } |
| |
| DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init); |
| |
| for (p = 0; p < rs->set.raid_devs; p++) |
| DMEMIT("%s %llu ", |
| format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev), |
| (unsigned long long) rs->dev[p].start); |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Message interface |
| */ |
| enum raid_msg_actions { |
| act_bw, /* Recovery bandwidth switch. */ |
| act_dev, /* Device failure switch. */ |
| act_overwrite, /* Stripe overwrite check. */ |
| act_stats, /* Development statistics switch. */ |
| act_sc, /* Stripe cache switch. */ |
| |
| act_on, /* Set entity on. */ |
| act_off, /* Set entity off. */ |
| act_reset, /* Reset entity. */ |
| |
| act_set = act_on, /* Set # absolute. */ |
| act_grow = act_off, /* Grow # by an amount. */ |
| act_shrink = act_reset, /* Shrink # by an amount. */ |
| }; |
| |
| /* Turn a delta into an absolute value. */ |
| static int _absolute(unsigned long action, int act, int r) |
| { |
| /* Make delta absolute. */ |
| if (test_bit(act_set, &action)) |
| ; |
| else if (test_bit(act_grow, &action)) |
| r += act; |
| else if (test_bit(act_shrink, &action)) |
| r = act - r; |
| else |
| r = -EINVAL; |
| |
| return r; |
| } |
| |
| /* Change recovery io bandwidth. */ |
| static int bandwidth_change(struct dm_msg *msg, void *context) |
| { |
| struct raid_set *rs = context; |
| int act = rs->recover.bandwidth; |
| int bandwidth = DM_MSG_INT_ARG(msg); |
| |
| if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) { |
| /* Make delta bandwidth absolute. */ |
| bandwidth = _absolute(msg->action, act, bandwidth); |
| |
| /* Check range. */ |
| if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) { |
| recover_set_bandwidth(rs, bandwidth); |
| return 0; |
| } |
| } |
| |
| set_bit(dm_msg_ret_arg, &msg->ret); |
| set_bit(dm_msg_ret_inval, &msg->ret); |
| return -EINVAL; |
| } |
| |
| /* Set/reset development feature flags. */ |
| static int devel_flags(struct dm_msg *msg, void *context) |
| { |
| struct raid_set *rs = context; |
| |
| if (test_bit(act_on, &msg->action)) |
| return test_and_set_bit(msg->spec->parm, |
| &rs->io.flags) ? -EPERM : 0; |
| else if (test_bit(act_off, &msg->action)) |
| return test_and_clear_bit(msg->spec->parm, |
| &rs->io.flags) ? 0 : -EPERM; |
| else if (test_bit(act_reset, &msg->action)) { |
| if (test_bit(act_stats, &msg->action)) { |
| stats_reset(rs); |
| goto on; |
| } else if (test_bit(act_overwrite, &msg->action)) { |
| on: |
| set_bit(msg->spec->parm, &rs->io.flags); |
| return 0; |
| } |
| } |
| |
| return -EINVAL; |
| } |
| |
| /* Resize the stripe cache. */ |
| static int sc_resize(struct dm_msg *msg, void *context) |
| { |
| int act, stripes; |
| struct raid_set *rs = context; |
| |
| /* Deny permission in case the daemon is still resizing!. */ |
| if (atomic_read(&rs->sc.stripes_to_set)) |
| return -EPERM; |
| |
| stripes = DM_MSG_INT_ARG(msg); |
| if (stripes > 0) { |
| act = atomic_read(&rs->sc.stripes); |
| |
| /* Make delta stripes absolute. */ |
| stripes = _absolute(msg->action, act, stripes); |
| |
| /* |
| * Check range and that the # of stripes changes. |
| * We leave the resizing to the wroker. |
| */ |
| if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) && |
| stripes != atomic_read(&rs->sc.stripes)) { |
| atomic_set(&rs->sc.stripes_to_set, stripes); |
| wake_do_raid(rs); |
| return 0; |
| } |
| } |
| |
| set_bit(dm_msg_ret_arg, &msg->ret); |
| set_bit(dm_msg_ret_inval, &msg->ret); |
| return -EINVAL; |
| } |
| |
| /* Parse the RAID message action. */ |
| /* |
| * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50' |
| * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of' |
| * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of' |
| * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024' |
| * |
| */ |
| static int raid_message(struct dm_target *ti, unsigned argc, char **argv) |
| { |
| /* Variables to store the parsed parameters im. */ |
| static int i[2]; |
| static unsigned long *i_arg[] = { |
| (unsigned long *) i + 0, |
| (unsigned long *) i + 1, |
| }; |
| |
| /* Declare all message option strings. */ |
| static char *str_sgs[] = { "set", "grow", "shrink" }; |
| static char *str_oor[] = { "on", "off", "reset" }; |
| |
| /* Declare all actions. */ |
| static unsigned long act_sgs[] = { act_set, act_grow, act_shrink }; |
| static unsigned long act_oor[] = { act_on, act_off, act_reset }; |
| |
| /* Bandwidth option. */ |
| static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs }; |
| static struct dm_message_argument bw_args = { |
| 1, i_arg, { dm_msg_int_t } |
| }; |
| |
| static struct dm_message_argument null_args = { |
| 0, NULL, { dm_msg_int_t } |
| }; |
| |
| /* Overwrite and statistics option. */ |
| static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor }; |
| |
| /* Sripecache option. */ |
| static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs }; |
| |
| /* Declare messages. */ |
| static struct dm_msg_spec specs[] = { |
| { "bandwidth", act_bw, &bw_opt, &bw_args, |
| 0, bandwidth_change }, |
| { "overwrite", act_overwrite, &ovr_stats_opt, &null_args, |
| RS_CHECK_OVERWRITE, devel_flags }, |
| { "statistics", act_stats, &ovr_stats_opt, &null_args, |
| RS_DEVEL_STATS, devel_flags }, |
| { "stripecache", act_sc, &stripe_opt, &bw_args, |
| 0, sc_resize }, |
| }; |
| |
| /* The message for the parser. */ |
| struct dm_msg msg = { |
| .num_specs = ARRAY_SIZE(specs), |
| .specs = specs, |
| }; |
| |
| return dm_message_parse(TARGET, &msg, ti->private, argc, argv); |
| } |
| /* |
| * END message interface |
| */ |
| |
| static struct target_type raid_target = { |
| .name = "raid45", |
| .version = {1, 0, 0}, |
| .module = THIS_MODULE, |
| .ctr = raid_ctr, |
| .dtr = raid_dtr, |
| .map = raid_map, |
| .presuspend = raid_presuspend, |
| .postsuspend = raid_postsuspend, |
| .resume = raid_resume, |
| .status = raid_status, |
| .message = raid_message, |
| }; |
| |
| static void init_exit(const char *bad_msg, const char *good_msg, int r) |
| { |
| if (r) |
| DMERR("Failed to %sregister target [%d]", bad_msg, r); |
| else |
| DMINFO("%s %s", good_msg, version); |
| } |
| |
| static int __init dm_raid_init(void) |
| { |
| int r = dm_register_target(&raid_target); |
| |
| init_exit("", "initialized", r); |
| return r; |
| } |
| |
| static void __exit dm_raid_exit(void) |
| { |
| dm_unregister_target(&raid_target); |
| init_exit("un", "exit", 0); |
| } |
| |
| /* Module hooks. */ |
| module_init(dm_raid_init); |
| module_exit(dm_raid_exit); |
| |
| MODULE_DESCRIPTION(DM_NAME " raid4/5 target"); |
| MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>"); |
| MODULE_LICENSE("GPL"); |
| MODULE_ALIAS("dm-raid4"); |
| MODULE_ALIAS("dm-raid5"); |