| /* SPDX-License-Identifier: GPL-2.0-only */ |
| /* |
| * Copyright 2023 Red Hat |
| */ |
| |
| #ifndef VDO_SLAB_DEPOT_H |
| #define VDO_SLAB_DEPOT_H |
| |
| #include <linux/atomic.h> |
| #include <linux/dm-kcopyd.h> |
| #include <linux/list.h> |
| |
| #include "numeric.h" |
| |
| #include "admin-state.h" |
| #include "completion.h" |
| #include "data-vio.h" |
| #include "encodings.h" |
| #include "physical-zone.h" |
| #include "priority-table.h" |
| #include "recovery-journal.h" |
| #include "statistics.h" |
| #include "types.h" |
| #include "vio.h" |
| #include "wait-queue.h" |
| |
| /* |
| * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has |
| * a single array of slabs in order to eliminate the need for additional math in order to compute |
| * which physical zone a PBN is in. It also has a block_allocator per zone. |
| * |
| * Each physical zone has a single dedicated queue and thread for performing all updates to the |
| * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the |
| * code to omit more fine-grained locking for the various slab structures. Each physical zone |
| * maintains a separate copy of the slab summary to remove the need for explicit locking on that |
| * structure as well. |
| * |
| * Load operations must be performed on the admin thread. Normal operations, such as allocations |
| * and reference count updates, must be performed on the appropriate physical zone thread. Requests |
| * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery |
| * journal thread to run on the appropriate physical zone thread. Save operations must be launched |
| * from the same admin thread as the original load operation. |
| */ |
| |
| enum { |
| /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ |
| BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, |
| }; |
| |
| /* |
| * Represents the possible status of a block. |
| */ |
| enum reference_status { |
| RS_FREE, /* this block is free */ |
| RS_SINGLE, /* this block is singly-referenced */ |
| RS_SHARED, /* this block is shared */ |
| RS_PROVISIONAL /* this block is provisionally allocated */ |
| }; |
| |
| struct vdo_slab; |
| |
| struct journal_lock { |
| u16 count; |
| sequence_number_t recovery_start; |
| }; |
| |
| struct slab_journal { |
| /* A waiter object for getting a VIO pool entry */ |
| struct vdo_waiter resource_waiter; |
| /* A waiter object for updating the slab summary */ |
| struct vdo_waiter slab_summary_waiter; |
| /* A waiter object for getting a vio with which to flush */ |
| struct vdo_waiter flush_waiter; |
| /* The queue of VIOs waiting to make an entry */ |
| struct vdo_wait_queue entry_waiters; |
| /* The parent slab reference of this journal */ |
| struct vdo_slab *slab; |
| |
| /* Whether a tail block commit is pending */ |
| bool waiting_to_commit; |
| /* Whether the journal is updating the slab summary */ |
| bool updating_slab_summary; |
| /* Whether the journal is adding entries from the entry_waiters queue */ |
| bool adding_entries; |
| /* Whether a partial write is in progress */ |
| bool partial_write_in_progress; |
| |
| /* The oldest block in the journal on disk */ |
| sequence_number_t head; |
| /* The oldest block in the journal which may not be reaped */ |
| sequence_number_t unreapable; |
| /* The end of the half-open interval of the active journal */ |
| sequence_number_t tail; |
| /* The next journal block to be committed */ |
| sequence_number_t next_commit; |
| /* The tail sequence number that is written in the slab summary */ |
| sequence_number_t summarized; |
| /* The tail sequence number that was last summarized in slab summary */ |
| sequence_number_t last_summarized; |
| |
| /* The sequence number of the recovery journal lock */ |
| sequence_number_t recovery_lock; |
| |
| /* |
| * The number of entries which fit in a single block. Can't use the constant because unit |
| * tests change this number. |
| */ |
| journal_entry_count_t entries_per_block; |
| /* |
| * The number of full entries which fit in a single block. Can't use the constant because |
| * unit tests change this number. |
| */ |
| journal_entry_count_t full_entries_per_block; |
| |
| /* The recovery journal of the VDO (slab journal holds locks on it) */ |
| struct recovery_journal *recovery_journal; |
| |
| /* The statistics shared by all slab journals in our physical zone */ |
| struct slab_journal_statistics *events; |
| /* A list of the VIO pool entries for outstanding journal block writes */ |
| struct list_head uncommitted_blocks; |
| |
| /* |
| * The current tail block header state. This will be packed into the block just before it |
| * is written. |
| */ |
| struct slab_journal_block_header tail_header; |
| /* A pointer to a block-sized buffer holding the packed block data */ |
| struct packed_slab_journal_block *block; |
| |
| /* The number of blocks in the on-disk journal */ |
| block_count_t size; |
| /* The number of blocks at which to start pushing reference blocks */ |
| block_count_t flushing_threshold; |
| /* The number of blocks at which all reference blocks should be writing */ |
| block_count_t flushing_deadline; |
| /* The number of blocks at which to wait for reference blocks to write */ |
| block_count_t blocking_threshold; |
| /* The number of blocks at which to scrub the slab before coming online */ |
| block_count_t scrubbing_threshold; |
| |
| /* This list entry is for block_allocator to keep a queue of dirty journals */ |
| struct list_head dirty_entry; |
| |
| /* The lock for the oldest unreaped block of the journal */ |
| struct journal_lock *reap_lock; |
| /* The locks for each on disk block */ |
| struct journal_lock *locks; |
| }; |
| |
| /* |
| * Reference_block structure |
| * |
| * Blocks are used as a proxy, permitting saves of partial refcounts. |
| */ |
| struct reference_block { |
| /* This block waits on the ref_counts to tell it to write */ |
| struct vdo_waiter waiter; |
| /* The slab to which this reference_block belongs */ |
| struct vdo_slab *slab; |
| /* The number of references in this block that represent allocations */ |
| block_size_t allocated_count; |
| /* The slab journal block on which this block must hold a lock */ |
| sequence_number_t slab_journal_lock; |
| /* The slab journal block which should be released when this block is committed */ |
| sequence_number_t slab_journal_lock_to_release; |
| /* The point up to which each sector is accurate on disk */ |
| struct journal_point commit_points[VDO_SECTORS_PER_BLOCK]; |
| /* Whether this block has been modified since it was written to disk */ |
| bool is_dirty; |
| /* Whether this block is currently writing */ |
| bool is_writing; |
| }; |
| |
| /* The search_cursor represents the saved position of a free block search. */ |
| struct search_cursor { |
| /* The reference block containing the current search index */ |
| struct reference_block *block; |
| /* The position at which to start searching for the next free counter */ |
| slab_block_number index; |
| /* The position just past the last valid counter in the current block */ |
| slab_block_number end_index; |
| |
| /* A pointer to the first reference block in the slab */ |
| struct reference_block *first_block; |
| /* A pointer to the last reference block in the slab */ |
| struct reference_block *last_block; |
| }; |
| |
| enum slab_rebuild_status { |
| VDO_SLAB_REBUILT, |
| VDO_SLAB_REPLAYING, |
| VDO_SLAB_REQUIRES_SCRUBBING, |
| VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, |
| VDO_SLAB_REBUILDING, |
| }; |
| |
| /* |
| * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of |
| * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for |
| * metadata storage for the reference counts and slab journal for the slab. |
| * |
| * A reference count is maintained for each physical block number. The vast majority of blocks have |
| * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS |
| * (254) the reference count is stored in counters[pbn]. |
| */ |
| struct vdo_slab { |
| /* A list entry to queue this slab in a block_allocator list */ |
| struct list_head allocq_entry; |
| |
| /* The struct block_allocator that owns this slab */ |
| struct block_allocator *allocator; |
| |
| /* The journal for this slab */ |
| struct slab_journal journal; |
| |
| /* The slab number of this slab */ |
| slab_count_t slab_number; |
| /* The offset in the allocator partition of the first block in this slab */ |
| physical_block_number_t start; |
| /* The offset of the first block past the end of this slab */ |
| physical_block_number_t end; |
| /* The starting translated PBN of the slab journal */ |
| physical_block_number_t journal_origin; |
| /* The starting translated PBN of the reference counts */ |
| physical_block_number_t ref_counts_origin; |
| |
| /* The administrative state of the slab */ |
| struct admin_state state; |
| /* The status of the slab */ |
| enum slab_rebuild_status status; |
| /* Whether the slab was ever queued for scrubbing */ |
| bool was_queued_for_scrubbing; |
| |
| /* The priority at which this slab has been queued for allocation */ |
| u8 priority; |
| |
| /* Fields beyond this point are the reference counts for the data blocks in this slab. */ |
| /* The size of the counters array */ |
| u32 block_count; |
| /* The number of free blocks */ |
| u32 free_blocks; |
| /* The array of reference counts */ |
| vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */ |
| |
| /* The saved block pointer and array indexes for the free block search */ |
| struct search_cursor search_cursor; |
| |
| /* A list of the dirty blocks waiting to be written out */ |
| struct vdo_wait_queue dirty_blocks; |
| /* The number of blocks which are currently writing */ |
| size_t active_count; |
| |
| /* A waiter object for updating the slab summary */ |
| struct vdo_waiter summary_waiter; |
| |
| /* The latest slab journal for which there has been a reference count update */ |
| struct journal_point slab_journal_point; |
| |
| /* The number of reference count blocks */ |
| u32 reference_block_count; |
| /* reference count block array */ |
| struct reference_block *reference_blocks; |
| }; |
| |
| enum block_allocator_drain_step { |
| VDO_DRAIN_ALLOCATOR_START, |
| VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER, |
| VDO_DRAIN_ALLOCATOR_STEP_SLABS, |
| VDO_DRAIN_ALLOCATOR_STEP_SUMMARY, |
| VDO_DRAIN_ALLOCATOR_STEP_FINISHED, |
| }; |
| |
| struct slab_scrubber { |
| /* The queue of slabs to scrub first */ |
| struct list_head high_priority_slabs; |
| /* The queue of slabs to scrub once there are no high_priority_slabs */ |
| struct list_head slabs; |
| /* The queue of VIOs waiting for a slab to be scrubbed */ |
| struct vdo_wait_queue waiters; |
| |
| /* |
| * The number of slabs that are unrecovered or being scrubbed. This field is modified by |
| * the physical zone thread, but is queried by other threads. |
| */ |
| slab_count_t slab_count; |
| |
| /* The administrative state of the scrubber */ |
| struct admin_state admin_state; |
| /* Whether to only scrub high-priority slabs */ |
| bool high_priority_only; |
| /* The slab currently being scrubbed */ |
| struct vdo_slab *slab; |
| /* The vio for loading slab journal blocks */ |
| struct vio vio; |
| }; |
| |
| /* A sub-structure for applying actions in parallel to all an allocator's slabs. */ |
| struct slab_actor { |
| /* The number of slabs performing a slab action */ |
| slab_count_t slab_action_count; |
| /* The method to call when a slab action has been completed by all slabs */ |
| vdo_action_fn callback; |
| }; |
| |
| /* A slab_iterator is a structure for iterating over a set of slabs. */ |
| struct slab_iterator { |
| struct vdo_slab **slabs; |
| struct vdo_slab *next; |
| slab_count_t end; |
| slab_count_t stride; |
| }; |
| |
| /* |
| * The slab_summary provides hints during load and recovery about the state of the slabs in order |
| * to avoid the need to read the slab journals in their entirety before a VDO can come online. |
| * |
| * The information in the summary for each slab includes the rough number of free blocks (which is |
| * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free |
| * space will be used on restart), and the location of the tail block of the slab's journal. |
| * |
| * The slab_summary has its own partition at the end of the volume which is sized to allow for a |
| * complete copy of the summary for each of up to 16 physical zones. |
| * |
| * During resize, the slab_summary moves its backing partition and is saved once moved; the |
| * slab_summary is not permitted to overwrite the previous recovery journal space. |
| * |
| * The slab_summary does not have its own version information, but relies on the VDO volume version |
| * number. |
| */ |
| |
| /* |
| * A slab status is a very small structure for use in determining the ordering of slabs in the |
| * scrubbing process. |
| */ |
| struct slab_status { |
| slab_count_t slab_number; |
| bool is_clean; |
| u8 emptiness; |
| }; |
| |
| struct slab_summary_block { |
| /* The block_allocator to which this block belongs */ |
| struct block_allocator *allocator; |
| /* The index of this block in its zone's summary */ |
| block_count_t index; |
| /* Whether this block has a write outstanding */ |
| bool writing; |
| /* Ring of updates waiting on the outstanding write */ |
| struct vdo_wait_queue current_update_waiters; |
| /* Ring of updates waiting on the next write */ |
| struct vdo_wait_queue next_update_waiters; |
| /* The active slab_summary_entry array for this block */ |
| struct slab_summary_entry *entries; |
| /* The vio used to write this block */ |
| struct vio vio; |
| /* The packed entries, one block long, backing the vio */ |
| char *outgoing_entries; |
| }; |
| |
| /* |
| * The statistics for all the slab summary zones owned by this slab summary. These fields are all |
| * mutated only by their physical zone threads, but are read by other threads when gathering |
| * statistics for the entire depot. |
| */ |
| struct atomic_slab_summary_statistics { |
| /* Number of blocks written */ |
| atomic64_t blocks_written; |
| }; |
| |
| struct block_allocator { |
| struct vdo_completion completion; |
| /* The slab depot for this allocator */ |
| struct slab_depot *depot; |
| /* The nonce of the VDO */ |
| nonce_t nonce; |
| /* The physical zone number of this allocator */ |
| zone_count_t zone_number; |
| /* The thread ID for this allocator's physical zone */ |
| thread_id_t thread_id; |
| /* The number of slabs in this allocator */ |
| slab_count_t slab_count; |
| /* The number of the last slab owned by this allocator */ |
| slab_count_t last_slab; |
| /* The reduced priority level used to preserve unopened slabs */ |
| unsigned int unopened_slab_priority; |
| /* The state of this allocator */ |
| struct admin_state state; |
| /* The actor for applying an action to all slabs */ |
| struct slab_actor slab_actor; |
| |
| /* The slab from which blocks are currently being allocated */ |
| struct vdo_slab *open_slab; |
| /* A priority queue containing all slabs available for allocation */ |
| struct priority_table *prioritized_slabs; |
| /* The slab scrubber */ |
| struct slab_scrubber scrubber; |
| /* What phase of the close operation the allocator is to perform */ |
| enum block_allocator_drain_step drain_step; |
| |
| /* |
| * These statistics are all mutated only by the physical zone thread, but are read by other |
| * threads when gathering statistics for the entire depot. |
| */ |
| /* |
| * The count of allocated blocks in this zone. Not in block_allocator_statistics for |
| * historical reasons. |
| */ |
| u64 allocated_blocks; |
| /* Statistics for this block allocator */ |
| struct block_allocator_statistics statistics; |
| /* Cumulative statistics for the slab journals in this zone */ |
| struct slab_journal_statistics slab_journal_statistics; |
| /* Cumulative statistics for the reference counters in this zone */ |
| struct ref_counts_statistics ref_counts_statistics; |
| |
| /* |
| * This is the head of a queue of slab journals which have entries in their tail blocks |
| * which have not yet started to commit. When the recovery journal is under space pressure, |
| * slab journals which have uncommitted entries holding a lock on the recovery journal head |
| * are forced to commit their blocks early. This list is kept in order, with the tail |
| * containing the slab journal holding the most recent recovery journal lock. |
| */ |
| struct list_head dirty_slab_journals; |
| |
| /* The vio pool for reading and writing block allocator metadata */ |
| struct vio_pool *vio_pool; |
| /* The dm_kcopyd client for erasing slab journals */ |
| struct dm_kcopyd_client *eraser; |
| /* Iterator over the slabs to be erased */ |
| struct slab_iterator slabs_to_erase; |
| |
| /* The portion of the slab summary managed by this allocator */ |
| /* The state of the slab summary */ |
| struct admin_state summary_state; |
| /* The number of outstanding summary writes */ |
| block_count_t summary_write_count; |
| /* The array (owned by the blocks) of all entries */ |
| struct slab_summary_entry *summary_entries; |
| /* The array of slab_summary_blocks */ |
| struct slab_summary_block *summary_blocks; |
| }; |
| |
| enum slab_depot_load_type { |
| VDO_SLAB_DEPOT_NORMAL_LOAD, |
| VDO_SLAB_DEPOT_RECOVERY_LOAD, |
| VDO_SLAB_DEPOT_REBUILD_LOAD |
| }; |
| |
| struct slab_depot { |
| zone_count_t zone_count; |
| zone_count_t old_zone_count; |
| struct vdo *vdo; |
| struct slab_config slab_config; |
| struct action_manager *action_manager; |
| |
| physical_block_number_t first_block; |
| physical_block_number_t last_block; |
| physical_block_number_t origin; |
| |
| /* slab_size == (1 << slab_size_shift) */ |
| unsigned int slab_size_shift; |
| |
| /* Determines how slabs should be queued during load */ |
| enum slab_depot_load_type load_type; |
| |
| /* The state for notifying slab journals to release recovery journal */ |
| sequence_number_t active_release_request; |
| sequence_number_t new_release_request; |
| |
| /* State variables for scrubbing complete handling */ |
| atomic_t zones_to_scrub; |
| |
| /* Array of pointers to individually allocated slabs */ |
| struct vdo_slab **slabs; |
| /* The number of slabs currently allocated and stored in 'slabs' */ |
| slab_count_t slab_count; |
| |
| /* Array of pointers to a larger set of slabs (used during resize) */ |
| struct vdo_slab **new_slabs; |
| /* The number of slabs currently allocated and stored in 'new_slabs' */ |
| slab_count_t new_slab_count; |
| /* The size that 'new_slabs' was allocated for */ |
| block_count_t new_size; |
| |
| /* The last block before resize, for rollback */ |
| physical_block_number_t old_last_block; |
| /* The last block after resize, for resize */ |
| physical_block_number_t new_last_block; |
| |
| /* The statistics for the slab summary */ |
| struct atomic_slab_summary_statistics summary_statistics; |
| /* The start of the slab summary partition */ |
| physical_block_number_t summary_origin; |
| /* The number of bits to shift to get a 7-bit fullness hint */ |
| unsigned int hint_shift; |
| /* The slab summary entries for all of the zones the partition can hold */ |
| struct slab_summary_entry *summary_entries; |
| |
| /* The block allocators for this depot */ |
| struct block_allocator allocators[]; |
| }; |
| |
| struct reference_updater; |
| |
| bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab, |
| physical_block_number_t pbn, |
| enum journal_operation operation, |
| bool increment, |
| struct journal_point *recovery_point, |
| struct vdo_completion *parent); |
| |
| int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot, |
| physical_block_number_t pbn, |
| enum journal_operation operation); |
| |
| static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion) |
| { |
| vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION); |
| return container_of(completion, struct block_allocator, completion); |
| } |
| |
| int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab, |
| physical_block_number_t pbn, |
| struct pbn_lock *lock); |
| |
| int __must_check vdo_allocate_block(struct block_allocator *allocator, |
| physical_block_number_t *block_number_ptr); |
| |
| int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, |
| struct vdo_waiter *waiter); |
| |
| void vdo_modify_reference_count(struct vdo_completion *completion, |
| struct reference_updater *updater); |
| |
| int __must_check vdo_release_block_reference(struct block_allocator *allocator, |
| physical_block_number_t pbn); |
| |
| void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion); |
| |
| void vdo_dump_block_allocator(const struct block_allocator *allocator); |
| |
| int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state, |
| struct vdo *vdo, |
| struct partition *summary_partition, |
| struct slab_depot **depot_ptr); |
| |
| void vdo_free_slab_depot(struct slab_depot *depot); |
| |
| struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot); |
| |
| int __must_check vdo_allocate_reference_counters(struct slab_depot *depot); |
| |
| struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot, |
| physical_block_number_t pbn); |
| |
| u8 __must_check vdo_get_increment_limit(struct slab_depot *depot, |
| physical_block_number_t pbn); |
| |
| bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot, |
| physical_block_number_t pbn); |
| |
| block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot); |
| |
| block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot); |
| |
| void vdo_get_slab_depot_statistics(const struct slab_depot *depot, |
| struct vdo_statistics *stats); |
| |
| void vdo_load_slab_depot(struct slab_depot *depot, |
| const struct admin_state_code *operation, |
| struct vdo_completion *parent, void *context); |
| |
| void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, |
| enum slab_depot_load_type load_type, |
| struct vdo_completion *parent); |
| |
| void vdo_update_slab_depot_size(struct slab_depot *depot); |
| |
| int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, |
| const struct partition *partition); |
| |
| void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent); |
| |
| void vdo_abandon_new_slabs(struct slab_depot *depot); |
| |
| void vdo_drain_slab_depot(struct slab_depot *depot, |
| const struct admin_state_code *operation, |
| struct vdo_completion *parent); |
| |
| void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent); |
| |
| void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, |
| sequence_number_t recovery_block_number); |
| |
| void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, |
| struct vdo_completion *parent); |
| |
| void vdo_dump_slab_depot(const struct slab_depot *depot); |
| |
| #endif /* VDO_SLAB_DEPOT_H */ |