Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
| 2 | /* |
| 3 | * Copyright 2023 Red Hat |
| 4 | */ |
| 5 | |
| 6 | #ifndef VDO_BLOCK_MAP_H |
| 7 | #define VDO_BLOCK_MAP_H |
| 8 | |
| 9 | #include <linux/list.h> |
| 10 | |
| 11 | #include "numeric.h" |
| 12 | |
| 13 | #include "admin-state.h" |
| 14 | #include "completion.h" |
| 15 | #include "encodings.h" |
| 16 | #include "int-map.h" |
| 17 | #include "statistics.h" |
| 18 | #include "types.h" |
| 19 | #include "vio.h" |
| 20 | #include "wait-queue.h" |
| 21 | |
Matthew Sakai | ea9ca07 | 2024-02-06 22:00:42 -0500 | [diff] [blame] | 22 | /* |
| 23 | * The block map is responsible for tracking all the logical to physical mappings of a VDO. It |
| 24 | * consists of a collection of 60 radix trees gradually allocated as logical addresses are used. |
| 25 | * Each tree is assigned to a logical zone such that it is easy to compute which zone must handle |
| 26 | * each logical address. Each logical zone also has a dedicated portion of the leaf page cache. |
| 27 | * |
| 28 | * Each logical zone has a single dedicated queue and thread for performing all updates to the |
| 29 | * radix trees assigned to that zone. The concurrency guarantees of this single-threaded model |
| 30 | * allow the code to omit more fine-grained locking for the block map structures. |
| 31 | * |
| 32 | * Load operations must be performed on the admin thread. Normal operations, such as reading and |
| 33 | * updating mappings, must be performed on the appropriate logical zone thread. Save operations |
| 34 | * must be launched from the same admin thread as the original load operation. |
| 35 | */ |
| 36 | |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 37 | enum { |
| 38 | BLOCK_MAP_VIO_POOL_SIZE = 64, |
| 39 | }; |
| 40 | |
| 41 | /* |
| 42 | * Generation counter for page references. |
| 43 | */ |
| 44 | typedef u32 vdo_page_generation; |
| 45 | |
| 46 | extern const struct block_map_entry UNMAPPED_BLOCK_MAP_ENTRY; |
| 47 | |
Matthew Sakai | 14d531d | 2023-11-16 21:06:33 -0500 | [diff] [blame] | 48 | /* The VDO Page Cache abstraction. */ |
| 49 | struct vdo_page_cache { |
| 50 | /* the VDO which owns this cache */ |
| 51 | struct vdo *vdo; |
| 52 | /* number of pages in cache */ |
| 53 | page_count_t page_count; |
| 54 | /* number of pages to write in the current batch */ |
| 55 | page_count_t pages_in_batch; |
| 56 | /* Whether the VDO is doing a read-only rebuild */ |
| 57 | bool rebuilding; |
| 58 | |
| 59 | /* array of page information entries */ |
| 60 | struct page_info *infos; |
| 61 | /* raw memory for pages */ |
| 62 | char *pages; |
| 63 | /* cache last found page info */ |
| 64 | struct page_info *last_found; |
| 65 | /* map of page number to info */ |
| 66 | struct int_map *page_map; |
| 67 | /* main LRU list (all infos) */ |
| 68 | struct list_head lru_list; |
| 69 | /* free page list (oldest first) */ |
| 70 | struct list_head free_list; |
| 71 | /* outgoing page list */ |
| 72 | struct list_head outgoing_list; |
| 73 | /* number of read I/O operations pending */ |
| 74 | page_count_t outstanding_reads; |
| 75 | /* number of write I/O operations pending */ |
| 76 | page_count_t outstanding_writes; |
| 77 | /* number of pages covered by the current flush */ |
| 78 | page_count_t pages_in_flush; |
| 79 | /* number of pages waiting to be included in the next flush */ |
| 80 | page_count_t pages_to_flush; |
| 81 | /* number of discards in progress */ |
| 82 | unsigned int discard_count; |
| 83 | /* how many VPCs waiting for free page */ |
| 84 | unsigned int waiter_count; |
| 85 | /* queue of waiters who want a free page */ |
Mike Snitzer | d6e260c | 2023-11-20 17:29:16 -0500 | [diff] [blame] | 86 | struct vdo_wait_queue free_waiters; |
Matthew Sakai | 14d531d | 2023-11-16 21:06:33 -0500 | [diff] [blame] | 87 | /* |
| 88 | * Statistics are only updated on the logical zone thread, but are accessed from other |
| 89 | * threads. |
| 90 | */ |
| 91 | struct block_map_statistics stats; |
| 92 | /* counter for pressure reports */ |
| 93 | u32 pressure_report; |
| 94 | /* the block map zone to which this cache belongs */ |
| 95 | struct block_map_zone *zone; |
| 96 | }; |
| 97 | |
| 98 | /* |
| 99 | * The state of a page buffer. If the page buffer is free no particular page is bound to it, |
| 100 | * otherwise the page buffer is bound to particular page whose absolute pbn is in the pbn field. If |
| 101 | * the page is resident or dirty the page data is stable and may be accessed. Otherwise the page is |
| 102 | * in flight (incoming or outgoing) and its data should not be accessed. |
| 103 | * |
| 104 | * @note Update the static data in get_page_state_name() if you change this enumeration. |
| 105 | */ |
| 106 | enum vdo_page_buffer_state { |
| 107 | /* this page buffer is not being used */ |
| 108 | PS_FREE, |
| 109 | /* this page is being read from store */ |
| 110 | PS_INCOMING, |
| 111 | /* attempt to load this page failed */ |
| 112 | PS_FAILED, |
| 113 | /* this page is valid and un-modified */ |
| 114 | PS_RESIDENT, |
| 115 | /* this page is valid and modified */ |
| 116 | PS_DIRTY, |
| 117 | /* this page is being written and should not be used */ |
| 118 | PS_OUTGOING, |
| 119 | /* not a state */ |
| 120 | PAGE_STATE_COUNT, |
| 121 | } __packed; |
| 122 | |
| 123 | /* |
| 124 | * The write status of page |
| 125 | */ |
| 126 | enum vdo_page_write_status { |
| 127 | WRITE_STATUS_NORMAL, |
| 128 | WRITE_STATUS_DISCARD, |
| 129 | WRITE_STATUS_DEFERRED, |
| 130 | } __packed; |
| 131 | |
| 132 | /* Per-page-slot information. */ |
| 133 | struct page_info { |
| 134 | /* Preallocated page struct vio */ |
| 135 | struct vio *vio; |
| 136 | /* back-link for references */ |
| 137 | struct vdo_page_cache *cache; |
| 138 | /* the pbn of the page */ |
| 139 | physical_block_number_t pbn; |
| 140 | /* page is busy (temporarily locked) */ |
| 141 | u16 busy; |
| 142 | /* the write status the page */ |
| 143 | enum vdo_page_write_status write_status; |
| 144 | /* page state */ |
| 145 | enum vdo_page_buffer_state state; |
| 146 | /* queue of completions awaiting this item */ |
Mike Snitzer | d6e260c | 2023-11-20 17:29:16 -0500 | [diff] [blame] | 147 | struct vdo_wait_queue waiting; |
Matthew Sakai | 14d531d | 2023-11-16 21:06:33 -0500 | [diff] [blame] | 148 | /* state linked list entry */ |
| 149 | struct list_head state_entry; |
| 150 | /* LRU entry */ |
| 151 | struct list_head lru_entry; |
| 152 | /* |
| 153 | * The earliest recovery journal block containing uncommitted updates to the block map page |
| 154 | * associated with this page_info. A reference (lock) is held on that block to prevent it |
| 155 | * from being reaped. When this value changes, the reference on the old value must be |
| 156 | * released and a reference on the new value must be acquired. |
| 157 | */ |
| 158 | sequence_number_t recovery_lock; |
| 159 | }; |
| 160 | |
| 161 | /* |
| 162 | * A completion awaiting a specific page. Also a live reference into the page once completed, until |
| 163 | * freed. |
| 164 | */ |
| 165 | struct vdo_page_completion { |
| 166 | /* The generic completion */ |
| 167 | struct vdo_completion completion; |
| 168 | /* The cache involved */ |
| 169 | struct vdo_page_cache *cache; |
| 170 | /* The waiter for the pending list */ |
Mike Snitzer | d6e260c | 2023-11-20 17:29:16 -0500 | [diff] [blame] | 171 | struct vdo_waiter waiter; |
Matthew Sakai | 14d531d | 2023-11-16 21:06:33 -0500 | [diff] [blame] | 172 | /* The absolute physical block number of the page on disk */ |
| 173 | physical_block_number_t pbn; |
| 174 | /* Whether the page may be modified */ |
| 175 | bool writable; |
| 176 | /* Whether the page is available */ |
| 177 | bool ready; |
| 178 | /* The info structure for the page, only valid when ready */ |
| 179 | struct page_info *info; |
| 180 | }; |
| 181 | |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 182 | struct forest; |
| 183 | |
| 184 | struct tree_page { |
Mike Snitzer | d6e260c | 2023-11-20 17:29:16 -0500 | [diff] [blame] | 185 | struct vdo_waiter waiter; |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 186 | |
| 187 | /* Dirty list entry */ |
| 188 | struct list_head entry; |
| 189 | |
| 190 | /* If dirty, the tree zone flush generation in which it was last dirtied. */ |
| 191 | u8 generation; |
| 192 | |
| 193 | /* Whether this page is an interior tree page being written out. */ |
| 194 | bool writing; |
| 195 | |
| 196 | /* If writing, the tree zone flush generation of the copy being written. */ |
| 197 | u8 writing_generation; |
| 198 | |
| 199 | /* |
| 200 | * Sequence number of the earliest recovery journal block containing uncommitted updates to |
| 201 | * this page |
| 202 | */ |
| 203 | sequence_number_t recovery_lock; |
| 204 | |
| 205 | /* The value of recovery_lock when the this page last started writing */ |
| 206 | sequence_number_t writing_recovery_lock; |
| 207 | |
| 208 | char page_buffer[VDO_BLOCK_SIZE]; |
| 209 | }; |
| 210 | |
| 211 | enum block_map_page_type { |
| 212 | VDO_TREE_PAGE, |
| 213 | VDO_CACHE_PAGE, |
| 214 | }; |
| 215 | |
| 216 | typedef struct list_head dirty_era_t[2]; |
| 217 | |
| 218 | struct dirty_lists { |
Mike Snitzer | 571eff3 | 2024-02-13 23:57:10 -0500 | [diff] [blame] | 219 | /* The number of periods after which an element will be expired */ |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 220 | block_count_t maximum_age; |
Mike Snitzer | 571eff3 | 2024-02-13 23:57:10 -0500 | [diff] [blame] | 221 | /* The oldest period which has unexpired elements */ |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 222 | sequence_number_t oldest_period; |
Mike Snitzer | 571eff3 | 2024-02-13 23:57:10 -0500 | [diff] [blame] | 223 | /* One more than the current period */ |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 224 | sequence_number_t next_period; |
Mike Snitzer | 571eff3 | 2024-02-13 23:57:10 -0500 | [diff] [blame] | 225 | /* The offset in the array of lists of the oldest period */ |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 226 | block_count_t offset; |
Mike Snitzer | 571eff3 | 2024-02-13 23:57:10 -0500 | [diff] [blame] | 227 | /* Expired pages */ |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 228 | dirty_era_t expired; |
Mike Snitzer | 571eff3 | 2024-02-13 23:57:10 -0500 | [diff] [blame] | 229 | /* The lists of dirty pages */ |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 230 | dirty_era_t eras[]; |
| 231 | }; |
| 232 | |
| 233 | struct block_map_zone { |
| 234 | zone_count_t zone_number; |
| 235 | thread_id_t thread_id; |
| 236 | struct admin_state state; |
| 237 | struct block_map *block_map; |
| 238 | /* Dirty pages, by era*/ |
| 239 | struct dirty_lists *dirty_lists; |
| 240 | struct vdo_page_cache page_cache; |
| 241 | data_vio_count_t active_lookups; |
| 242 | struct int_map *loading_pages; |
| 243 | struct vio_pool *vio_pool; |
| 244 | /* The tree page which has issued or will be issuing a flush */ |
| 245 | struct tree_page *flusher; |
Mike Snitzer | d6e260c | 2023-11-20 17:29:16 -0500 | [diff] [blame] | 246 | struct vdo_wait_queue flush_waiters; |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 247 | /* The generation after the most recent flush */ |
| 248 | u8 generation; |
| 249 | u8 oldest_generation; |
| 250 | /* The counts of dirty pages in each generation */ |
| 251 | u32 dirty_page_counts[256]; |
| 252 | }; |
| 253 | |
| 254 | struct block_map { |
| 255 | struct vdo *vdo; |
| 256 | struct action_manager *action_manager; |
| 257 | /* The absolute PBN of the first root of the tree part of the block map */ |
| 258 | physical_block_number_t root_origin; |
| 259 | block_count_t root_count; |
| 260 | |
| 261 | /* The era point we are currently distributing to the zones */ |
| 262 | sequence_number_t current_era_point; |
| 263 | /* The next era point */ |
| 264 | sequence_number_t pending_era_point; |
| 265 | |
| 266 | /* The number of entries in block map */ |
| 267 | block_count_t entry_count; |
| 268 | nonce_t nonce; |
| 269 | struct recovery_journal *journal; |
| 270 | |
| 271 | /* The trees for finding block map pages */ |
| 272 | struct forest *forest; |
| 273 | /* The expanded trees awaiting growth */ |
| 274 | struct forest *next_forest; |
| 275 | /* The number of entries after growth */ |
| 276 | block_count_t next_entry_count; |
| 277 | |
| 278 | zone_count_t zone_count; |
| 279 | struct block_map_zone zones[]; |
| 280 | }; |
| 281 | |
| 282 | /** |
| 283 | * typedef vdo_entry_callback_fn - A function to be called for each allocated PBN when traversing |
| 284 | * the forest. |
| 285 | * @pbn: A PBN of a tree node. |
| 286 | * @completion: The parent completion of the traversal. |
| 287 | * |
| 288 | * Return: VDO_SUCCESS or an error. |
| 289 | */ |
| 290 | typedef int (*vdo_entry_callback_fn)(physical_block_number_t pbn, |
| 291 | struct vdo_completion *completion); |
| 292 | |
Matthew Sakai | 14d531d | 2023-11-16 21:06:33 -0500 | [diff] [blame] | 293 | static inline struct vdo_page_completion *as_vdo_page_completion(struct vdo_completion *completion) |
| 294 | { |
| 295 | vdo_assert_completion_type(completion, VDO_PAGE_COMPLETION); |
| 296 | return container_of(completion, struct vdo_page_completion, completion); |
| 297 | } |
| 298 | |
| 299 | void vdo_release_page_completion(struct vdo_completion *completion); |
| 300 | |
| 301 | void vdo_get_page(struct vdo_page_completion *page_completion, |
| 302 | struct block_map_zone *zone, physical_block_number_t pbn, |
| 303 | bool writable, void *parent, vdo_action_fn callback, |
| 304 | vdo_action_fn error_handler, bool requeue); |
| 305 | |
| 306 | void vdo_request_page_write(struct vdo_completion *completion); |
| 307 | |
| 308 | int __must_check vdo_get_cached_page(struct vdo_completion *completion, |
| 309 | struct block_map_page **page_ptr); |
| 310 | |
| 311 | int __must_check vdo_invalidate_page_cache(struct vdo_page_cache *cache); |
| 312 | |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 313 | static inline struct block_map_page * __must_check |
| 314 | vdo_as_block_map_page(struct tree_page *tree_page) |
| 315 | { |
| 316 | return (struct block_map_page *) tree_page->page_buffer; |
| 317 | } |
| 318 | |
| 319 | bool vdo_copy_valid_page(char *buffer, nonce_t nonce, |
| 320 | physical_block_number_t pbn, |
| 321 | struct block_map_page *page); |
| 322 | |
| 323 | void vdo_find_block_map_slot(struct data_vio *data_vio); |
| 324 | |
| 325 | physical_block_number_t vdo_find_block_map_page_pbn(struct block_map *map, |
| 326 | page_number_t page_number); |
| 327 | |
| 328 | void vdo_write_tree_page(struct tree_page *page, struct block_map_zone *zone); |
| 329 | |
| 330 | void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback, |
Mike Snitzer | b06d5c3 | 2024-01-26 21:35:49 -0500 | [diff] [blame] | 331 | struct vdo_completion *completion); |
Matthew Sakai | ddb12d6 | 2023-11-16 21:05:22 -0500 | [diff] [blame] | 332 | |
| 333 | int __must_check vdo_decode_block_map(struct block_map_state_2_0 state, |
| 334 | block_count_t logical_blocks, struct vdo *vdo, |
| 335 | struct recovery_journal *journal, nonce_t nonce, |
| 336 | page_count_t cache_size, block_count_t maximum_age, |
| 337 | struct block_map **map_ptr); |
| 338 | |
| 339 | void vdo_drain_block_map(struct block_map *map, const struct admin_state_code *operation, |
| 340 | struct vdo_completion *parent); |
| 341 | |
| 342 | void vdo_resume_block_map(struct block_map *map, struct vdo_completion *parent); |
| 343 | |
| 344 | int __must_check vdo_prepare_to_grow_block_map(struct block_map *map, |
| 345 | block_count_t new_logical_blocks); |
| 346 | |
| 347 | void vdo_grow_block_map(struct block_map *map, struct vdo_completion *parent); |
| 348 | |
| 349 | void vdo_abandon_block_map_growth(struct block_map *map); |
| 350 | |
| 351 | void vdo_free_block_map(struct block_map *map); |
| 352 | |
| 353 | struct block_map_state_2_0 __must_check vdo_record_block_map(const struct block_map *map); |
| 354 | |
| 355 | void vdo_initialize_block_map_from_journal(struct block_map *map, |
| 356 | struct recovery_journal *journal); |
| 357 | |
| 358 | zone_count_t vdo_compute_logical_zone(struct data_vio *data_vio); |
| 359 | |
| 360 | void vdo_advance_block_map_era(struct block_map *map, |
| 361 | sequence_number_t recovery_block_number); |
| 362 | |
| 363 | void vdo_update_block_map_page(struct block_map_page *page, struct data_vio *data_vio, |
| 364 | physical_block_number_t pbn, |
| 365 | enum block_mapping_state mapping_state, |
| 366 | sequence_number_t *recovery_lock); |
| 367 | |
| 368 | void vdo_get_mapped_block(struct data_vio *data_vio); |
| 369 | |
| 370 | void vdo_put_mapped_block(struct data_vio *data_vio); |
| 371 | |
| 372 | struct block_map_statistics __must_check vdo_get_block_map_statistics(struct block_map *map); |
| 373 | |
| 374 | /** |
| 375 | * vdo_convert_maximum_age() - Convert the maximum age to reflect the new recovery journal format |
| 376 | * @age: The configured maximum age |
| 377 | * |
| 378 | * Return: The converted age |
| 379 | * |
| 380 | * In the old recovery journal format, each journal block held 311 entries, and every write bio |
| 381 | * made two entries. The old maximum age was half the usable journal length. In the new format, |
| 382 | * each block holds only 217 entries, but each bio only makes one entry. We convert the configured |
| 383 | * age so that the number of writes in a block map era is the same in the old and new formats. This |
| 384 | * keeps the bound on the amount of work required to recover the block map from the recovery |
| 385 | * journal the same across the format change. It also keeps the amortization of block map page |
| 386 | * writes to write bios the same. |
| 387 | */ |
| 388 | static inline block_count_t vdo_convert_maximum_age(block_count_t age) |
| 389 | { |
| 390 | return DIV_ROUND_UP(age * RECOVERY_JOURNAL_1_ENTRIES_PER_BLOCK, |
| 391 | 2 * RECOVERY_JOURNAL_ENTRIES_PER_BLOCK); |
| 392 | } |
| 393 | |
| 394 | #endif /* VDO_BLOCK_MAP_H */ |