2 files changed, 2656 insertions, 0 deletions
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
new file mode 100644
index 000000000000..af3136f83e4f
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -0,0 +1,2404 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "slab-depot.h"
+
+#include <linux/atomic.h>
+#include <linux/bio.h>
+#include <linux/log2.h>
+#include <linux/min_heap.h>
+#include <linux/minmax.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+
+#include "action-manager.h"
+#include "admin-state.h"
+#include "completion.h"
+#include "constants.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "io-submitter.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "repair.h"
+#include "status-codes.h"
+#include "types.h"
+#include "vdo.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+static const u64 BYTES_PER_WORD = sizeof(u64);
+static const bool NORMAL_OPERATION = true;
+
+/**
+ * get_lock() - Get the lock object for a slab journal block by sequence number.
+ * @journal: vdo_slab journal to retrieve from.
+ * @sequence_number: Sequence number of the block.
+ *
+ * Return: The lock object for the given sequence number.
+ */
+static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal,
+							  sequence_number_t sequence_number)
+{
+	return &journal->locks[sequence_number % journal->size];
+}
+
+static bool is_slab_open(struct vdo_slab *slab)
+{
+	return (!vdo_is_state_quiescing(&slab->state) &&
+		!vdo_is_state_quiescent(&slab->state));
+}
+
+/**
+ * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush.
+ * @journal: The journal to check.
+ *
+ * Return: true if there are no entry waiters, or if the slab is unrecovered.
+ */
+static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal)
+{
+	return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
+		vdo_has_waiters(&journal->entry_waiters));
+}
+
+/**
+ * is_reaping() - Check whether a reap is currently in progress.
+ * @journal: The journal which may be reaping.
+ *
+ * Return: true if the journal is reaping.
+ */
+static inline bool __must_check is_reaping(struct slab_journal *journal)
+{
+	return (journal->head != journal->unreapable);
+}
+
+/**
+ * initialize_tail_block() - Initialize tail block as a new block.
+ * @journal: The journal whose tail block is being initialized.
+ */
+static void initialize_tail_block(struct slab_journal *journal)
+{
+	struct slab_journal_block_header *header = &journal->tail_header;
+
+	header->sequence_number = journal->tail;
+	header->entry_count = 0;
+	header->has_block_map_increments = false;
+}
+
+/**
+ * initialize_journal_state() - Set all journal fields appropriately to start journaling.
+ * @journal: The journal to be reset, based on its tail sequence number.
+ */
+static void initialize_journal_state(struct slab_journal *journal)
+{
+	journal->unreapable = journal->head;
+	journal->reap_lock = get_lock(journal, journal->unreapable);
+	journal->next_commit = journal->tail;
+	journal->summarized = journal->last_summarized = journal->tail;
+	initialize_tail_block(journal);
+}
+
+/**
+ * block_is_full() - Check whether a journal block is full.
+ * @journal: The slab journal for the block.
+ *
+ * Return: true if the tail block is full.
+ */
+static bool __must_check block_is_full(struct slab_journal *journal)
+{
+	journal_entry_count_t count = journal->tail_header.entry_count;
+
+	return (journal->tail_header.has_block_map_increments ?
+		(journal->full_entries_per_block == count) :
+		(journal->entries_per_block == count));
+}
+
+static void add_entries(struct slab_journal *journal);
+static void update_tail_block_location(struct slab_journal *journal);
+static void release_journal_locks(struct waiter *waiter, void *context);
+
+/**
+ * is_slab_journal_blank() - Check whether a slab's journal is blank.
+ *
+ * A slab journal is blank if it has never had any entries recorded in it.
+ *
+ * Return: true if the slab's journal has never been modified.
+ */
+static bool is_slab_journal_blank(const struct vdo_slab *slab)
+{
+	return ((slab->journal.tail == 1) &&
+		(slab->journal.tail_header.entry_count == 0));
+}
+
+/**
+ * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
+ *                             order.
+ * @journal: The journal to be marked dirty.
+ * @lock: The recovery journal lock held by the slab journal.
+ */
+static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
+{
+	struct slab_journal *dirty_journal;
+	struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
+
+	ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
+
+	journal->recovery_lock = lock;
+	list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
+		if (dirty_journal->recovery_lock <= journal->recovery_lock)
+			break;
+	}
+
+	list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next);
+}
+
+static void mark_slab_journal_clean(struct slab_journal *journal)
+{
+	journal->recovery_lock = 0;
+	list_del_init(&journal->dirty_entry);
+}
+
+static void check_if_slab_drained(struct vdo_slab *slab)
+{
+	bool read_only;
+	struct slab_journal *journal = &slab->journal;
+	const struct admin_state_code *code;
+
+	if (!vdo_is_state_draining(&slab->state) ||
+	    must_make_entries_to_flush(journal) ||
+	    is_reaping(journal) ||
+	    journal->waiting_to_commit ||
+	    !list_empty(&journal->uncommitted_blocks) ||
+	    journal->updating_slab_summary ||
+	    (slab->active_count > 0))
+		return;
+
+	/* When not suspending or recovering, the slab must be clean. */
+	code = vdo_get_admin_state_code(&slab->state);
+	read_only = vdo_is_read_only(slab->allocator->depot->vdo);
+	if (!read_only &&
+	    vdo_has_waiters(&slab->dirty_blocks) &&
+	    (code != VDO_ADMIN_STATE_SUSPENDING) &&
+	    (code != VDO_ADMIN_STATE_RECOVERING))
+		return;
+
+	vdo_finish_draining_with_result(&slab->state,
+					(read_only ? VDO_READ_ONLY : VDO_SUCCESS));
+}
+
+/**
+ * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are
+ *                    complete.
+ * @journal: The journal to be reaped.
+ */
+static void finish_reaping(struct slab_journal *journal)
+{
+	journal->head = journal->unreapable;
+	add_entries(journal);
+	check_if_slab_drained(journal->slab);
+}
+
+static void reap_slab_journal(struct slab_journal *journal);
+
+/**
+ * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try
+ *                      reaping again in case we deferred reaping due to an outstanding vio.
+ * @completion: The flush vio.
+ */
+static void complete_reaping(struct vdo_completion *completion)
+{
+	struct slab_journal *journal = completion->parent;
+
+	return_vio_to_pool(journal->slab->allocator->vio_pool,
+			   vio_as_pooled_vio(as_vio(uds_forget(completion))));
+	finish_reaping(journal);
+	reap_slab_journal(journal);
+}
+
+/**
+ * handle_flush_error() - Handle an error flushing the lower layer.
+ * @completion: The flush vio.
+ */
+static void handle_flush_error(struct vdo_completion *completion)
+{
+	vio_record_metadata_io_error(as_vio(completion));
+	vdo_enter_read_only_mode(completion->vdo, completion->result);
+	complete_reaping(completion);
+}
+
+static void flush_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, complete_reaping,
+			      journal->slab->allocator->thread_id);
+}
+
+/**
+ * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer
+ *                       prior to reaping.
+ * @waiter: The journal as a flush waiter.
+ * @context: The newly acquired flush vio.
+ */
+static void flush_for_reaping(struct waiter *waiter, void *context)
+{
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, flush_waiter);
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+
+	vio->completion.parent = journal;
+	submit_flush_vio(vio, flush_endio, handle_flush_error);
+}
+
+/**
+ * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks.
+ * @journal: The slab journal.
+ */
+static void reap_slab_journal(struct slab_journal *journal)
+{
+	bool reaped = false;
+
+	if (is_reaping(journal)) {
+		/* We already have a reap in progress so wait for it to finish. */
+		return;
+	}
+
+	if ((journal->slab->status != VDO_SLAB_REBUILT) ||
+	    !vdo_is_state_normal(&journal->slab->state) ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo)) {
+		/*
+		 * We must not reap in the first two cases, and there's no point in read-only mode.
+		 */
+		return;
+	}
+
+	/*
+	 * Start reclaiming blocks only when the journal head has no references. Then stop when a
+	 * block is referenced or reap reaches the most recently written block, referenced by the
+	 * slab summary, which has the sequence number just before the tail.
+	 */
+	while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
+		reaped = true;
+		journal->unreapable++;
+		journal->reap_lock++;
+		if (journal->reap_lock == &journal->locks[journal->size])
+			journal->reap_lock = &journal->locks[0];
+	}
+
+	if (!reaped)
+		return;
+
+	/*
+	 * It is never safe to reap a slab journal block without first issuing a flush, regardless
+	 * of whether a user flush has been received or not. In the absence of the flush, the
+	 * reference block write which released the locks allowing the slab journal to reap may not
+	 * be persisted. Although slab summary writes will eventually issue flushes, multiple slab
+	 * journal block writes can be issued while previous slab summary updates have not yet been
+	 * made. Even though those slab journal block writes will be ignored if the slab summary
+	 * update is not persisted, they may still overwrite the to-be-reaped slab journal block
+	 * resulting in a loss of reference count updates (VDO-2912).
+	 */
+	journal->flush_waiter.callback = flush_for_reaping;
+	acquire_vio_from_pool(journal->slab->allocator->vio_pool,
+			      &journal->flush_waiter);
+}
+
+/**
+ * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block.
+ * @journal: The slab journal.
+ * @sequence_number: The journal sequence number of the referenced block.
+ * @adjustment: Amount to adjust the reference counter.
+ *
+ * Note that when the adjustment is negative, the slab journal will be reaped.
+ */
+static void adjust_slab_journal_block_reference(struct slab_journal *journal,
+						sequence_number_t sequence_number,
+						int adjustment)
+{
+	struct journal_lock *lock;
+
+	if (sequence_number == 0)
+		return;
+
+	if (journal->slab->status == VDO_SLAB_REPLAYING) {
+		/* Locks should not be used during offline replay. */
+		return;
+	}
+
+	ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
+	lock = get_lock(journal, sequence_number);
+	if (adjustment < 0) {
+		ASSERT_LOG_ONLY((-adjustment <= lock->count),
+				"adjustment %d of lock count %u for slab journal block %llu must not underflow",
+				adjustment, lock->count,
+				(unsigned long long) sequence_number);
+	}
+
+	lock->count += adjustment;
+	if (lock->count == 0)
+		reap_slab_journal(journal);
+}
+
+/**
+ * release_journal_locks() - Callback invoked after a slab summary update completes.
+ * @waiter: The slab summary waiter that has just been notified.
+ * @context: The result code of the update.
+ *
+ * Registered in the constructor on behalf of update_tail_block_location().
+ *
+ * Implements waiter_callback_fn.
+ */
+static void release_journal_locks(struct waiter *waiter, void *context)
+{
+	sequence_number_t first, i;
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, slab_summary_waiter);
+	int result = *((int *) context);
+
+	if (result != VDO_SUCCESS) {
+		if (result != VDO_READ_ONLY) {
+			/*
+			 * Don't bother logging what might be lots of errors if we are already in
+			 * read-only mode.
+			 */
+			uds_log_error_strerror(result, "failed slab summary update %llu",
+					       (unsigned long long) journal->summarized);
+		}
+
+		journal->updating_slab_summary = false;
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		check_if_slab_drained(journal->slab);
+		return;
+	}
+
+	if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) {
+		journal->partial_write_in_progress = false;
+		add_entries(journal);
+	}
+
+	first = journal->last_summarized;
+	journal->last_summarized = journal->summarized;
+	for (i = journal->summarized - 1; i >= first; i--) {
+		/*
+		 * Release the lock the summarized block held on the recovery journal. (During
+		 * replay, recovery_start will always be 0.)
+		 */
+		if (journal->recovery_journal != NULL) {
+			zone_count_t zone_number = journal->slab->allocator->zone_number;
+			struct journal_lock *lock = get_lock(journal, i);
+
+			vdo_release_recovery_journal_block_reference(journal->recovery_journal,
+								     lock->recovery_start,
+								     VDO_ZONE_TYPE_PHYSICAL,
+								     zone_number);
+		}
+
+		/*
+		 * Release our own lock against reaping for blocks that are committed. (This
+		 * function will not change locks during replay.)
+		 */
+		adjust_slab_journal_block_reference(journal, i, -1);
+	}
+
+	journal->updating_slab_summary = false;
+
+	reap_slab_journal(journal);
+
+	/* Check if the slab summary needs to be updated again. */
+	update_tail_block_location(journal);
+}
+
+/**
+ * update_tail_block_location() - Update the tail block location in the slab summary, if necessary.
+ * @journal: The slab journal that is updating its tail block location.
+ */
+static void update_tail_block_location(struct slab_journal *journal)
+{
+	block_count_t free_block_count;
+	struct vdo_slab *slab = journal->slab;
+
+	if (journal->updating_slab_summary ||
+	    vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+	    (journal->last_summarized >= journal->next_commit)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	if (slab->status != VDO_SLAB_REBUILT) {
+		u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint;
+
+		free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift;
+	} else {
+		free_block_count = slab->free_blocks;
+	}
+
+	journal->summarized = journal->next_commit;
+	journal->updating_slab_summary = true;
+
+	/*
+	 * Update slab summary as dirty.
+	 * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this
+	 * slab have been written to the layer. Therefore, indicate that the ref counts must be
+	 * loaded when the journal head has reaped past sequence number 1.
+	 */
+	update_slab_summary_entry(slab, &journal->slab_summary_waiter,
+				  journal->summarized % journal->size,
+				  (journal->head > 1), false, free_block_count);
+}
+
+/**
+ * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
+ */
+static void reopen_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	sequence_number_t block;
+
+	ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
+			"vdo_slab journal's active block empty before reopening");
+	journal->head = journal->tail;
+	initialize_journal_state(journal);
+
+	/* Ensure no locks are spuriously held on an empty journal. */
+	for (block = 1; block <= journal->size; block++) {
+		ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
+				"Scrubbed journal's block %llu is not locked",
+				(unsigned long long) block);
+	}
+
+	add_entries(journal);
+}
+
+static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio)
+{
+	const struct packed_slab_journal_block *block =
+		(const struct packed_slab_journal_block *) vio->vio.data;
+
+	return __le64_to_cpu(block->header.sequence_number);
+}
+
+/**
+ * complete_write() - Handle post-commit processing.
+ * @completion: The write vio as a completion.
+ *
+ * This is the callback registered by write_slab_journal_block().
+ */
+static void complete_write(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion));
+	struct slab_journal *journal = completion->parent;
+	sequence_number_t committed = get_committing_sequence_number(pooled);
+
+	list_del_init(&pooled->list_entry);
+	return_vio_to_pool(journal->slab->allocator->vio_pool, uds_forget(pooled));
+
+	if (result != VDO_SUCCESS) {
+		vio_record_metadata_io_error(as_vio(completion));
+		uds_log_error_strerror(result, "cannot write slab journal block %llu",
+				       (unsigned long long) committed);
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		check_if_slab_drained(journal->slab);
+		return;
+	}
+
+	WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1);
+
+	if (list_empty(&journal->uncommitted_blocks)) {
+		/* If no blocks are outstanding, then the commit point is at the tail. */
+		journal->next_commit = journal->tail;
+	} else {
+		/* The commit point is always the beginning of the oldest incomplete block. */
+		pooled = container_of(journal->uncommitted_blocks.next,
+				      struct pooled_vio, list_entry);
+		journal->next_commit = get_committing_sequence_number(pooled);
+	}
+
+	update_tail_block_location(journal);
+}
+
+static void write_slab_journal_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id);
+}
+
+/**
+ * write_slab_journal_block() - Write a slab journal block.
+ * @waiter: The vio pool waiter which was just notified.
+ * @context: The vio pool entry for the write.
+ *
+ * Callback from acquire_vio_from_pool() registered in commit_tail().
+ */
+static void write_slab_journal_block(struct waiter *waiter, void *context)
+{
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, resource_waiter);
+	struct slab_journal_block_header *header = &journal->tail_header;
+	int unused_entries = journal->entries_per_block - header->entry_count;
+	physical_block_number_t block_number;
+	const struct admin_state_code *operation;
+
+	header->head = journal->head;
+	list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks);
+	vdo_pack_slab_journal_block_header(header, &journal->block->header);
+
+	/* Copy the tail block into the vio. */
+	memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
+
+	ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
+	if (unused_entries > 0) {
+		/*
+		 * Release the per-entry locks for any unused entries in the block we are about to
+		 * write.
+		 */
+		adjust_slab_journal_block_reference(journal, header->sequence_number,
+						    -unused_entries);
+		journal->partial_write_in_progress = !block_is_full(journal);
+	}
+
+	block_number = journal->slab->journal_origin +
+		(header->sequence_number % journal->size);
+	vio->completion.parent = journal;
+
+	/*
+	 * This block won't be read in recovery until the slab summary is updated to refer to it.
+	 * The slab summary update does a flush which is sufficient to protect us from VDO-2331.
+	 */
+	submit_metadata_vio(uds_forget(vio), block_number, write_slab_journal_endio,
+			    complete_write, REQ_OP_WRITE);
+
+	/* Since the write is submitted, the tail block structure can be reused. */
+	journal->tail++;
+	initialize_tail_block(journal);
+	journal->waiting_to_commit = false;
+
+	operation = vdo_get_admin_state_code(&journal->slab->state);
+	if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) {
+		vdo_finish_operation(&journal->slab->state,
+				     (vdo_is_read_only(journal->slab->allocator->depot->vdo) ?
+				      VDO_READ_ONLY : VDO_SUCCESS));
+		return;
+	}
+
+	add_entries(journal);
+}
+
+/**
+ * commit_tail() - Commit the tail block of the slab journal.
+ * @journal: The journal whose tail block should be committed.
+ */
+static void commit_tail(struct slab_journal *journal)
+{
+	if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) {
+		/*
+		 * There are no entries at the moment, but there are some waiters, so defer
+		 * initiating the flush until those entries are ready to write.
+		 */
+		return;
+	}
+
+	if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
+	    journal->waiting_to_commit ||
+	    (journal->tail_header.entry_count == 0)) {
+		/*
+		 * There is nothing to do since the tail block is empty, or writing, or the journal
+		 * is in read-only mode.
+		 */
+		return;
+	}
+
+	/*
+	 * Since we are about to commit the tail block, this journal no longer needs to be on the
+	 * ring of journals which the recovery journal might ask to commit.
+	 */
+	mark_slab_journal_clean(journal);
+
+	journal->waiting_to_commit = true;
+
+	journal->resource_waiter.callback = write_slab_journal_block;
+	acquire_vio_from_pool(journal->slab->allocator->vio_pool,
+			      &journal->resource_waiter);
+}
+
+/**
+ * encode_slab_journal_entry() - Encode a slab journal entry.
+ * @tail_header: The unpacked header for the block.
+ * @payload: The journal block payload to hold the entry.
+ * @sbn: The slab block number of the entry to encode.
+ * @operation: The type of the entry.
+ * @increment: True if this is an increment.
+ *
+ * Exposed for unit tests.
+ */
+static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
+				      slab_journal_payload *payload,
+				      slab_block_number sbn,
+				      enum journal_operation operation,
+				      bool increment)
+{
+	journal_entry_count_t entry_number = tail_header->entry_count++;
+
+	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		if (!tail_header->has_block_map_increments) {
+			memset(payload->full_entries.entry_types, 0,
+			       VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
+			tail_header->has_block_map_increments = true;
+		}
+
+		payload->full_entries.entry_types[entry_number / 8] |=
+			((u8)1 << (entry_number % 8));
+	}
+
+	vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment);
+}
+
+/**
+ * expand_journal_point() - Convert a recovery journal journal_point which refers to both an
+ *                          increment and a decrement to a single point which refers to one or the
+ *                          other.
+ * @recovery_point: The journal point to convert.
+ * @increment: Whether the current entry is an increment.
+ *
+ * Return: The expanded journal point
+ *
+ * Because each data_vio has but a single recovery journal point, but may need to make both
+ * increment and decrement entries in the same slab journal. In order to distinguish the two
+ * entries, the entry count of the expanded journal point is twice the actual recovery journal
+ * entry count for increments, and one more than that for decrements.
+ */
+static struct journal_point expand_journal_point(struct journal_point recovery_point,
+						 bool increment)
+{
+	recovery_point.entry_count *= 2;
+	if (!increment)
+		recovery_point.entry_count++;
+
+	return recovery_point;
+}
+
+/**
+ * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a
+ *               block becomes full.
+ * @journal: The slab journal to append to.
+ * @pbn: The pbn being adjusted.
+ * @operation: The type of entry to make.
+ * @increment: True if this is an increment.
+ * @recovery_point: The expanded recovery point.
+ *
+ * This function is synchronous.
+ */
+static void add_entry(struct slab_journal *journal, physical_block_number_t pbn,
+		      enum journal_operation operation, bool increment,
+		      struct journal_point recovery_point)
+{
+	struct packed_slab_journal_block *block = journal->block;
+	int result;
+
+	result = ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
+						 &recovery_point),
+			"recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
+			(unsigned long long) recovery_point.sequence_number,
+			recovery_point.entry_count,
+			(unsigned long long) journal->tail_header.recovery_point.sequence_number,
+			journal->tail_header.recovery_point.entry_count);
+	if (result != VDO_SUCCESS) {
+		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
+		return;
+	}
+
+	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
+		result = ASSERT((journal->tail_header.entry_count <
+				 journal->full_entries_per_block),
+				"block has room for full entries");
+		if (result != VDO_SUCCESS) {
+			vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
+						 result);
+			return;
+		}
+	}
+
+	encode_slab_journal_entry(&journal->tail_header, &block->payload,
+				  pbn - journal->slab->start, operation, increment);
+	journal->tail_header.recovery_point = recovery_point;
+	if (block_is_full(journal))
+		commit_tail(journal);
+}
+
+static inline block_count_t journal_length(const struct slab_journal *journal)
+{
+	return journal->tail - journal->head;
+}
+
+/**
+ * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal.
+ * @slab: The slab to play into.
+ * @pbn: The PBN for the entry.
+ * @operation: The type of entry to add.
+ * @increment: True if this entry is an increment.
+ * @recovery_point: The recovery journal point corresponding to this entry.
+ * @parent: The completion to notify when there is space to add the entry if the entry could not be
+ *          added immediately.
+ *
+ * Return: true if the entry was added immediately.
+ */
+bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn,
+				  enum journal_operation operation, bool increment,
+				  struct journal_point *recovery_point,
+				  struct vdo_completion *parent)
+{
+	struct slab_journal *journal = &slab->journal;
+	struct slab_journal_block_header *header = &journal->tail_header;
+	struct journal_point expanded = expand_journal_point(*recovery_point, increment);
+
+	/* Only accept entries after the current recovery point. */
+	if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded))
+		return true;
+
+	if ((header->entry_count >= journal->full_entries_per_block) &&
+	    (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) {
+		/*
+		 * The tail block does not have room for the entry we are attempting to add so
+		 * commit the tail block now.
+		 */
+		commit_tail(journal);
+	}
+
+	if (journal->waiting_to_commit) {
+		vdo_start_operation_with_waiter(&journal->slab->state,
+						VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
+						parent, NULL);
+		return false;
+	}
+
+	if (journal_length(journal) >= journal->size) {
+		/*
+		 * We must have reaped the current head before the crash, since the blocked
+		 * threshold keeps us from having more entries than fit in a slab journal; hence we
+		 * can just advance the head (and unreapable block), as needed.
+		 */
+		journal->head++;
+		journal->unreapable++;
+	}
+
+	if (journal->slab->status == VDO_SLAB_REBUILT)
+		journal->slab->status = VDO_SLAB_REPLAYING;
+
+	add_entry(journal, pbn, operation, increment, expanded);
+	return true;
+}
+
+/**
+ * requires_reaping() - Check whether the journal must be reaped before adding new entries.
+ * @journal: The journal to check.
+ *
+ * Return: true if the journal must be reaped.
+ */
+static bool requires_reaping(const struct slab_journal *journal)
+{
+	return (journal_length(journal) >= journal->blocking_threshold);
+}
+
+/** finish_summary_update() - A waiter callback that resets the writing state of a slab. */
+static void finish_summary_update(struct waiter *waiter, void *context)
+{
+	struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter);
+	int result = *((int *) context);
+
+	slab->active_count--;
+
+	if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
+		uds_log_error_strerror(result, "failed to update slab summary");
+		vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+	}
+
+	check_if_slab_drained(slab);
+}
+
+static void write_reference_block(struct waiter *waiter, void *context);
+
+/**
+ * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring
+ *                                  a VIO for it from the pool.
+ * @waiter: The waiter of the block which is starting to write.
+ * @context: The parent slab of the block.
+ *
+ * This can be asynchronous since the writer will have to wait if all VIOs in the pool are
+ * currently in use.
+ */
+static void launch_reference_block_write(struct waiter *waiter, void *context)
+{
+	struct vdo_slab *slab = context;
+
+	if (vdo_is_read_only(slab->allocator->depot->vdo))
+		return;
+
+	slab->active_count++;
+	container_of(waiter, struct reference_block, waiter)->is_writing = true;
+	waiter->callback = write_reference_block;
+	acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+}
+
+static void save_dirty_reference_blocks(struct vdo_slab *slab)
+{
+	vdo_notify_all_waiters(&slab->dirty_blocks, launch_reference_block_write, slab);
+	check_if_slab_drained(slab);
+}
+
+/**
+ * finish_reference_block_write() - After a reference block has written, clean it, release its
+ *                                  locks, and return its VIO to the pool.
+ * @completion: The VIO that just finished writing.
+ */
+static void finish_reference_block_write(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+	struct reference_block *block = completion->parent;
+	struct vdo_slab *slab = block->slab;
+	tail_block_offset_t offset;
+
+	slab->active_count--;
+
+	/* Release the slab journal lock. */
+	adjust_slab_journal_block_reference(&slab->journal,
+					    block->slab_journal_lock_to_release, -1);
+	return_vio_to_pool(slab->allocator->vio_pool, pooled);
+
+	/*
+	 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
+	 * us to be dirtied again, but we don't want to double enqueue.
+	 */
+	block->is_writing = false;
+
+	if (vdo_is_read_only(completion->vdo)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	/* Re-queue the block if it was re-dirtied while it was writing. */
+	if (block->is_dirty) {
+		vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+		if (vdo_is_state_draining(&slab->state)) {
+			/* We must be saving, and this block will otherwise not be relaunched. */
+			save_dirty_reference_blocks(slab);
+		}
+
+		return;
+	}
+
+	/*
+	 * Mark the slab as clean in the slab summary if there are no dirty or writing blocks
+	 * and no summary update in progress.
+	 */
+	if ((slab->active_count > 0) || vdo_has_waiters(&slab->dirty_blocks)) {
+		check_if_slab_drained(slab);
+		return;
+	}
+
+	offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+	slab->active_count++;
+	slab->summary_waiter.callback = finish_summary_update;
+	update_slab_summary_entry(slab, &slab->summary_waiter, offset,
+				  true, true, slab->free_blocks);
+}
+
+/**
+ * get_reference_counters_for_block() - Find the reference counters for a given block.
+ * @block: The reference_block in question.
+ *
+ * Return: A pointer to the reference counters for this block.
+ */
+static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block)
+{
+	size_t block_index = block - block->slab->reference_blocks;
+
+	return &block->slab->counters[block_index * COUNTS_PER_BLOCK];
+}
+
+/**
+ * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out.
+ * @block: The block to copy.
+ * @buffer: The char buffer to fill with the packed block.
+ */
+static void pack_reference_block(struct reference_block *block, void *buffer)
+{
+	struct packed_reference_block *packed = buffer;
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+	sector_count_t i;
+	struct packed_journal_point commit_point;
+
+	vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point);
+
+	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+		packed->sectors[i].commit_point = commit_point;
+		memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR),
+		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+	}
+}
+
+static void write_reference_block_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct reference_block *block = vio->completion.parent;
+	thread_id_t thread_id = block->slab->allocator->thread_id;
+
+	continue_vio_after_io(vio, finish_reference_block_write, thread_id);
+}
+
+/**
+ * handle_io_error() - Handle an I/O error reading or writing a reference count block.
+ * @completion: The VIO doing the I/O as a completion.
+ */
+static void handle_io_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct vio *vio = as_vio(completion);
+	struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
+
+	vio_record_metadata_io_error(vio);
+	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	slab->active_count--;
+	vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
+	check_if_slab_drained(slab);
+}
+
+/**
+ * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy
+ *                           its counters and associated data into the VIO, and launch the write.
+ * @waiter: The waiter of the dirty block.
+ * @context: The VIO returned by the pool.
+ */
+static void write_reference_block(struct waiter *waiter, void *context)
+{
+	size_t block_offset;
+	physical_block_number_t pbn;
+	struct pooled_vio *pooled = context;
+	struct vdo_completion *completion = &pooled->vio.completion;
+	struct reference_block *block = container_of(waiter, struct reference_block,
+						     waiter);
+
+	pack_reference_block(block, pooled->vio.data);
+	block_offset = (block - block->slab->reference_blocks);
+	pbn = (block->slab->ref_counts_origin + block_offset);
+	block->slab_journal_lock_to_release = block->slab_journal_lock;
+	completion->parent = block;
+
+	/*
+	 * Mark the block as clean, since we won't be committing any updates that happen after this
+	 * moment. As long as VIO order is preserved, two VIOs updating this block at once will not
+	 * cause complications.
+	 */
+	block->is_dirty = false;
+
+	/*
+	 * Flush before writing to ensure that the recovery journal and slab journal entries which
+	 * cover this reference update are stable (VDO-2331).
+	 */
+	WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written,
+		   block->slab->allocator->ref_counts_statistics.blocks_written + 1);
+
+	completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id;
+	submit_metadata_vio(&pooled->vio, pbn, write_reference_block_endio,
+			    handle_io_error, REQ_OP_WRITE | REQ_PREFLUSH);
+}
+
+static void reclaim_journal_space(struct slab_journal *journal)
+{
+	block_count_t length = journal_length(journal);
+	struct vdo_slab *slab = journal->slab;
+	block_count_t write_count = vdo_count_waiters(&slab->dirty_blocks);
+	block_count_t written;
+
+	if ((length < journal->flushing_threshold) || (write_count == 0))
+		return;
+
+	/* The slab journal is over the first threshold, schedule some reference block writes. */
+	WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1);
+	if (length < journal->flushing_deadline) {
+		/* Schedule more writes the closer to the deadline we get. */
+		write_count /= journal->flushing_deadline - length + 1;
+		write_count = max_t(block_count_t, write_count, 1);
+	}
+
+	for (written = 0; written < write_count; written++) {
+		vdo_notify_next_waiter(&slab->dirty_blocks,
+				       launch_reference_block_write, slab);
+	}
+}
+
+/**
+ * reference_count_to_status() - Convert a reference count to a reference status.
+ * @count: The count to convert.
+ *
+ * Return: The appropriate reference status.
+ */
+static enum reference_status __must_check reference_count_to_status(vdo_refcount_t count)
+{
+	if (count == EMPTY_REFERENCE_COUNT)
+		return RS_FREE;
+	else if (count == 1)
+		return RS_SINGLE;
+	else if (count == PROVISIONAL_REFERENCE_COUNT)
+		return RS_PROVISIONAL;
+	else
+		return RS_SHARED;
+}
+
+/**
+ * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue
+ *                 if it wasn't already dirty.
+ * @block: The reference block to mark as dirty.
+ */
+static void dirty_block(struct reference_block *block)
+{
+	if (block->is_dirty)
+		return;
+
+	block->is_dirty = true;
+	if (!block->is_writing)
+		vdo_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
+}
+
+/**
+ * get_reference_block() - Get the reference block that covers the given block index.
+ */
+static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab,
+								 slab_block_number index)
+{
+	return &slab->reference_blocks[index / COUNTS_PER_BLOCK];
+}
+
+/**
+ * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical
+ *                                block number.
+ * @slab: The slab.
+ * @physical_block_number: The physical block number.
+ * @slab_block_number_ptr: A pointer to the slab block number.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check slab_block_number_from_pbn(struct vdo_slab *slab,
+						   physical_block_number_t pbn,
+						   slab_block_number *slab_block_number_ptr)
+{
+	u64 slab_block_number;
+
+	if (pbn < slab->start)
+		return VDO_OUT_OF_RANGE;
+
+	slab_block_number = pbn - slab->start;
+	if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks)
+		return VDO_OUT_OF_RANGE;
+
+	*slab_block_number_ptr = slab_block_number;
+	return VDO_SUCCESS;
+}
+
+/**
+ * get_reference_counter() - Get the reference counter that covers the given physical block number.
+ * @slab: The slab to query.
+ * @pbn: The physical block number.
+ * @counter_ptr: A pointer to the reference counter.
+ */
+static int __must_check get_reference_counter(struct vdo_slab *slab,
+					      physical_block_number_t pbn,
+					      vdo_refcount_t **counter_ptr)
+{
+	slab_block_number index;
+	int result = slab_block_number_from_pbn(slab, pbn, &index);
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*counter_ptr = &slab->counters[index];
+
+	return VDO_SUCCESS;
+}
+
+static unsigned int calculate_slab_priority(struct vdo_slab *slab)
+{
+	block_count_t free_blocks = slab->free_blocks;
+	unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority;
+	unsigned int priority;
+
+	/*
+	 * Wholly full slabs must be the only ones with lowest priority, 0.
+	 *
+	 * Slabs that have never been opened (empty, newly initialized, and never been written to)
+	 * have lower priority than previously opened slabs that have a significant number of free
+	 * blocks. This ranking causes VDO to avoid writing physical blocks for the first time
+	 * unless there are very few free blocks that have been previously written to.
+	 *
+	 * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO
+	 * a better client of any underlying storage that is thinly-provisioned (though discarding
+	 * would be better).
+	 *
+	 * For all other slabs, the priority is derived from the logarithm of the number of free
+	 * blocks. Slabs with the same order of magnitude of free blocks have the same priority.
+	 * With 2^23 blocks, the priority will range from 1 to 25. The reserved
+	 * unopened_slab_priority divides the range and is skipped by the logarithmic mapping.
+	 */
+
+	if (free_blocks == 0)
+		return 0;
+
+	if (is_slab_journal_blank(slab))
+		return unopened_slab_priority;
+
+	priority = (1 + ilog2(free_blocks));
+	return ((priority < unopened_slab_priority) ? priority : priority + 1);
+}
+
+/*
+ * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab
+ * so slabs with lots of free blocks with be opened for allocation before slabs that have few free
+ * blocks.
+ */
+static void prioritize_slab(struct vdo_slab *slab)
+{
+	ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
+			"a slab must not already be on a ring when prioritizing");
+	slab->priority = calculate_slab_priority(slab);
+	vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
+				   slab->priority, &slab->allocq_entry);
+}
+
+/**
+ * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab.
+ * @increment: should be true if the free block count went up.
+ */
+static void adjust_free_block_count(struct vdo_slab *slab, bool increment)
+{
+	struct block_allocator *allocator = slab->allocator;
+
+	WRITE_ONCE(allocator->allocated_blocks,
+		   allocator->allocated_blocks + (increment ? -1 : 1));
+
+	/* The open slab doesn't need to be reprioritized until it is closed. */
+	if (slab == allocator->open_slab)
+		return;
+
+	/* Don't bother adjusting the priority table if unneeded. */
+	if (slab->priority == calculate_slab_priority(slab))
+		return;
+
+	/*
+	 * Reprioritize the slab to reflect the new free block count by removing it from the table
+	 * and re-enqueuing it with the new priority.
+	 */
+	vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry);
+	prioritize_slab(slab);
+}
+
+/**
+ * increment_for_data() - Increment the reference count for a data block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the data block before this increment.
+ * @lock: The pbn_lock associated with this increment (may be NULL).
+ * @counter_ptr: A pointer to the count for the data block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int increment_for_data(struct vdo_slab *slab, struct reference_block *block,
+			      slab_block_number block_number,
+			      enum reference_status old_status,
+			      struct pbn_lock *lock, vdo_refcount_t *counter_ptr,
+			      bool adjust_block_count)
+{
+	switch (old_status) {
+	case RS_FREE:
+		*counter_ptr = 1;
+		block->allocated_count++;
+		slab->free_blocks--;
+		if (adjust_block_count)
+			adjust_free_block_count(slab, false);
+
+		break;
+
+	case RS_PROVISIONAL:
+		*counter_ptr = 1;
+		break;
+
+	default:
+		/* Single or shared */
+		if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) {
+			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+						      "Incrementing a block already having 254 references (slab %u, offset %u)",
+						      slab->slab_number, block_number);
+		}
+		(*counter_ptr)++;
+	}
+
+	if (lock != NULL)
+		vdo_unassign_pbn_lock_provisional_reference(lock);
+	return VDO_SUCCESS;
+}
+
+/**
+ * decrement_for_data() - Decrement the reference count for a data block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the data block before this decrement.
+ * @updater: The reference updater doing this operation in case we need to look up the pbn lock.
+ * @lock: The pbn_lock associated with the block being decremented (may be NULL).
+ * @counter_ptr: A pointer to the count for the data block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int decrement_for_data(struct vdo_slab *slab, struct reference_block *block,
+			      slab_block_number block_number,
+			      enum reference_status old_status,
+			      struct reference_updater *updater,
+			      vdo_refcount_t *counter_ptr, bool adjust_block_count)
+{
+	switch (old_status) {
+	case RS_FREE:
+		return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+					      "Decrementing free block at offset %u in slab %u",
+					      block_number, slab->slab_number);
+
+	case RS_PROVISIONAL:
+	case RS_SINGLE:
+		if (updater->zpbn.zone != NULL) {
+			struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone,
+									       updater->zpbn.pbn);
+
+			if (lock != NULL) {
+				/*
+				 * There is a read lock on this block, so the block must not become
+				 * unreferenced.
+				 */
+				*counter_ptr = PROVISIONAL_REFERENCE_COUNT;
+				vdo_assign_pbn_lock_provisional_reference(lock);
+				break;
+			}
+		}
+
+		*counter_ptr = EMPTY_REFERENCE_COUNT;
+		block->allocated_count--;
+		slab->free_blocks++;
+		if (adjust_block_count)
+			adjust_free_block_count(slab, true);
+
+		break;
+
+	default:
+		/* Shared */
+		(*counter_ptr)--;
+	}
+
+	return VDO_SUCCESS;
+}
+
+/**
+ * increment_for_block_map() - Increment the reference count for a block map page.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @old_status: The reference status of the block before this increment.
+ * @lock: The pbn_lock associated with this increment (may be NULL).
+ * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
+ * @counter_ptr: A pointer to the count for the block (in, out).
+ * @adjust_block_count: Whether to update the allocator's free block count.
+ *
+ * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map
+ * blocks never dedupe they should never be adjusted from any other state. The adjustment always
+ * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map
+ * blocks.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int increment_for_block_map(struct vdo_slab *slab, struct reference_block *block,
+				   slab_block_number block_number,
+				   enum reference_status old_status,
+				   struct pbn_lock *lock, bool normal_operation,
+				   vdo_refcount_t *counter_ptr, bool adjust_block_count)
+{
+	switch (old_status) {
+	case RS_FREE:
+		if (normal_operation) {
+			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+						      "Incrementing unallocated block map block (slab %u, offset %u)",
+						      slab->slab_number, block_number);
+		}
+
+		*counter_ptr = MAXIMUM_REFERENCE_COUNT;
+		block->allocated_count++;
+		slab->free_blocks--;
+		if (adjust_block_count)
+			adjust_free_block_count(slab, false);
+
+		return VDO_SUCCESS;
+
+	case RS_PROVISIONAL:
+		if (!normal_operation)
+			return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+						      "Block map block had provisional reference during replay (slab %u, offset %u)",
+						      slab->slab_number, block_number);
+
+		*counter_ptr = MAXIMUM_REFERENCE_COUNT;
+		if (lock != NULL)
+			vdo_unassign_pbn_lock_provisional_reference(lock);
+		return VDO_SUCCESS;
+
+	default:
+		return uds_log_error_strerror(VDO_REF_COUNT_INVALID,
+					      "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
+					      *counter_ptr, slab->slab_number,
+					      block_number);
+	}
+}
+
+static bool __must_check is_valid_journal_point(const struct journal_point *point)
+{
+	return ((point != NULL) && (point->sequence_number > 0));
+}
+
+/**
+ * update_reference_count() - Update the reference count of a block.
+ * @slab: The slab which owns the block.
+ * @block: The reference block which contains the block being updated.
+ * @block_number: The block to update.
+ * @slab_journal_point: The slab journal point at which this update is journaled.
+ * @updater: The reference updater.
+ * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
+ * @adjust_block_count: Whether to update the slab's free block count.
+ * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement
+ *                             of a provisional reference.
+ *
+ * Return: VDO_SUCCESS or an error.
+ */
+static int update_reference_count(struct vdo_slab *slab, struct reference_block *block,
+				  slab_block_number block_number,
+				  const struct journal_point *slab_journal_point,
+				  struct reference_updater *updater,
+				  bool normal_operation, bool adjust_block_count,
+				  bool *provisional_decrement_ptr)
+{
+	vdo_refcount_t *counter_ptr = &slab->counters[block_number];
+	enum reference_status old_status = reference_count_to_status(*counter_ptr);
+	int result;
+
+	if (!updater->increment) {
+		result = decrement_for_data(slab, block, block_number, old_status,
+					    updater, counter_ptr, adjust_block_count);
+		if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) {
+			if (provisional_decrement_ptr != NULL)
+				*provisional_decrement_ptr = true;
+			return VDO_SUCCESS;
+		}
+	} else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) {
+		result = increment_for_data(slab, block, block_number, old_status,
+					    updater->lock, counter_ptr, adjust_block_count);
+	} else {
+		result = increment_for_block_map(slab, block, block_number, old_status,
+						 updater->lock, normal_operation,
+						 counter_ptr, adjust_block_count);
+	}
+
+	if (result != VDO_SUCCESS)
+		return result;
+
+	if (is_valid_journal_point(slab_journal_point))
+		slab->slab_journal_point = *slab_journal_point;
+
+	return VDO_SUCCESS;
+}
+
+static int __must_check adjust_reference_count(struct vdo_slab *slab,
+					       struct reference_updater *updater,
+					       const struct journal_point *slab_journal_point)
+{
+	slab_block_number block_number;
+	int result;
+	struct reference_block *block;
+	bool provisional_decrement = false;
+
+	if (!is_slab_open(slab))
+		return VDO_INVALID_ADMIN_STATE;
+
+	result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	block = get_reference_block(slab, block_number);
+	result = update_reference_count(slab, block, block_number, slab_journal_point,
+					updater, NORMAL_OPERATION, true,
+					&provisional_decrement);
+	if ((result != VDO_SUCCESS) || provisional_decrement)
+		return result;
+
+	if (block->is_dirty && (block->slab_journal_lock > 0)) {
+		sequence_number_t entry_lock = slab_journal_point->sequence_number;
+		/*
+		 * This block is already dirty and a slab journal entry has been made for it since
+		 * the last time it was clean. We must release the per-entry slab journal lock for
+		 * the entry associated with the update we are now doing.
+		 */
+		result = ASSERT(is_valid_journal_point(slab_journal_point),
+				"Reference count adjustments need slab journal points.");
+		if (result != VDO_SUCCESS)
+			return result;
+
+		adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1);
+		return VDO_SUCCESS;
+	}
+
+	/*
+	 * This may be the first time we are applying an update for which there is a slab journal
+	 * entry to this block since the block was cleaned. Therefore, we convert the per-entry
+	 * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock.
+	 */
+	if (is_valid_journal_point(slab_journal_point))
+		block->slab_journal_lock = slab_journal_point->sequence_number;
+	else
+		block->slab_journal_lock = 0;
+
+	dirty_block(block);
+	return VDO_SUCCESS;
+}
+
+/**
+ * add_entry_from_waiter() - Add an entry to the slab journal.
+ * @waiter: The vio which should make an entry now.
+ * @context: The slab journal to make an entry in.
+ *
+ * This callback is invoked by add_entries() once it has determined that we are ready to make
+ * another entry in the slab journal. Implements waiter_callback_fn.
+ */
+static void add_entry_from_waiter(struct waiter *waiter, void *context)
+{
+	int result;
+	struct reference_updater *updater =
+		container_of(waiter, struct reference_updater, waiter);
+	struct data_vio *data_vio = data_vio_from_reference_updater(updater);
+	struct slab_journal *journal = context;
+	struct slab_journal_block_header *header = &journal->tail_header;
+	struct journal_point slab_journal_point = {
+		.sequence_number = header->sequence_number,
+		.entry_count = header->entry_count,
+	};
+	sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number;
+
+	if (header->entry_count == 0) {
+		/*
+		 * This is the first entry in the current tail block, so get a lock on the recovery
+		 * journal which we will hold until this tail block is committed.
+		 */
+		get_lock(journal, header->sequence_number)->recovery_start = recovery_block;
+		if (journal->recovery_journal != NULL) {
+			zone_count_t zone_number = journal->slab->allocator->zone_number;
+
+			vdo_acquire_recovery_journal_block_reference(journal->recovery_journal,
+								     recovery_block,
+								     VDO_ZONE_TYPE_PHYSICAL,
+								     zone_number);
+		}
+
+		mark_slab_journal_dirty(journal, recovery_block);
+		reclaim_journal_space(journal);
+	}
+
+	add_entry(journal, updater->zpbn.pbn, updater->operation, updater->increment,
+		  expand_journal_point(data_vio->recovery_journal_point,
+				       updater->increment));
+
+	if (journal->slab->status != VDO_SLAB_REBUILT) {
+		/*
+		 * If the slab is unrecovered, scrubbing will take care of the count since the
+		 * update is now recorded in the journal.
+		 */
+		adjust_slab_journal_block_reference(journal,
+						    slab_journal_point.sequence_number, -1);
+		result = VDO_SUCCESS;
+	} else {
+		/* Now that an entry has been made in the slab journal, update the counter. */
+		result = adjust_reference_count(journal->slab, updater,
+						&slab_journal_point);
+	}
+
+	if (updater->increment)
+		continue_data_vio_with_error(data_vio, result);
+	else
+		vdo_continue_completion(&data_vio->decrement_completion, result);
+}
+
+/**
+ * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map
+ *                                         increment.
+ * @journal: The journal.
+ *
+ * Return: true if the first entry waiter's operation is a block map increment.
+ */
+static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal)
+{
+	struct waiter *waiter = vdo_get_first_waiter(&journal->entry_waiters);
+	struct reference_updater *updater = container_of(waiter,
+							 struct reference_updater,
+							 waiter);
+
+	return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING);
+}
+
+/**
+ * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries.
+ * @journal: The journal to which entries may be added.
+ *
+ * By processing the queue in order, we ensure that slab journal entries are made in the same order
+ * as recovery journal entries for the same increment or decrement.
+ */
+static void add_entries(struct slab_journal *journal)
+{
+	if (journal->adding_entries) {
+		/* Protect against re-entrancy. */
+		return;
+	}
+
+	journal->adding_entries = true;
+	while (vdo_has_waiters(&journal->entry_waiters)) {
+		struct slab_journal_block_header *header = &journal->tail_header;
+
+		if (journal->partial_write_in_progress ||
+		    (journal->slab->status == VDO_SLAB_REBUILDING)) {
+			/*
+			 * Don't add entries while rebuilding or while a partial write is
+			 * outstanding (VDO-2399).
+			 */
+			break;
+		}
+
+		if (journal->waiting_to_commit) {
+			/*
+			 * If we are waiting for resources to write the tail block, and the tail
+			 * block is full, we can't make another entry.
+			 */
+			WRITE_ONCE(journal->events->tail_busy_count,
+				   journal->events->tail_busy_count + 1);
+			break;
+		} else if (is_next_entry_a_block_map_increment(journal) &&
+			   (header->entry_count >= journal->full_entries_per_block)) {
+			/*
+			 * The tail block does not have room for a block map increment, so commit
+			 * it now.
+			 */
+			commit_tail(journal);
+			if (journal->waiting_to_commit) {
+				WRITE_ONCE(journal->events->tail_busy_count,
+					   journal->events->tail_busy_count + 1);
+				break;
+			}
+		}
+
+		/* If the slab is over the blocking threshold, make the vio wait. */
+		if (requires_reaping(journal)) {
+			WRITE_ONCE(journal->events->blocked_count,
+				   journal->events->blocked_count + 1);
+			save_dirty_reference_blocks(journal->slab);
+			break;
+		}
+
+		if (header->entry_count == 0) {
+			struct journal_lock *lock =
+				get_lock(journal, header->sequence_number);
+
+			/*
+			 * Check if the on disk slab journal is full. Because of the blocking and
+			 * scrubbing thresholds, this should never happen.
+			 */
+			if (lock->count > 0) {
+				ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
+						"New block has locks, but journal is not full");
+
+				/*
+				 * The blocking threshold must let the journal fill up if the new
+				 * block has locks; if the blocking threshold is smaller than the
+				 * journal size, the new block cannot possibly have locks already.
+				 */
+				ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
+						"New block can have locks already iff blocking threshold is at the end of the journal");
+
+				WRITE_ONCE(journal->events->disk_full_count,
+					   journal->events->disk_full_count + 1);
+				save_dirty_reference_blocks(journal->slab);
+				break;
+			}
+
+			/*
+			 * Don't allow the new block to be reaped until all of the reference count
+			 * blocks are written and the journal block has been fully committed as
+			 * well.
+			 */
+			lock->count = journal->entries_per_block + 1;
+
+			if (header->sequence_number == 1) {
+				struct vdo_slab *slab = journal->slab;
+				block_count_t i;
+
+				/*
+				 * This is the first entry in this slab journal, ever. Dirty all of
+				 * the reference count blocks. Each will acquire a lock on the tail
+				 * block so that the journal won't be reaped until the reference
+				 * counts are initialized. The lock acquisition must be done by the
+				 * ref_counts since here we don't know how many reference blocks
+				 * the ref_counts has.
+				 */
+				for (i = 0; i < slab->reference_block_count; i++) {
+					slab->reference_blocks[i].slab_journal_lock = 1;
+					dirty_block(&slab->reference_blocks[i]);
+				}
+
+				adjust_slab_journal_block_reference(journal, 1,
+								    slab->reference_block_count);
+			}
+		}
+
+		vdo_notify_next_waiter(&journal->entry_waiters,
+				       add_entry_from_waiter, journal);
+	}
+
+	journal->adding_entries = false;
+
+	/* If there are no waiters, and we are flushing or saving, commit the tail block. */
+	if (vdo_is_state_draining(&journal->slab->state) &&
+	    !vdo_is_state_suspending(&journal->slab->state) &&
+	    !vdo_has_waiters(&journal->entry_waiters))
+		commit_tail(journal);
+}
+
+/**
+ * reset_search_cursor() - Reset the free block search back to the first reference counter in the
+ *                         first reference block of a slab.
+ */
+static void reset_search_cursor(struct vdo_slab *slab)
+{
+	struct search_cursor *cursor = &slab->search_cursor;
+
+	cursor->block = cursor->first_block;
+	cursor->index = 0;
+	/* Unit tests have slabs with only one reference block (and it's a runt). */
+	cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
+}
+
+/**
+ * advance_search_cursor() - Advance the search cursor to the start of the next reference block in
+ *                           a slab,
+ *
+ * Wraps around to the first reference block if the current block is the last reference block.
+ *
+ * Return: true unless the cursor was at the last reference block.
+ */
+static bool advance_search_cursor(struct vdo_slab *slab)
+{
+	struct search_cursor *cursor = &slab->search_cursor;
+
+	/*
+	 * If we just finished searching the last reference block, then wrap back around to the
+	 * start of the array.
+	 */
+	if (cursor->block == cursor->last_block) {
+		reset_search_cursor(slab);
+		return false;
+	}
+
+	/* We're not already at the end, so advance to cursor to the next block. */
+	cursor->block++;
+	cursor->index = cursor->end_index;
+
+	if (cursor->block == cursor->last_block) {
+		/* The last reference block will usually be a runt. */
+		cursor->end_index = slab->block_count;
+	} else {
+		cursor->end_index += COUNTS_PER_BLOCK;
+	}
+
+	return true;
+}
+
+/**
+ * replay_reference_count_change() - Replay the reference count adjustment from a slab journal
+ *                                   entry into the reference count for a block.
+ * @slab: The slab.
+ * @entry_point: The slab journal point for the entry.
+ * @entry: The slab journal entry being replayed.
+ *
+ * The adjustment will be ignored if it was already recorded in the reference count.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int replay_reference_count_change(struct vdo_slab *slab,
+					 const struct journal_point *entry_point,
+					 struct slab_journal_entry entry)
+{
+	int result;
+	struct reference_block *block = get_reference_block(slab, entry.sbn);
+	sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR;
+	struct reference_updater updater = {
+		.operation = entry.operation,
+		.increment = entry.increment,
+	};
+
+	if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) {
+		/* This entry is already reflected in the existing counts, so do nothing. */
+		return VDO_SUCCESS;
+	}
+
+	/* This entry is not yet counted in the reference counts. */
+	result = update_reference_count(slab, block, entry.sbn, entry_point,
+					&updater, !NORMAL_OPERATION, false, NULL);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	dirty_block(block);
+	return VDO_SUCCESS;
+}
+
+/**
+ * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of
+ *                            reference counters.
+ * @word_ptr: A pointer to the eight counter bytes to check.
+ * @start_index: The array index corresponding to word_ptr[0].
+ * @fail_index: The array index to return if no zero byte is found.
+ *
+ * The search does no bounds checking; the function relies on the array being sufficiently padded.
+ *
+ * Return: The array index of the first zero byte in the word, or the value passed as fail_index if
+ *         no zero byte was found.
+ */
+static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr,
+						       slab_block_number start_index,
+						       slab_block_number fail_index)
+{
+	u64 word = get_unaligned_le64(word_ptr);
+
+	/* This looks like a loop, but GCC will unroll the eight iterations for us. */
+	unsigned int offset;
+
+	for (offset = 0; offset < BYTES_PER_WORD; offset++) {
+		/* Assumes little-endian byte order, which we have on X86. */
+		if ((word & 0xFF) == 0)
+			return (start_index + offset);
+		word >>= 8;
+	}
+
+	return fail_index;
+}
+
+/**
+ * vdo_find_free_block() - Find the first block with a reference count of zero in the specified
+ *                         range of reference counter indexes.
+ * @slab: The slab counters to scan.
+ * @index_ptr: A pointer to hold the array index of the free block.
+ *
+ * Exposed for unit testing.
+ *
+ * Return: true if a free block was found in the specified range.
+ */
+static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr)
+{
+	slab_block_number zero_index;
+	slab_block_number next_index = slab->search_cursor.index;
+	slab_block_number end_index = slab->search_cursor.end_index;
+	u8 *next_counter = &slab->counters[next_index];
+	u8 *end_counter = &slab->counters[end_index];
+
+	/*
+	 * Search every byte of the first unaligned word. (Array is padded so reading past end is
+	 * safe.)
+	 */
+	zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
+	if (zero_index < end_index) {
+		*index_ptr = zero_index;
+		return true;
+	}
+
+	/*
+	 * On architectures where unaligned word access is expensive, this would be a good place to
+	 * advance to an alignment boundary.
+	 */
+	next_index += BYTES_PER_WORD;
+	next_counter += BYTES_PER_WORD;
+
+	/*
+	 * Now we're word-aligned; check an word at a time until we find a word containing a zero.
+	 * (Array is padded so reading past end is safe.)
+	 */
+	while (next_counter < end_counter) {
+		/*
+		 * The following code is currently an exact copy of the code preceding the loop,
+		 * but if you try to merge them by using a do loop, it runs slower because a jump
+		 * instruction gets added at the start of the iteration.
+		 */
+		zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
+		if (zero_index < end_index) {
+			*index_ptr = zero_index;
+			return true;
+		}
+
+		next_index += BYTES_PER_WORD;
+		next_counter += BYTES_PER_WORD;
+	}
+
+	return false;
+}
+
+/**
+ * search_current_reference_block() - Search the reference block currently saved in the search
+ *                                    cursor for a reference count of zero, starting at the saved
+ *                                    counter index.
+ * @slab: The slab to search.
+ * @free_index_ptr: A pointer to receive the array index of the zero reference count.
+ *
+ * Return: true if an unreferenced counter was found.
+ */
+static bool search_current_reference_block(const struct vdo_slab *slab,
+					   slab_block_number *free_index_ptr)
+{
+	/* Don't bother searching if the current block is known to be full. */
+	return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) &&
+		find_free_block(slab, free_index_ptr));
+}
+
+/**
+ * search_reference_blocks() - Search each reference block for a reference count of zero.
+ * @slab: The slab to search.
+ * @free_index_ptr: A pointer to receive the array index of the zero reference count.
+ *
+ * Searches each reference block for a reference count of zero, starting at the reference block and
+ * counter index saved in the search cursor and searching up to the end of the last reference
+ * block. The search does not wrap.
+ *
+ * Return: true if an unreferenced counter was found.
+ */
+static bool search_reference_blocks(struct vdo_slab *slab,
+				    slab_block_number *free_index_ptr)
+{
+	/* Start searching at the saved search position in the current block. */
+	if (search_current_reference_block(slab, free_index_ptr))
+		return true;
+
+	/* Search each reference block up to the end of the slab. */
+	while (advance_search_cursor(slab)) {
+		if (search_current_reference_block(slab, free_index_ptr))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
+ */
+static void make_provisional_reference(struct vdo_slab *slab,
+				       slab_block_number block_number)
+{
+	struct reference_block *block = get_reference_block(slab, block_number);
+
+	/*
+	 * Make the initial transition from an unreferenced block to a
+	 * provisionally allocated block.
+	 */
+	slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT;
+
+	/* Account for the allocation. */
+	block->allocated_count++;
+	slab->free_blocks--;
+}
+
+/**
+ * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
+ */
+static void dirty_all_reference_blocks(struct vdo_slab *slab)
+{
+	block_count_t i;
+
+	for (i = 0; i < slab->reference_block_count; i++)
+		dirty_block(&slab->reference_blocks[i]);
+}
+
+/**
+ * clear_provisional_references() - Clear the provisional reference counts from a reference block.
+ * @block: The block to clear.
+ */
+static void clear_provisional_references(struct reference_block *block)
+{
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+	block_count_t j;
+
+	for (j = 0; j < COUNTS_PER_BLOCK; j++) {
+		if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
+			counters[j] = EMPTY_REFERENCE_COUNT;
+			block->allocated_count--;
+		}
+	}
+}
+
+static inline bool journal_points_equal(struct journal_point first,
+					struct journal_point second)
+{
+	return ((first.sequence_number == second.sequence_number) &&
+		(first.entry_count == second.entry_count));
+}
+
+/**
+ * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure.
+ * @packed: The written reference block to be unpacked.
+ * @block: The internal reference block to be loaded.
+ */
+static void unpack_reference_block(struct packed_reference_block *packed,
+				   struct reference_block *block)
+{
+	block_count_t index;
+	sector_count_t i;
+	struct vdo_slab *slab = block->slab;
+	vdo_refcount_t *counters = get_reference_counters_for_block(block);
+
+	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
+		struct packed_reference_sector *sector = &packed->sectors[i];
+
+		vdo_unpack_journal_point(&sector->commit_point, &block->commit_points[i]);
+		memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts,
+		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
+		/* The slab_journal_point must be the latest point found in any sector. */
+		if (vdo_before_journal_point(&slab->slab_journal_point,
+					     &block->commit_points[i]))
+			slab->slab_journal_point = block->commit_points[i];
+
+		if ((i > 0) &&
+		    !journal_points_equal(block->commit_points[0],
+					  block->commit_points[i])) {
+			size_t block_index = block - block->slab->reference_blocks;
+
+			uds_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
+					i, block_index, block->slab->slab_number);
+		}
+	}
+
+	block->allocated_count = 0;
+	for (index = 0; index < COUNTS_PER_BLOCK; index++) {
+		if (counters[index] != EMPTY_REFERENCE_COUNT)
+			block->allocated_count++;
+	}
+}
+
+/**
+ * finish_reference_block_load() - After a reference block has been read, unpack it.
+ * @completion: The VIO that just finished reading.
+ */
+static void finish_reference_block_load(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
+	struct reference_block *block = completion->parent;
+	struct vdo_slab *slab = block->slab;
+
+	unpack_reference_block((struct packed_reference_block *) vio->data, block);
+	return_vio_to_pool(slab->allocator->vio_pool, pooled);
+	slab->active_count--;
+	clear_provisional_references(block);
+
+	slab->free_blocks -= block->allocated_count;
+	check_if_slab_drained(slab);
+}
+
+static void load_reference_block_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct reference_block *block = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_reference_block_load,
+			      block->slab->allocator->thread_id);
+}
+
+/**
+ * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
+ *                          block.
+ * @waiter: The waiter of the block to load.
+ * @context: The VIO returned by the pool.
+ */
+static void load_reference_block(struct waiter *waiter, void *context)
+{
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	struct reference_block *block =
+		container_of(waiter, struct reference_block, waiter);
+	size_t block_offset = (block - block->slab->reference_blocks);
+
+	vio->completion.parent = block;
+	submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
+			    load_reference_block_endio, handle_io_error,
+			    REQ_OP_READ);
+}
+
+/**
+ * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a
+ *                           pre-allocated reference counter.
+ */
+static void load_reference_blocks(struct vdo_slab *slab)
+{
+	block_count_t i;
+
+	slab->free_blocks = slab->block_count;
+	slab->active_count = slab->reference_block_count;
+	for (i = 0; i < slab->reference_block_count; i++) {
+		struct waiter *waiter = &slab->reference_blocks[i].waiter;
+
+		waiter->callback = load_reference_block;
+		acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
+	}
+}
+
+/**
+ * drain_slab() - Drain all reference count I/O.
+ *
+ * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the
+ * reference blocks may be loaded from disk or dirty reference blocks may be written out.
+ */
+static void drain_slab(struct vdo_slab *slab)
+{
+	bool save;
+	bool load;
+	const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state);
+
+	if (state == VDO_ADMIN_STATE_SUSPENDING)
+		return;
+
+	if ((state != VDO_ADMIN_STATE_REBUILDING) &&
+	    (state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING))
+		commit_tail(&slab->journal);
+
+	if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL))
+		return;
+
+	save = false;
+	load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts;
+	if (state == VDO_ADMIN_STATE_SCRUBBING) {
+		if (load) {
+			load_reference_blocks(slab);
+			return;
+		}
+	} else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) {
+		if (!load) {
+			/* These reference counts were never written, so mark them all dirty. */
+			dirty_all_reference_blocks(slab);
+		}
+		save = true;
+	} else if (state == VDO_ADMIN_STATE_REBUILDING) {
+		/*
+		 * Write out the counters if the slab has written them before, or it has any
+		 * non-zero reference counts, or there are any slab journal blocks.
+		 */
+		block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks;
+
+		if (load || (slab->free_blocks != data_blocks) ||
+		    !is_slab_journal_blank(slab)) {
+			dirty_all_reference_blocks(slab);
+			save = true;
+		}
+	} else if (state == VDO_ADMIN_STATE_SAVING) {
+		save = (slab->status == VDO_SLAB_REBUILT);
+	} else {
+		vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS);
+		return;
+	}
+
+	if (save)
+		save_dirty_reference_blocks(slab);
+}
+
+static int allocate_slab_counters(struct vdo_slab *slab)
+{
+	int result;
+	size_t index, bytes;
+
+	result = ASSERT(slab->reference_blocks == NULL,
+			"vdo_slab %u doesn't allocate refcounts twice",
+			slab->slab_number);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = uds_allocate(slab->reference_block_count, struct reference_block,
+			      __func__, &slab->reference_blocks);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	/*
+	 * Allocate such that the runt slab has a full-length memory array, plus a little padding
+	 * so we can word-search even at the very end.
+	 */
+	bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
+	result = uds_allocate(bytes, vdo_refcount_t, "ref counts array",
+			      &slab->counters);
+	if (result != UDS_SUCCESS) {
+		uds_free(uds_forget(slab->reference_blocks));
+		return result;
+	}
+
+	slab->search_cursor.first_block = slab->reference_blocks;
+	slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1];
+	reset_search_cursor(slab);
+
+	for (index = 0; index < slab->reference_block_count; index++) {
+		slab->reference_blocks[index] = (struct reference_block) {
+			.slab = slab,
+		};
+	}
+
+	return VDO_SUCCESS;
+}
+
+static int allocate_counters_if_clean(struct vdo_slab *slab)
+{
+	if (vdo_is_state_clean_load(&slab->state))
+		return allocate_slab_counters(slab);
+
+	return VDO_SUCCESS;
+}
+
+static void finish_loading_journal(struct vdo_completion *completion)
+{
+	struct vio *vio = as_vio(completion);
+	struct slab_journal *journal = completion->parent;
+	struct vdo_slab *slab = journal->slab;
+	struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data;
+	struct slab_journal_block_header header;
+
+	vdo_unpack_slab_journal_block_header(&block->header, &header);
+
+	/* FIXME: should it be an error if the following conditional fails? */
+	if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) &&
+	    (header.nonce == slab->allocator->nonce)) {
+		journal->tail = header.sequence_number + 1;
+
+		/*
+		 * If the slab is clean, this implies the slab journal is empty, so advance the
+		 * head appropriately.
+		 */
+		journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ?
+				 header.head : journal->tail);
+		journal->tail_header = header;
+		initialize_journal_state(journal);
+	}
+
+	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
+}
+
+static void read_slab_journal_tail_endio(struct bio *bio)
+{
+	struct vio *vio = bio->bi_private;
+	struct slab_journal *journal = vio->completion.parent;
+
+	continue_vio_after_io(vio, finish_loading_journal,
+			      journal->slab->allocator->thread_id);
+}
+
+static void handle_load_error(struct vdo_completion *completion)
+{
+	int result = completion->result;
+	struct slab_journal *journal = completion->parent;
+	struct vio *vio = as_vio(completion);
+
+	vio_record_metadata_io_error(vio);
+	return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
+	vdo_finish_loading_with_result(&journal->slab->state, result);
+}
+
+/**
+ * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio
+ *                            pool.
+ * @waiter: The vio pool waiter which has just been notified.
+ * @context: The vio pool entry given to the waiter.
+ *
+ * This is the success callback from acquire_vio_from_pool() when loading a slab journal.
+ */
+static void read_slab_journal_tail(struct waiter *waiter, void *context)
+{
+	struct slab_journal *journal =
+		container_of(waiter, struct slab_journal, resource_waiter);
+	struct vdo_slab *slab = journal->slab;
+	struct pooled_vio *pooled = context;
+	struct vio *vio = &pooled->vio;
+	tail_block_offset_t last_commit_point =
+		slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+
+	/*
+	 * Slab summary keeps the commit point offset, so the tail block is the block before that.
+	 * Calculation supports small journals in unit tests.
+	 */
+	tail_block_offset_t tail_block = ((last_commit_point == 0) ?
+					  (tail_block_offset_t)(journal->size - 1) :
+					  (last_commit_point - 1));
+
+	vio->completion.parent = journal;
+	vio->completion.callback_thread_id = slab->allocator->thread_id;
+	submit_metadata_vio(vio, slab->journal_origin + tail_block,
+			    read_slab_journal_tail_endio, handle_load_error,
+			    REQ_OP_READ);
+}
+
+/**
+ * load_slab_journal() - Load a slab's journal by reading the journal's tail.
+ */
+static void load_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	tail_block_offset_t last_commit_point;
+
+	last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
+	if ((last_commit_point == 0) &&
+	    !slab->allocator->summary_entries[slab->slab_number].load_ref_counts) {
+		/*
+		 * This slab claims that it has a tail block at (journal->size - 1), but a head of
+		 * 1. This is impossible, due to the scrubbing threshold, on a real system, so
+		 * don't bother reading the (bogus) data off disk.
+		 */
+		ASSERT_LOG_ONLY(((journal->size < 16) ||
+				 (journal->scrubbing_threshold < (journal->size - 1))),
+				"Scrubbing threshold protects against reads of unwritten slab journal blocks");
+		vdo_finish_loading_with_result(&slab->state,
+					       allocate_counters_if_clean(slab));
+		return;
+	}
+
+	journal->resource_waiter.callback = read_slab_journal_tail;
+	acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter);
+}
+
+static void free_slab(struct vdo_slab *slab)
+{
+	if (slab == NULL)
+		return;
+
+	list_del(&slab->allocq_entry);
+	uds_free(uds_forget(slab->journal.block));
+	uds_free(uds_forget(slab->journal.locks));
+	uds_free(uds_forget(slab->counters));
+	uds_free(uds_forget(slab->reference_blocks));
+	uds_free(slab);
+}
+
+static int initialize_slab_journal(struct vdo_slab *slab)
+{
+	struct slab_journal *journal = &slab->journal;
+	const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
+	int result;
+
+	result = uds_allocate(slab_config->slab_journal_blocks, struct journal_lock,
+			      __func__, &journal->locks);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	result = uds_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
+			      (char **) &journal->block);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	journal->slab = slab;
+	journal->size = slab_config->slab_journal_blocks;
+	journal->flushing_threshold = slab_config->slab_journal_flushing_threshold;
+	journal->blocking_threshold = slab_config->slab_journal_blocking_threshold;
+	journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold;
+	journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK;
+	journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK;
+	journal->events = &slab->allocator->slab_journal_statistics;
+	journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal;
+	journal->tail = 1;
+	journal->head = 1;
+
+	journal->flushing_deadline = journal->flushing_threshold;
+	/*
+	 * Set there to be some time between the deadline and the blocking threshold, so that
+	 * hopefully all are done before blocking.
+	 */
+	if ((journal->blocking_threshold - journal->flushing_threshold) > 5)
+		journal->flushing_deadline = journal->blocking_threshold - 5;
+
+	journal->slab_summary_waiter.callback = release_journal_locks;
+
+	INIT_LIST_HEAD(&journal->dirty_entry);
+	INIT_LIST_HEAD(&journal->uncommitted_blocks);
+
+	journal->tail_header.nonce = slab->allocator->nonce;
+	journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL;
+	initialize_journal_state(journal);
+	return VDO_SUCCESS;
+}
+
+/**
+ * make_slab() - Construct a new, empty slab.
+ * @slab_origin: The physical block number within the block allocator partition of the first block
+ *               in the slab.
+ * @allocator: The block allocator to which the slab belongs.
+ * @slab_number: The slab number of the slab.
+ * @is_new: true if this slab is being allocated as part of a resize.
+ * @slab_ptr: A pointer to receive the new slab.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+static int __must_check make_slab(physical_block_number_t slab_origin,
+				  struct block_allocator *allocator,
+				  slab_count_t slab_number, bool is_new,
+				  struct vdo_slab **slab_ptr)
+{
+	const struct slab_config *slab_config = &allocator->depot->slab_config;
+	struct vdo_slab *slab;
+	int result;
+
+	result = uds_allocate(1, struct vdo_slab, __func__, &slab);
+	if (result != VDO_SUCCESS)
+		return result;
+
+	*slab = (struct vdo_slab) {
+		.allocator = allocator,
+		.start = slab_origin,
+		.end = slab_origin + slab_config->slab_blocks,
+		.slab_number = slab_number,
+		.ref_counts_origin = slab_origin + slab_config->data_blocks,
+		.journal_origin =
+			vdo_get_slab_journal_start_block(slab_config, slab_origin),
+		.block_count = slab_config->data_blocks,
+		.free_blocks = slab_config->data_blocks,
+		.reference_block_count =
+			vdo_get_saved_reference_count_size(slab_config->data_blocks),
+	};
+	INIT_LIST_HEAD(&slab->allocq_entry);
+
+	result = initialize_slab_journal(slab);
+	if (result != VDO_SUCCESS) {
+		free_slab(slab);
+		return result;
+	}
+
+	if (is_new) {
+		vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW);
+		result = allocate_slab_counters(slab);
+		if (result != VDO_SUCCESS) {
+			free_slab(slab);
+			return result;
+		}
+	} else {
+		vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
+	}
+
+	*slab_ptr = slab;
+	return VDO_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h
new file mode 100644
index 000000000000..98a2c4135e05
--- /dev/null
+++ b/drivers/md/dm-vdo/slab-depot.h
@@ -0,0 +1,252 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_SLAB_DEPOT_H
+#define VDO_SLAB_DEPOT_H
+
+#include <linux/atomic.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/list.h>
+
+#include "numeric.h"
+
+#include "admin-state.h"
+#include "completion.h"
+#include "data-vio.h"
+#include "encodings.h"
+#include "physical-zone.h"
+#include "priority-table.h"
+#include "recovery-journal.h"
+#include "statistics.h"
+#include "types.h"
+#include "vio.h"
+#include "wait-queue.h"
+
+/*
+ * Represents the possible status of a block.
+ */
+enum reference_status {
+	RS_FREE, /* this block is free */
+	RS_SINGLE, /* this block is singly-referenced */
+	RS_SHARED, /* this block is shared */
+	RS_PROVISIONAL /* this block is provisionally allocated */
+};
+
+struct vdo_slab;
+
+struct journal_lock {
+	u16 count;
+	sequence_number_t recovery_start;
+};
+
+struct slab_journal {
+	/* A waiter object for getting a VIO pool entry */
+	struct waiter resource_waiter;
+	/* A waiter object for updating the slab summary */
+	struct waiter slab_summary_waiter;
+	/* A waiter object for getting a vio with which to flush */
+	struct waiter flush_waiter;
+	/* The queue of VIOs waiting to make an entry */
+	struct wait_queue entry_waiters;
+	/* The parent slab reference of this journal */
+	struct vdo_slab *slab;
+
+	/* Whether a tail block commit is pending */
+	bool waiting_to_commit;
+	/* Whether the journal is updating the slab summary */
+	bool updating_slab_summary;
+	/* Whether the journal is adding entries from the entry_waiters queue */
+	bool adding_entries;
+	/* Whether a partial write is in progress */
+	bool partial_write_in_progress;
+
+	/* The oldest block in the journal on disk */
+	sequence_number_t head;
+	/* The oldest block in the journal which may not be reaped */
+	sequence_number_t unreapable;
+	/* The end of the half-open interval of the active journal */
+	sequence_number_t tail;
+	/* The next journal block to be committed */
+	sequence_number_t next_commit;
+	/* The tail sequence number that is written in the slab summary */
+	sequence_number_t summarized;
+	/* The tail sequence number that was last summarized in slab summary */
+	sequence_number_t last_summarized;
+
+	/* The sequence number of the recovery journal lock */
+	sequence_number_t recovery_lock;
+
+	/*
+	 * The number of entries which fit in a single block. Can't use the constant because unit
+	 * tests change this number.
+	 */
+	journal_entry_count_t entries_per_block;
+	/*
+	 * The number of full entries which fit in a single block. Can't use the constant because
+	 * unit tests change this number.
+	 */
+	journal_entry_count_t full_entries_per_block;
+
+	/* The recovery journal of the VDO (slab journal holds locks on it) */
+	struct recovery_journal *recovery_journal;
+
+	/* The statistics shared by all slab journals in our physical zone */
+	struct slab_journal_statistics *events;
+	/* A list of the VIO pool entries for outstanding journal block writes */
+	struct list_head uncommitted_blocks;
+
+	/*
+	 * The current tail block header state. This will be packed into the block just before it
+	 * is written.
+	 */
+	struct slab_journal_block_header tail_header;
+	/* A pointer to a block-sized buffer holding the packed block data */
+	struct packed_slab_journal_block *block;
+
+	/* The number of blocks in the on-disk journal */
+	block_count_t size;
+	/* The number of blocks at which to start pushing reference blocks */
+	block_count_t flushing_threshold;
+	/* The number of blocks at which all reference blocks should be writing */
+	block_count_t flushing_deadline;
+	/* The number of blocks at which to wait for reference blocks to write */
+	block_count_t blocking_threshold;
+	/* The number of blocks at which to scrub the slab before coming online */
+	block_count_t scrubbing_threshold;
+
+	/* This list entry is for block_allocator to keep a queue of dirty journals */
+	struct list_head dirty_entry;
+
+	/* The lock for the oldest unreaped block of the journal */
+	struct journal_lock *reap_lock;
+	/* The locks for each on disk block */
+	struct journal_lock *locks;
+};
+
+/*
+ * Reference_block structure
+ *
+ * Blocks are used as a proxy, permitting saves of partial refcounts.
+ */
+struct reference_block {
+	/* This block waits on the ref_counts to tell it to write */
+	struct waiter waiter;
+	/* The slab to which this reference_block belongs */
+	struct vdo_slab *slab;
+	/* The number of references in this block that represent allocations */
+	block_size_t allocated_count;
+	/* The slab journal block on which this block must hold a lock */
+	sequence_number_t slab_journal_lock;
+	/* The slab journal block which should be released when this block is committed */
+	sequence_number_t slab_journal_lock_to_release;
+	/* The point up to which each sector is accurate on disk */
+	struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
+	/* Whether this block has been modified since it was written to disk */
+	bool is_dirty;
+	/* Whether this block is currently writing */
+	bool is_writing;
+};
+
+/* The search_cursor represents the saved position of a free block search. */
+struct search_cursor {
+	/* The reference block containing the current search index */
+	struct reference_block *block;
+	/* The position at which to start searching for the next free counter */
+	slab_block_number index;
+	/* The position just past the last valid counter in the current block */
+	slab_block_number end_index;
+
+	/* A pointer to the first reference block in the slab */
+	struct reference_block *first_block;
+	/* A pointer to the last reference block in the slab */
+	struct reference_block *last_block;
+};
+
+enum slab_rebuild_status {
+	VDO_SLAB_REBUILT,
+	VDO_SLAB_REPLAYING,
+	VDO_SLAB_REQUIRES_SCRUBBING,
+	VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
+	VDO_SLAB_REBUILDING,
+};
+
+/*
+ * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
+ * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
+ * metadata storage for the reference counts and slab journal for the slab.
+ *
+ * A reference count is maintained for each physical block number. The vast majority of blocks have
+ * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
+ * (254) the reference count is stored in counters[pbn].
+ */
+struct vdo_slab {
+	/* A list entry to queue this slab in a block_allocator list */
+	struct list_head allocq_entry;
+
+	/* The struct block_allocator that owns this slab */
+	struct block_allocator *allocator;
+
+	/* The journal for this slab */
+	struct slab_journal journal;
+
+	/* The slab number of this slab */
+	slab_count_t slab_number;
+	/* The offset in the allocator partition of the first block in this slab */
+	physical_block_number_t start;
+	/* The offset of the first block past the end of this slab */
+	physical_block_number_t end;
+	/* The starting translated PBN of the slab journal */
+	physical_block_number_t journal_origin;
+	/* The starting translated PBN of the reference counts */
+	physical_block_number_t ref_counts_origin;
+
+	/* The administrative state of the slab */
+	struct admin_state state;
+	/* The status of the slab */
+	enum slab_rebuild_status status;
+	/* Whether the slab was ever queued for scrubbing */
+	bool was_queued_for_scrubbing;
+
+	/* The priority at which this slab has been queued for allocation */
+	u8 priority;
+
+	/* Fields beyond this point are the reference counts for the data blocks in this slab. */
+	/* The size of the counters array */
+	u32 block_count;
+	/* The number of free blocks */
+	u32 free_blocks;
+	/* The array of reference counts */
+	vdo_refcount_t *counters; /* use uds_allocate() to align data ptr */
+
+	/* The saved block pointer and array indexes for the free block search */
+	struct search_cursor search_cursor;
+
+	/* A list of the dirty blocks waiting to be written out */
+	struct wait_queue dirty_blocks;
+	/* The number of blocks which are currently writing */
+	size_t active_count;
+
+	/* A waiter object for updating the slab summary */
+	struct waiter summary_waiter;
+
+	/* The latest slab journal for which there has been a reference count update */
+	struct journal_point slab_journal_point;
+
+	/* The number of reference count blocks */
+	u32 reference_block_count;
+	/* reference count block array */
+	struct reference_block *reference_blocks;
+};
+
+bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
+					       physical_block_number_t pbn,
+					       enum journal_operation operation,
+					       bool increment,
+					       struct journal_point *recovery_point,
+					       struct vdo_completion *parent);
+
+void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
+
+#endif /* VDO_SLAB_DEPOT_H */