File libtorrent-rasterbar_PR7013.patch of Package libtorrent-rasterbar

From db932d9fa4efaa4e3927015f78494d29ed9d6751 Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Sun, 17 Jul 2022 18:15:49 -0700
Subject: [PATCH 1/7] add new multi-threaded disk I/O subsystem using preadv
 and pwritev

---
 CMakeLists.txt                                |    5 +
 ChangeLog                                     |    1 +
 Jamfile                                       |    3 +
 Makefile                                      |    7 +
 bindings/python/src/session.cpp               |    3 +
 examples/client_test.cpp                      |    7 +-
 include/libtorrent/aux_/debug_disk_thread.hpp |    5 +
 include/libtorrent/aux_/disk_buffer_pool.hpp  |    3 +
 include/libtorrent/aux_/disk_cache.hpp        |  380 ++++
 .../libtorrent/aux_/disk_completed_queue.hpp  |    1 +
 include/libtorrent/aux_/disk_job.hpp          |    8 +
 include/libtorrent/aux_/disk_job_pool.hpp     |    2 +
 include/libtorrent/aux_/pread_disk_job.hpp    |   27 +
 include/libtorrent/aux_/pread_storage.hpp     |  187 ++
 include/libtorrent/aux_/store_buffer.hpp      |    7 +
 include/libtorrent/aux_/unique_ptr.hpp        |    7 +
 include/libtorrent/config.hpp                 |   10 +
 include/libtorrent/libtorrent.hpp             |    1 +
 include/libtorrent/pread_disk_io.hpp          |   28 +
 src/disk_buffer_pool.cpp                      |   11 +-
 src/disk_cache.cpp                            |  669 +++++++
 src/disk_completed_queue.cpp                  |   27 +
 src/disk_job.cpp                              |    2 +
 src/disk_job_pool.cpp                         |    2 +
 src/mmap_disk_io.cpp                          |    1 +
 src/pread_disk_io.cpp                         | 1748 +++++++++++++++++
 src/pread_storage.cpp                         |  797 ++++++++
 src/session.cpp                               |    5 +-
 src/settings_pack.cpp                         |    2 +-
 src/torrent.cpp                               |    8 +-
 test/test_add_torrent.cpp                     |    3 +-
 test/test_copy_file.cpp                       |    1 +
 test/test_file.cpp                            |    2 +
 test/test_storage.cpp                         |   80 +-
 test/test_torrent_info.cpp                    |    3 +-
 test/web_seed_suite.cpp                       |    6 +-
 tools/disk_io_stress_test.cpp                 |    5 +-
 tools/parse_session_stats.py                  |    3 +-
 tools/run_benchmark.py                        |    9 +-
 39 files changed, 4050 insertions(+), 26 deletions(-)
 create mode 100644 include/libtorrent/aux_/disk_cache.hpp
 create mode 100644 include/libtorrent/aux_/pread_disk_job.hpp
 create mode 100644 include/libtorrent/aux_/pread_storage.hpp
 create mode 100644 include/libtorrent/pread_disk_io.hpp
 create mode 100644 src/disk_cache.cpp
 create mode 100644 src/pread_disk_io.cpp
 create mode 100644 src/pread_storage.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31768b82e43..5cb31258987 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,6 +166,7 @@ set(libtorrent_aux_include_files
 	disable_warnings_pop.hpp
 	disable_warnings_push.hpp
 	disk_buffer_pool.hpp
+	disk_cache.hpp
 	disk_completed_queue.hpp
 	mmap_disk_job.hpp
 	disk_job.hpp
@@ -226,6 +227,7 @@ set(libtorrent_aux_include_files
 	portmap.hpp
 	posix_part_file.hpp
 	posix_storage.hpp
+	pread_disk_job.hpp
 	proxy_base.hpp
 	proxy_settings.hpp
 	puff.hpp
@@ -327,6 +329,7 @@ set(sources
 	disabled_disk_io.cpp
 	disk_buffer_holder.cpp
 	disk_buffer_pool.cpp
+	disk_cache.cpp
 	disk_completed_queue.cpp
 	disk_io_thread_pool.cpp
 	disk_job_fence.cpp
@@ -384,6 +387,8 @@ set(sources
 	posix_disk_io.cpp
 	posix_part_file.cpp
 	posix_storage.cpp
+	pread_disk_io.cpp
+	pread_storage.cpp
 	proxy_base.cpp
 	proxy_settings.cpp
 	puff.cpp
diff --git a/ChangeLog b/ChangeLog
index 15b0177afaf..e17b67f74e5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,6 @@
 2.1.0 not released
 
+	* add a multi-threaded, pread()-based, disk I/O backend (pread_disk_io)
 	* try harder to bind TCP and UDP sockets to the same port
 	* made disk_interface's status_t type a flags type
 	* optimize resume data format to use less space
diff --git a/Jamfile b/Jamfile
index f8985d16dad..4a8cc207f33 100644
--- a/Jamfile
+++ b/Jamfile
@@ -796,6 +796,7 @@ SOURCES =
 	directory
 	disk_buffer_holder
 	disk_buffer_pool
+	disk_cache
 	disk_completed_queue
 	disk_io_thread_pool
 	disabled_disk_io
@@ -900,6 +901,8 @@ SOURCES =
 	mmap
 	mmap_disk_io
 	mmap_storage
+	pread_disk_io
+	pread_storage
 	posix_disk_io
 	posix_part_file
 	posix_storage
diff --git a/Makefile b/Makefile
index 624eafefaa1..9f9d169f54a 100644
--- a/Makefile
+++ b/Makefile
@@ -321,6 +321,7 @@ SOURCES = \
   disabled_disk_io.cpp            \
   disk_buffer_holder.cpp          \
   disk_buffer_pool.cpp            \
+  disk_cache.cpp                  \
   disk_completed_queue.cpp        \
   disk_io_thread_pool.cpp         \
   disk_job_fence.cpp              \
@@ -379,6 +380,8 @@ SOURCES = \
   posix_disk_io.cpp               \
   posix_part_file.cpp             \
   posix_storage.cpp               \
+  pread_disk_io.cpp               \
+  pread_storage.cpp               \
   proxy_base.cpp                  \
   proxy_settings.cpp              \
   puff.cpp                        \
@@ -495,6 +498,7 @@ HEADERS = \
   piece_block.hpp              \
   portmap.hpp                  \
   posix_disk_io.hpp            \
+  pread_disk_io.hpp            \
   read_resume_data.hpp         \
   session.hpp                  \
   session_handle.hpp           \
@@ -560,6 +564,7 @@ HEADERS = \
   aux_/disable_warnings_pop.hpp     \
   aux_/disable_warnings_push.hpp    \
   aux_/disk_buffer_pool.hpp         \
+  aux_/disk_cache.hpp               \
   aux_/disk_completed_queue.hpp     \
   aux_/disk_io_thread_pool.hpp      \
   aux_/disk_job_fence.hpp           \
@@ -626,6 +631,8 @@ HEADERS = \
   aux_/portmap.hpp                  \
   aux_/posix_part_file.hpp          \
   aux_/posix_storage.hpp            \
+  aux_/pread_disk_job.hpp           \
+  aux_/pread_storage.hpp            \
   aux_/proxy_base.hpp               \
   aux_/proxy_settings.hpp           \
   aux_/puff.hpp                     \
diff --git a/bindings/python/src/session.cpp b/bindings/python/src/session.cpp
index 46048393e2f..e979398a58c 100644
--- a/bindings/python/src/session.cpp
+++ b/bindings/python/src/session.cpp
@@ -30,6 +30,7 @@
 
 #include <libtorrent/mmap_disk_io.hpp>
 #include <libtorrent/posix_disk_io.hpp>
+#include <libtorrent/pread_disk_io.hpp>
 
 namespace boost
 {
@@ -882,6 +883,8 @@ namespace
 #endif
         if (disk_io == "posix_disk_io_constructor")
             s.disk_io_constructor = &lt::posix_disk_io_constructor;
+        else if (disk_io == "pread_disk_io_constructor")
+            s.disk_io_constructor = &lt::pread_disk_io_constructor;
         else
             s.disk_io_constructor = &lt::default_disk_io_constructor;
     }
diff --git a/examples/client_test.cpp b/examples/client_test.cpp
index f93d5f07542..ec548cab97a 100644
--- a/examples/client_test.cpp
+++ b/examples/client_test.cpp
@@ -56,6 +56,7 @@ see LICENSE file.
 
 #include "libtorrent/mmap_disk_io.hpp"
 #include "libtorrent/posix_disk_io.hpp"
+#include "libtorrent/pread_disk_io.hpp"
 #include "libtorrent/disabled_disk_io.hpp"
 
 #include "torrent_view.hpp"
@@ -1347,7 +1348,7 @@ CLIENT OPTIONS
   -O                    print session stats counters to the log
   -1                    exit on first torrent completing (useful for benchmarks)
   -i <disk-io>          specify which disk I/O back-end to use. One of:
-                        mmap, posix, disabled
+                        mmap, posix, pread, disabled
 )"
 #ifdef TORRENT_UTP_LOG_ENABLE
 R"(
@@ -1561,6 +1562,10 @@ int main(int argc, char* argv[])
 #endif
 				if (arg == "posix"_sv)
 					params.disk_io_constructor = lt::posix_disk_io_constructor;
+#if TORRENT_HAVE_PREAD || defined TORRENT_WINDOWS
+				else if (arg == "pread"_sv)
+					params.disk_io_constructor = lt::pread_disk_io_constructor;
+#endif
 				else if (arg == "disabled"_sv)
 					params.disk_io_constructor = lt::disabled_disk_io_constructor;
 				else
diff --git a/include/libtorrent/aux_/debug_disk_thread.hpp b/include/libtorrent/aux_/debug_disk_thread.hpp
index a1b7a5b3608..f0f9f2865ae 100644
--- a/include/libtorrent/aux_/debug_disk_thread.hpp
+++ b/include/libtorrent/aux_/debug_disk_thread.hpp
@@ -24,6 +24,7 @@ see LICENSE file.
 #include <string>
 #include <sstream>
 #include <unordered_map>
+#include <thread>
 
 #include "libtorrent/aux_/disk_job.hpp"
 #include "libtorrent/disk_interface.hpp"
@@ -93,6 +94,10 @@ inline std::string print_job(aux::disk_job const& j)
 				<< " buf-offset:" << j.buffer_offset << " size:" << j.buffer_size << " )";
 		}
 
+		void operator()(job::kick_hasher const& j) const {
+			m_ss << "kick-hasher( piece:" << j.piece << " )";
+		}
+
 	private:
 		std::stringstream& m_ss;
 	};
diff --git a/include/libtorrent/aux_/disk_buffer_pool.hpp b/include/libtorrent/aux_/disk_buffer_pool.hpp
index 99d39c44506..e07eca52a0d 100644
--- a/include/libtorrent/aux_/disk_buffer_pool.hpp
+++ b/include/libtorrent/aux_/disk_buffer_pool.hpp
@@ -21,6 +21,7 @@ see LICENSE file.
 #include <mutex>
 #include <functional>
 #include <memory>
+#include <optional>
 
 #include "libtorrent/io_context.hpp"
 #include "libtorrent/span.hpp"
@@ -54,6 +55,8 @@ namespace aux {
 			return m_in_use;
 		}
 
+		std::optional<int> flush_request() const;
+
 		void set_settings(settings_interface const& sett);
 
 	private:
diff --git a/include/libtorrent/aux_/disk_cache.hpp b/include/libtorrent/aux_/disk_cache.hpp
new file mode 100644
index 00000000000..ad5a2b5d095
--- /dev/null
+++ b/include/libtorrent/aux_/disk_cache.hpp
@@ -0,0 +1,380 @@
+/*
+
+Copyright (c) 2023, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#ifndef TORRENT_DISK_CACHE
+#define TORRENT_DISK_CACHE
+
+#include <unordered_map>
+#include <mutex>
+
+#include "libtorrent/storage_defs.hpp"
+#include "libtorrent/aux_/scope_end.hpp"
+#include "libtorrent/aux_/alloca.hpp"
+#include "libtorrent/aux_/invariant_check.hpp"
+#include "libtorrent/aux_/pread_disk_job.hpp"
+#include "libtorrent/aux_/pread_storage.hpp"
+#include "libtorrent/aux_/disk_io_thread_pool.hpp" // for jobqueue_t
+#include "libtorrent/aux_/unique_ptr.hpp"
+#include "libtorrent/disk_buffer_holder.hpp"
+#include "libtorrent/hasher.hpp"
+
+#include "libtorrent/aux_/disable_warnings_push.hpp"
+#include <boost/functional/hash.hpp>
+
+#define BOOST_BIND_NO_PLACEHOLDERS
+
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/hashed_index.hpp>
+#include <boost/multi_index/sequenced_index.hpp>
+#include <boost/multi_index/mem_fun.hpp>
+#include <boost/multi_index/member.hpp>
+
+#include "libtorrent/aux_/disable_warnings_pop.hpp"
+
+
+namespace libtorrent::aux {
+
+namespace mi = boost::multi_index;
+
+// uniquely identifies a torrent and piece
+struct piece_location
+{
+	piece_location(storage_index_t const t, piece_index_t const p)
+		: torrent(t), piece(p) {}
+	storage_index_t torrent;
+	piece_index_t piece;
+	bool operator==(piece_location const& rhs) const
+	{
+		return std::tie(torrent, piece)
+			== std::tie(rhs.torrent, rhs.piece);
+	}
+
+	bool operator<(piece_location const& rhs) const
+	{
+		return std::tie(torrent, piece)
+			< std::tie(rhs.torrent, rhs.piece);
+	}
+};
+
+inline size_t hash_value(piece_location const& l)
+{
+	std::size_t ret = 0;
+	boost::hash_combine(ret, std::hash<storage_index_t>{}(l.torrent));
+	boost::hash_combine(ret, std::hash<piece_index_t>{}(l.piece));
+	return ret;
+}
+
+struct cached_block_entry
+{
+	span<char const> buf() const;
+
+	// once the write job has been executed, and we've flushed the buffer, we
+	// move it into buf_holder, to keep the buffer alive until any hash job has
+	// completed as well. The underlying data can be accessed through buf, but
+	// the owner moves from the pread_disk_job object to this buf_holder.
+	// TODO: save space by just storing the buffer pointer here. The
+	// cached_piece_entry could hold the pointer to the buffer pool to be able
+	// to free these on destruction
+	// we would still need to save the *size* of the block, to support the
+	// shorter last block of a torrent
+	disk_buffer_holder buf_holder;
+	pread_disk_job* write_job = nullptr;
+
+	bool flushed_to_disk = false;
+
+	// TODO: only allocate this field for v2 torrents
+	sha256_hash block_hash;
+};
+
+struct cached_piece_entry
+{
+	cached_piece_entry(piece_location const& loc
+		, int const num_blocks
+		, int const piece_size_v2);
+
+	span<cached_block_entry> get_blocks() const;
+
+	piece_location piece;
+
+	// this is set to true when the piece has been populated with all blocks
+	// it will make it prioritized for flushing to disk
+	// it will be cleared once all blocks have been flushed
+	bool ready_to_flush = false;
+
+	// when this is true, there is a thread currently hashing blocks and
+	// updating the hash context in "ph".
+	bool hashing = false;
+
+	// when a thread is writing this piece to disk, this is true. Only one
+	// thread at a time should be flushing a piece to disk.
+	bool flushing = false;
+
+	// this is set to true if the piece hash has been computed and returned
+	// to the bittorrent engine.
+	bool piece_hash_returned = false;
+
+	// this indicates that this piece belongs to a v2 torrent, and it has the
+	// block_hash member of cached_block_entry and we need to compute the block
+	// hashes as well
+	bool v1_hashes = false;
+	bool v2_hashes = false;
+
+	// if this is a v2 torrent, this is the exact size of this piece. The
+	// end-piece of each file may be truncated for v2 torrents
+	int piece_size2;
+
+	int blocks_in_piece = 0;
+
+	// the number of blocks that have been hashed so far. Specifically for the
+	// v1 SHA1 hash of the piece, so all blocks are contiguous starting at block
+	// 0.
+	int hasher_cursor = 0;
+
+	// the number of contiguous blocks, starting at 0, that have been flushed to
+	// disk so far. This is used to determine how many blocks are left to flush
+	// from this piece without requiring read-back to hash them, by substracting
+	// flushed_cursor from hasher_cursor.
+	int flushed_cursor = 0;
+
+	// the number of blocks that have a write job associated with them
+	int num_jobs = 0;
+
+	// returns the number of blocks in this piece that have been hashed and
+	// ready to be flushed without requiring reading them back in the future.
+	int cheap_to_flush() const
+	{
+		return int(hasher_cursor) - int(flushed_cursor);
+	}
+
+	unique_ptr<cached_block_entry[]> blocks;
+
+	hasher ph;
+
+	// if there is a hash_job set on this piece, whenever we complete hashing
+	// the last block, we should post this
+	pread_disk_job* hash_job = nullptr;
+
+	// if the piece has been requested to be cleared, but it was locked
+	// (flushing) at the time. We hang this job here to complete it once the
+	// thread currently flushing is done with it
+	pread_disk_job* clear_piece = nullptr;
+};
+
+struct disk_cache
+{
+	using piece_container = mi::multi_index_container<
+		cached_piece_entry,
+		mi::indexed_by<
+		// look up ranges of pieces by (torrent, piece-index)
+		mi::ordered_unique<mi::member<cached_piece_entry, piece_location, &cached_piece_entry::piece>>,
+		// ordered by the number of contiguous blocks we can flush without
+		// read-back. large numbers are ordered first
+		mi::ordered_non_unique<mi::const_mem_fun<cached_piece_entry, int, &cached_piece_entry::cheap_to_flush>, std::greater<void>>,
+		// ordered by whether the piece is ready to be flushed or not
+		// true is ordered before false
+		mi::ordered_non_unique<mi::member<cached_piece_entry, bool, &cached_piece_entry::ready_to_flush>, std::greater<void>>,
+		// hash-table lookup of individual pieces. faster than index 0
+		mi::hashed_unique<mi::member<cached_piece_entry, piece_location, &cached_piece_entry::piece>>
+		>
+	>;
+
+	template <typename Fun>
+	bool get(piece_location const loc, int const block_idx, Fun f) const
+	{
+		std::unique_lock<std::mutex> l(m_mutex);
+
+		INVARIANT_CHECK;
+
+		auto& view = m_pieces.template get<0>();
+		auto i = view.find(loc);
+		if (i == view.end()) return false;
+
+		if (i->blocks[block_idx].buf().data())
+		{
+			// TODO: it would be nice if this could be called without holding
+			// the mutex. It would require being able to lock the piece
+			f(i->blocks[block_idx].buf());
+			return true;
+		}
+		return false;
+	}
+
+	template <typename Fun>
+	sha256_hash hash2(piece_location const loc, int const block_idx, Fun f) const
+	{
+		std::unique_lock<std::mutex> l(m_mutex);
+
+		INVARIANT_CHECK;
+
+		auto& view = m_pieces.template get<0>();
+		auto i = view.find(loc);
+		if (i != view.end())
+		{
+			if (i->hashing)
+			{
+				// TODO: it would probably be more efficient to wait here.
+				// #error we should hang the hash job onto the piece. If there is a
+				// job already, form a queue
+				l.unlock();
+				return f();
+			}
+			auto const& cbe = i->blocks[block_idx];
+			// There's nothing stopping the hash threads from hashing the blocks in
+			// parallel. This should not depend on the hasher_cursor. That's a v1
+			// concept
+			if (i->hasher_cursor > block_idx)
+				return cbe.block_hash;
+			if (cbe.buf().data())
+			{
+				hasher256 h;
+				h.update(cbe.buf());
+				return h.final();
+			}
+		}
+		l.unlock();
+		return f();
+	}
+
+	// returns false if the piece is not in the cache
+	template <typename Fun>
+	bool hash_piece(piece_location const loc, Fun f)
+	{
+		std::unique_lock<std::mutex> l(m_mutex);
+
+		INVARIANT_CHECK;
+
+		auto& view = m_pieces.template get<0>();
+		auto piece_iter = view.find(loc);
+		if (piece_iter == view.end()) return false;
+
+		TORRENT_ALLOCA(blocks, char const*, piece_iter->blocks_in_piece);
+		TORRENT_ALLOCA(v2_hashes, sha256_hash, piece_iter->blocks_in_piece);
+
+		for (int i = 0; i < piece_iter->blocks_in_piece; ++i)
+		{
+			blocks[i] = piece_iter->blocks[i].buf().data();
+			v2_hashes[i] = piece_iter->blocks[i].block_hash;
+		}
+
+		view.modify(piece_iter, [](cached_piece_entry& e) { e.hashing = true; });
+		int const hasher_cursor = piece_iter->hasher_cursor;
+		l.unlock();
+
+		auto se = scope_end([&] {
+			l.lock();
+			view.modify(piece_iter, [&](cached_piece_entry& e) {
+				e.hashing = false;
+			});
+		});
+		f(const_cast<hasher&>(piece_iter->ph), hasher_cursor, blocks, v2_hashes);
+		return true;
+	}
+
+	// If the specified piece exists in the cache, and it's unlocked, clear all
+	// write jobs (return them in "aborted"). Returns true if the clear_piece
+	// job should be posted as complete. Returns false if the piece is locked by
+	// another thread, and the clear_piece job has been queued to be issued once
+	// the piece is unlocked.
+	bool try_clear_piece(piece_location const loc, pread_disk_job* j, jobqueue_t& aborted);
+
+	template <typename Fun>
+	int get2(piece_location const loc, int const block_idx, Fun f) const
+	{
+		std::unique_lock<std::mutex> l(m_mutex);
+
+		INVARIANT_CHECK;
+
+		auto& view = m_pieces.template get<0>();
+		auto i = view.find(loc);
+		if (i == view.end()) return 0;
+
+		char const* buf1 = i->blocks[block_idx].buf().data();
+		char const* buf2 = i->blocks[block_idx + 1].buf().data();
+
+		if (buf1 == nullptr && buf2 == nullptr)
+			return 0;
+
+		return f(buf1, buf2);
+	}
+
+	// returns true if this piece needs to have its hasher kicked
+	bool insert(piece_location const loc
+		, int const block_idx
+		, pread_disk_job* write_job);
+
+	enum hash_result: std::uint8_t
+	{
+		job_completed,
+		job_queued,
+		post_job,
+	};
+
+	// this call can have 3 outcomes:
+	// 1. the job is immediately satisfied and should be posted to the
+	//    completion queue
+	// 2. The piece is in the cache and currently hashing, but it's not done
+	//    yet. We hang the hash job on the piece itself so the hashing thread
+	//    can complete it when hashing finishes
+	// 3. The piece is not in the cache and should be posted to the disk thread
+	//    to read back the bytes.
+	hash_result try_hash_piece(piece_location const loc, pread_disk_job* hash_job);
+
+	// this should be called from a hasher thread
+	void kick_hasher(piece_location const& loc, jobqueue_t& completed_jobs);
+
+	// this should be called by a disk thread
+	// the callback should return the number of blocks it successfully flushed
+	// to disk
+	void flush_to_disk(std::function<int(bitfield&, span<cached_block_entry const>, int)> f
+		, int const target_blocks
+		, std::function<void(jobqueue_t, pread_disk_job*)> clear_piece_fun);
+
+	void flush_storage(std::function<int(bitfield&, span<cached_block_entry const>, int)> f
+		, storage_index_t const storage
+		, std::function<void(jobqueue_t, pread_disk_job*)> clear_piece_fun);
+
+	std::size_t size() const;
+	std::size_t num_flushing() const;
+
+#if TORRENT_USE_INVARIANT_CHECKS
+	void check_invariant() const;
+#endif
+
+private:
+
+	// this requires the mutex to be locked
+	void clear_piece_impl(cached_piece_entry& cpe, jobqueue_t& aborted);
+
+	template <typename Iter, typename View>
+	Iter flush_piece_impl(View& view
+		, Iter piece_iter
+		, std::function<int(bitfield&, span<cached_block_entry const>, int)> const& f
+		, std::unique_lock<std::mutex>& l
+		, int const num_blocks
+		, span<cached_block_entry> const blocks
+		, std::function<void(jobqueue_t, pread_disk_job*)> clear_piece_fun);
+
+	mutable std::mutex m_mutex;
+	piece_container m_pieces;
+
+	// the number of *dirty* blocks in the cache. i.e. blocks that need to be
+	// flushed to disk. The cache may (briefly) hold more buffers than this
+	// while finishing hashing blocks.
+	int m_blocks = 0;
+
+	// the number of blocks currently being flushed by a disk thread
+	// we use this to avoid over-shooting flushing blocks
+	int m_flushing_blocks = 0;
+};
+
+}
+
+#endif
+
diff --git a/include/libtorrent/aux_/disk_completed_queue.hpp b/include/libtorrent/aux_/disk_completed_queue.hpp
index 2a307fa6014..cf13c2138f6 100644
--- a/include/libtorrent/aux_/disk_completed_queue.hpp
+++ b/include/libtorrent/aux_/disk_completed_queue.hpp
@@ -26,6 +26,7 @@ struct disk_completed_queue
 	{}
 
 	void abort_job(io_context& ioc, aux::disk_job* j);
+	void abort_jobs(io_context& ioc, jobqueue_t jobs);
 	void append(io_context& ioc, jobqueue_t jobs);
 
 private:
diff --git a/include/libtorrent/aux_/disk_job.hpp b/include/libtorrent/aux_/disk_job.hpp
index 78197185556..dc8d793dd33 100644
--- a/include/libtorrent/aux_/disk_job.hpp
+++ b/include/libtorrent/aux_/disk_job.hpp
@@ -44,6 +44,7 @@ namespace libtorrent::aux {
 		, file_priority
 		, clear_piece
 		, partial_read
+		, kick_hasher
 		, num_job_ids
 	};
 
@@ -234,6 +235,12 @@ namespace job {
 		// the piece to clear
 		piece_index_t piece;
 	};
+
+	struct kick_hasher
+	{
+		// the piece whose hasher to kick
+		piece_index_t piece;
+	};
 }
 
 	// disk_job is a generic base class to disk io subsystem-specifit jobs (e.g.
@@ -285,6 +292,7 @@ namespace job {
 			, job::file_priority
 			, job::clear_piece
 			, job::partial_read
+			, job::kick_hasher
 		> action;
 
 		// the type of job this is
diff --git a/include/libtorrent/aux_/disk_job_pool.hpp b/include/libtorrent/aux_/disk_job_pool.hpp
index a9d108a60ef..b515b7cadd8 100644
--- a/include/libtorrent/aux_/disk_job_pool.hpp
+++ b/include/libtorrent/aux_/disk_job_pool.hpp
@@ -79,7 +79,9 @@ namespace aux {
 	};
 
 	struct mmap_disk_job;
+	struct pread_disk_job;
 	extern template struct disk_job_pool<aux::mmap_disk_job>;
+	extern template struct disk_job_pool<aux::pread_disk_job>;
 }
 }
 
diff --git a/include/libtorrent/aux_/pread_disk_job.hpp b/include/libtorrent/aux_/pread_disk_job.hpp
new file mode 100644
index 00000000000..fe9896b730b
--- /dev/null
+++ b/include/libtorrent/aux_/pread_disk_job.hpp
@@ -0,0 +1,27 @@
+/*
+
+Copyright (c) 2022, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#ifndef TORRENT_PREAD_DISK_JOB_HPP
+#define TORRENT_PREAD_DISK_JOB_HPP
+
+#include "libtorrent/aux_/disk_job.hpp"
+
+namespace libtorrent::aux {
+
+	struct pread_storage;
+
+	struct TORRENT_EXTRA_EXPORT pread_disk_job : disk_job
+	{
+		// the disk storage this job applies to (if applicable)
+		std::shared_ptr<pread_storage> storage;
+	};
+
+}
+
+#endif // TORRENT_PREAD_DISK_JOB_HPP
diff --git a/include/libtorrent/aux_/pread_storage.hpp b/include/libtorrent/aux_/pread_storage.hpp
new file mode 100644
index 00000000000..6c8f2b3ef01
--- /dev/null
+++ b/include/libtorrent/aux_/pread_storage.hpp
@@ -0,0 +1,187 @@
+/*
+
+Copyright (c) 2022, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#ifndef TORRENT_PREAD_STORAGE_HPP
+#define TORRENT_PREAD_STORAGE_HPP
+
+#include "libtorrent/config.hpp"
+
+#include <mutex>
+#include <memory>
+
+#include "libtorrent/fwd.hpp"
+#include "libtorrent/aux_/disk_job_fence.hpp"
+#include "libtorrent/storage_defs.hpp"
+#include "libtorrent/aux_/part_file.hpp"
+#include "libtorrent/aux_/stat_cache.hpp"
+#include "libtorrent/aux_/file_pool.hpp"
+#include "libtorrent/bitfield.hpp"
+#include "libtorrent/span.hpp"
+#include "libtorrent/aux_/vector.hpp"
+#include "libtorrent/aux_/open_mode.hpp" // for aux::open_mode_t
+#include "libtorrent/disk_interface.hpp" // for disk_job_flags_t
+
+namespace libtorrent::aux {
+
+	struct session_settings;
+	struct file_view;
+
+	struct TORRENT_EXTRA_EXPORT pread_storage
+		: std::enable_shared_from_this<pread_storage>
+		, aux::disk_job_fence
+	{
+		// constructs the pread_storage based on the given storage_params.
+		// ``file_pool`` is the cache of file handles that the storage will use.
+		// All files it opens will ask the file_pool to open them.
+		pread_storage(storage_params const& params, aux::file_pool&);
+
+		// hidden
+		~pread_storage();
+		pread_storage(pread_storage const&) = delete;
+		pread_storage& operator=(pread_storage const&) = delete;
+
+		void abort_jobs();
+
+		bool has_any_file(storage_error&);
+		void set_file_priority(settings_interface const&
+			, aux::vector<download_priority_t, file_index_t>& prio
+			, storage_error&);
+		void rename_file(file_index_t index, std::string const& new_filename
+			, storage_error&);
+		void release_files(storage_error&);
+		void delete_files(remove_flags_t options, storage_error&);
+		status_t initialize(settings_interface const&, storage_error&);
+		std::pair<status_t, std::string> move_storage(std::string save_path
+			, move_flags_t, storage_error&);
+		bool verify_resume_data(add_torrent_params const& rd
+			, aux::vector<std::string, file_index_t> const& links
+			, storage_error&);
+		bool tick();
+
+		int read(settings_interface const&, span<char> buffer
+			, piece_index_t piece, int offset, aux::open_mode_t mode
+			, disk_job_flags_t flags
+			, storage_error&);
+		int write(settings_interface const&, span<char> buffer
+			, piece_index_t piece, int offset, aux::open_mode_t mode
+			, disk_job_flags_t flags
+			, storage_error&);
+		int write(settings_interface const& sett
+			, span<span<char> const> buffers
+			, piece_index_t const piece, int offset
+			, open_mode_t const mode
+			, disk_job_flags_t const flags
+			, storage_error& error);
+		int hash(settings_interface const&, hasher& ph, std::ptrdiff_t len
+			, piece_index_t piece, int offset, aux::open_mode_t mode
+			, disk_job_flags_t flags, storage_error&);
+		int hash2(settings_interface const&, hasher256& ph, std::ptrdiff_t len
+			, piece_index_t piece, int offset, aux::open_mode_t mode
+			, disk_job_flags_t flags, storage_error&);
+
+		// if the files in this storage are mapped, returns the mapped
+		// file_storage, otherwise returns the original file_storage object.
+		file_storage const& files() const { return m_mapped_files ? *m_mapped_files : m_files; }
+
+		bool set_need_tick()
+		{
+			bool const prev = m_need_tick;
+			m_need_tick = true;
+			return prev;
+		}
+
+		void do_tick()
+		{
+			m_need_tick = false;
+			tick();
+		}
+
+		void set_owner(std::shared_ptr<void> const& tor) { m_torrent = tor; }
+
+		storage_index_t storage_index() const { return m_storage_index; }
+		void set_storage_index(storage_index_t st) { m_storage_index = st; }
+
+		bool v1() const { return m_v1; }
+		bool v2() const { return m_v2; }
+
+	private:
+
+		bool m_need_tick = false;
+		file_storage const& m_files;
+
+		// the reason for this to be a void pointer
+		// is to avoid creating a dependency on the
+		// torrent. This shared_ptr is here only
+		// to keep the torrent object alive until
+		// the storage destructs. This is because
+		// the file_storage object is owned by the torrent.
+		std::shared_ptr<void> m_torrent;
+
+		storage_index_t m_storage_index{0};
+
+		void need_partfile();
+
+		std::unique_ptr<file_storage> m_mapped_files;
+
+		// in order to avoid calling stat() on each file multiple times
+		// during startup, cache the results in here, and clear it all
+		// out once the torrent starts (to avoid getting stale results)
+		// each entry represents the size and timestamp of the file
+		mutable aux::stat_cache m_stat_cache;
+
+		// helper function to open a file in the file pool with the right mode
+		std::shared_ptr<aux::file_handle> open_file(settings_interface const&, file_index_t
+			, aux::open_mode_t, storage_error&) const;
+		std::shared_ptr<aux::file_handle> open_file_impl(settings_interface const&
+			, file_index_t, aux::open_mode_t, storage_error&) const;
+
+		bool use_partfile(file_index_t index) const;
+		void use_partfile(file_index_t index, bool b);
+
+		aux::vector<download_priority_t, file_index_t> m_file_priority;
+		std::string m_save_path;
+		std::string m_part_file_name;
+
+		// this this is an array indexed by file-index. Each slot represents
+		// whether this file has the part-file enabled for it. This is used for
+		// backwards compatibility with pre-partfile versions of libtorrent. If
+		// this vector is empty, the default is that files *do* use the partfile.
+		// on startup, any 0-priority file that's found in it's original location
+		// is expected to be an old-style (pre-partfile) torrent storage, and
+		// those files have their slot set to false in this vector.
+		// note that the vector is *sparse*, it's only allocated if a file has its
+		// entry set to false, and only indices up to that entry.
+		aux::vector<bool, file_index_t> m_use_partfile;
+
+		// the file pool is a member of the disk_io_thread
+		// to make all storage instances share the pool
+		aux::file_pool& m_pool;
+
+		// used for skipped files
+		std::unique_ptr<part_file> m_part_file;
+
+		// this is a bitfield with one bit per file. A bit being set means
+		// we've written to that file previously. If we do write to a file
+		// whose bit is 0, we set the file size, to make the file allocated
+		// on disk (in full allocation mode) and just sparsely allocated in
+		// case of sparse allocation mode
+		mutable std::mutex m_file_created_mutex;
+		mutable typed_bitfield<file_index_t> m_file_created;
+
+		bool m_allocate_files;
+		// this is a v1 torrent
+		bool m_v1;
+		// this is a v2 torrent. If both v1 and v2 are set, it's a hybrid
+		// torrent
+		bool m_v2;
+	};
+
+}
+
+#endif // TORRENT_PREAD_STORAGE_HPP
diff --git a/include/libtorrent/aux_/store_buffer.hpp b/include/libtorrent/aux_/store_buffer.hpp
index 48a51448186..3f1cf0aa9dc 100644
--- a/include/libtorrent/aux_/store_buffer.hpp
+++ b/include/libtorrent/aux_/store_buffer.hpp
@@ -74,6 +74,13 @@ struct store_buffer
 		auto const it = m_store_buffer.find(loc);
 		if (it != m_store_buffer.end())
 		{
+			// TODO: it would be nice if this could be called without holding
+			// the mutex. It would require a reference counter on the store
+			// buffer entries and that we potentially erases it after this call.
+			// it would also require the store buffer being able to take over
+			// ownership of the buffer when the owner erases it. Perhase erase()
+			// could be made to take a buffer_holder, which is held onto if the
+			// refcount > 0
 			f(it->second);
 			return true;
 		}
diff --git a/include/libtorrent/aux_/unique_ptr.hpp b/include/libtorrent/aux_/unique_ptr.hpp
index cd490554e39..f892043bcba 100644
--- a/include/libtorrent/aux_/unique_ptr.hpp
+++ b/include/libtorrent/aux_/unique_ptr.hpp
@@ -31,6 +31,8 @@ namespace libtorrent { namespace aux {
 		unique_ptr() = default;
 		explicit unique_ptr(T* arr) : base(arr) {}
 
+		unique_ptr(base b): base(std::move(b)) {}
+
 		decltype(auto) operator[](IndexType idx) const
 		{
 			TORRENT_ASSERT(idx >= IndexType(0));
@@ -38,6 +40,11 @@ namespace libtorrent { namespace aux {
 		}
 	};
 
+	template <typename T, typename IndexType = std::ptrdiff_t>
+	unique_ptr<T, IndexType> make_unique(IndexType const num) {
+		static_assert(std::is_array_v<T>);
+		return unique_ptr<T, IndexType>(new std::remove_extent_t<T>[std::size_t(num)]);
+	}
 }}
 
 #endif
diff --git a/include/libtorrent/config.hpp b/include/libtorrent/config.hpp
index e106e6c3e77..270bfe3abf5 100644
--- a/include/libtorrent/config.hpp
+++ b/include/libtorrent/config.hpp
@@ -352,6 +352,11 @@ see LICENSE file.
 #define TORRENT_USE_IFCONF 1
 #define TORRENT_USE_GRTTABLE 1
 
+#ifndef TORRENT_HAVE_PREAD
+#define TORRENT_HAVE_PREAD 0
+#endif
+
+
 // ==== GNU/Hurd ===
 #elif defined __GNU__
 #define TORRENT_HURD
@@ -466,6 +471,11 @@ see LICENSE file.
 #define TORRENT_HAVE_MMAP 0
 #endif
 
+#ifndef TORRENT_HAVE_PREAD
+#define TORRENT_HAVE_PREAD 1
+#endif
+
+
 #ifndef TORRENT_HAVE_MAP_VIEW_OF_FILE
 #define TORRENT_HAVE_MAP_VIEW_OF_FILE 0
 #endif
diff --git a/include/libtorrent/libtorrent.hpp b/include/libtorrent/libtorrent.hpp
index 1d6a027b67a..f1b4f66dfa3 100644
--- a/include/libtorrent/libtorrent.hpp
+++ b/include/libtorrent/libtorrent.hpp
@@ -83,6 +83,7 @@
 #include "libtorrent/piece_block.hpp"
 #include "libtorrent/portmap.hpp"
 #include "libtorrent/posix_disk_io.hpp"
+#include "libtorrent/pread_disk_io.hpp"
 #include "libtorrent/random.hpp"
 #include "libtorrent/read_resume_data.hpp"
 #include "libtorrent/session.hpp"
diff --git a/include/libtorrent/pread_disk_io.hpp b/include/libtorrent/pread_disk_io.hpp
new file mode 100644
index 00000000000..b6ef36772c5
--- /dev/null
+++ b/include/libtorrent/pread_disk_io.hpp
@@ -0,0 +1,28 @@
+/*
+
+Copyright (c) 2022, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#ifndef TORRENT_PREAD_DISK_IO_HPP
+#define TORRENT_PREAD_DISK_IO_HPP
+
+#include "libtorrent/config.hpp"
+#include "libtorrent/disk_interface.hpp"
+#include "libtorrent/io_context.hpp"
+
+namespace libtorrent {
+
+	struct counters;
+	struct settings_interface;
+
+	// constructs a multi-threaded file disk I/O using pread()/pwrite()
+	TORRENT_EXPORT std::unique_ptr<disk_interface> pread_disk_io_constructor(
+		io_context& ios, settings_interface const&, counters& cnt);
+
+}
+
+#endif // TORRENT_PREAD_DISK_IO_HPP
diff --git a/src/disk_buffer_pool.cpp b/src/disk_buffer_pool.cpp
index 7c7004471ad..ab7205e82a2 100644
--- a/src/disk_buffer_pool.cpp
+++ b/src/disk_buffer_pool.cpp
@@ -16,6 +16,7 @@ see LICENSE file.
 #include "libtorrent/io_context.hpp"
 #include "libtorrent/disk_observer.hpp"
 #include "libtorrent/disk_interface.hpp" // for default_block_size
+#include "libtorrent/aux_/debug_disk_thread.hpp"
 
 #include "libtorrent/aux_/disable_warnings_push.hpp"
 
@@ -178,7 +179,7 @@ namespace {
 
 		int const pool_size = std::max(1, sett.get_int(settings_pack::max_queued_disk_bytes) / default_block_size);
 		m_max_use = pool_size;
-		m_low_watermark = m_max_use / 2;
+		m_low_watermark = std::max(2, m_max_use - 32);
 		if (m_in_use >= m_max_use && !m_exceeded_max_size)
 		{
 			m_exceeded_max_size = true;
@@ -189,6 +190,14 @@ namespace {
 #endif
 	}
 
+	std::optional<int> disk_buffer_pool::flush_request() const
+	{
+		std::unique_lock<std::mutex> l(m_pool_mutex);
+		if (m_in_use >= m_low_watermark)
+			return m_in_use - m_low_watermark;
+		return std::nullopt;
+	}
+
 	void disk_buffer_pool::remove_buffer_in_use(char* buf)
 	{
 		TORRENT_UNUSED(buf);
diff --git a/src/disk_cache.cpp b/src/disk_cache.cpp
new file mode 100644
index 00000000000..63bd6192084
--- /dev/null
+++ b/src/disk_cache.cpp
@@ -0,0 +1,669 @@
+/*
+
+Copyright (c) 2023, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#include "libtorrent/aux_/disk_cache.hpp"
+
+namespace libtorrent::aux {
+
+namespace mi = boost::multi_index;
+
+namespace {
+struct compare_storage
+{
+	bool operator()(piece_location const& lhs, storage_index_t const rhs) const
+	{
+		return lhs.torrent < rhs;
+	}
+
+	bool operator()(storage_index_t const lhs, piece_location const& rhs) const
+	{
+		return lhs < rhs.torrent;
+	}
+};
+
+bool have_buffers(span<const cached_block_entry> blocks)
+{
+	for (auto const& b : blocks)
+		if (b.buf().data() == nullptr) return false;
+	return true;
+}
+
+bool compute_ready_to_flush(span<const cached_block_entry> blocks)
+{
+	bool has_job = false;
+	for (auto const& b : blocks)
+	{
+		has_job |= bool(b.write_job);
+		if (!b.write_job && !b.flushed_to_disk) return false;
+	}
+	return has_job;
+}
+
+int compute_flushed_cursor(span<const cached_block_entry> blocks)
+{
+	int ret = 0;
+	for (auto const& b : blocks)
+	{
+		if (!b.flushed_to_disk) return ret;
+		++ret;
+	}
+	return ret;
+}
+
+#if TORRENT_USE_ASSERTS
+int count_jobs(span<const cached_block_entry> blocks)
+{
+	return static_cast<int>(std::count_if(blocks.begin(), blocks.end()
+		, [](cached_block_entry const& b) { return b.write_job; }));
+}
+#endif
+
+}
+
+span<char const> cached_block_entry::buf() const {
+	if (buf_holder)
+		return {buf_holder.data(), buf_holder.size()};
+
+	if (write_job != nullptr)
+	{
+		TORRENT_ASSERT(write_job->get_type() == aux::job_action_t::write);
+		auto const& job = std::get<job::write>(write_job->action);
+		return {job.buf.data(), job.buffer_size};
+	}
+	return {nullptr, 0};
+}
+
+cached_piece_entry::cached_piece_entry(piece_location const& loc, int const num_blocks, int const piece_size_v2)
+	: piece(loc)
+	, piece_size2(piece_size_v2)
+	, blocks_in_piece(num_blocks)
+	, blocks(aux::make_unique<cached_block_entry[], std::ptrdiff_t>(num_blocks))
+	, ph(hasher())
+{}
+
+span<cached_block_entry> cached_piece_entry::get_blocks() const
+{
+	return {blocks.get(), blocks_in_piece};
+}
+
+// If the specified piece exists in the cache, and it's unlocked, clear all
+// write jobs (return them in "aborted"). Returns true if the clear_piece
+// job should be posted as complete. Returns false if the piece is locked by
+// another thread, and the clear_piece job has been queued to be issued once
+// the piece is unlocked.
+bool disk_cache::try_clear_piece(piece_location const loc, pread_disk_job* j, jobqueue_t& aborted)
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+
+	INVARIANT_CHECK;
+
+	auto& view = m_pieces.template get<0>();
+	auto i = view.find(loc);
+	if (i == view.end()) return true;
+	if (i->flushing)
+	{
+		// postpone the clearing until we're done flushing
+		view.modify(i, [&](cached_piece_entry& e) { e.clear_piece = j; });
+		return false;
+	}
+
+	// we clear a piece after it fails the hash check. It doesn't make sense
+	// to be hashing still
+	TORRENT_ASSERT(!i->hashing);
+	if (i->hashing)
+	{
+		// postpone the clearing until we're done flushing
+		view.modify(i, [&](cached_piece_entry& e) { e.clear_piece = j; });
+		return false;
+	}
+
+	view.modify(i, [&](cached_piece_entry& e) {
+		clear_piece_impl(e, aborted);
+	});
+	return true;
+}
+
+// returns true if this piece needs to have its hasher kicked
+bool disk_cache::insert(piece_location const loc
+	, int const block_idx
+	, pread_disk_job* write_job)
+{
+	TORRENT_ASSERT(write_job != nullptr);
+	std::unique_lock<std::mutex> l(m_mutex);
+
+	INVARIANT_CHECK;
+
+	auto& view = m_pieces.template get<0>();
+	auto i = view.find(loc);
+	if (i == view.end())
+	{
+		pread_storage* storage = write_job->storage.get();
+		file_storage const& fs = storage->files();
+		int const blocks_in_piece = (storage->files().piece_size(loc.piece) + default_block_size - 1) / default_block_size;
+		int const piece_size2 = fs.piece_size2(loc.piece);
+		cached_piece_entry pe(loc, blocks_in_piece, piece_size2);
+		pe.v1_hashes = storage->v1();
+		pe.v2_hashes = storage->v2();
+		i = m_pieces.insert(std::move(pe)).first;
+	}
+
+	cached_block_entry& blk = i->blocks[block_idx];
+	TORRENT_ASSERT(!blk.buf_holder);
+	TORRENT_ASSERT(blk.write_job == nullptr);
+	TORRENT_ASSERT(blk.flushed_to_disk == false);
+	TORRENT_ASSERT(block_idx >= i->flushed_cursor);
+	TORRENT_ASSERT(block_idx >= i->hasher_cursor);
+
+	TORRENT_ASSERT(write_job->get_type() == aux::job_action_t::write);
+	blk.write_job = write_job;
+	++m_blocks;
+
+	bool const ready_to_flush = compute_ready_to_flush(i->get_blocks());
+	view.modify(i, [&](cached_piece_entry& e) {
+		e.ready_to_flush = ready_to_flush;
+		++e.num_jobs;
+	});
+
+	return block_idx == 0 || ready_to_flush;
+}
+
+// this call can have 3 outcomes:
+// 1. the job is immediately satisfied and should be posted to the
+//    completion queue
+// 2. The piece is in the cache and currently hashing, but it's not done
+//    yet. We hang the hash job on the piece itself so the hashing thread
+//    can complete it when hashing finishes
+// 3. The piece is not in the cache and should be posted to the disk thread
+//    to read back the bytes.
+disk_cache::hash_result disk_cache::try_hash_piece(piece_location const loc, pread_disk_job* hash_job)
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+
+	INVARIANT_CHECK;
+
+	auto& view = m_pieces.template get<0>();
+	auto i = view.find(loc);
+	if (i == view.end()) return hash_result::post_job;
+
+	// we should only ask for the hash once
+	TORRENT_ASSERT(!i->piece_hash_returned);
+
+	if (!i->hashing && i->hasher_cursor == i->blocks_in_piece)
+	{
+		view.modify(i, [&](cached_piece_entry& e) {
+			e.piece_hash_returned = true;
+
+			job::hash& job = std::get<aux::job::hash>(hash_job->action);
+			job.piece_hash = e.ph.final();
+			if (!job.block_hashes.empty())
+			{
+				TORRENT_ASSERT(i->v2_hashes);
+				for (int idx = 0; idx < e.blocks_in_piece; ++idx)
+					job.block_hashes[idx] = e.blocks[idx].block_hash;
+			}
+		});
+		return hash_result::job_completed;
+	}
+
+	if (i->hashing
+		&& i->hasher_cursor < i->blocks_in_piece
+		&& have_buffers(i->get_blocks().subspan(i->hasher_cursor))
+		)
+	{
+		// We're not done hashing yet, let the hashing thread post the
+		// completion once it's done
+
+		// We don't expect to ever have simultaneous async_hash() requests
+		// for the same piece
+		TORRENT_ASSERT(i->hash_job == nullptr);
+		view.modify(i, [&](cached_piece_entry& e) { e.hash_job = hash_job; });
+		return hash_result::job_queued;
+	}
+
+	return hash_result::post_job;
+}
+
+// this should be called from a hasher thread
+void disk_cache::kick_hasher(piece_location const& loc, jobqueue_t& completed_jobs)
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+
+	INVARIANT_CHECK;
+
+	auto& view = m_pieces.template get<0>();
+	auto piece_iter = view.find(loc);
+	if (piece_iter == view.end())
+		return;
+
+	// some other thread beat us to it
+	if (piece_iter->hashing)
+		return;
+
+	TORRENT_ALLOCA(blocks_storage, span<char const>, piece_iter->blocks_in_piece);
+	int cursor = piece_iter->hasher_cursor;
+keep_going:
+	int block_idx = 0;
+	int end = cursor;
+	while (end < piece_iter->blocks_in_piece && piece_iter->blocks[end].buf().data())
+	{
+		blocks_storage[block_idx] = piece_iter->blocks[end].buf();
+		++block_idx;
+		++end;
+	}
+	auto const blocks = blocks_storage.first(block_idx);
+
+	hasher& ctx = const_cast<hasher&>(piece_iter->ph);
+
+	view.modify(piece_iter, [](cached_piece_entry& e) { e.hashing = true; });
+
+	bool const need_v1 = piece_iter->v1_hashes;
+	bool const need_v2 = piece_iter->v2_hashes;
+
+	l.unlock();
+
+	int bytes_left = piece_iter->piece_size2 - (cursor * default_block_size);
+	for (auto& buf: blocks)
+	{
+		cached_block_entry& cbe = piece_iter->blocks[cursor];
+
+		if (need_v1)
+			ctx.update(buf);
+
+		if (need_v2 && bytes_left > 0)
+		{
+			int const this_block_size = std::min(bytes_left, default_block_size);
+			cbe.block_hash = hasher256(buf.first(this_block_size)).final();
+			bytes_left -= default_block_size;
+		}
+
+		++cursor;
+	}
+
+	l.lock();
+	for (auto& cbe : piece_iter->get_blocks().subspan(piece_iter->hasher_cursor, block_idx))
+	{
+		// TODO: free these in bulk, acquiring the mutex just once
+		// free them after releasing the mutex, l
+		if (cbe.buf_holder)
+			cbe.buf_holder.reset();
+	}
+
+	view.modify(piece_iter, [&](cached_piece_entry& e) {
+		e.hasher_cursor = cursor;
+		e.hashing = false;
+	});
+
+	if (cursor != piece_iter->blocks_in_piece)
+	{
+		// if some other thread added the next block, keep going
+		if (piece_iter->blocks[cursor].buf().data())
+			goto keep_going;
+	}
+
+	if (!piece_iter->hash_job) return;
+
+	// there's a hash job hung on this piece, post it now
+	pread_disk_job* j = nullptr;
+	span<cached_block_entry> const cached_blocks = piece_iter->get_blocks();
+	view.modify(piece_iter, [&cached_blocks, &j](cached_piece_entry& e) {
+		j = std::exchange(e.hash_job, nullptr);
+		e.ready_to_flush = compute_ready_to_flush(cached_blocks);
+	});
+	// we've hashed all blocks, and there's a hash job associated with
+	// this piece, post it.
+	sha1_hash const piece_hash = ctx.final();
+
+	job::hash& job = std::get<job::hash>(j->action);
+	job.piece_hash = piece_hash;
+	if (!job.block_hashes.empty())
+	{
+		TORRENT_ASSERT(need_v2);
+		int const to_copy = std::min(
+			piece_iter->blocks_in_piece,
+			int(job.block_hashes.size()));
+		for (int i = 0; i < to_copy; ++i)
+			job.block_hashes[i] = piece_iter->blocks[i].block_hash;
+	}
+	completed_jobs.push_back(j);
+}
+
+template <typename Iter, typename View>
+Iter disk_cache::flush_piece_impl(View& view
+	, Iter piece_iter
+	, std::function<int(bitfield&, span<cached_block_entry const>, int)> const& f
+	, std::unique_lock<std::mutex>& l
+	, int const num_blocks
+	, span<cached_block_entry> const blocks
+	, std::function<void(jobqueue_t, pread_disk_job*)> clear_piece_fun)
+{
+	view.modify(piece_iter, [](cached_piece_entry& e) { TORRENT_ASSERT(!e.flushing); e.flushing = true; });
+	m_flushing_blocks += num_blocks;
+	TORRENT_ASSERT(num_blocks > 0);
+
+	int const hash_cursor = piece_iter->hasher_cursor;
+
+	// we have to release the lock while flushing, but since we set the
+	// "flushing" member to true, this piece is pinned to the cache
+	l.unlock();
+
+	int count = 0;
+	bitfield flushed_blocks;
+	{
+		auto se = scope_end([&] {
+			l.lock();
+			view.modify(piece_iter, [](cached_piece_entry& e) {
+				TORRENT_ASSERT(e.flushing);
+				e.flushing = false;
+			});
+			TORRENT_ASSERT(m_flushing_blocks >= num_blocks);
+			m_flushing_blocks -= num_blocks;
+		});
+		flushed_blocks.resize(int(blocks.size()));
+		flushed_blocks.clear_all();
+		count = f(flushed_blocks, blocks, hash_cursor);
+	}
+
+	// now that we hold the mutex again, we can update the entries for
+	// all the blocks that were flushed
+	int jobs = 0;
+	for (int i = 0; i < blocks.size(); ++i)
+	{
+		if (!flushed_blocks.get_bit(i)) continue;
+		cached_block_entry& blk = blocks[i];
+
+		auto* j = blk.write_job;
+		TORRENT_ASSERT(j);
+		TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
+		blk.buf_holder = std::move(std::get<aux::job::write>(j->action).buf);
+		blk.flushed_to_disk = true;
+		TORRENT_ASSERT(blk.buf_holder);
+		// TODO: free these in bulk at the end, or something
+		if (i < hash_cursor)
+			blk.buf_holder.reset();
+
+		blk.write_job = nullptr;
+		++jobs;
+	}
+	auto next_iter = std::next(piece_iter);
+	view.modify(piece_iter, [&blocks, jobs](cached_piece_entry& e) {
+		e.flushed_cursor = compute_flushed_cursor(blocks);
+		e.ready_to_flush = compute_ready_to_flush(blocks);
+		TORRENT_ASSERT(e.num_jobs >= jobs);
+		e.num_jobs -= jobs;
+	});
+	TORRENT_ASSERT(count <= blocks.size());
+	TORRENT_ASSERT(m_blocks >= count);
+	m_blocks -= count;
+	if (piece_iter->clear_piece)
+	{
+		jobqueue_t aborted;
+		pread_disk_job* clear_piece = nullptr;
+		view.modify(piece_iter, [&](cached_piece_entry& e) {
+			clear_piece_impl(e, aborted);
+			clear_piece = std::exchange(e.clear_piece, nullptr);
+		});
+		clear_piece_fun(std::move(aborted), clear_piece);
+	}
+
+	return next_iter;
+}
+
+// this should be called by a disk thread
+// the callback should return the number of blocks it successfully flushed
+// to disk
+void disk_cache::flush_to_disk(
+	std::function<int(bitfield&, span<cached_block_entry const>, int)> f
+	, int const target_blocks
+	, std::function<void(jobqueue_t, pread_disk_job*)> clear_piece_fun)
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+
+	INVARIANT_CHECK;
+
+	// first we look for pieces that are ready to be flushed and should be
+	// updating
+	auto& view = m_pieces.template get<2>();
+	for (auto piece_iter = view.begin(); piece_iter != view.end();)
+	{
+		// We want to flush all pieces that are ready to flush regardless of
+		// the flush target. There's not much value in keeping them in RAM
+		// when we've completely downloaded the piece and hashed it
+		// so, we don't check flush target in this loop
+
+		if (piece_iter->flushing)
+		{
+			++piece_iter;
+			continue;
+		}
+
+		if (!piece_iter->ready_to_flush)
+			break;
+
+		int const num_blocks = piece_iter->blocks_in_piece;
+		TORRENT_ASSERT(num_blocks >= 0);
+		if (num_blocks == 0)
+		{
+			++piece_iter;
+			continue;
+		}
+		span<cached_block_entry> const blocks = piece_iter->get_blocks();
+
+		auto const next_iter = flush_piece_impl(view, piece_iter, f, l
+			, num_blocks, blocks, clear_piece_fun);
+
+		if (piece_iter->piece_hash_returned)
+		{
+			TORRENT_ASSERT(!piece_iter->flushing);
+			TORRENT_ASSERT(!piece_iter->hashing);
+			view.erase(piece_iter);
+		}
+		piece_iter = next_iter;
+	}
+
+	// if we get here, we have to "force flush" some blocks even though we
+	// don't have all the blocks yet. Start by flushing pieces that have the
+	// most contiguous blocks to flush:
+	auto& view2 = m_pieces.template get<1>();
+	for (auto piece_iter = view2.begin(); piece_iter != view2.end();)
+	{
+		// We avoid flushing if other threads have already initiated sufficient
+		// amount of flushing
+		if (m_blocks - m_flushing_blocks <= target_blocks)
+			return;
+
+		if (piece_iter->flushing)
+		{
+			++piece_iter;
+			continue;
+		}
+
+		int const num_blocks = piece_iter->hasher_cursor - piece_iter->flushed_cursor;
+		TORRENT_ASSERT(num_blocks >= 0);
+
+		// the pieces are ordered by the number of blocks that are cheap to
+		// flush (i.e. won't require read-back later)
+		// if we encounter a 0, all the remaining ones will also be zero
+		if (num_blocks <= 0) break;
+		span<cached_block_entry> const blocks = piece_iter->get_blocks().subspan(piece_iter->flushed_cursor);
+
+		piece_iter = flush_piece_impl(view2, piece_iter, f, l
+			, num_blocks, blocks, clear_piece_fun);
+	}
+
+	// we may still need to flush blocks at this point, even though we
+	// would require read-back later to compute the piece hash
+	auto& view3 = m_pieces.template get<0>();
+	for (auto piece_iter = view3.begin(); piece_iter != view3.end();)
+	{
+		// We avoid flushing if other threads have already initiated sufficient
+		// amount of flushing
+		if (m_blocks - m_flushing_blocks <= target_blocks)
+			return;
+
+		if (piece_iter->flushing)
+		{
+			++piece_iter;
+			continue;
+		}
+
+		int const num_blocks = piece_iter->num_jobs;
+		TORRENT_ASSERT(count_jobs(piece_iter->get_blocks()) == num_blocks);
+		if (num_blocks == 0)
+		{
+			++piece_iter;
+			continue;
+		}
+
+		span<cached_block_entry> const blocks = piece_iter->get_blocks();
+
+		piece_iter = flush_piece_impl(view3, piece_iter, f, l
+			, num_blocks, blocks, clear_piece_fun);
+	}
+}
+
+void disk_cache::flush_storage(std::function<int(bitfield&, span<cached_block_entry const>, int)> f
+	, storage_index_t const storage
+	, std::function<void(jobqueue_t, pread_disk_job*)> clear_piece_fun)
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+
+	INVARIANT_CHECK;
+
+	auto& range_view = m_pieces.template get<0>();
+	auto& view = m_pieces.template get<3>();
+	auto const [begin, end] = range_view.equal_range(storage, compare_storage());
+
+	std::vector<piece_index_t> pieces;
+	for (auto i = begin; i != end; ++i)
+		pieces.push_back(i->piece.piece);
+
+	bitfield flushed_blocks;
+
+	for (auto piece : pieces)
+	{
+		auto piece_iter = view.find(piece_location{storage, piece});
+		if (piece_iter == view.end())
+			continue;
+
+		// There's a risk that some other thread is flushing this piece, but
+		// won't force-flush it completely. In that case parts of the piece
+		// may not be flushed
+		// TODO: maybe we should track these pieces and synchronize with
+		// them later. maybe wait for them to be flushed or hang our job on
+		// them, but that would really only work if there's only one piece
+		// left
+		if (piece_iter->flushing)
+			continue;
+
+		int const num_blocks = piece_iter->num_jobs;
+		TORRENT_ASSERT(count_jobs(piece_iter->get_blocks()) == num_blocks);
+		if (num_blocks == 0) continue;
+		span<cached_block_entry> const blocks = piece_iter->get_blocks();
+
+		flush_piece_impl(view, piece_iter, f, l
+			, num_blocks, blocks, clear_piece_fun);
+
+		TORRENT_ASSERT(!piece_iter->flushing);
+		TORRENT_ASSERT(!piece_iter->hashing);
+		piece_iter = view.erase(piece_iter);
+	}
+}
+
+std::size_t disk_cache::size() const
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+	INVARIANT_CHECK;
+	return static_cast<std::size_t>(m_blocks);
+}
+
+std::size_t disk_cache::num_flushing() const
+{
+	std::unique_lock<std::mutex> l(m_mutex);
+	INVARIANT_CHECK;
+	return static_cast<std::size_t>(m_flushing_blocks);
+}
+
+#if TORRENT_USE_INVARIANT_CHECKS
+void disk_cache::check_invariant() const
+{
+	// mutex must be held by caller
+	int dirty_blocks = 0;
+	int flushing_blocks = 0;
+
+	auto& view = m_pieces.template get<2>();
+	for (auto const& piece_entry : view)
+	{
+		int const num_blocks = piece_entry.blocks_in_piece;
+
+		if (piece_entry.flushing)
+			flushing_blocks += num_blocks;
+
+		span<cached_block_entry> const blocks = piece_entry.get_blocks();
+
+		TORRENT_ASSERT(piece_entry.flushed_cursor <= num_blocks);
+		TORRENT_ASSERT(piece_entry.hasher_cursor <= num_blocks);
+
+		int idx = 0;
+		for (auto& be : blocks)
+		{
+			if (be.write_job) ++dirty_blocks;
+			// a block holds either a write job or buffer, never both
+			TORRENT_ASSERT(!(bool(be.write_job) && bool(be.buf_holder)));
+			if (be.write_job)
+				TORRENT_ASSERT(be.write_job->get_type() == aux::job_action_t::write);
+
+			if (idx < piece_entry.flushed_cursor)
+				TORRENT_ASSERT(be.write_job == nullptr);
+			else if (idx == piece_entry.flushed_cursor)
+				TORRENT_ASSERT(!be.buf_holder);
+
+//			if (idx < piece_entry.hasher_cursor)
+//				TORRENT_ASSERT(!be.buf_holder);
+
+			if (piece_entry.ready_to_flush)
+				TORRENT_ASSERT(be.write_job != nullptr || be.flushed_to_disk);
+			++idx;
+		}
+	}
+	// if one or more blocks are being flushed, we cannot know how many blocks
+	// are in flight. We just know the limit
+	TORRENT_ASSERT(dirty_blocks == m_blocks);
+	TORRENT_ASSERT(m_flushing_blocks <= flushing_blocks);
+}
+#endif
+
+// this requires the mutex to be locked
+void disk_cache::clear_piece_impl(cached_piece_entry& cpe, jobqueue_t& aborted)
+{
+	TORRENT_ASSERT(!cpe.flushing);
+	TORRENT_ASSERT(!cpe.hashing);
+	int jobs = 0;
+	for (int idx = 0; idx < cpe.blocks_in_piece; ++idx)
+	{
+		auto& cbe = cpe.blocks[idx];
+		if (cbe.write_job)
+		{
+			aborted.push_back(cbe.write_job);
+			cbe.write_job = nullptr;
+			cbe.flushed_to_disk = false;
+			++jobs;
+			--m_blocks;
+		}
+		cbe.buf_holder.reset();
+	}
+	cpe.ready_to_flush = false;
+	cpe.piece_hash_returned = false;
+	cpe.hasher_cursor = 0;
+	cpe.flushed_cursor = 0;
+	TORRENT_ASSERT(cpe.num_jobs >= jobs);
+	cpe.num_jobs -= jobs;
+	cpe.ph = hasher{};
+}
+
+}
diff --git a/src/disk_completed_queue.cpp b/src/disk_completed_queue.cpp
index 18429b1cb16..552a3415f55 100644
--- a/src/disk_completed_queue.cpp
+++ b/src/disk_completed_queue.cpp
@@ -42,6 +42,33 @@ void disk_completed_queue::abort_job(io_context& ioc, aux::disk_job* j)
 	}
 }
 
+void disk_completed_queue::abort_jobs(io_context& ioc, jobqueue_t jobs)
+{
+	if (jobs.empty()) return;
+
+	for (auto i = jobs.iterate(); i.get(); i.next())
+	{
+		auto* j = i.get();
+		j->ret = disk_status::fatal_disk_error;
+		j->error = storage_error(boost::asio::error::operation_aborted);
+		j->flags |= aux::disk_job::aborted;
+#if TORRENT_USE_ASSERTS
+		TORRENT_ASSERT(j->job_posted == false);
+		j->job_posted = true;
+#endif
+	}
+	std::lock_guard<std::mutex> l(m_completed_jobs_mutex);
+	m_completed_jobs.append(std::move(jobs));
+
+	if (!m_job_completions_in_flight)
+	{
+		DLOG("posting job handlers (%d)\n", m_completed_jobs.size());
+
+		post(ioc, [this] { this->call_job_handlers(); });
+		m_job_completions_in_flight = true;
+	}
+}
+
 void disk_completed_queue::append(io_context& ioc, jobqueue_t jobs)
 {
 	std::lock_guard<std::mutex> l(m_completed_jobs_mutex);
diff --git a/src/disk_job.cpp b/src/disk_job.cpp
index 6d2a01d25ac..7cfedf1596f 100644
--- a/src/disk_job.cpp
+++ b/src/disk_job.cpp
@@ -97,6 +97,8 @@ namespace {
 			j.handler(std::move(j.buf), m_job.error);
 		}
 
+		void operator()(job::kick_hasher&) const {}
+
 	private:
 		disk_job& m_job;
 	};
diff --git a/src/disk_job_pool.cpp b/src/disk_job_pool.cpp
index 11f0571dbae..e08abb0cb73 100644
--- a/src/disk_job_pool.cpp
+++ b/src/disk_job_pool.cpp
@@ -10,6 +10,7 @@ see LICENSE file.
 
 #include "libtorrent/aux_/disk_job_pool.hpp"
 #include "libtorrent/aux_/mmap_disk_job.hpp"
+#include "libtorrent/aux_/pread_disk_job.hpp"
 
 namespace libtorrent {
 namespace aux {
@@ -69,5 +70,6 @@ namespace aux {
 	}
 
 	template struct disk_job_pool<aux::mmap_disk_job>;
+	template struct disk_job_pool<aux::pread_disk_job>;
 }
 }
diff --git a/src/mmap_disk_io.cpp b/src/mmap_disk_io.cpp
index dea6bf67ae8..ba6cfef722b 100644
--- a/src/mmap_disk_io.cpp
+++ b/src/mmap_disk_io.cpp
@@ -136,6 +136,7 @@ struct TORRENT_EXTRA_EXPORT mmap_disk_io final
 	// this submits all queued up jobs to the thread
 	void submit_jobs() override;
 
+	status_t do_job(aux::job::kick_hasher&, aux::mmap_disk_job*) { return status_t{}; }
 	status_t do_job(aux::job::partial_read& a, aux::mmap_disk_job* j);
 	status_t do_job(aux::job::read& a, aux::mmap_disk_job* j);
 	status_t do_job(aux::job::write& a, aux::mmap_disk_job* j);
diff --git a/src/pread_disk_io.cpp b/src/pread_disk_io.cpp
new file mode 100644
index 00000000000..63d4c58f3dd
--- /dev/null
+++ b/src/pread_disk_io.cpp
@@ -0,0 +1,1748 @@
+/*
+
+Copyright (c) 2022, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#include "libtorrent/config.hpp"
+
+#include "libtorrent/aux_/pread_storage.hpp"
+#include "libtorrent/pread_disk_io.hpp"
+#include "libtorrent/disk_buffer_holder.hpp"
+#include "libtorrent/aux_/throw.hpp"
+#include "libtorrent/error_code.hpp"
+#include "libtorrent/error.hpp"
+#include "libtorrent/aux_/disk_buffer_pool.hpp"
+#include "libtorrent/aux_/pread_disk_job.hpp"
+#include "libtorrent/performance_counters.hpp"
+#include "libtorrent/aux_/debug.hpp"
+#include "libtorrent/units.hpp"
+#include "libtorrent/hasher.hpp"
+#include "libtorrent/aux_/platform_util.hpp" // for set_thread_name
+#include "libtorrent/aux_/disk_job_pool.hpp"
+#include "libtorrent/aux_/disk_io_thread_pool.hpp"
+#include "libtorrent/aux_/disk_cache.hpp"
+#include "libtorrent/aux_/time.hpp"
+#include "libtorrent/add_torrent_params.hpp"
+#include "libtorrent/aux_/numeric_cast.hpp"
+#include "libtorrent/settings_pack.hpp"
+#include "libtorrent/aux_/storage_array.hpp"
+#include "libtorrent/aux_/disk_completed_queue.hpp"
+#include "libtorrent/aux_/debug_disk_thread.hpp"
+
+#include <functional>
+
+namespace libtorrent {
+namespace {
+
+aux::open_mode_t file_mode_for_job(aux::pread_disk_job* j)
+{
+	aux::open_mode_t ret = aux::open_mode::read_only;
+	if (!(j->flags & disk_interface::sequential_access)) ret |= aux::open_mode::random_access;
+	return ret;
+}
+
+#if TORRENT_USE_ASSERTS
+bool valid_flags(disk_job_flags_t const flags)
+{
+	return (flags & ~(disk_interface::force_copy
+			| disk_interface::sequential_access
+			| disk_interface::volatile_read
+			| disk_interface::v1_hash
+			| disk_interface::flush_piece))
+		== disk_job_flags_t{};
+}
+#endif
+
+template <typename Fun>
+void translate_error(aux::disk_job* j, Fun f)
+{
+	try
+	{
+		j->ret = f();
+	}
+	catch (boost::system::system_error const& err)
+	{
+		j->ret = disk_status::fatal_disk_error;
+		j->error.ec = err.code();
+		j->error.operation = operation_t::exception;
+	}
+	catch (std::bad_alloc const&)
+	{
+		j->ret = disk_status::fatal_disk_error;
+		j->error.ec = errors::no_memory;
+		j->error.operation = operation_t::exception;
+	}
+	catch (std::exception const&)
+	{
+		j->ret = disk_status::fatal_disk_error;
+		j->error.ec = boost::asio::error::fault;
+		j->error.operation = operation_t::exception;
+	}
+}
+
+} // anonymous namespace
+
+// this is a singleton consisting of the thread and a queue
+// of disk io jobs
+struct TORRENT_EXTRA_EXPORT pread_disk_io final
+	: disk_interface
+{
+	pread_disk_io(io_context& ios, settings_interface const&, counters& cnt);
+#if TORRENT_USE_ASSERTS
+	~pread_disk_io() override;
+#endif
+
+	void settings_updated() override;
+	storage_holder new_torrent(storage_params const& params
+		, std::shared_ptr<void> const& owner) override;
+	void remove_torrent(storage_index_t) override;
+
+	void abort(bool wait) override;
+
+	void async_read(storage_index_t storage, peer_request const& r
+		, std::function<void(disk_buffer_holder, storage_error const&)> handler
+		, disk_job_flags_t flags = {}) override;
+	bool async_write(storage_index_t storage, peer_request const& r
+		, char const* buf, std::shared_ptr<disk_observer> o
+		, std::function<void(storage_error const&)> handler
+		, disk_job_flags_t flags = {}) override;
+	void async_hash(storage_index_t storage, piece_index_t piece, span<sha256_hash> v2
+		, disk_job_flags_t flags
+		, std::function<void(piece_index_t, sha1_hash const&, storage_error const&)> handler) override;
+	void async_hash2(storage_index_t storage, piece_index_t piece, int offset, disk_job_flags_t flags
+		, std::function<void(piece_index_t, sha256_hash const&, storage_error const&)> handler) override;
+	void async_move_storage(storage_index_t storage, std::string p, move_flags_t flags
+		, std::function<void(status_t, std::string const&, storage_error const&)> handler) override;
+	void async_release_files(storage_index_t storage
+		, std::function<void()> handler = std::function<void()>()) override;
+	void async_delete_files(storage_index_t storage, remove_flags_t options
+		, std::function<void(storage_error const&)> handler) override;
+	void async_check_files(storage_index_t storage
+		, add_torrent_params const* resume_data
+		, aux::vector<std::string, file_index_t> links
+		, std::function<void(status_t, storage_error const&)> handler) override;
+	void async_rename_file(storage_index_t storage, file_index_t index, std::string name
+		, std::function<void(std::string const&, file_index_t, storage_error const&)> handler) override;
+	void async_stop_torrent(storage_index_t storage
+		, std::function<void()> handler) override;
+	void async_set_file_priority(storage_index_t storage
+		, aux::vector<download_priority_t, file_index_t> prio
+		, std::function<void(storage_error const&
+			, aux::vector<download_priority_t, file_index_t>)> handler) override;
+
+	void async_clear_piece(storage_index_t storage, piece_index_t index
+		, std::function<void(piece_index_t)> handler) override;
+
+	void update_stats_counters(counters& c) const override;
+
+	std::vector<open_file_state> get_status(storage_index_t) const override;
+
+	// this submits all queued up jobs to the thread
+	void submit_jobs() override;
+
+	status_t do_job(aux::job::partial_read& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::read& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::write& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::hash& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::hash2& a, aux::pread_disk_job* j);
+
+	status_t do_job(aux::job::move_storage& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::release_files& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::delete_files& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::check_fastresume& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::rename_file& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::stop_torrent& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::file_priority& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::clear_piece& a, aux::pread_disk_job* j);
+	status_t do_job(aux::job::kick_hasher& a, aux::pread_disk_job* j);
+
+private:
+
+	void thread_fun(aux::disk_io_thread_pool& pool
+		, executor_work_guard<io_context::executor_type> work);
+
+	void add_completed_jobs(jobqueue_t jobs);
+	void add_completed_jobs_impl(jobqueue_t jobs, jobqueue_t& completed);
+
+	void perform_job(aux::pread_disk_job* j, jobqueue_t& completed_jobs);
+
+	// this queues up another job to be submitted
+	void add_job(aux::pread_disk_job* j, bool user_add = true);
+	void add_fence_job(aux::pread_disk_job* j, bool user_add = true);
+
+	void execute_job(aux::pread_disk_job* j);
+	void immediate_execute();
+	void abort_jobs();
+	void abort_hash_jobs(storage_index_t storage);
+
+	void try_flush_cache(int target_cache_size
+		, std::unique_lock<std::mutex>& l);
+	void flush_storage(std::shared_ptr<aux::pread_storage> const& storage);
+
+	int flush_cache_blocks(bitfield& flushed, span<aux::cached_block_entry const> blocks
+		, int hash_cursor
+		, jobqueue_t& completed_jobs);
+	void clear_piece_jobs(jobqueue_t aborted, aux::pread_disk_job* clear);
+
+	// returns the maximum number of threads
+	// the actual number of threads may be less
+	int num_threads() const;
+	aux::disk_io_thread_pool& pool_for_job(aux::pread_disk_job* j);
+
+	// set to true once we start shutting down
+	std::atomic<bool> m_abort{false};
+
+	// this is a counter of how many threads are currently running.
+	// it's used to identify the last thread still running while
+	// shutting down. This last thread is responsible for cleanup
+	// must hold the job mutex to access
+	int m_num_running_threads = 0;
+
+	aux::disk_job_pool<aux::pread_disk_job> m_job_pool;
+
+	// std::mutex to protect the m_generic_threads and m_hash_threads lists
+	mutable std::mutex m_job_mutex;
+
+	// when set, it means we're trying to flush the disk cache down to this size
+	// it's a signal to generic disk threads to start flushing. Once flushing
+	// starts, m_flush_target is cleared.
+	std::optional<int> m_flush_target = std::nullopt;
+
+	settings_interface const& m_settings;
+
+	// LRU cache of open files
+	aux::file_pool m_file_pool;
+
+	// disk cache
+	aux::disk_buffer_pool m_buffer_pool;
+
+	// total number of blocks in use by both the read
+	// and the write cache. This is not supposed to
+	// exceed m_cache_size
+
+	counters& m_stats_counters;
+
+	// this is the main thread io_context. Callbacks are
+	// posted on this in order to have them execute in
+	// the main thread.
+	io_context& m_ios;
+
+	aux::disk_completed_queue m_completed_jobs;
+
+	// storages that have had write activity recently and will get ticked
+	// soon, for deferred actions (say, flushing partfile metadata)
+	std::vector<std::pair<time_point, std::weak_ptr<aux::pread_storage>>> m_need_tick;
+	std::mutex m_need_tick_mutex;
+
+	aux::storage_array<aux::pread_storage> m_torrents;
+
+	std::atomic_flag m_jobs_aborted = ATOMIC_FLAG_INIT;
+
+	// every write job is inserted into this map while it is in the job queue.
+	// It is removed after the write completes. This will let subsequent reads
+	// pull the buffers straight out of the queue instead of having to
+	// synchronize with the writing thread(s)
+	aux::disk_cache m_cache;
+
+	// most jobs are posted to m_generic_io_jobs
+	// but hash jobs are posted to m_hash_io_jobs if m_hash_threads
+	// has a non-zero maximum thread count
+	aux::disk_io_thread_pool m_generic_threads;
+	aux::disk_io_thread_pool m_hash_threads;
+};
+
+TORRENT_EXPORT std::unique_ptr<disk_interface> pread_disk_io_constructor(
+	io_context& ios, settings_interface const& sett, counters& cnt)
+{
+	return std::make_unique<pread_disk_io>(ios, sett, cnt);
+}
+
+// ------- pread_disk_io ------
+
+// for _1 and _2
+using namespace std::placeholders;
+
+pread_disk_io::pread_disk_io(io_context& ios, settings_interface const& sett, counters& cnt)
+	: m_settings(sett)
+	, m_file_pool(sett.get_int(settings_pack::file_pool_size))
+	, m_buffer_pool(ios)
+	, m_stats_counters(cnt)
+	, m_ios(ios)
+	, m_completed_jobs([&](aux::disk_job** j, int const n) {
+		m_job_pool.free_jobs(reinterpret_cast<aux::pread_disk_job**>(j), n);
+		}, cnt)
+	, m_generic_threads(std::bind(&pread_disk_io::thread_fun, this, _1, _2), ios)
+	, m_hash_threads(std::bind(&pread_disk_io::thread_fun, this, _1, _2), ios)
+{
+	settings_updated();
+}
+
+std::vector<open_file_state> pread_disk_io::get_status(storage_index_t const st) const
+{
+	return m_file_pool.get_status(st);
+}
+
+storage_holder pread_disk_io::new_torrent(storage_params const& params
+	, std::shared_ptr<void> const& owner)
+{
+	TORRENT_ASSERT(params.files.is_valid());
+
+	auto storage = std::make_shared<aux::pread_storage>(params, m_file_pool);
+	storage->set_owner(owner);
+	storage_index_t const idx = m_torrents.add(std::move(storage));
+	return storage_holder(idx, *this);
+}
+
+void pread_disk_io::remove_torrent(storage_index_t const idx)
+{
+	m_torrents.remove(idx);
+}
+
+#if TORRENT_USE_ASSERTS
+pread_disk_io::~pread_disk_io()
+{
+	DLOG("destructing pread_disk_io\n");
+
+	// abort should have been triggered
+	TORRENT_ASSERT(m_abort);
+
+	// there are not supposed to be any writes in-flight by now
+	TORRENT_ASSERT(m_cache.size() == 0);
+
+	// all torrents are supposed to have been removed by now
+	TORRENT_ASSERT(m_torrents.empty());
+}
+#endif
+
+void pread_disk_io::abort(bool const wait)
+{
+	DLOG("pread_disk_io::abort: (wait: %d)\n", int(wait));
+
+	// first make sure queued jobs have been submitted
+	// otherwise the queue may not get processed
+	submit_jobs();
+
+	// abuse the job mutex to make setting m_abort and checking the thread count atomic
+	// see also the comment in thread_fun
+	std::unique_lock<std::mutex> l(m_job_mutex);
+	if (m_abort.exchange(true)) return;
+	bool const no_threads = m_generic_threads.num_threads() == 0
+		&& m_hash_threads.num_threads() == 0;
+	// abort outstanding jobs belonging to this torrent
+
+	DLOG("aborting hash jobs\n");
+	m_hash_threads.visit_jobs([](aux::disk_job* j)
+	{
+		j->flags |= aux::disk_job::aborted;
+	});
+	l.unlock();
+
+	// if there are no disk threads, we can't wait for the jobs here, because
+	// we'd stall indefinitely
+	if (no_threads)
+	{
+		abort_jobs();
+	}
+
+	DLOG("aborting thread pools\n");
+	// even if there are no threads it doesn't hurt to abort the pools
+	// it prevents threads from being started after an abort which is a good
+	// defensive programming measure
+	m_generic_threads.abort(wait);
+	m_hash_threads.abort(wait);
+}
+
+void pread_disk_io::settings_updated()
+{
+	m_buffer_pool.set_settings(m_settings);
+	m_file_pool.resize(m_settings.get_int(settings_pack::file_pool_size));
+
+	int const num_threads = m_settings.get_int(settings_pack::aio_threads);
+	int const num_hash_threads = m_settings.get_int(settings_pack::hashing_threads);
+	DLOG("set max threads(%d, %d)\n", num_threads, num_hash_threads);
+
+	m_generic_threads.set_max_threads(num_threads);
+	m_hash_threads.set_max_threads(num_hash_threads);
+}
+
+void pread_disk_io::perform_job(aux::pread_disk_job* j, jobqueue_t& completed_jobs)
+{
+	TORRENT_ASSERT(j->next == nullptr);
+
+#if DEBUG_DISK_THREAD
+	{
+		std::unique_lock<std::mutex> l(m_job_mutex);
+
+		DLOG("perform_job job: %s outstanding: %d\n"
+			, print_job(*j).c_str()
+			, j->storage ? j->storage->num_outstanding_jobs() : -1);
+	}
+#endif
+
+	std::shared_ptr<aux::pread_storage> storage = j->storage;
+
+	m_stats_counters.inc_stats_counter(counters::num_running_disk_jobs, 1);
+
+	// call disk function
+	// TODO: in the future, propagate exceptions back to the handlers
+	translate_error(j, [&] {
+		return std::visit([this, j](auto& a) { return this->do_job(a, j); }, j->action);
+	});
+
+	// note that -2 errors are OK
+	TORRENT_ASSERT(j->ret != disk_status::fatal_disk_error
+		|| (j->error.ec && j->error.operation != operation_t::unknown));
+
+	m_stats_counters.inc_stats_counter(counters::num_running_disk_jobs, -1);
+
+	completed_jobs.push_back(j);
+}
+
+status_t pread_disk_io::do_job(aux::job::partial_read& a, aux::pread_disk_job* j)
+{
+	TORRENT_ASSERT(a.buf);
+	time_point const start_time = clock_type::now();
+
+	span<char> const b = {a.buf.data() + a.buffer_offset, a.buffer_size};
+
+	int const ret = j->storage->read(m_settings, b
+		, a.piece, a.offset, file_mode_for_job(j), j->flags, j->error);
+
+	TORRENT_ASSERT(ret >= 0 || j->error.ec);
+	TORRENT_UNUSED(ret);
+
+	if (!j->error.ec)
+	{
+		std::int64_t const read_time = total_microseconds(clock_type::now() - start_time);
+
+		m_stats_counters.inc_stats_counter(counters::num_read_back);
+		m_stats_counters.inc_stats_counter(counters::num_blocks_read);
+		m_stats_counters.inc_stats_counter(counters::num_read_ops);
+		m_stats_counters.inc_stats_counter(counters::disk_read_time, read_time);
+		m_stats_counters.inc_stats_counter(counters::disk_job_time, read_time);
+	}
+
+	TORRENT_ASSERT((j->flags & aux::disk_job::in_progress) || !j->storage);
+	return status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::read& a, aux::pread_disk_job* j)
+{
+	a.buf = disk_buffer_holder(m_buffer_pool, m_buffer_pool.allocate_buffer("send buffer"), default_block_size);
+	if (!a.buf)
+	{
+		j->error.ec = error::no_memory;
+		j->error.operation = operation_t::alloc_cache_piece;
+		return disk_status::fatal_disk_error;
+	}
+
+	time_point const start_time = clock_type::now();
+
+	aux::open_mode_t const file_mode = file_mode_for_job(j);
+	span<char> const b = {a.buf.data(), a.buffer_size};
+
+	int const ret = j->storage->read(m_settings, b
+		, a.piece, a.offset, file_mode, j->flags, j->error);
+
+	TORRENT_ASSERT(ret >= 0 || j->error.ec);
+	TORRENT_UNUSED(ret);
+
+	if (!j->error.ec)
+	{
+		std::int64_t const read_time = total_microseconds(clock_type::now() - start_time);
+
+		m_stats_counters.inc_stats_counter(counters::num_read_back);
+		m_stats_counters.inc_stats_counter(counters::num_blocks_read);
+		m_stats_counters.inc_stats_counter(counters::num_read_ops);
+		m_stats_counters.inc_stats_counter(counters::disk_read_time, read_time);
+		m_stats_counters.inc_stats_counter(counters::disk_job_time, read_time);
+	}
+	TORRENT_ASSERT((j->flags & aux::disk_job::in_progress) || !j->storage);
+	return status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::write&, aux::pread_disk_job*)
+{
+	TORRENT_ASSERT_FAIL();
+	return status_t{};
+}
+
+void pread_disk_io::async_read(storage_index_t storage, peer_request const& r
+	, std::function<void(disk_buffer_holder, storage_error const&)> handler
+	, disk_job_flags_t const flags)
+{
+	TORRENT_ASSERT(valid_flags(flags));
+	TORRENT_ASSERT(r.length <= default_block_size);
+	TORRENT_ASSERT(r.length > 0);
+	TORRENT_ASSERT(r.start >= 0);
+
+	storage_error ec;
+	if (r.length <= 0 || r.start < 0)
+	{
+		// this is an invalid read request.
+		ec.ec = errors::invalid_request;
+		ec.operation = operation_t::file_read;
+		handler(disk_buffer_holder{}, ec);
+		return;
+	}
+
+	// in case r.start is not aligned to a block, calculate that offset,
+	// since that's how the disk_cache is indexed. block_offset is the
+	// aligned offset to the first block this read touches. In the case the
+	// request is aligned, it's the same as r.start
+	int const block_offset = r.start - (r.start % default_block_size);
+	int const block_idx = r.start / default_block_size;
+	// this is the offset into the block that we're reading from
+	int const read_offset = r.start - block_offset;
+
+	DLOG("async_read piece: %d block: %d (read-offset: %d)\n", static_cast<int>(r.piece)
+		, block_offset / default_block_size, read_offset);
+
+	disk_buffer_holder buffer;
+
+	if (read_offset + r.length > default_block_size)
+	{
+		// This is an unaligned request spanning two blocks. One of the two
+		// blocks may be in the cache, or neither.
+		// If neither is in the cache, we can just issue a normal
+		// read job for the unaligned request.
+
+		aux::piece_location const loc{storage, r.piece};
+		std::ptrdiff_t const len1 = default_block_size - read_offset;
+
+		TORRENT_ASSERT(r.length > len1);
+
+		int const ret = m_cache.get2(loc, block_idx, [&](char const* buf1, char const* buf2)
+		{
+			buffer = disk_buffer_holder(m_buffer_pool
+				, m_buffer_pool.allocate_buffer("send buffer")
+				, r.length);
+			if (!buffer)
+			{
+				ec.ec = error::no_memory;
+				ec.operation = operation_t::alloc_cache_piece;
+				return 3;
+			}
+
+			if (buf1)
+				std::memcpy(buffer.data(), buf1 + read_offset, std::size_t(len1));
+			if (buf2)
+				std::memcpy(buffer.data() + len1, buf2, std::size_t(r.length - len1));
+			return (buf1 ? 2 : 0) | (buf2 ? 1 : 0);
+		});
+
+		if (ret == 3)
+		{
+			// both sides were found in the store buffer and the read request
+			// was satisfied immediately
+			handler(std::move(buffer), ec);
+			return;
+		}
+
+		if (ret != 0)
+		{
+			TORRENT_ASSERT(ret == 1 || ret == 2);
+			// only one side of the read request was found in the store
+			// buffer, and we need to issue a partial read for the remaining
+			// bytes
+			aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::partial_read>(
+				flags,
+				m_torrents[storage]->shared_from_this(),
+				std::move(handler),
+				std::move(buffer),
+				std::uint16_t((ret == 1) ? 0 : len1), // buffer_offset
+				std::uint16_t((ret == 1) ? len1 : r.length - len1), // buffer_size
+				r.piece,
+				(ret == 1) ? r.start : block_offset + default_block_size // offset
+			);
+
+			add_job(j);
+			return;
+		}
+
+		// if we couldn't find any block in the cache, fall through and post it
+		// as a normal read job
+	}
+	else
+	{
+		// this is an aligned read request for one block
+		if (m_cache.get({ storage, r.piece }, block_idx, [&](span<char const> buf)
+		{
+			TORRENT_ASSERT(buf.size() <= read_offset + r.length);
+			buffer = disk_buffer_holder(m_buffer_pool, m_buffer_pool.allocate_buffer("send buffer"), r.length);
+			if (!buffer)
+			{
+				ec.ec = error::no_memory;
+				ec.operation = operation_t::alloc_cache_piece;
+				return;
+			}
+
+			std::memcpy(buffer.data(), buf.data() + read_offset, std::size_t(r.length));
+		}))
+		{
+			handler(std::move(buffer), ec);
+			return;
+		}
+	}
+
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::read>(
+		flags,
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		disk_buffer_holder{},
+		std::uint16_t(r.length), // buffer_size
+		r.piece,
+		r.start // offset
+	);
+
+	add_job(j);
+}
+
+bool pread_disk_io::async_write(storage_index_t const storage, peer_request const& r
+	, char const* buf, std::shared_ptr<disk_observer> o
+	, std::function<void(storage_error const&)> handler
+	, disk_job_flags_t const flags)
+{
+	TORRENT_ASSERT(valid_flags(flags));
+	bool exceeded = false;
+	disk_buffer_holder buffer(m_buffer_pool, m_buffer_pool.allocate_buffer(
+		exceeded, o, "receive buffer"), r.length);
+	if (!buffer) aux::throw_ex<std::bad_alloc>();
+	std::memcpy(buffer.data(), buf, aux::numeric_cast<std::size_t>(r.length));
+
+	TORRENT_ASSERT(r.start % default_block_size == 0);
+	TORRENT_ASSERT(r.length <= default_block_size);
+
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::write>(
+		flags,
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		std::move(buffer),
+		r.piece,
+		r.start,
+		std::uint16_t(r.length)
+	);
+
+	DLOG("async_write: piece: %d offset: %d\n", int(r.piece), int(r.start));
+	bool const need_kick = m_cache.insert({j->storage->storage_index(), r.piece}, r.start / default_block_size, j);
+
+	if (need_kick)
+	{
+		// TODO: if the most recently added job to the hash thread pool is a
+		// kick-hasher job for the same piece, skip this
+		aux::pread_disk_job* khj = m_job_pool.allocate_job<aux::job::kick_hasher>(
+			flags,
+			m_torrents[storage]->shared_from_this(),
+			r.piece
+		);
+		add_job(khj);
+	}
+
+	std::unique_lock<std::mutex> l(m_job_mutex);
+	if (!m_flush_target)
+	{
+		// if the disk buffer wants to free up blocks, notify the thread
+		// pool that we may need to flush blocks
+		auto req = m_buffer_pool.flush_request();
+		if (req)
+		{
+			m_flush_target = std::max(0, int(m_cache.size()) - *req);
+			DLOG("async_write: set flush_target: %d\n", *m_flush_target);
+			// wake up a thread
+			m_generic_threads.interrupt();
+		}
+	}
+
+	return exceeded;
+}
+
+void pread_disk_io::async_hash(storage_index_t const storage
+	, piece_index_t const piece, span<sha256_hash> const v2, disk_job_flags_t const flags
+	, std::function<void(piece_index_t, sha1_hash const&, storage_error const&)> handler)
+{
+	TORRENT_ASSERT(valid_flags(flags));
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::hash>(
+		flags,
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		piece,
+		v2,
+		sha1_hash{}
+	);
+
+	aux::disk_cache::hash_result const ret = m_cache.try_hash_piece({j->storage->storage_index(), piece}, j);
+
+	// if we have already computed the piece hash, just post the completion
+	// immediately
+	if (ret == aux::disk_cache::job_completed)
+	{
+		jobqueue_t jobs;
+		jobs.push_back(j);
+		add_completed_jobs(std::move(jobs));
+		return;
+	}
+
+	// In this case the job has been queued on the piece, and will be posted
+	// once the hashing completes
+	if (ret == aux::disk_cache::job_queued)
+		return;
+
+	add_job(j);
+}
+
+void pread_disk_io::async_hash2(storage_index_t const storage
+	, piece_index_t const piece, int const offset, disk_job_flags_t const flags
+	, std::function<void(piece_index_t, sha256_hash const&, storage_error const&)> handler)
+{
+	TORRENT_ASSERT(valid_flags(flags));
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::hash2>(
+		flags,
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		piece,
+		offset,
+		sha256_hash{}
+	);
+
+	// TODO: check the disk cache here
+
+	add_job(j);
+}
+
+void pread_disk_io::async_move_storage(storage_index_t const storage
+	, std::string p, move_flags_t const flags
+	, std::function<void(status_t, std::string const&, storage_error const&)> handler)
+{
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::move_storage>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		std::move(p), // path
+		flags
+	);
+
+	add_fence_job(j);
+}
+
+void pread_disk_io::async_release_files(storage_index_t const storage
+	, std::function<void()> handler)
+{
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::release_files>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler)
+	);
+
+	add_fence_job(j);
+}
+
+void pread_disk_io::abort_hash_jobs(storage_index_t const storage)
+{
+	// abort outstanding hash jobs belonging to this torrent
+	std::unique_lock<std::mutex> l(m_job_mutex);
+
+	auto st = m_torrents[storage]->shared_from_this();
+	// hash jobs
+	m_hash_threads.visit_jobs([&](aux::disk_job* gj)
+	{
+		auto* j = static_cast<aux::pread_disk_job*>(gj);
+		if (j->storage != st) return;
+		// only cancel volatile-read jobs. This means only full checking
+		// jobs. These jobs are likely to have a pretty deep queue and
+		// really gain from being cancelled. They can also be restarted
+		// easily.
+		if (j->flags & disk_interface::volatile_read)
+			j->flags |= aux::disk_job::aborted;
+	});
+}
+
+void pread_disk_io::async_delete_files(storage_index_t const storage
+	, remove_flags_t const options
+	, std::function<void(storage_error const&)> handler)
+{
+	abort_hash_jobs(storage);
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::delete_files>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		options
+	);
+	add_fence_job(j);
+}
+
+void pread_disk_io::async_check_files(storage_index_t const storage
+	, add_torrent_params const* resume_data
+	, aux::vector<std::string, file_index_t> links
+	, std::function<void(status_t, storage_error const&)> handler)
+{
+	aux::vector<std::string, file_index_t>* links_vector = nullptr;
+	if (!links.empty()) links_vector = new aux::vector<std::string, file_index_t>(std::move(links));
+
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::check_fastresume>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		links_vector,
+		resume_data
+	);
+
+	add_fence_job(j);
+}
+
+void pread_disk_io::async_rename_file(storage_index_t const storage
+	, file_index_t const index, std::string name
+	, std::function<void(std::string const&, file_index_t, storage_error const&)> handler)
+{
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::rename_file>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		index,
+		std::move(name)
+	);
+	add_fence_job(j);
+}
+
+void pread_disk_io::async_stop_torrent(storage_index_t const storage
+	, std::function<void()> handler)
+{
+	auto st = m_torrents[storage]->shared_from_this();
+	abort_hash_jobs(storage);
+
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::stop_torrent>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler)
+	);
+	add_fence_job(j);
+}
+
+void pread_disk_io::async_set_file_priority(storage_index_t const storage
+	, aux::vector<download_priority_t, file_index_t> prios
+	, std::function<void(storage_error const&
+		, aux::vector<download_priority_t, file_index_t>)> handler)
+{
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::file_priority>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		std::move(prios)
+	);
+
+	add_fence_job(j);
+}
+
+void pread_disk_io::async_clear_piece(storage_index_t const storage
+	, piece_index_t const index, std::function<void(piece_index_t)> handler)
+{
+	aux::pread_disk_job* j = m_job_pool.allocate_job<aux::job::clear_piece>(
+		{},
+		m_torrents[storage]->shared_from_this(),
+		std::move(handler),
+		index
+	);
+
+	DLOG("async_clear_piece: piece: %d\n", int(index));
+	// regular jobs are not executed in-order.
+	// clear piece must wait for all write jobs issued to the piece finish
+	// before it completes.
+	jobqueue_t aborted_jobs;
+	bool const immediate_completion = m_cache.try_clear_piece(
+		{j->storage->storage_index(), index}, j, aborted_jobs);
+
+	m_completed_jobs.abort_jobs(m_ios, std::move(aborted_jobs));
+	if (immediate_completion)
+	{
+		DLOG("immediate clear\n");
+		jobqueue_t jobs;
+		jobs.push_back(j);
+		add_completed_jobs(std::move(jobs));
+	}
+	else
+	{
+		DLOG("deferred clear\n");
+	}
+}
+
+status_t pread_disk_io::do_job(aux::job::hash& a, aux::pread_disk_job* j)
+{
+	// we're not using a cache. This is the simple path
+	// just read straight from the file
+	bool const v1 = bool(j->flags & disk_interface::v1_hash);
+	bool const v2 = !a.block_hashes.empty();
+
+	int const piece_size = v1 ? j->storage->files().piece_size(a.piece) : 0;
+	int const piece_size2 = v2 ? j->storage->files().piece_size2(a.piece) : 0;
+	int const blocks_in_piece = v1 ? (piece_size + default_block_size - 1) / default_block_size : 0;
+	int const blocks_in_piece2 = v2 ? j->storage->files().blocks_in_piece2(a.piece) : 0;
+	aux::open_mode_t const file_mode = file_mode_for_job(j);
+
+	TORRENT_ASSERT(!v2 || int(a.block_hashes.size()) >= blocks_in_piece2);
+	TORRENT_ASSERT(v1 || v2);
+
+	int const blocks_to_read = std::max(blocks_in_piece, blocks_in_piece2);
+
+	// this creates a function object, ready to be passed to
+	// m_cache.hash_piece()
+	auto hash_partial_piece = [&] (lt::hasher& ph
+		, int const hasher_cursor
+		, span<char const*> const blocks
+		, span<sha256_hash> const v2_hashes)
+	{
+		time_point const start_time = clock_type::now();
+
+		if (v2 && hasher_cursor > 0)
+		{
+			for (int i = 0; i < hasher_cursor; ++i)
+			{
+				TORRENT_ASSERT(!v2_hashes[i].is_all_zeros());
+				a.block_hashes[i] = v2_hashes[i];
+			}
+		}
+
+		int offset = hasher_cursor * default_block_size;
+		int blocks_read_from_disk = 0;
+		for (int i = hasher_cursor; i < blocks_to_read; ++i)
+		{
+			bool const v2_block = i < blocks_in_piece2;
+
+			std::ptrdiff_t const len = v1 ? std::min(default_block_size, piece_size - offset) : 0;
+			std::ptrdiff_t const len2 = v2_block ? std::min(default_block_size, piece_size2 - offset) : 0;
+
+			hasher256 ph2;
+			char const* buf = blocks[i];
+			if (buf == nullptr)
+			{
+				DLOG("do_hash: reading (piece: %d block: %d)\n", int(a.piece), i);
+
+				j->error.ec.clear();
+
+				if (v1)
+				{
+					auto const flags = v2_block
+						? (j->flags & ~disk_interface::flush_piece)
+						: j->flags;
+
+					j->storage->hash(m_settings, ph, len, a.piece
+						, offset, file_mode, flags, j->error);
+				}
+				if (v2_block)
+				{
+					j->storage->hash2(m_settings, ph2, len2, a.piece, offset
+						, file_mode, j->flags, j->error);
+				}
+				if (j->error) break;
+				++blocks_read_from_disk;
+			}
+			else
+			{
+				if (v1)
+					ph.update({ buf, len });
+				if (v2_block)
+					ph2.update({buf, len2});
+			}
+			offset += default_block_size;
+
+			if (v2_block)
+				a.block_hashes[i] = ph2.final();
+		}
+
+		if (v1)
+			a.piece_hash = ph.final();
+
+		if (!j->error.ec)
+		{
+			std::int64_t const read_time = total_microseconds(clock_type::now() - start_time);
+
+			m_stats_counters.inc_stats_counter(counters::num_blocks_read, blocks_read_from_disk);
+			m_stats_counters.inc_stats_counter(counters::num_read_ops, blocks_read_from_disk);
+			m_stats_counters.inc_stats_counter(counters::disk_hash_time, read_time);
+			m_stats_counters.inc_stats_counter(counters::disk_job_time, read_time);
+		}
+	};
+
+	if (!m_cache.hash_piece({ j->storage->storage_index(), a.piece}
+		, hash_partial_piece))
+	{
+		// fall back to reading everything from disk
+
+		TORRENT_ALLOCA(blocks, char const*, blocks_to_read);
+		TORRENT_ALLOCA(v2_hashes, sha256_hash, blocks_in_piece2);
+		for (char const*& b : blocks) b = nullptr;
+		hasher ph;
+		hash_partial_piece(ph, 0, blocks, v2_hashes);
+	}
+	return j->error ? disk_status::fatal_disk_error : status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::hash2& a, aux::pread_disk_job* j)
+{
+	int const piece_size = j->storage->files().piece_size2(a.piece);
+	aux::open_mode_t const file_mode = file_mode_for_job(j);
+
+	DLOG("do_hash2: reading (piece: %d offset: %d)\n", int(a.piece), int(a.offset));
+
+	time_point const start_time = clock_type::now();
+
+	TORRENT_ASSERT(piece_size > a.offset);
+	std::ptrdiff_t const len = std::min(default_block_size, piece_size - a.offset);
+
+	int ret = 0;
+	a.piece_hash2 = m_cache.hash2({ j->storage->storage_index(), a.piece }
+		, a.offset / default_block_size
+		, [&] {
+		hasher256 h;
+		ret = j->storage->hash2(m_settings, h, len, a.piece, a.offset
+			, file_mode, j->flags, j->error);
+		return h.final();
+	});
+
+	if (!j->error.ec)
+	{
+		std::int64_t const read_time = total_microseconds(clock_type::now() - start_time);
+
+		m_stats_counters.inc_stats_counter(counters::num_blocks_read);
+		m_stats_counters.inc_stats_counter(counters::num_read_ops);
+		m_stats_counters.inc_stats_counter(counters::disk_hash_time, read_time);
+		m_stats_counters.inc_stats_counter(counters::disk_job_time, read_time);
+	}
+
+	return ret >= 0 ? status_t{} : disk_status::fatal_disk_error;
+}
+
+status_t pread_disk_io::do_job(aux::job::move_storage& a, aux::pread_disk_job* j)
+{
+	// if this assert fails, something's wrong with the fence logic
+	TORRENT_ASSERT(j->storage->num_outstanding_jobs() == 1);
+	flush_storage(j->storage);
+
+	// if files have to be closed, that's the storage's responsibility
+	auto const [ret, p] = j->storage->move_storage(std::move(a.path), a.move_flags, j->error);
+
+	a.path = std::move(p);
+	return ret;
+}
+
+status_t pread_disk_io::do_job(aux::job::release_files&, aux::pread_disk_job* j)
+{
+	// if this assert fails, something's wrong with the fence logic
+	TORRENT_ASSERT(j->storage->num_outstanding_jobs() == 1);
+	flush_storage(j->storage);
+	j->storage->release_files(j->error);
+	return j->error ? disk_status::fatal_disk_error : status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::delete_files& a, aux::pread_disk_job* j)
+{
+	TORRENT_ASSERT(a.flags);
+
+	// if this assert fails, something's wrong with the fence logic
+	TORRENT_ASSERT(j->storage->num_outstanding_jobs() == 1);
+
+	// TODO: maybe we don't need to write to files we're about to delete
+	flush_storage(j->storage);
+
+	j->storage->delete_files(a.flags, j->error);
+	return j->error ? disk_status::fatal_disk_error : status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::check_fastresume& a, aux::pread_disk_job* j)
+{
+	// if this assert fails, something's wrong with the fence logic
+	TORRENT_ASSERT(j->storage->num_outstanding_jobs() == 1);
+	flush_storage(j->storage);
+	add_torrent_params const* rd = a.resume_data;
+	add_torrent_params tmp;
+	if (rd == nullptr) rd = &tmp;
+
+	std::unique_ptr<aux::vector<std::string, file_index_t>> links(a.links);
+	// check if the fastresume data is up to date
+	// if it is, use it and return true. If it
+	// isn't return false and the full check
+	// will be run. If the links pointer is non-empty, it has the same number
+	// of elements as there are files. Each element is either empty or contains
+	// the absolute path to a file identical to the corresponding file in this
+	// torrent. The storage must create hard links (or copy) those files. If
+	// any file does not exist or is inaccessible, the disk job must fail.
+
+	TORRENT_ASSERT(j->storage->files().piece_length() > 0);
+
+	// always initialize the storage
+	auto const ret_flag = j->storage->initialize(m_settings, j->error);
+	if (j->error) return disk_status::fatal_disk_error | ret_flag;
+
+	// we must call verify_resume() unconditionally of the setting below, in
+	// order to set up the links (if present)
+	bool const verify_success = j->storage->verify_resume_data(*rd
+		, links ? *links : aux::vector<std::string, file_index_t>(), j->error);
+
+	// j->error may have been set at this point, by verify_resume_data()
+	// it's important to not have it cleared out subsequent calls, as long
+	// as they succeed.
+
+	if (m_settings.get_bool(settings_pack::no_recheck_incomplete_resume))
+		return status_t{} | ret_flag;
+
+	if (!aux::contains_resume_data(*rd))
+	{
+		// if we don't have any resume data, we still may need to trigger a
+		// full re-check, if there are *any* files.
+		storage_error ignore;
+		return ((j->storage->has_any_file(ignore))
+			? disk_status::need_full_check
+			: status_t{})
+			| ret_flag;
+	}
+
+	return (verify_success
+		? status_t{}
+		: disk_status::need_full_check)
+		| ret_flag;
+}
+
+status_t pread_disk_io::do_job(aux::job::rename_file& a, aux::pread_disk_job* j)
+{
+	// if this assert fails, something's wrong with the fence logic
+	TORRENT_ASSERT(j->storage->num_outstanding_jobs() == 1);
+
+	// if files need to be closed, that's the storage's responsibility
+	j->storage->rename_file(a.file_index, a.name, j->error);
+	return j->error ? disk_status::fatal_disk_error : status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::stop_torrent&, aux::pread_disk_job* j)
+{
+	// if this assert fails, something's wrong with the fence logic
+	TORRENT_ASSERT(j->storage->num_outstanding_jobs() == 1);
+	flush_storage(j->storage);
+	j->storage->release_files(j->error);
+	return j->error ? disk_status::fatal_disk_error : status_t{};
+}
+
+void pread_disk_io::update_stats_counters(counters& c) const
+{
+	// These are atomic_counts, so it's safe to access them from
+	// a different thread
+	std::unique_lock<std::mutex> jl(m_job_mutex);
+
+	c.set_value(counters::num_read_jobs, m_job_pool.read_jobs_in_use());
+	c.set_value(counters::num_write_jobs, m_job_pool.write_jobs_in_use());
+	c.set_value(counters::num_jobs, m_job_pool.jobs_in_use());
+	c.set_value(counters::queued_disk_jobs, m_generic_threads.queue_size()
+		+ m_hash_threads.queue_size());
+
+	jl.unlock();
+
+	// gauges
+	c.set_value(counters::disk_blocks_in_use, m_buffer_pool.in_use());
+}
+
+status_t pread_disk_io::do_job(aux::job::file_priority& a, aux::pread_disk_job* j)
+{
+	j->storage->set_file_priority(m_settings
+		, a.prio
+		, j->error);
+	return status_t{};
+}
+
+status_t pread_disk_io::do_job(aux::job::clear_piece&, aux::pread_disk_job*)
+{
+	TORRENT_ASSERT_FAIL();
+	return {};
+}
+
+status_t pread_disk_io::do_job(aux::job::kick_hasher& a, aux::pread_disk_job* j)
+{
+	jobqueue_t jobs;
+	m_cache.kick_hasher({j->storage->storage_index(), a.piece}, jobs);
+	add_completed_jobs(std::move(jobs));
+	return {};
+}
+
+void pread_disk_io::add_fence_job(aux::pread_disk_job* j, bool const user_add)
+{
+	// if this happens, it means we started to shut down
+	// the disk threads too early. We have to post all jobs
+	// before the disk threads are shut down
+	if (m_abort)
+	{
+		m_completed_jobs.abort_job(m_ios, j);
+		return;
+	}
+
+	DLOG("add_fence:job: %s (outstanding: %d)\n"
+		, print_job(*j).c_str()
+		, j->storage->num_outstanding_jobs());
+
+	TORRENT_ASSERT(j->storage);
+	m_stats_counters.inc_stats_counter(counters::num_fenced_read + static_cast<int>(j->get_type()));
+
+	int const ret = j->storage->raise_fence(j, m_stats_counters);
+	if (ret == aux::disk_job_fence::fence_post_fence)
+	{
+		std::unique_lock<std::mutex> l(m_job_mutex);
+		TORRENT_ASSERT((j->flags & aux::disk_job::in_progress) || !j->storage);
+		m_generic_threads.push_back(j);
+		l.unlock();
+	}
+
+	if (num_threads() == 0 && user_add)
+		immediate_execute();
+}
+
+void pread_disk_io::add_job(aux::pread_disk_job* j, bool const user_add)
+{
+	TORRENT_ASSERT(!j->storage || j->storage->files().is_valid());
+	TORRENT_ASSERT(j->next == nullptr);
+	// if this happens, it means we started to shut down
+	// the disk threads too early. We have to post all jobs
+	// before the disk threads are shut down
+	if (m_abort)
+	{
+		m_completed_jobs.abort_job(m_ios, j);
+		return;
+	}
+
+	TORRENT_ASSERT(!(j->flags & aux::disk_job::in_progress));
+
+	DLOG("add_job: %s (outstanding: %d)\n"
+		, print_job(*j).c_str()
+		, j->storage ? j->storage->num_outstanding_jobs() : 0);
+
+	// is the fence up for this storage?
+	// jobs that are instantaneous are not affected by the fence, is_blocked()
+	// will take ownership of the job and queue it up, in case the fence is up
+	// if the fence flag is set, this job just raised the fence on the storage
+	// and should be scheduled
+	if (j->storage && j->storage->is_blocked(j))
+	{
+		m_stats_counters.inc_stats_counter(counters::blocked_disk_jobs);
+		DLOG("blocked job: %s (torrent: %d total: %d)\n"
+			, print_job(*j).c_str(), j->storage ? j->storage->num_blocked() : 0
+			, int(m_stats_counters[counters::blocked_disk_jobs]));
+		return;
+	}
+
+	std::unique_lock<std::mutex> l(m_job_mutex);
+
+	TORRENT_ASSERT((j->flags & aux::disk_job::in_progress) || !j->storage);
+
+	auto& q = pool_for_job(j);
+	q.push_back(j);
+	l.unlock();
+	// if we literally have 0 disk threads, we have to execute the jobs
+	// immediately. If add job is called internally by the pread_disk_io,
+	// we need to defer executing it. We only want the top level to loop
+	// over the job queue (as is done below)
+	if (pool_for_job(j).max_threads() == 0 && user_add)
+		immediate_execute();
+}
+
+void pread_disk_io::immediate_execute()
+{
+	while (!m_generic_threads.empty())
+	{
+		auto* j = static_cast<aux::pread_disk_job*>(m_generic_threads.pop_front());
+		execute_job(j);
+	}
+}
+
+void pread_disk_io::submit_jobs()
+{
+	std::unique_lock<std::mutex> l(m_job_mutex);
+	m_generic_threads.submit_jobs();
+	m_hash_threads.submit_jobs();
+}
+
+void pread_disk_io::execute_job(aux::pread_disk_job* j)
+{
+	jobqueue_t completed_jobs;
+	if (j->flags & aux::disk_job::aborted)
+	{
+		j->ret = disk_status::fatal_disk_error;
+		j->error = storage_error(boost::asio::error::operation_aborted);
+		completed_jobs.push_back(j);
+		add_completed_jobs(std::move(completed_jobs));
+		return;
+	}
+
+	perform_job(j, completed_jobs);
+	if (!completed_jobs.empty())
+		add_completed_jobs(std::move(completed_jobs));
+}
+
+int pread_disk_io::flush_cache_blocks(bitfield& flushed
+	, span<aux::cached_block_entry const> blocks
+	, int const hash_cursor, jobqueue_t& completed_jobs)
+{
+	if (blocks.empty()) return 0;
+
+#if DEBUG_DISK_THREAD
+	{
+		auto piece = piece_index_t(-1);
+		std::string blocks_str;
+		blocks_str.reserve(blocks.size());
+		for (auto const& blk : blocks)
+		{
+			blocks_str += blk.write_job ? '*' : ' ';
+			if (blk.write_job)
+				piece = std::get<aux::job::write>(blk.write_job->action).piece;
+		}
+		// If this assert fires, it means we were asked to flush a piece
+		// that doesn't have any jobs to flush
+		TORRENT_ASSERT(piece != piece_index_t(-1));
+		DLOG("flush_cache_blocks: piece: %d hash_cursor: %d blocks: [%s]\n", int(piece), hash_cursor, blocks_str.c_str());
+	}
+#else
+	TORRENT_UNUSED(hash_cursor);
+#endif
+
+	// blocks may be sparse. We need to skip any block entry where write_job is null
+	m_stats_counters.inc_stats_counter(counters::num_running_disk_jobs, 1);
+	m_stats_counters.inc_stats_counter(counters::num_writing_threads, 1);
+	time_point const start_time = clock_type::now();
+
+	TORRENT_ALLOCA(iovec, span<char>, blocks.size());
+	bool failed = false;
+	int count = 0;
+	int start_idx = 0;
+	int idx = 0;
+
+	// the total number of blocks we ended up flushing to disk
+	int ret = 0;
+
+	// the piece offset of the start of the range of contiguous blocks we're
+	// currently assembling into iovec
+	int start_offset = 0;
+
+	// the offset of the end of the range of contiguous blocks we're currently
+	// assembing
+	int end_offset = 0;
+
+	aux::open_mode_t file_mode;
+	auto piece = piece_index_t(-1);
+	disk_job_flags_t flags;
+
+	std::shared_ptr<aux::pread_storage> storage;
+
+	storage_error error;
+	// TODO: refactor this loop into an iterator adapter that returns
+	// contiguous ranges of blocks. Then de-duplicate the write-to-disk logic
+	// into the loop
+	TORRENT_ASSERT(blocks.size() > 0);
+	for (auto& be : blocks)
+	{
+		auto* j = be.write_job;
+
+		auto const job_offset = [&] {
+			if (j != nullptr)
+				return std::get<aux::job::write>(j->action).offset;
+			else
+				return 0;
+		}();
+
+		if (!storage && j) storage = j->storage;
+		if (count > 0 && (j == nullptr || job_offset > end_offset))
+		{
+			TORRENT_ASSERT(piece != piece_index_t(-1));
+			DLOG("write: blocks: %d (piece: %d)\n", count, int(piece));
+			storage->write(m_settings, iovec.first(count)
+				, piece, start_offset, file_mode, flags, error);
+
+			int i = start_idx;
+			for (aux::cached_block_entry const& blk : blocks.subspan(start_idx, count))
+			{
+				auto* j2 = blk.write_job;
+				TORRENT_ASSERT(j2);
+				TORRENT_ASSERT(j2->get_type() == aux::job_action_t::write);
+				j2->error = error;
+				flushed.set_bit(i);
+				completed_jobs.push_back(j2);
+				++i;
+			}
+
+			if (error) {
+				// if there was a failure, fail the remaining jobs as well
+				for (int k = start_idx + count; k < blocks.size(); ++k)
+				{
+					auto* j2 = be.write_job;
+					if (j2 == nullptr) continue;
+					j2->error = error;
+					// TODO: should we free the job's buffer here?
+					completed_jobs.push_back(j2);
+				}
+				failed = true;
+				break;
+			}
+
+			ret += count;
+
+			start_offset = job_offset;
+			start_idx = idx;
+			count = 0;
+		}
+
+		if (j == nullptr)
+		{
+			++idx;
+			start_idx = idx;
+			continue;
+		}
+
+		TORRENT_ASSERT(j->storage == storage);
+		TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
+		auto& a = std::get<aux::job::write>(j->action);
+
+		if (count == 0) start_offset = job_offset;
+		iovec[count] = span<char>{ a.buf.data(), a.buffer_size};
+		++count;
+		flags = j->flags;
+		piece = a.piece;
+		file_mode = file_mode_for_job(j);
+		end_offset = job_offset + a.buffer_size;
+		++idx;
+	}
+
+	if (count > 0)
+	{
+		DLOG("write: blocks: %d (piece: %d)\n", count, int(piece));
+		storage->write(m_settings, iovec.first(count)
+			, piece, start_offset, file_mode, flags, error);
+
+		int i = start_idx;
+		for (aux::cached_block_entry const& blk : blocks.subspan(start_idx, count))
+		{
+			auto* j = blk.write_job;
+			TORRENT_ASSERT(j);
+			TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
+			j->error = error;
+			flushed.set_bit(i);
+			completed_jobs.push_back(j);
+			++i;
+		}
+		// TODO: if we failed, post the remaining block's jobs as failures too
+		if (error) failed = true;
+		else ret += count;
+	}
+
+	if (!failed)
+	{
+		std::int64_t const write_time = total_microseconds(clock_type::now() - start_time);
+
+		m_stats_counters.inc_stats_counter(counters::num_blocks_written, blocks.size());
+		m_stats_counters.inc_stats_counter(counters::num_write_ops);
+		m_stats_counters.inc_stats_counter(counters::disk_write_time, write_time);
+		m_stats_counters.inc_stats_counter(counters::disk_job_time, write_time);
+	}
+
+	// TODO: put this in an RAII object
+	m_stats_counters.inc_stats_counter(counters::num_writing_threads, -1);
+	m_stats_counters.inc_stats_counter(counters::num_running_disk_jobs, -1);
+
+	return ret;
+}
+
+void pread_disk_io::clear_piece_jobs(jobqueue_t aborted, aux::pread_disk_job* clear)
+{
+	m_completed_jobs.abort_jobs(m_ios, std::move(aborted));
+	jobqueue_t jobs;
+	jobs.push_back(clear);
+	add_completed_jobs(std::move(jobs));
+}
+
+void pread_disk_io::try_flush_cache(int const target_cache_size
+	, std::unique_lock<std::mutex>& l)
+{
+	DLOG("flushing, cache target: %d (current size: %d currently flushing: %d)\n"
+		, target_cache_size, m_cache.size(), m_cache.num_flushing());
+	l.unlock();
+	jobqueue_t completed_jobs;
+	m_cache.flush_to_disk(
+		[&](bitfield& flushed, span<aux::cached_block_entry const> blocks, int const hash_cursor) {
+			return flush_cache_blocks(flushed, blocks, hash_cursor, completed_jobs);
+		}
+		, target_cache_size
+		, [&](jobqueue_t aborted, aux::pread_disk_job* clear) {
+			clear_piece_jobs(std::move(aborted), clear);
+		});
+	l.lock();
+	DLOG("flushed blocks (%d blocks left), return to disk loop\n", m_cache.size());
+	if (!completed_jobs.empty())
+		add_completed_jobs(std::move(completed_jobs));
+}
+
+void pread_disk_io::flush_storage(std::shared_ptr<aux::pread_storage> const& storage)
+{
+	storage_index_t const torrent = storage->storage_index();
+	DLOG("flush_storage (%d)\n", torrent);
+	jobqueue_t completed_jobs;
+	m_cache.flush_storage(
+		[&](bitfield& flushed, span<aux::cached_block_entry const> blocks, int const hash_cursor) {
+			return flush_cache_blocks(flushed, blocks, hash_cursor, completed_jobs);
+		}
+		, torrent
+		, [&](jobqueue_t aborted, aux::pread_disk_job* clear) {
+			clear_piece_jobs(std::move(aborted), clear);
+		});
+	DLOG("flush_storage - done (%d left)\n", m_cache.size());
+	if (!completed_jobs.empty())
+		add_completed_jobs(std::move(completed_jobs));
+}
+
+void pread_disk_io::thread_fun(aux::disk_io_thread_pool& pool
+	, executor_work_guard<io_context::executor_type> work)
+{
+	// work is used to keep the io_context alive
+	TORRENT_UNUSED(work);
+
+	ADD_OUTSTANDING_ASYNC("pread_disk_io::work");
+	std::thread::id const thread_id = std::this_thread::get_id();
+
+	aux::set_thread_name("Disk");
+
+	DLOG("started disk thread\n");
+
+	std::unique_lock<std::mutex> l(m_job_mutex);
+
+	++m_num_running_threads;
+	m_stats_counters.inc_stats_counter(counters::num_running_threads, 1);
+
+	// we call close_oldest_file on the file_pool regularly. This is the next
+	// time we should call it
+	time_point next_close_oldest_file = min_time();
+
+	for (;;)
+	{
+		auto const res = pool.wait_for_job(l);
+
+		// if we need to flush the cache, let one of the generic threads do
+		// that
+		if (m_flush_target && &pool == &m_generic_threads)
+		{
+			int const target_cache_size = *std::exchange(m_flush_target, std::nullopt);
+			DLOG("try_flush_cache(%d)\n", target_cache_size);
+			try_flush_cache(target_cache_size, l);
+			continue;
+		}
+
+		if (res == aux::wait_result::exit_thread)
+		{
+			DLOG("exit disk loop\n");
+			break;
+		}
+
+		if (res != aux::wait_result::new_job)
+		{
+			DLOG("continue disk loop\n");
+			continue;
+		}
+
+		auto* j = static_cast<aux::pread_disk_job*>(pool.pop_front());
+		l.unlock();
+
+		TORRENT_ASSERT((j->flags & aux::disk_job::in_progress) || !j->storage);
+
+		if (&pool == &m_generic_threads && thread_id == pool.first_thread_id())
+		{
+			time_point const now = aux::time_now();
+			{
+				std::unique_lock<std::mutex> l2(m_need_tick_mutex);
+				while (!m_need_tick.empty() && m_need_tick.front().first < now)
+				{
+					std::shared_ptr<aux::pread_storage> st = m_need_tick.front().second.lock();
+					m_need_tick.erase(m_need_tick.begin());
+					if (st)
+					{
+						l2.unlock();
+						st->tick();
+						l2.lock();
+					}
+				}
+			}
+
+			if (now > next_close_oldest_file)
+			{
+				seconds const interval(m_settings.get_int(settings_pack::close_file_interval));
+				if (interval <= seconds(0))
+				{
+					// check again in one minute, in case the setting changed
+					next_close_oldest_file = now + minutes(1);
+				}
+				else
+				{
+					next_close_oldest_file = now + interval;
+					m_file_pool.close_oldest();
+				}
+			}
+		}
+
+		execute_job(j);
+
+		l.lock();
+	}
+
+	// do cleanup in the last running thread
+	// if we're not aborting, that means we just configured the thread pool to
+	// not have any threads (i.e. perform all disk operations in the network
+	// thread). In this case, the cleanup will happen in abort().
+
+	int const threads_left = --m_num_running_threads;
+	if (threads_left > 0 || !m_abort)
+	{
+		DLOG("exiting disk thread. num_threads: %d aborting: %d\n"
+			, threads_left, int(m_abort));
+		m_stats_counters.inc_stats_counter(counters::num_running_threads, -1);
+		COMPLETE_ASYNC("pread_disk_io::work");
+		return;
+	}
+
+	DLOG("last thread alive. (left: %d) cleaning up. (generic-jobs: %d hash-jobs: %d)\n"
+		, threads_left
+		, m_generic_threads.queue_size()
+		, m_hash_threads.queue_size());
+
+	// flush everything before exiting this thread
+	try_flush_cache(0, l);
+
+	// it is important to hold the job mutex while calling try_thread_exit()
+	// and continue to hold it until checking m_abort above so that abort()
+	// doesn't inadvertently trigger the code below when it thinks there are no
+	// more disk I/O threads running
+	l.unlock();
+
+	// at this point, there are no queued jobs left. However, main
+	// thread is still running and may still have peer_connections
+	// that haven't fully destructed yet, reclaiming their references
+	// to read blocks in the disk cache. We need to wait until all
+	// references are removed from other threads before we can go
+	// ahead with the cleanup.
+	// This is not supposed to happen because the disk thread is now scheduled
+	// for shut down after all peers have shut down (see
+	// session_impl::abort_stage2()).
+
+	DLOG("the last disk thread alive. cleaning up\n");
+
+	abort_jobs();
+
+	m_stats_counters.inc_stats_counter(counters::num_running_threads, -1);
+	COMPLETE_ASYNC("pread_disk_io::work");
+}
+
+void pread_disk_io::abort_jobs()
+{
+	DLOG("pread_disk_io::abort_jobs\n");
+
+	if (m_jobs_aborted.test_and_set()) return;
+
+	// close all files. This may take a long
+	// time on certain OSes (i.e. Mac OS)
+	// that's why it's important to do this in
+	// the disk thread in parallel with stopping
+	// trackers.
+	m_file_pool.release();
+}
+
+int pread_disk_io::num_threads() const
+{
+	return m_generic_threads.max_threads() + m_hash_threads.max_threads();
+}
+
+aux::disk_io_thread_pool& pread_disk_io::pool_for_job(aux::pread_disk_job* j)
+{
+	if (m_hash_threads.max_threads() > 0
+		&& (j->get_type() == aux::job_action_t::hash
+			|| j->get_type() == aux::job_action_t::hash2
+			|| j->get_type() == aux::job_action_t::kick_hasher))
+		return m_hash_threads;
+	else
+		return m_generic_threads;
+}
+
+void pread_disk_io::add_completed_jobs(jobqueue_t jobs)
+{
+	jobqueue_t completed = std::move(jobs);
+	do
+	{
+		// when a job completes, it's possible for it to cause
+		// a fence to be lowered, issuing the jobs queued up
+		// behind the fence
+		jobqueue_t new_jobs;
+		add_completed_jobs_impl(std::move(completed), new_jobs);
+		TORRENT_ASSERT(completed.empty());
+		completed = std::move(new_jobs);
+	} while (!completed.empty());
+}
+
+void pread_disk_io::add_completed_jobs_impl(jobqueue_t jobs, jobqueue_t& completed)
+{
+	jobqueue_t new_jobs;
+	int ret = 0;
+	for (auto i = jobs.iterate(); i.get(); i.next())
+	{
+		auto* j = static_cast<aux::pread_disk_job*>(i.get());
+
+		if (j->flags & aux::disk_job::fence)
+		{
+			m_stats_counters.inc_stats_counter(
+				counters::num_fenced_read + static_cast<int>(j->get_type()), -1);
+		}
+
+		if (j->flags & aux::disk_job::in_progress)
+		{
+			TORRENT_ASSERT(j->storage);
+			if (j->storage)
+				ret += j->storage->job_complete(j, new_jobs);
+		}
+
+		TORRENT_ASSERT(ret == new_jobs.size());
+		TORRENT_ASSERT(!(j->flags & aux::disk_job::in_progress));
+#if TORRENT_USE_ASSERTS
+		TORRENT_ASSERT(j->job_posted == false);
+		j->job_posted = true;
+#endif
+	}
+
+	if (ret)
+	{
+		DLOG("unblocked %d jobs (%d left)\n", ret
+			, int(m_stats_counters[counters::blocked_disk_jobs]) - ret);
+	}
+
+	m_stats_counters.inc_stats_counter(counters::blocked_disk_jobs, -ret);
+	TORRENT_ASSERT(int(m_stats_counters[counters::blocked_disk_jobs]) >= 0);
+
+	if (m_abort.load())
+	{
+		while (!new_jobs.empty())
+		{
+			auto* j = static_cast<aux::pread_disk_job*>(new_jobs.pop_front());
+			TORRENT_ASSERT((j->flags & aux::disk_job::in_progress) || !j->storage);
+			j->ret = disk_status::fatal_disk_error;
+			j->error = storage_error(boost::asio::error::operation_aborted);
+			completed.push_back(j);
+		}
+	}
+	else
+	{
+		if (!new_jobs.empty())
+		{
+			{
+				std::lock_guard<std::mutex> l(m_job_mutex);
+				m_generic_threads.append(std::move(new_jobs));
+			}
+
+			{
+				std::lock_guard<std::mutex> l(m_job_mutex);
+				m_generic_threads.submit_jobs();
+			}
+		}
+	}
+
+	m_completed_jobs.append(m_ios, std::move(jobs));
+}
+
+}
diff --git a/src/pread_storage.cpp b/src/pread_storage.cpp
new file mode 100644
index 00000000000..a5e11b99817
--- /dev/null
+++ b/src/pread_storage.cpp
@@ -0,0 +1,797 @@
+/*
+
+Copyright (c) 2022, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#include "libtorrent/config.hpp"
+#include "libtorrent/error_code.hpp"
+#include "libtorrent/aux_/storage_utils.hpp"
+#include "libtorrent/hasher.hpp"
+
+#include <ctime>
+#include <algorithm>
+#include <numeric>
+#include <set>
+#include <functional>
+#include <cstdio>
+
+#include "libtorrent/aux_/pread_storage.hpp"
+#include "libtorrent/aux_/torrent.hpp"
+#include "libtorrent/aux_/path.hpp"
+#include "libtorrent/aux_/invariant_check.hpp"
+#include "libtorrent/aux_/session_impl.hpp"
+#include "libtorrent/aux_/file_pool.hpp"
+#include "libtorrent/aux_/file.hpp" // for file_handle, pread_all, pwrite_all
+#include "libtorrent/disk_buffer_holder.hpp"
+#include "libtorrent/aux_/stat_cache.hpp"
+#include "libtorrent/hex.hpp" // to_hex
+
+#include <sys/types.h>
+
+#if (TORRENT_HAS_FADVISE && defined POSIX_FADV_DONTNEED)
+#include <fcntl.h>
+#endif
+
+#if defined TORRENT_LINUX && defined SYNC_FILE_RANGE_WRITE
+#include <fcntl.h> // for sync_file_range
+#elif defined TORRENT_WINDOWS
+#include "libtorrent/aux_/windows.hpp" // for FlushFileBuffers
+#elif defined TORRENT_BSD && ! defined __APPLE__
+#include <unistd.h> // for fsync_range
+#else
+#include <unistd.h> // for fsync
+#endif
+
+namespace libtorrent::aux {
+
+namespace {
+
+	// TODO: move this o aux_/file.hpp
+	void advise_dont_need(handle_type handle, std::int64_t offset, std::int64_t len)
+	{
+#if (TORRENT_HAS_FADVISE && defined POSIX_FADV_DONTNEED)
+		::posix_fadvise(handle, offset, len, POSIX_FADV_DONTNEED);
+#else
+		TORRENT_UNUSED(handle);
+		TORRENT_UNUSED(offset);
+		TORRENT_UNUSED(len);
+#endif
+	}
+
+	// TODO: move this o aux_/file.hpp
+	void sync_file(handle_type handle, std::int64_t offset, std::int64_t len)
+	{
+#if defined TORRENT_LINUX && defined SYNC_FILE_RANGE_WRITE
+		::sync_file_range(handle, offset, len, SYNC_FILE_RANGE_WRITE);
+#elif defined TORRENT_WINDOWS
+		::FlushFileBuffers(handle);
+		TORRENT_UNUSED(offset);
+		TORRENT_UNUSED(len);
+#elif defined TORRENT_BSD && ! defined __APPLE__
+		::fsync_range(handle, FFILESYNC, offset, len);
+#else
+		::fsync(handle);
+		TORRENT_UNUSED(offset);
+		TORRENT_UNUSED(len);
+#endif
+	}
+}
+
+	pread_storage::pread_storage(storage_params const& params
+		, file_pool& pool)
+		: m_files(params.files)
+		, m_file_priority(params.priorities)
+		, m_save_path(complete(params.path))
+		, m_part_file_name("." + to_hex(params.info_hash) + ".parts")
+		, m_pool(pool)
+		, m_allocate_files(params.mode == storage_mode_allocate)
+		, m_v1(params.v1)
+		, m_v2(params.v2)
+	{
+		// a torrent must be either v1 or v2 (or both)
+		TORRENT_ASSERT(m_v1 || m_v2);
+		if (params.mapped_files) m_mapped_files = std::make_unique<file_storage>(*params.mapped_files);
+
+		TORRENT_ASSERT(files().num_files() > 0);
+	}
+
+	pread_storage::~pread_storage()
+	{
+		error_code ec;
+		if (m_part_file) m_part_file->flush_metadata(ec);
+
+		// this may be called from a different
+		// thread than the disk thread
+		m_pool.release(storage_index());
+	}
+
+	void pread_storage::need_partfile()
+	{
+		if (m_part_file) return;
+
+		m_part_file = std::make_unique<part_file>(
+			m_save_path, m_part_file_name
+			, files().num_pieces(), files().piece_length());
+	}
+
+	void pread_storage::set_file_priority(settings_interface const& sett
+		, vector<download_priority_t, file_index_t>& prio
+		, storage_error& ec)
+	{
+		// extend our file priorities in case it's truncated
+		// the default assumed priority is 4 (the default)
+		if (prio.size() > m_file_priority.size())
+			m_file_priority.resize(prio.size(), default_priority);
+
+		file_storage const& fs = files();
+		for (file_index_t i(0); i < prio.end_index(); ++i)
+		{
+			// pad files always have priority 0.
+			if (fs.pad_file_at(i)) continue;
+
+			download_priority_t const old_prio = m_file_priority[i];
+			download_priority_t new_prio = prio[i];
+			if (old_prio == dont_download && new_prio != dont_download)
+			{
+				// move stuff out of the part file
+				auto f = open_file(sett, i, open_mode::write, ec);
+				if (ec)
+				{
+					prio = m_file_priority;
+					return;
+				}
+				TORRENT_ASSERT(f);
+
+				if (m_part_file && use_partfile(i))
+				{
+					try
+					{
+						m_part_file->export_file([&f](std::int64_t file_offset, span<char> buf)
+						{
+							do {
+								error_code err;
+								int const r = pwrite_all(f->fd(), buf, file_offset, err);
+								if (err)
+									throw_ex<lt::system_error>(err);
+								buf = buf.subspan(r);
+								file_offset += r;
+							} while (buf.size() > 0);
+						}, fs.file_offset(i), fs.file_size(i), ec.ec);
+						if (ec)
+						{
+							ec.file(i);
+							ec.operation = operation_t::partfile_write;
+							prio = m_file_priority;
+							return;
+						}
+					}
+					catch (lt::system_error const& err)
+					{
+						ec.file(i);
+						ec.operation = operation_t::partfile_write;
+						ec.ec = err.code();
+						return;
+					}
+				}
+			}
+			else if (old_prio != dont_download && new_prio == dont_download)
+			{
+				// move stuff into the part file
+				// this is not implemented yet.
+				// so we just don't use a partfile for this file
+
+				std::string const fp = fs.file_path(i, m_save_path);
+				bool const file_exists = exists(fp, ec.ec);
+				if (ec.ec)
+				{
+					ec.file(i);
+					ec.operation = operation_t::file_stat;
+					prio = m_file_priority;
+					return;
+				}
+				use_partfile(i, !file_exists);
+			}
+			ec.ec.clear();
+			m_file_priority[i] = new_prio;
+
+			if (m_file_priority[i] == dont_download && use_partfile(i))
+			{
+				need_partfile();
+			}
+		}
+		if (m_part_file) m_part_file->flush_metadata(ec.ec);
+		if (ec)
+		{
+			ec.file(torrent_status::error_file_partfile);
+			ec.operation = operation_t::partfile_write;
+		}
+	}
+
+	bool pread_storage::use_partfile(file_index_t const index) const
+	{
+		TORRENT_ASSERT_VAL(index >= file_index_t{}, index);
+		if (index >= m_use_partfile.end_index()) return true;
+		return m_use_partfile[index];
+	}
+
+	void pread_storage::use_partfile(file_index_t const index, bool const b)
+	{
+		if (index >= m_use_partfile.end_index())
+		{
+			// no need to extend this array if we're just setting it to "true",
+			// that's default already
+			if (b) return;
+			m_use_partfile.resize(static_cast<int>(index) + 1, true);
+		}
+		m_use_partfile[index] = b;
+	}
+
+	status_t pread_storage::initialize(settings_interface const& sett, storage_error& ec)
+	{
+		m_stat_cache.reserve(files().num_files());
+
+#ifdef TORRENT_WINDOWS
+		// don't do full file allocations on network drives
+		auto const file_name = convert_to_native_path_string(m_save_path);
+		int const drive_type = GetDriveTypeW(file_name.c_str());
+
+		if (drive_type == DRIVE_REMOTE)
+			m_allocate_files = false;
+#endif
+		{
+			std::unique_lock<std::mutex> l(m_file_created_mutex);
+			m_file_created.resize(files().num_files(), false);
+		}
+
+		file_storage const& fs = files();
+		status_t ret{};
+		// if some files have priority 0, we need to check if they exist on the
+		// filesystem, in which case we won't use a partfile for them.
+		// this is to be backwards compatible with previous versions of
+		// libtorrent, when part files were not supported.
+		for (file_index_t i(0); i < m_file_priority.end_index(); ++i)
+		{
+			if (m_file_priority[i] != dont_download || fs.pad_file_at(i))
+				continue;
+
+			error_code err;
+			auto const size = m_stat_cache.get_filesize(i, fs, m_save_path, err);
+			if (!err && size > 0)
+			{
+				use_partfile(i, false);
+				if (size > fs.file_size(i))
+					ret = ret | disk_status::oversized_file;
+			}
+			else
+			{
+				// we may have earlier determined we *can't* use a partfile for
+				// this file, we need to be able to change our mind in case the
+				// file disappeared
+				use_partfile(i, true);
+				need_partfile();
+			}
+		}
+
+		initialize_storage(fs, m_save_path, m_stat_cache, m_file_priority
+			, [&sett, this](file_index_t const file_index, storage_error& e)
+			{ open_file(sett, file_index, open_mode::write, e); }
+			, create_symlink
+			, [&ret](file_index_t, std::int64_t) { ret = ret | disk_status::oversized_file; }
+			, ec);
+
+		// close files that were opened in write mode
+		m_pool.release(storage_index());
+		return ret;
+	}
+
+	bool pread_storage::has_any_file(storage_error& ec)
+	{
+		m_stat_cache.reserve(files().num_files());
+
+		if (aux::has_any_file(files(), m_save_path, m_stat_cache, ec))
+			return true;
+
+		if (ec) return false;
+
+		file_status s;
+		stat_file(combine_path(m_save_path, m_part_file_name), &s, ec.ec);
+		if (!ec) return true;
+
+		// the part file not existing is expected
+		if (ec.ec == boost::system::errc::no_such_file_or_directory)
+			ec.ec.clear();
+
+		if (ec)
+		{
+			ec.file(torrent_status::error_file_partfile);
+			ec.operation = operation_t::file_stat;
+		}
+		return false;
+	}
+
+	void pread_storage::rename_file(file_index_t const index, std::string const& new_filename
+		, storage_error& ec)
+	{
+		if (index < file_index_t(0) || index >= files().end_file()) return;
+		std::string const old_name = files().file_path(index, m_save_path);
+		m_pool.release(storage_index(), index);
+
+		// if the old file doesn't exist, just succeed and change the filename
+		// that will be created. This shortcut is important because the
+		// destination directory may not exist yet, which would cause a failure
+		// even though we're not moving a file (yet). It's better for it to
+		// fail later when we try to write to the file the first time, because
+		// the user then will have had a chance to make the destination directory
+		// valid.
+		if (exists(old_name, ec.ec))
+		{
+			std::string new_path;
+			if (is_complete(new_filename)) new_path = new_filename;
+			else new_path = combine_path(m_save_path, new_filename);
+			std::string new_dir = parent_path(new_path);
+
+			// create any missing directories that the new filename
+			// lands in
+			create_directories(new_dir, ec.ec);
+			if (ec.ec)
+			{
+				ec.file(index);
+				ec.operation = operation_t::file_rename;
+				return;
+			}
+
+			rename(old_name, new_path, ec.ec);
+
+			// if old_name doesn't exist, that's not an error
+			// here. Once we start writing to the file, it will
+			// be written to the new filename
+			if (ec.ec == boost::system::errc::no_such_file_or_directory)
+				ec.ec.clear();
+
+			if (ec)
+			{
+				ec.ec.clear();
+				copy_file(old_name, new_path, ec);
+
+				if (ec)
+				{
+					ec.file(index);
+					return;
+				}
+
+				error_code ignore;
+				remove(old_name, ignore);
+			}
+		}
+		else if (ec.ec)
+		{
+			// if exists fails, report that error
+			ec.file(index);
+			ec.operation = operation_t::file_rename;
+			return;
+		}
+
+		// if old path doesn't exist, just rename the file
+		// in our file_storage, so that when it is created
+		// it will get the new name
+		if (!m_mapped_files)
+		{ m_mapped_files = std::make_unique<file_storage>(files()); }
+		m_mapped_files->rename_file(index, new_filename);
+	}
+
+	void pread_storage::release_files(storage_error&)
+	{
+		if (m_part_file)
+		{
+			error_code ignore;
+			m_part_file->flush_metadata(ignore);
+		}
+
+		// make sure we don't have the files open
+		m_pool.release(storage_index());
+
+		// make sure we can pick up new files added to the download directory when
+		// we start the torrent again
+		m_stat_cache.clear();
+	}
+
+	void pread_storage::delete_files(remove_flags_t const options, storage_error& ec)
+	{
+		// make sure we don't have the files open
+		m_pool.release(storage_index());
+
+		// if there's a part file open, make sure to destruct it to have it
+		// release the underlying part file. Otherwise we may not be able to
+		// delete it
+		if (m_part_file) m_part_file.reset();
+
+		aux::delete_files(files(), m_save_path, m_part_file_name, options, ec);
+	}
+
+	bool pread_storage::verify_resume_data(add_torrent_params const& rd
+		, aux::vector<std::string, file_index_t> const& links
+		, storage_error& ec)
+	{
+		return aux::verify_resume_data(rd, links, files()
+			, m_file_priority, m_stat_cache, m_save_path, ec);
+	}
+
+	std::pair<status_t, std::string> pread_storage::move_storage(std::string save_path
+		, move_flags_t const flags, storage_error& ec)
+	{
+		m_pool.release(storage_index());
+
+		status_t ret;
+		auto move_partfile = [&](std::string const& new_save_path, error_code& e)
+		{
+			if (!m_part_file) return;
+			m_part_file->move_partfile(new_save_path, e);
+		};
+		std::tie(ret, m_save_path) = aux::move_storage(files(), m_save_path, std::move(save_path)
+			, std::move(move_partfile), flags, ec);
+
+		// clear the stat cache in case the new location has new files
+		m_stat_cache.clear();
+
+		return { ret, m_save_path };
+	}
+
+	int pread_storage::read(settings_interface const& sett
+		, span<char> buffer
+		, piece_index_t const piece, int const offset
+		, open_mode_t const mode
+		, disk_job_flags_t const flags
+		, storage_error& error)
+	{
+#ifdef TORRENT_SIMULATE_SLOW_READ
+		std::this_thread::sleep_for(seconds(1));
+#endif
+		return readwrite(files(), buffer, piece, offset, error
+			, [this, mode, flags, &sett](file_index_t const file_index
+				, std::int64_t const file_offset
+				, span<char> buf, storage_error& ec)
+		{
+			// reading from a pad file yields zeroes
+			if (files().pad_file_at(file_index)) return read_zeroes(buf);
+
+			if (file_index < m_file_priority.end_index()
+				&& m_file_priority[file_index] == dont_download
+				&& use_partfile(file_index))
+			{
+				TORRENT_ASSERT(m_part_file);
+
+				error_code e;
+				peer_request map = files().map_file(file_index, file_offset, 0);
+				int const ret = m_part_file->read(buf, map.piece, map.start, e);
+
+				if (e)
+				{
+					ec.ec = e;
+					ec.file(file_index);
+					ec.operation = operation_t::partfile_read;
+					return -1;
+				}
+				return ret;
+			}
+
+			auto handle = open_file(sett, file_index, mode, ec);
+			if (ec) return -1;
+
+			// set this unconditionally in case the upper layer would like to treat
+			// short reads as errors
+			ec.operation = operation_t::file_read;
+
+			int const ret = pread_all(handle->fd(), buf, file_offset, ec.ec);
+			if (ec.ec) {
+				ec.file(file_index);
+				return ret;
+			}
+			if (flags & disk_interface::volatile_read)
+				advise_dont_need(handle->fd(), file_offset, buf.size());
+
+			return ret;
+		});
+	}
+
+	int pread_storage::write(settings_interface const& sett
+		, span<span<char> const> buffers
+		, piece_index_t const piece, int offset
+		, open_mode_t const mode
+		, disk_job_flags_t const flags
+		, storage_error& error)
+	{
+		for (auto const& buf : buffers)
+		{
+			write(sett, buf, piece, offset, mode, flags, error);
+			offset += int(buf.size());
+			if (error) return offset;
+		}
+		return offset;
+	}
+
+	int pread_storage::write(settings_interface const& sett
+		, span<char> buffer
+		, piece_index_t const piece, int const offset
+		, open_mode_t const mode
+		, disk_job_flags_t
+		, storage_error& error)
+	{
+		auto const write_mode = sett.get_int(settings_pack::disk_io_write_mode);
+		return readwrite(files(), buffer, piece, offset, error
+			, [this, mode, &sett, write_mode](file_index_t const file_index
+				, std::int64_t const file_offset
+				, span<char> buf, storage_error& ec)
+		{
+			// writing to a pad-file is a no-op
+			if (files().pad_file_at(file_index))
+				return int(buf.size());
+
+			if (file_index < m_file_priority.end_index()
+				&& m_file_priority[file_index] == dont_download
+				&& use_partfile(file_index))
+			{
+				TORRENT_ASSERT(m_part_file);
+
+				error_code e;
+				peer_request map = files().map_file(file_index
+					, file_offset, 0);
+				int const ret = m_part_file->write(buf, map.piece, map.start, e);
+
+				if (e)
+				{
+					ec.ec = e;
+					ec.file(file_index);
+					ec.operation = operation_t::partfile_write;
+					return -1;
+				}
+				return ret;
+			}
+
+			// invalidate our stat cache for this file, since
+			// we're writing to it
+			m_stat_cache.set_dirty(file_index);
+
+			auto handle = open_file(sett, file_index, open_mode::write | mode, ec);
+			if (ec) return -1;
+			TORRENT_ASSERT(handle);
+
+			// set this unconditionally in case the upper layer would like to treat
+			// short reads as errors
+			ec.operation = operation_t::file_write;
+
+			int const ret = pwrite_all(handle->fd(), buf, file_offset, ec.ec);
+			if (ec.ec)
+			{
+				ec.file(file_index);
+				ec.operation = operation_t::file_write;
+				return ret;
+			}
+			if (write_mode == settings_pack::write_through)
+				sync_file(handle->fd(), file_offset, buf.size());
+			return ret;
+		});
+	}
+
+	int pread_storage::hash(settings_interface const& sett
+		, hasher& ph, std::ptrdiff_t const len
+		, piece_index_t const piece, int const offset
+		, open_mode_t const mode
+		, disk_job_flags_t const flags
+		, storage_error& error)
+	{
+#ifdef TORRENT_SIMULATE_SLOW_READ
+		std::this_thread::sleep_for(seconds(1));
+#endif
+		char dummy = 0;
+
+		std::vector<char> scratch_buffer;
+
+		return readwrite(files(), {&dummy, len}, piece, offset, error
+			, [this, mode, flags, &ph, &sett, &scratch_buffer](
+				file_index_t const file_index
+				, std::int64_t const file_offset
+				, span<char> buf, storage_error& ec)
+		{
+			if (files().pad_file_at(file_index))
+				return hash_zeroes(ph, buf.size());
+
+			if (file_index < m_file_priority.end_index()
+				&& m_file_priority[file_index] == dont_download
+				&& use_partfile(file_index))
+			{
+				error_code e;
+				peer_request map = files().map_file(file_index, file_offset, 0);
+				int const ret = m_part_file->hash(ph, buf.size()
+					, map.piece, map.start, e);
+
+				if (e)
+				{
+					ec.ec = e;
+					ec.file(file_index);
+					ec.operation = operation_t::partfile_read;
+				}
+				return ret;
+			}
+
+			auto handle = open_file(sett, file_index, mode, ec);
+			if (ec) return -1;
+
+			scratch_buffer.resize(std::size_t(buf.size()));
+			int ret = pread_all(handle->fd(), scratch_buffer, file_offset, ec.ec);
+			if (ec.ec)
+			{
+				ec.file(file_index);
+				ec.operation = operation_t::file_read;
+				return ret;
+			}
+			if (ret >= 0)
+			{
+				ph.update(scratch_buffer);
+				if (flags & disk_interface::volatile_read)
+					advise_dont_need(handle->fd(), file_offset, buf.size());
+				if (flags & disk_interface::flush_piece)
+					sync_file(handle->fd(), file_offset, buf.size());
+			}
+
+			return ret;
+		});
+	}
+
+	int pread_storage::hash2(settings_interface const& sett
+		, hasher256& ph, std::ptrdiff_t const len
+		, piece_index_t const piece, int const offset
+		, open_mode_t const mode
+		, disk_job_flags_t const flags
+		, storage_error& error)
+	{
+		std::int64_t const start_offset = static_cast<int>(piece) * std::int64_t(files().piece_length()) + offset;
+		file_index_t const file_index = files().file_index_at_offset(start_offset);
+		std::int64_t const file_offset = start_offset - files().file_offset(file_index);
+		TORRENT_ASSERT(file_offset >= 0);
+		TORRENT_ASSERT(!files().pad_file_at(file_index));
+
+		if (file_index < m_file_priority.end_index()
+			&& m_file_priority[file_index] == dont_download
+			&& use_partfile(file_index))
+		{
+			error_code e;
+			peer_request map = files().map_file(file_index, file_offset, 0);
+			int const ret = m_part_file->hash2(ph, len, map.piece, map.start, e);
+
+			if (e)
+			{
+				error.ec = e;
+				error.file(file_index);
+				error.operation = operation_t::partfile_read;
+				return -1;
+			}
+			return ret;
+		}
+
+		auto handle = open_file(sett, file_index, mode, error);
+		if (error) return -1;
+
+		std::unique_ptr<char[]> scratch_buffer(new char[std::size_t(len)]);
+		span<char> b = {scratch_buffer.get(), len};
+		int const ret = pread_all(handle->fd(), b, file_offset, error.ec);
+		if (error.ec)
+		{
+			error.operation = operation_t::file_read;
+			error.file(file_index);
+			return ret;
+		}
+		ph.update(b);
+		if (flags & disk_interface::volatile_read)
+			advise_dont_need(handle->fd(), file_offset, len);
+		if (flags & disk_interface::flush_piece)
+			sync_file(handle->fd(), file_offset, len);
+
+		return static_cast<int>(len);
+	}
+
+	// a wrapper around open_file_impl that, if it fails, makes sure the
+	// directories have been created and retries
+	std::shared_ptr<file_handle> pread_storage::open_file(settings_interface const& sett
+		, file_index_t const file
+		, open_mode_t mode, storage_error& ec) const
+	{
+		if (mode & open_mode::write
+			&& !(mode & open_mode::truncate))
+		{
+			std::unique_lock<std::mutex> l(m_file_created_mutex);
+			if (m_file_created.size() != files().num_files())
+				m_file_created.resize(files().num_files(), false);
+
+			// if we haven't created this file already, make sure to truncate it to
+			// its final size
+			mode |= (m_file_created[file] == false) ? open_mode::truncate : open_mode::read_only;
+		}
+
+		if (files().file_flags(file) & file_storage::flag_executable)
+			mode |= open_mode::executable;
+
+		if (files().file_flags(file) & file_storage::flag_hidden)
+			mode |= open_mode::hidden;
+
+#ifdef _WIN32
+		if (sett.get_bool(settings_pack::enable_set_file_valid_data))
+		{
+			mode |= open_mode::allow_set_file_valid_data;
+		}
+#endif
+
+		auto h = open_file_impl(sett, file, mode, ec);
+		if (ec.ec)
+		{
+			ec.file(file);
+			return {};
+		}
+		TORRENT_ASSERT(h);
+
+		if (mode & open_mode::truncate)
+		{
+			// remember that we've truncated this file, so we don't have to do it
+			// again
+			std::unique_lock<std::mutex> l(m_file_created_mutex);
+			m_file_created.set_bit(file);
+		}
+
+		// the optional should be set here
+		TORRENT_ASSERT(static_cast<bool>(h));
+		return h;
+	}
+
+	std::shared_ptr<file_handle> pread_storage::open_file_impl(settings_interface const& sett
+		, file_index_t file
+		, open_mode_t mode
+		, storage_error& ec) const
+	{
+		TORRENT_ASSERT(!files().pad_file_at(file));
+		if (!m_allocate_files) mode |= open_mode::sparse;
+
+		// files with priority 0 should always be sparse
+		if (m_file_priority.end_index() > file && m_file_priority[file] == dont_download)
+			mode |= open_mode::sparse;
+
+		if (sett.get_bool(settings_pack::no_atime_storage))
+		{
+			mode |= open_mode::no_atime;
+		}
+
+		// if we have a cache already, don't store the data twice by leaving it in the OS cache as well
+		auto const write_mode = sett.get_int(settings_pack::disk_io_write_mode);
+		if (write_mode == settings_pack::disable_os_cache
+			|| write_mode == settings_pack::write_through)
+		{
+			mode |= open_mode::no_cache;
+		}
+
+		try {
+			return m_pool.open_file(storage_index(), m_save_path, file
+				, files(), mode
+#if TORRENT_HAVE_MAP_VIEW_OF_FILE
+				, nullptr
+#endif
+				);
+		}
+		catch (storage_error const& se)
+		{
+			ec = se;
+			ec.file(file);
+			TORRENT_ASSERT(ec);
+			return {};
+		}
+	}
+
+	bool pread_storage::tick()
+	{
+		error_code ec;
+		if (m_part_file) m_part_file->flush_metadata(ec);
+
+		return false;
+	}
+} // namespace libtorrent::aux
diff --git a/src/session.cpp b/src/session.cpp
index 9714d039ea6..ed44ba97b2c 100644
--- a/src/session.cpp
+++ b/src/session.cpp
@@ -19,6 +19,7 @@ see LICENSE file.
 #include "libtorrent/disk_interface.hpp"
 #include "libtorrent/mmap_disk_io.hpp"
 #include "libtorrent/posix_disk_io.hpp"
+#include "libtorrent/pread_disk_io.hpp"
 #include "libtorrent/aux_/platform_util.hpp"
 
 namespace libtorrent {
@@ -491,7 +492,9 @@ namespace {
 	TORRENT_EXPORT std::unique_ptr<disk_interface> default_disk_io_constructor(
 		io_context& ios, settings_interface const& sett, counters& cnt)
 	{
-#if TORRENT_HAVE_MMAP || TORRENT_HAVE_MAP_VIEW_OF_FILE
+#if TORRENT_HAVE_PREAD || defined TORRENT_WINDOWS
+		return pread_disk_io_constructor(ios, sett, cnt);
+#elif TORRENT_HAVE_MMAP || TORRENT_HAVE_MAP_VIEW_OF_FILE
 		// TODO: In C++17. use if constexpr instead
 #include "libtorrent/aux_/disable_deprecation_warnings_push.hpp"
 		if (sizeof(void*) == 8)
diff --git a/src/settings_pack.cpp b/src/settings_pack.cpp
index dc622e5a5ca..459d8b4c843 100644
--- a/src/settings_pack.cpp
+++ b/src/settings_pack.cpp
@@ -244,7 +244,7 @@ namespace {
 		SET(initial_picker_threshold, 4, nullptr),
 		SET(allowed_fast_set_size, 5, nullptr),
 		SET(suggest_mode, settings_pack::no_piece_suggestions, nullptr),
-		SET(max_queued_disk_bytes, 1024 * 1024, nullptr),
+		SET(max_queued_disk_bytes, 50 * 1024 * 1024, nullptr),
 		SET(handshake_timeout, 10, nullptr),
 		SET(send_buffer_low_watermark, 10 * 1024, nullptr),
 		SET(send_buffer_watermark, 500 * 1024, nullptr),
diff --git a/src/torrent.cpp b/src/torrent.cpp
index f0422a7a0d3..70462988748 100644
--- a/src/torrent.cpp
+++ b/src/torrent.cpp
@@ -2545,9 +2545,7 @@ bool is_downloading_state(int const st)
 			// if the v1 hash failed the check, don't add the v2 hashes to the
 			// merkle tree. They are most likely invalid.
 			if (torrent_file().info_hashes().has_v2() && !bool(hash_passed[0] == false))
-			{
-				hash_passed[1] = on_blocks_hashed(piece, block_hashes);
-			}
+			{ hash_passed[1] = on_blocks_hashed(piece, block_hashes); }
 		}
 		else
 		{
@@ -6949,7 +6947,9 @@ namespace {
 				return result.valid;
 			}
 
-			if (m_picker && m_picker->is_downloading(p) && m_picker->is_piece_finished(p)
+			if (m_picker
+				&& m_picker->is_downloading(p)
+				&& !m_picker->has_piece_passed(p)
 				&& !m_picker->is_hashing(p))
 			{
 				piece_passed(p);
diff --git a/test/test_add_torrent.cpp b/test/test_add_torrent.cpp
index cdf64bd5c41..5b19994ae0a 100644
--- a/test/test_add_torrent.cpp
+++ b/test/test_add_torrent.cpp
@@ -9,6 +9,7 @@ see LICENSE file.
 
 #include "test.hpp"
 #include "setup_transfer.hpp" // for load_file
+#include "settings.hpp" // for settings()
 
 #include "libtorrent/flags.hpp"
 #include "libtorrent/alert_types.hpp"
@@ -77,7 +78,7 @@ lt::error_code test_add_torrent(std::string file, add_torrent_test_flag_t const
 		atp.ti.reset();
 	}
 
-	lt::session_params p;
+	lt::session_params p = settings();
 	p.settings.set_int(lt::settings_pack::alert_mask, lt::alert_category::error | lt::alert_category::status);
 	p.settings.set_str(lt::settings_pack::listen_interfaces, "127.0.0.1:6881");
 	lt::session ses(p);
diff --git a/test/test_copy_file.cpp b/test/test_copy_file.cpp
index 789fce4eb16..fe6815dba67 100644
--- a/test/test_copy_file.cpp
+++ b/test/test_copy_file.cpp
@@ -11,6 +11,7 @@ see LICENSE file.
 #include "libtorrent/error_code.hpp"
 #include "libtorrent/aux_/mmap.hpp"
 #include "libtorrent/aux_/open_mode.hpp"
+#include "libtorrent/aux_/storage_utils.hpp"
 #include "test.hpp"
 
 #include <fstream>
diff --git a/test/test_file.cpp b/test/test_file.cpp
index ce6c32b451a..bd06b52b011 100644
--- a/test/test_file.cpp
+++ b/test/test_file.cpp
@@ -18,6 +18,8 @@ see LICENSE file.
 #include "libtorrent/string_view.hpp"
 #include "libtorrent/aux_/file_view_pool.hpp"
 #include "libtorrent/aux_/numeric_cast.hpp"
+#include "libtorrent/aux_/storage_utils.hpp"
+#include "libtorrent/aux_/file_pool_impl.hpp"
 #include "test.hpp"
 #include "test_utils.hpp"
 #include <vector>
diff --git a/test/test_storage.cpp b/test/test_storage.cpp
index c51a81476b7..1594b098167 100644
--- a/test/test_storage.cpp
+++ b/test/test_storage.cpp
@@ -19,7 +19,9 @@ see LICENSE file.
 
 #include "libtorrent/aux_/mmap_storage.hpp"
 #include "libtorrent/aux_/posix_storage.hpp"
+#include "libtorrent/aux_/pread_storage.hpp"
 #include "libtorrent/aux_/file_view_pool.hpp"
+#include "libtorrent/aux_/file_pool.hpp"
 #include "libtorrent/hasher.hpp"
 #include "libtorrent/session.hpp"
 #include "libtorrent/session_params.hpp"
@@ -45,20 +47,13 @@ see LICENSE file.
 using namespace std::placeholders;
 using namespace lt;
 
-#if ! TORRENT_HAVE_MMAP && ! TORRENT_HAVE_MAP_VIEW_OF_FILE
-namespace libtorrent {
-namespace aux {
-	struct file_view_pool {};
-}
-}
-#endif
-
 namespace {
 
 #if TORRENT_HAVE_MMAP || TORRENT_HAVE_MAP_VIEW_OF_FILE
 using lt::aux::mmap_storage;
 #endif
 using lt::aux::posix_storage;
+using lt::aux::pread_storage;
 
 constexpr int piece_size = 16 * 1024 * 16;
 constexpr int half = piece_size / 2;
@@ -186,6 +181,12 @@ struct file_pool_type<posix_storage>
 	using type = int;
 };
 
+template <>
+struct file_pool_type<pread_storage>
+{
+	using type = aux::file_pool;
+};
+
 template <typename StorageType>
 std::shared_ptr<StorageType> make_storage(storage_params const& p
 	, typename file_pool_type<StorageType>::type& fp);
@@ -206,6 +207,13 @@ std::shared_ptr<posix_storage> make_storage(storage_params const& p
 	return std::make_shared<posix_storage>(p);
 }
 
+template <>
+std::shared_ptr<pread_storage> make_storage(storage_params const& p
+	, aux::file_pool& fp)
+{
+	return std::make_shared<pread_storage>(p, fp);
+}
+
 template <typename StorageType, typename FilePool>
 std::pair<std::shared_ptr<StorageType>, std::shared_ptr<torrent_info>>
 setup_torrent(
@@ -295,6 +303,33 @@ int read(std::shared_ptr<posix_storage> s
 
 void release_files(std::shared_ptr<posix_storage>, storage_error&) {}
 
+int write(std::shared_ptr<pread_storage> s
+	, aux::session_settings const& sett
+	, span<char> buf
+	, piece_index_t const piece
+	, int const offset
+	, aux::open_mode_t mode
+	, storage_error& error)
+{
+	return s->write(sett, buf, piece, offset, mode, disk_job_flags_t{}, error);
+}
+
+int read(std::shared_ptr<pread_storage> s
+	, aux::session_settings const& sett
+	, span<char> buf
+	, piece_index_t piece
+	, int offset
+	, aux::open_mode_t mode
+	, storage_error& ec)
+{
+	return s->read(sett, buf, piece, offset, mode, disk_job_flags_t{}, ec);
+}
+
+void release_files(std::shared_ptr<pread_storage> s, storage_error& ec)
+{
+	s->release_files(ec);
+}
+
 std::vector<char> new_piece(std::size_t const size)
 {
 	std::vector<char> ret(size);
@@ -732,6 +767,17 @@ TORRENT_TEST(remove_posix_disk_io)
 	test_remove<posix_storage>(current_working_directory());
 }
 
+TORRENT_TEST(rename_pread_disk_io)
+{
+	test_rename<pread_storage>(current_working_directory());
+}
+
+TORRENT_TEST(remove_pread_disk_io)
+{
+	test_remove<pread_storage>(current_working_directory());
+}
+
+
 void test_fastresume(bool const test_deprecated)
 {
 	std::string test_path = current_working_directory();
@@ -769,6 +815,7 @@ void test_fastresume(bool const test_deprecated)
 		{
 			print_alerts(ses, "ses");
 			s = h.status();
+			std::cout << "progress: " << s.progress << std::endl;
 			if (s.progress == 1.0f)
 			{
 				std::cout << "progress: 1.0f" << std::endl;
@@ -1079,6 +1126,7 @@ bool check_pattern(std::vector<char> const& buf, int counter)
 TORRENT_TEST(mmap_disk_io) { run_test<mmap_storage>(); }
 #endif
 TORRENT_TEST(posix_disk_io) { run_test<posix_storage>(); }
+TORRENT_TEST(pread_disk_io) { run_test<pread_storage>(); }
 
 namespace {
 
@@ -1433,6 +1481,22 @@ TORRENT_TEST(move_posix_storage_reset)
 	test_move_storage_reset<posix_storage>(move_flags_t::reset_save_path_unchecked);
 }
 
+TORRENT_TEST(move_pread_storage_to_self)
+{
+	test_move_storage_to_self<pread_storage>();
+}
+
+TORRENT_TEST(move_pread_storage_into_self)
+{
+	test_move_storage_into_self<pread_storage>();
+}
+
+TORRENT_TEST(move_pread_storage_reset)
+{
+	test_move_storage_reset<pread_storage>(move_flags_t::reset_save_path);
+	test_move_storage_reset<pread_storage>(move_flags_t::reset_save_path_unchecked);
+}
+
 TORRENT_TEST(storage_paths_string_pooling)
 {
 	file_storage file_storage;
diff --git a/test/test_torrent_info.cpp b/test/test_torrent_info.cpp
index 94496bcc36c..3ec0545a66d 100644
--- a/test/test_torrent_info.cpp
+++ b/test/test_torrent_info.cpp
@@ -14,7 +14,8 @@ see LICENSE file.
 #include "test.hpp"
 #include "setup_transfer.hpp" // for load_file
 #include "test_utils.hpp"
-#include "settings.hpp"
+#include "settings.hpp" // for settings()
+
 #include "libtorrent/file_storage.hpp"
 #include "libtorrent/load_torrent.hpp"
 #include "libtorrent/aux_/path.hpp"
diff --git a/test/web_seed_suite.cpp b/test/web_seed_suite.cpp
index de5ac01c9dc..e7a28325534 100644
--- a/test/web_seed_suite.cpp
+++ b/test/web_seed_suite.cpp
@@ -220,7 +220,10 @@ void test_transfer(lt::session& ses, lt::add_torrent_params p
 		{
 			bool const expect = !fs.pad_file_at(i);
 			std::string file_path = combine_path(save_path, fs.file_path(i));
-			std::printf("checking file: %s\n", file_path.c_str());
+			std::printf("checking file: %s (pad-file: %d size: %" PRId64 ")\n"
+				, file_path.c_str()
+				, !expect
+				, fs.file_size(i));
 			TEST_EQUAL(exists(file_path), expect);
 		}
 	}
@@ -338,7 +341,6 @@ int EXPORT run_http_suite(int proxy, char const* protocol
 
 		{
 			settings_pack pack = settings();
-			pack.set_int(settings_pack::max_queued_disk_bytes, 256 * 1024);
 			pack.set_str(settings_pack::listen_interfaces, test_listen_interface());
 			pack.set_int(settings_pack::max_retry_port_bind, 1000);
 			pack.set_bool(settings_pack::enable_lsd, false);
diff --git a/tools/disk_io_stress_test.cpp b/tools/disk_io_stress_test.cpp
index 14ef3bab141..dba004db19f 100644
--- a/tools/disk_io_stress_test.cpp
+++ b/tools/disk_io_stress_test.cpp
@@ -10,6 +10,7 @@ see LICENSE file.
 #include "libtorrent/session.hpp" // for default_disk_io_constructor
 #include "libtorrent/disabled_disk_io.hpp"
 #include "libtorrent/mmap_disk_io.hpp"
+#include "libtorrent/pread_disk_io.hpp"
 #include "libtorrent/posix_disk_io.hpp"
 
 #include "libtorrent/disk_interface.hpp"
@@ -167,6 +168,8 @@ int run_test(test_case const& t)
 	{
 		if (t.disk_backend  == "posix"_sv)
 			disk_io = lt::posix_disk_io_constructor(ioc, pack, cnt);
+		else if (t.disk_backend  == "pread"_sv)
+			disk_io = lt::pread_disk_io_constructor(ioc, pack, cnt);
 		else if (t.disk_backend  == "disabled"_sv)
 			disk_io = lt::disabled_disk_io_constructor(ioc, pack, cnt);
 		else
@@ -253,7 +256,7 @@ int run_test(test_case const& t)
 		{
 			if ((job_counter & 0x1fff) == 0)
 			{
-				printf("o: %d w: %d r: %d\r"
+				printf("o: %d w: %d r: %d  \r"
 					, outstanding
 					, int(blocks_to_write.size())
 					, int(blocks_to_read.size()));
diff --git a/tools/parse_session_stats.py b/tools/parse_session_stats.py
index 4ef185348f2..bf27d4133c3 100755
--- a/tools/parse_session_stats.py
+++ b/tools/parse_session_stats.py
@@ -130,7 +130,8 @@ def process_color(c, op):
 
 def plot_fun(script):
     try:
-        ret = os.system('gnuplot "%s" 2>/dev/null' % script)
+        print('gnuplot "%s"' % script)
+        ret = os.system('gnuplot "%s"' % script)
     except Exception as e:
         print('please install gnuplot: sudo apt install gnuplot')
         raise e
diff --git a/tools/run_benchmark.py b/tools/run_benchmark.py
index 0e2dcb42048..a05e50dfc1b 100755
--- a/tools/run_benchmark.py
+++ b/tools/run_benchmark.py
@@ -49,10 +49,11 @@ def main():
 
     rm_file_or_dir('t')
 
-    run_test('download-write-through', 'upload', ['-1', '--disk_io_write_mode=write_through', '-s', args.save_path], args.download_peers)
-    reset_download(args.save_path)
-    run_test('download-full-cache', 'upload', ['-1', '--disk_io_write_mode=enable_os_cache', '-s', args.save_path], args.download_peers)
-    run_test('upload', 'download', ['-G', '-e', '240', '-s', args.save_path], args.upload_peers)
+    for io_backend in ["mmap", "pread", "posix"]:
+        run_test(f'download-write-through-{io_backend}', 'upload', ['-i', io_backend, '-1', '--disk_io_write_mode=write_through', '-s', args.save_path], args.download_peers)
+        reset_download(args.save_path)
+        run_test(f'download-full-cache-{io_backend}', 'upload', ['-i', io_backend, '-1', '--disk_io_write_mode=enable_os_cache', '-s', args.save_path], args.download_peers)
+        run_test(f'upload-{io_backend}', 'download', ['-i', io_backend, '-G', '-e', '240', '-s', args.save_path], args.upload_peers)
 
 
 def run_test(name, test_cmd, client_arg, num_peers):

From ba4567a3ff25b99e96419e115911d98d0ac53d53 Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Sat, 24 Feb 2024 19:33:43 +0100
Subject: [PATCH 2/7] fix clang-tidy warnings

---
 src/disk_cache.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/disk_cache.cpp b/src/disk_cache.cpp
index 63bd6192084..3bdde5b26f3 100644
--- a/src/disk_cache.cpp
+++ b/src/disk_cache.cpp
@@ -84,7 +84,6 @@ cached_piece_entry::cached_piece_entry(piece_location const& loc, int const num_
 	, piece_size2(piece_size_v2)
 	, blocks_in_piece(num_blocks)
 	, blocks(aux::make_unique<cached_block_entry[], std::ptrdiff_t>(num_blocks))
-	, ph(hasher())
 {}
 
 span<cached_block_entry> cached_piece_entry::get_blocks() const
@@ -199,7 +198,7 @@ disk_cache::hash_result disk_cache::try_hash_piece(piece_location const loc, pre
 		view.modify(i, [&](cached_piece_entry& e) {
 			e.piece_hash_returned = true;
 
-			job::hash& job = std::get<aux::job::hash>(hash_job->action);
+			auto& job = std::get<aux::job::hash>(hash_job->action);
 			job.piece_hash = e.ph.final();
 			if (!job.block_hashes.empty())
 			{
@@ -258,7 +257,7 @@ void disk_cache::kick_hasher(piece_location const& loc, jobqueue_t& completed_jo
 	}
 	auto const blocks = blocks_storage.first(block_idx);
 
-	hasher& ctx = const_cast<hasher&>(piece_iter->ph);
+	auto& ctx = const_cast<hasher&>(piece_iter->ph);
 
 	view.modify(piece_iter, [](cached_piece_entry& e) { e.hashing = true; });
 
@@ -319,7 +318,7 @@ void disk_cache::kick_hasher(piece_location const& loc, jobqueue_t& completed_jo
 	// this piece, post it.
 	sha1_hash const piece_hash = ctx.final();
 
-	job::hash& job = std::get<job::hash>(j->action);
+	auto& job = std::get<job::hash>(j->action);
 	job.piece_hash = piece_hash;
 	if (!job.block_hashes.empty())
 	{

From 9ad0cfd263d9a969cf26561f54b4a8aa6dd4e269 Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Sun, 25 Feb 2024 01:53:32 +0100
Subject: [PATCH 3/7] fixup comments

---
 include/libtorrent/aux_/disk_cache.hpp | 8 --------
 src/pread_disk_io.cpp                  | 5 +++--
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/libtorrent/aux_/disk_cache.hpp b/include/libtorrent/aux_/disk_cache.hpp
index ad5a2b5d095..1255e506b78 100644
--- a/include/libtorrent/aux_/disk_cache.hpp
+++ b/include/libtorrent/aux_/disk_cache.hpp
@@ -316,14 +316,6 @@ struct disk_cache
 		post_job,
 	};
 
-	// this call can have 3 outcomes:
-	// 1. the job is immediately satisfied and should be posted to the
-	//    completion queue
-	// 2. The piece is in the cache and currently hashing, but it's not done
-	//    yet. We hang the hash job on the piece itself so the hashing thread
-	//    can complete it when hashing finishes
-	// 3. The piece is not in the cache and should be posted to the disk thread
-	//    to read back the bytes.
 	hash_result try_hash_piece(piece_location const loc, pread_disk_job* hash_job);
 
 	// this should be called from a hasher thread
diff --git a/src/pread_disk_io.cpp b/src/pread_disk_io.cpp
index 63d4c58f3dd..621c1e60e6f 100644
--- a/src/pread_disk_io.cpp
+++ b/src/pread_disk_io.cpp
@@ -708,8 +708,9 @@ void pread_disk_io::async_hash2(storage_index_t const storage
 		sha256_hash{}
 	);
 
-	// TODO: check the disk cache here
-
+	// In theory, we could check the cache for this block hash, but we
+	// only retain cached_piece_entries until the main piece hash has been
+	// returned, asking for individual blocks may not be available
 	add_job(j);
 }
 

From a9149de627d34d53da982d29f7648390b963f90e Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Sun, 25 Feb 2024 13:35:36 +0100
Subject: [PATCH 4/7] factor out common code from flush_cache_blocks

---
 include/libtorrent/aux_/disk_cache.hpp |  46 +++++++++
 src/pread_disk_io.cpp                  | 138 ++++++-------------------
 2 files changed, 78 insertions(+), 106 deletions(-)

diff --git a/include/libtorrent/aux_/disk_cache.hpp b/include/libtorrent/aux_/disk_cache.hpp
index 1255e506b78..e241f2a7aa1 100644
--- a/include/libtorrent/aux_/disk_cache.hpp
+++ b/include/libtorrent/aux_/disk_cache.hpp
@@ -167,6 +167,52 @@ struct cached_piece_entry
 	pread_disk_job* clear_piece = nullptr;
 };
 
+// TODO: add unit test for this function
+template <typename Fun>
+void visit_block_iovecs(span<aux::cached_block_entry const> blocks
+	, Fun const& f)
+{
+	TORRENT_ASSERT(blocks.size() > 0);
+	TORRENT_ALLOCA(iovec, span<char>, blocks.size());
+
+	int count = 0;
+
+	int start_idx = 0;
+	int idx = 0;
+
+	for (auto& be : blocks)
+	{
+		auto* j = be.write_job;
+		if (count > 0 && j == nullptr)
+		{
+			bool const interrupt = f(iovec.first(count), start_idx);
+			if (interrupt) return;
+
+			start_idx = idx;
+			count = 0;
+		}
+
+		if (j == nullptr)
+		{
+			++idx;
+			start_idx = idx;
+			continue;
+		}
+
+		TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
+		auto& a = std::get<aux::job::write>(j->action);
+
+		iovec[count] = span<char>{ a.buf.data(), a.buffer_size};
+		++count;
+		++idx;
+	}
+
+	if (count > 0)
+	{
+		f(iovec.first(count), start_idx);
+	}
+}
+
 struct disk_cache
 {
 	using piece_container = mi::multi_index_container<
diff --git a/src/pread_disk_io.cpp b/src/pread_disk_io.cpp
index 621c1e60e6f..436b4626be7 100644
--- a/src/pread_disk_io.cpp
+++ b/src/pread_disk_io.cpp
@@ -1307,128 +1307,54 @@ int pread_disk_io::flush_cache_blocks(bitfield& flushed
 	m_stats_counters.inc_stats_counter(counters::num_writing_threads, 1);
 	time_point const start_time = clock_type::now();
 
-	TORRENT_ALLOCA(iovec, span<char>, blocks.size());
 	bool failed = false;
-	int count = 0;
-	int start_idx = 0;
-	int idx = 0;
 
 	// the total number of blocks we ended up flushing to disk
 	int ret = 0;
 
-	// the piece offset of the start of the range of contiguous blocks we're
-	// currently assembling into iovec
-	int start_offset = 0;
-
-	// the offset of the end of the range of contiguous blocks we're currently
-	// assembing
-	int end_offset = 0;
-
-	aux::open_mode_t file_mode;
-	auto piece = piece_index_t(-1);
-	disk_job_flags_t flags;
-
-	std::shared_ptr<aux::pread_storage> storage;
-
-	storage_error error;
-	// TODO: refactor this loop into an iterator adapter that returns
-	// contiguous ranges of blocks. Then de-duplicate the write-to-disk logic
-	// into the loop
-	TORRENT_ASSERT(blocks.size() > 0);
-	for (auto& be : blocks)
-	{
-		auto* j = be.write_job;
-
-		auto const job_offset = [&] {
-			if (j != nullptr)
-				return std::get<aux::job::write>(j->action).offset;
-			else
-				return 0;
-		}();
-
-		if (!storage && j) storage = j->storage;
-		if (count > 0 && (j == nullptr || job_offset > end_offset))
-		{
-			TORRENT_ASSERT(piece != piece_index_t(-1));
-			DLOG("write: blocks: %d (piece: %d)\n", count, int(piece));
-			storage->write(m_settings, iovec.first(count)
-				, piece, start_offset, file_mode, flags, error);
-
-			int i = start_idx;
-			for (aux::cached_block_entry const& blk : blocks.subspan(start_idx, count))
-			{
-				auto* j2 = blk.write_job;
-				TORRENT_ASSERT(j2);
-				TORRENT_ASSERT(j2->get_type() == aux::job_action_t::write);
-				j2->error = error;
-				flushed.set_bit(i);
-				completed_jobs.push_back(j2);
-				++i;
-			}
-
-			if (error) {
-				// if there was a failure, fail the remaining jobs as well
-				for (int k = start_idx + count; k < blocks.size(); ++k)
-				{
-					auto* j2 = be.write_job;
-					if (j2 == nullptr) continue;
-					j2->error = error;
-					// TODO: should we free the job's buffer here?
-					completed_jobs.push_back(j2);
-				}
-				failed = true;
-				break;
-			}
-
-			ret += count;
-
-			start_offset = job_offset;
-			start_idx = idx;
-			count = 0;
-		}
-
-		if (j == nullptr)
-		{
-			++idx;
-			start_idx = idx;
-			continue;
-		}
-
-		TORRENT_ASSERT(j->storage == storage);
+	visit_block_iovecs(blocks, [&] (span<span<char>> iovec, int const start_idx) {
+		auto* j = blocks[start_idx].write_job;
 		TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
 		auto& a = std::get<aux::job::write>(j->action);
+		aux::open_mode_t const file_mode = file_mode_for_job(j);
+		aux::pread_storage* storage = j->storage.get();
 
-		if (count == 0) start_offset = job_offset;
-		iovec[count] = span<char>{ a.buf.data(), a.buffer_size};
-		++count;
-		flags = j->flags;
-		piece = a.piece;
-		file_mode = file_mode_for_job(j);
-		end_offset = job_offset + a.buffer_size;
-		++idx;
-	}
+		TORRENT_ASSERT(a.piece != piece_index_t(-1));
+		int const count = static_cast<int>(iovec.size());
+		DLOG("write: blocks: %d (piece: %d)\n", count, int(a.piece));
 
-	if (count > 0)
-	{
-		DLOG("write: blocks: %d (piece: %d)\n", count, int(piece));
-		storage->write(m_settings, iovec.first(count)
-			, piece, start_offset, file_mode, flags, error);
+		storage_error error;
+		storage->write(m_settings, iovec
+			, a.piece, a.offset, file_mode, j->flags, error);
 
 		int i = start_idx;
 		for (aux::cached_block_entry const& blk : blocks.subspan(start_idx, count))
 		{
-			auto* j = blk.write_job;
-			TORRENT_ASSERT(j);
-			TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
-			j->error = error;
+			auto* j2 = blk.write_job;
+			TORRENT_ASSERT(j2);
+			TORRENT_ASSERT(j2->get_type() == aux::job_action_t::write);
+			j2->error = error;
 			flushed.set_bit(i);
-			completed_jobs.push_back(j);
+			completed_jobs.push_back(j2);
 			++i;
 		}
-		// TODO: if we failed, post the remaining block's jobs as failures too
-		if (error) failed = true;
-		else ret += count;
-	}
+
+		ret += count;
+
+		if (error) {
+			// if there was a failure, fail the remaining jobs as well
+			for (aux::cached_block_entry const& blk : blocks.subspan(start_idx + count))
+			{
+				auto* j2 = blk.write_job;
+				if (j2 == nullptr) continue;
+				j2->error = error;
+				// TODO: should we free the job's buffer here?
+				completed_jobs.push_back(j2);
+			}
+			failed = true;
+		}
+		return failed;
+	});
 
 	if (!failed)
 	{

From b12ea03ca475d99201aeb58c1e1dad5e7dc66957 Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Sun, 10 Mar 2024 14:21:58 +0100
Subject: [PATCH 5/7] factor out visit_block_iovecs into its own header file

---
 CMakeLists.txt                                |  1 +
 Makefile                                      |  1 +
 include/libtorrent/aux_/disk_cache.hpp        | 54 +++-------------
 include/libtorrent/aux_/pread_storage.hpp     |  4 +-
 .../libtorrent/aux_/visit_block_iovecs.hpp    | 62 +++++++++++++++++++
 src/disk_cache.cpp                            | 14 ++++-
 src/pread_disk_io.cpp                         |  3 +-
 src/pread_storage.cpp                         |  8 +--
 test/Jamfile                                  |  1 +
 9 files changed, 94 insertions(+), 54 deletions(-)
 create mode 100644 include/libtorrent/aux_/visit_block_iovecs.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5cb31258987..1f7cc231868 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,6 +167,7 @@ set(libtorrent_aux_include_files
 	disable_warnings_push.hpp
 	disk_buffer_pool.hpp
 	disk_cache.hpp
+	visit_block_iovecs.hpp
 	disk_completed_queue.hpp
 	mmap_disk_job.hpp
 	disk_job.hpp
diff --git a/Makefile b/Makefile
index 9f9d169f54a..8ed34d4c47d 100644
--- a/Makefile
+++ b/Makefile
@@ -565,6 +565,7 @@ HEADERS = \
   aux_/disable_warnings_push.hpp    \
   aux_/disk_buffer_pool.hpp         \
   aux_/disk_cache.hpp               \
+  aux_/visit_block_iovecs.hpp       \
   aux_/disk_completed_queue.hpp     \
   aux_/disk_io_thread_pool.hpp      \
   aux_/disk_job_fence.hpp           \
diff --git a/include/libtorrent/aux_/disk_cache.hpp b/include/libtorrent/aux_/disk_cache.hpp
index e241f2a7aa1..c23a57a7008 100644
--- a/include/libtorrent/aux_/disk_cache.hpp
+++ b/include/libtorrent/aux_/disk_cache.hpp
@@ -73,8 +73,16 @@ inline size_t hash_value(piece_location const& l)
 
 struct cached_block_entry
 {
+	// returns the buffer associated with this block. It either picks it from
+	// the write job that's hung on this block, or from the buffer in the block
+	// object, if it has been flushed to disk already.
+	// If there is no buffer, it returns an empty span.
 	span<char const> buf() const;
 
+	// returns the buffer associated with the write job hanging on this block.
+	// If there is no write job, it returns an empty span.
+	span<char const> write_buf() const;
+
 	// once the write job has been executed, and we've flushed the buffer, we
 	// move it into buf_holder, to keep the buffer alive until any hash job has
 	// completed as well. The underlying data can be accessed through buf, but
@@ -167,52 +175,6 @@ struct cached_piece_entry
 	pread_disk_job* clear_piece = nullptr;
 };
 
-// TODO: add unit test for this function
-template <typename Fun>
-void visit_block_iovecs(span<aux::cached_block_entry const> blocks
-	, Fun const& f)
-{
-	TORRENT_ASSERT(blocks.size() > 0);
-	TORRENT_ALLOCA(iovec, span<char>, blocks.size());
-
-	int count = 0;
-
-	int start_idx = 0;
-	int idx = 0;
-
-	for (auto& be : blocks)
-	{
-		auto* j = be.write_job;
-		if (count > 0 && j == nullptr)
-		{
-			bool const interrupt = f(iovec.first(count), start_idx);
-			if (interrupt) return;
-
-			start_idx = idx;
-			count = 0;
-		}
-
-		if (j == nullptr)
-		{
-			++idx;
-			start_idx = idx;
-			continue;
-		}
-
-		TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
-		auto& a = std::get<aux::job::write>(j->action);
-
-		iovec[count] = span<char>{ a.buf.data(), a.buffer_size};
-		++count;
-		++idx;
-	}
-
-	if (count > 0)
-	{
-		f(iovec.first(count), start_idx);
-	}
-}
-
 struct disk_cache
 {
 	using piece_container = mi::multi_index_container<
diff --git a/include/libtorrent/aux_/pread_storage.hpp b/include/libtorrent/aux_/pread_storage.hpp
index 6c8f2b3ef01..18c187d5cd2 100644
--- a/include/libtorrent/aux_/pread_storage.hpp
+++ b/include/libtorrent/aux_/pread_storage.hpp
@@ -68,12 +68,12 @@ namespace libtorrent::aux {
 			, piece_index_t piece, int offset, aux::open_mode_t mode
 			, disk_job_flags_t flags
 			, storage_error&);
-		int write(settings_interface const&, span<char> buffer
+		int write(settings_interface const&, span<char const> buffer
 			, piece_index_t piece, int offset, aux::open_mode_t mode
 			, disk_job_flags_t flags
 			, storage_error&);
 		int write(settings_interface const& sett
-			, span<span<char> const> buffers
+			, span<span<char const> const> buffers
 			, piece_index_t const piece, int offset
 			, open_mode_t const mode
 			, disk_job_flags_t const flags
diff --git a/include/libtorrent/aux_/visit_block_iovecs.hpp b/include/libtorrent/aux_/visit_block_iovecs.hpp
new file mode 100644
index 00000000000..84f738d8843
--- /dev/null
+++ b/include/libtorrent/aux_/visit_block_iovecs.hpp
@@ -0,0 +1,62 @@
+/*
+
+Copyright (c) 2023, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#ifndef TORRENT_VISIT_BLOCK_IOVECS
+#define TORRENT_VISIT_BLOCK_IOVECS
+
+#include "libtorrent/span.hpp"
+
+namespace libtorrent::aux {
+
+// Fun is a function object that's called with f(span<span<char const>>, int)
+template <typename Fun, typename BlockEntry>
+void visit_block_iovecs(span<BlockEntry const> blocks
+	, Fun const& f)
+{
+	TORRENT_ASSERT(blocks.size() > 0);
+	TORRENT_ALLOCA(iovec, span<char const>, blocks.size());
+
+	int count = 0;
+
+	int start_idx = 0;
+	int idx = 0;
+
+	for (auto& be : blocks)
+	{
+		auto const buf = be.write_buf();
+		if (count > 0 && buf.empty())
+		{
+			bool const interrupt = f(iovec.first(count), start_idx);
+			if (interrupt) return;
+
+			start_idx = idx;
+			count = 0;
+		}
+
+		if (buf.empty())
+		{
+			++idx;
+			start_idx = idx;
+			continue;
+		}
+
+		iovec[count] = buf;
+		++count;
+		++idx;
+	}
+
+	if (count > 0)
+	{
+		f(iovec.first(count), start_idx);
+	}
+}
+
+}
+
+#endif
diff --git a/src/disk_cache.cpp b/src/disk_cache.cpp
index 3bdde5b26f3..1342c126ba2 100644
--- a/src/disk_cache.cpp
+++ b/src/disk_cache.cpp
@@ -66,7 +66,8 @@ int count_jobs(span<const cached_block_entry> blocks)
 
 }
 
-span<char const> cached_block_entry::buf() const {
+span<char const> cached_block_entry::buf() const
+{
 	if (buf_holder)
 		return {buf_holder.data(), buf_holder.size()};
 
@@ -79,6 +80,17 @@ span<char const> cached_block_entry::buf() const {
 	return {nullptr, 0};
 }
 
+span<char const> cached_block_entry::write_buf() const
+{
+	if (write_job != nullptr)
+	{
+		TORRENT_ASSERT(write_job->get_type() == aux::job_action_t::write);
+		auto const& job = std::get<job::write>(write_job->action);
+		return {job.buf.data(), job.buffer_size};
+	}
+	return {nullptr, 0};
+}
+
 cached_piece_entry::cached_piece_entry(piece_location const& loc, int const num_blocks, int const piece_size_v2)
 	: piece(loc)
 	, piece_size2(piece_size_v2)
diff --git a/src/pread_disk_io.cpp b/src/pread_disk_io.cpp
index 436b4626be7..286334bf1c2 100644
--- a/src/pread_disk_io.cpp
+++ b/src/pread_disk_io.cpp
@@ -25,6 +25,7 @@ see LICENSE file.
 #include "libtorrent/aux_/disk_job_pool.hpp"
 #include "libtorrent/aux_/disk_io_thread_pool.hpp"
 #include "libtorrent/aux_/disk_cache.hpp"
+#include "libtorrent/aux_/visit_block_iovecs.hpp"
 #include "libtorrent/aux_/time.hpp"
 #include "libtorrent/add_torrent_params.hpp"
 #include "libtorrent/aux_/numeric_cast.hpp"
@@ -1312,7 +1313,7 @@ int pread_disk_io::flush_cache_blocks(bitfield& flushed
 	// the total number of blocks we ended up flushing to disk
 	int ret = 0;
 
-	visit_block_iovecs(blocks, [&] (span<span<char>> iovec, int const start_idx) {
+	visit_block_iovecs(blocks, [&] (span<span<char const>> iovec, int const start_idx) {
 		auto* j = blocks[start_idx].write_job;
 		TORRENT_ASSERT(j->get_type() == aux::job_action_t::write);
 		auto& a = std::get<aux::job::write>(j->action);
diff --git a/src/pread_storage.cpp b/src/pread_storage.cpp
index a5e11b99817..1427a5005bc 100644
--- a/src/pread_storage.cpp
+++ b/src/pread_storage.cpp
@@ -498,7 +498,7 @@ namespace {
 	}
 
 	int pread_storage::write(settings_interface const& sett
-		, span<span<char> const> buffers
+		, span<span<char const> const> buffers
 		, piece_index_t const piece, int offset
 		, open_mode_t const mode
 		, disk_job_flags_t const flags
@@ -514,17 +514,17 @@ namespace {
 	}
 
 	int pread_storage::write(settings_interface const& sett
-		, span<char> buffer
+		, span<char const> buffer
 		, piece_index_t const piece, int const offset
 		, open_mode_t const mode
 		, disk_job_flags_t
 		, storage_error& error)
 	{
 		auto const write_mode = sett.get_int(settings_pack::disk_io_write_mode);
-		return readwrite(files(), buffer, piece, offset, error
+		return readwrite(files(), reinterpret_cast<span<char>>(buffer), piece, offset, error
 			, [this, mode, &sett, write_mode](file_index_t const file_index
 				, std::int64_t const file_offset
-				, span<char> buf, storage_error& ec)
+				, span<char const> buf, storage_error& ec)
 		{
 			// writing to a pad-file is a no-op
 			if (files().pad_file_at(file_index))
diff --git a/test/Jamfile b/test/Jamfile
index 4f3b892dd76..c8a5a4b83c6 100644
--- a/test/Jamfile
+++ b/test/Jamfile
@@ -200,6 +200,7 @@ run test_remap_files.cpp ;
 run test_similar_torrent.cpp ;
 run test_truncate.cpp ;
 run test_copy_file.cpp ;
+run test_disk_cache.cpp ;
 
 # turn these tests into simulations
 run test_resume.cpp ;

From afe049ea33d5889b93335c7c57a325ae9529c5da Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Sun, 10 Mar 2024 16:03:19 +0100
Subject: [PATCH 6/7] add test for visit_block_iovecs

---
 .../libtorrent/aux_/visit_block_iovecs.hpp    |   2 +
 src/pread_storage.cpp                         |   7 +-
 test/test_disk_cache.cpp                      | 171 ++++++++++++++++++
 3 files changed, 177 insertions(+), 3 deletions(-)
 create mode 100644 test/test_disk_cache.cpp

diff --git a/include/libtorrent/aux_/visit_block_iovecs.hpp b/include/libtorrent/aux_/visit_block_iovecs.hpp
index 84f738d8843..fa6da043ead 100644
--- a/include/libtorrent/aux_/visit_block_iovecs.hpp
+++ b/include/libtorrent/aux_/visit_block_iovecs.hpp
@@ -11,10 +11,12 @@ see LICENSE file.
 #define TORRENT_VISIT_BLOCK_IOVECS
 
 #include "libtorrent/span.hpp"
+#include "libtorrent/aux_/alloca.hpp"
 
 namespace libtorrent::aux {
 
 // Fun is a function object that's called with f(span<span<char const>>, int)
+// and is expected to return a bool. true=interrupt, false=continue
 template <typename Fun, typename BlockEntry>
 void visit_block_iovecs(span<BlockEntry const> blocks
 	, Fun const& f)
diff --git a/src/pread_storage.cpp b/src/pread_storage.cpp
index 1427a5005bc..563c83b026e 100644
--- a/src/pread_storage.cpp
+++ b/src/pread_storage.cpp
@@ -28,6 +28,7 @@ see LICENSE file.
 #include "libtorrent/aux_/file.hpp" // for file_handle, pread_all, pwrite_all
 #include "libtorrent/disk_buffer_holder.hpp"
 #include "libtorrent/aux_/stat_cache.hpp"
+#include "libtorrent/aux_/readwrite.hpp"
 #include "libtorrent/hex.hpp" // to_hex
 
 #include <sys/types.h>
@@ -521,7 +522,7 @@ namespace {
 		, storage_error& error)
 	{
 		auto const write_mode = sett.get_int(settings_pack::disk_io_write_mode);
-		return readwrite(files(), reinterpret_cast<span<char>>(buffer), piece, offset, error
+		return readwrite(files(), buffer, piece, offset, error
 			, [this, mode, &sett, write_mode](file_index_t const file_index
 				, std::int64_t const file_offset
 				, span<char const> buf, storage_error& ec)
@@ -590,11 +591,11 @@ namespace {
 
 		std::vector<char> scratch_buffer;
 
-		return readwrite(files(), {&dummy, len}, piece, offset, error
+		return readwrite(files(), span<char const>{&dummy, len}, piece, offset, error
 			, [this, mode, flags, &ph, &sett, &scratch_buffer](
 				file_index_t const file_index
 				, std::int64_t const file_offset
-				, span<char> buf, storage_error& ec)
+				, span<char const> buf, storage_error& ec)
 		{
 			if (files().pad_file_at(file_index))
 				return hash_zeroes(ph, buf.size());
diff --git a/test/test_disk_cache.cpp b/test/test_disk_cache.cpp
new file mode 100644
index 00000000000..3a1d22c52e8
--- /dev/null
+++ b/test/test_disk_cache.cpp
@@ -0,0 +1,171 @@
+/*
+
+Copyright (c) 2024, Arvid Norberg
+All rights reserved.
+
+You may use, distribute and modify this code under the terms of the BSD license,
+see LICENSE file.
+*/
+
+#include "libtorrent/aux_/visit_block_iovecs.hpp"
+#include <array>
+#include "test.hpp"
+
+using lt::span;
+
+namespace {
+
+struct tbe
+{
+	span<char const> write_buf() const
+	{
+		return _buf;
+	}
+	span<char const> _buf;
+};
+
+template <size_t N>
+tbe b(char const (&literal)[N])
+{
+	auto buf = span<char const>{&literal[0], N - 1};
+	return tbe{buf};
+}
+
+std::string join(span<span<char const>> iovec)
+{
+	std::string ret;
+	for (span<char const> const& b : iovec)
+	{
+		ret.append(b.begin(), b.end());
+	}
+	return ret;
+}
+
+}
+
+TORRENT_TEST(visit_block_iovecs_full)
+{
+	std::array<tbe, 5> const blocks{b("a"), b("b"), b("c"), b("d"), b("e")};
+
+	int cnt = 0;
+	lt::aux::visit_block_iovecs(span<tbe const>(blocks)
+		, [&cnt] (span<span<char const>> iovec, int start_idx) {
+		TEST_EQUAL(cnt, 0);
+		TEST_EQUAL(start_idx, 0);
+		TEST_EQUAL(iovec.size(), 5);
+		TEST_EQUAL(join(iovec), "abcde");
+		++cnt;
+		return false;
+	});
+}
+
+TORRENT_TEST(visit_block_iovecs_one_hole)
+{
+	std::array<tbe, 5> const blocks{b("a"), b("b"), b(""), b("d"), b("e")};
+
+	int cnt = 0;
+	lt::aux::visit_block_iovecs(span<tbe const>(blocks)
+		, [&cnt] (span<span<char const>> iovec, int start_idx) {
+		switch (cnt) {
+			case 0:
+				TEST_EQUAL(start_idx, 0);
+				TEST_EQUAL(iovec.size(), 2);
+				TEST_EQUAL(join(iovec), "ab");
+				break;
+			case 1:
+				TEST_EQUAL(start_idx, 3);
+				TEST_EQUAL(iovec.size(), 2);
+				TEST_EQUAL(join(iovec), "de");
+				break;
+			default:
+				TORRENT_ASSERT_FAIL();
+		}
+		++cnt;
+		return false;
+	});
+}
+
+TORRENT_TEST(visit_block_iovecs_two_holes)
+{
+	std::array<tbe, 5> const blocks{b("a"), b(""), b("c"), b(""), b("e")};
+
+	int cnt = 0;
+	lt::aux::visit_block_iovecs(span<tbe const>(blocks)
+		, [&cnt] (span<span<char const>> iovec, int start_idx) {
+		switch (cnt) {
+			case 0:
+				TEST_EQUAL(start_idx, 0);
+				TEST_EQUAL(iovec.size(), 1);
+				TEST_EQUAL(join(iovec), "a");
+				break;
+			case 1:
+				TEST_EQUAL(start_idx, 2);
+				TEST_EQUAL(iovec.size(), 1);
+				TEST_EQUAL(join(iovec), "c");
+				break;
+			case 2:
+				TEST_EQUAL(start_idx, 4);
+				TEST_EQUAL(iovec.size(), 1);
+				TEST_EQUAL(join(iovec), "e");
+				break;
+			default:
+				TORRENT_ASSERT_FAIL();
+		}
+		++cnt;
+		return false;
+	});
+}
+
+
+TORRENT_TEST(visit_block_iovecs_interrupt)
+{
+	std::array<tbe, 3> const blocks{b("a"), b(""), b("c")};
+
+	int cnt = 0;
+	lt::aux::visit_block_iovecs(span<tbe const>(blocks)
+		, [&cnt] (span<span<char const>> iovec, int start_idx) {
+		switch (cnt) {
+			case 0:
+				TEST_EQUAL(start_idx, 0);
+				TEST_EQUAL(iovec.size(), 1);
+				TEST_EQUAL(join(iovec), "a");
+				break;
+			default:
+				TORRENT_ASSERT_FAIL();
+		}
+		++cnt;
+		return true;
+	});
+}
+
+TORRENT_TEST(visit_block_iovecs_leading_hole)
+{
+	std::array<tbe, 5> const blocks{b(""), b("a"), b("b"), b("c"), b("d")};
+
+	int cnt = 0;
+	lt::aux::visit_block_iovecs(span<tbe const>(blocks)
+		, [&cnt] (span<span<char const>> iovec, int start_idx) {
+		TEST_EQUAL(cnt, 0);
+		TEST_EQUAL(start_idx, 1);
+		TEST_EQUAL(iovec.size(), 4);
+		TEST_EQUAL(join(iovec), "abcd");
+		++cnt;
+		return false;
+	});
+}
+
+TORRENT_TEST(visit_block_iovecs_trailing_hole)
+{
+	std::array<tbe, 5> const blocks{b("a"), b("b"), b("c"), b("d"), b("")};
+
+	int cnt = 0;
+	lt::aux::visit_block_iovecs(span<tbe const>(blocks)
+		, [&cnt] (span<span<char const>> iovec, int start_idx) {
+		TEST_EQUAL(cnt, 0);
+		TEST_EQUAL(start_idx, 0);
+		TEST_EQUAL(iovec.size(), 4);
+		TEST_EQUAL(join(iovec), "abcd");
+		++cnt;
+		return false;
+	});
+}

From 0d41aed6cc3d8a0ed622e78f80e8a1222ead8142 Mon Sep 17 00:00:00 2001
From: arvidn <arvid@libtorrent.org>
Date: Tue, 12 Mar 2024 01:04:56 +0100
Subject: [PATCH 7/7] extend the disk_io_stress_test to assert job callbacks

---
 tools/disk_io_stress_test.cpp | 50 +++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 11 deletions(-)

diff --git a/tools/disk_io_stress_test.cpp b/tools/disk_io_stress_test.cpp
index dba004db19f..2b8027f5d5e 100644
--- a/tools/disk_io_stress_test.cpp
+++ b/tools/disk_io_stress_test.cpp
@@ -234,12 +234,21 @@ int run_test(test_case const& t)
 		std::vector<char> write_buffer(lt::default_block_size);
 
 		int outstanding = 0;
+		std::set<int> in_flight;
 
 		lt::add_torrent_params atp;
 
-		disk_io->async_check_files(tor, &atp, lt::aux::vector<std::string, lt::file_index_t>{}
-			, [&](lt::status_t, lt::storage_error const&) { --outstanding; });
+		int job_idx = 0;
+		in_flight.insert(job_idx);
 		++outstanding;
+		disk_io->async_check_files(tor, &atp, lt::aux::vector<std::string, lt::file_index_t>{}
+			, [&, job_idx](lt::status_t, lt::storage_error const&) {
+				TORRENT_ASSERT(in_flight.count(job_idx));
+				in_flight.erase(job_idx);
+				TORRENT_ASSERT(outstanding > 0);
+				--outstanding;
+			});
+		++job_idx;
 		disk_io->submit_jobs();
 
 		while (outstanding > 0)
@@ -269,8 +278,13 @@ int run_test(test_case const& t)
 					auto const req = blocks_to_read.back();
 					blocks_to_read.erase(blocks_to_read.end() - 1);
 
-					disk_io->async_read(tor, req, [&, req](lt::disk_buffer_holder h, lt::storage_error const& ec)
+					in_flight.insert(job_idx);
+					++outstanding;
+					disk_io->async_read(tor, req, [&, req, job_idx](lt::disk_buffer_holder h, lt::storage_error const& ec)
 					{
+						TORRENT_ASSERT(in_flight.count(job_idx));
+						in_flight.erase(job_idx);
+						TORRENT_ASSERT(outstanding > 0);
 						--outstanding;
 						++job_counter;
 						if (ec)
@@ -288,8 +302,7 @@ int run_test(test_case const& t)
 							throw std::runtime_error("read buffer mismatch!");
 						}
 					});
-
-					++outstanding;
+					++job_idx;
 				}
 			}
 
@@ -300,9 +313,14 @@ int run_test(test_case const& t)
 
 				generate_block_fill(req, {write_buffer.data(), lt::default_block_size});
 
+				in_flight.insert(job_idx);
+				++outstanding;
 				disk_io->async_write(tor, req, write_buffer.data()
-					, {}, [&](lt::storage_error const& ec)
+					, {}, [&, job_idx](lt::storage_error const& ec)
 					{
+						TORRENT_ASSERT(in_flight.count(job_idx));
+						in_flight.erase(job_idx);
+						TORRENT_ASSERT(outstanding > 0);
 						--outstanding;
 						++job_counter;
 						if (ec)
@@ -313,6 +331,7 @@ int run_test(test_case const& t)
 							throw std::runtime_error("async_write failed");
 						}
 					});
+				++job_idx;
 				if (t.flags & test_mode::read_random_order)
 				{
 					std::uniform_int_distribution<> d(0, blocks_to_read.end_index());
@@ -329,28 +348,37 @@ int run_test(test_case const& t)
 					std::uniform_int_distribution<> d(0, blocks_to_read.end_index());
 					blocks_to_read.insert(blocks_to_read.begin() + d(random_engine), req);
 				}
-
-				++outstanding;
 			}
 
 			if ((t.flags & test_mode::flush_files) && (job_counter % 500) == 499)
 			{
-				disk_io->async_release_files(tor, [&]()
+				in_flight.insert(job_idx);
+				++outstanding;
+				disk_io->async_release_files(tor, [&, job_idx]()
 				{
+					TORRENT_ASSERT(in_flight.count(job_idx));
+					in_flight.erase(job_idx);
+					TORRENT_ASSERT(outstanding > 0);
 					--outstanding;
 					++job_counter;
 				});
+				++job_idx;
 			}
 
 			if ((t.flags & test_mode::clear_pieces) && (job_counter % 300) == 299)
 			{
 				lt::piece_index_t const p = blocks_to_write.front().piece;
-				disk_io->async_clear_piece(tor, p, [&](lt::piece_index_t)
+				in_flight.insert(job_idx);
+				++outstanding;
+				disk_io->async_clear_piece(tor, p, [&, job_idx](lt::piece_index_t)
 					{
+					TORRENT_ASSERT(in_flight.count(job_idx));
+					in_flight.erase(job_idx);
+					TORRENT_ASSERT(outstanding > 0);
 					--outstanding;
 					++job_counter;
 					});
-				++outstanding;
+				++job_idx;
 				// TODO: technically all blocks for this piece should be added
 				// to blocks_to_write again here
 			}
openSUSE Build Service is sponsored by