diff --git a/Cargo.lock b/Cargo.lock index d4cf149..1050f46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -228,7 +234,7 @@ version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "syn", @@ -758,6 +764,7 @@ dependencies = [ "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", + "ouroboros", "rand", "reqwest", "reqwest-middleware", @@ -839,6 +846,12 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -1497,6 +1510,30 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ouroboros" +version = "0.18.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0f050db9c44b97a94723127e6be766ac5c340c48f2c4bb3ffa11713744be59" +dependencies = [ + "aliasable", + "ouroboros_macro", + "static_assertions", +] + +[[package]] +name = "ouroboros_macro" +version = "0.18.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c7028bdd3d43083f6d8d4d5187680d0d3560d54df4cc9d752005268b41e64d0" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "proc-macro2-diagnostics", + "quote", + "syn", +] + [[package]] name = "page_size" version = "0.6.0" @@ -1623,6 +1660,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proc-macro2-diagnostics" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "version_check", + "yansi", +] + [[package]] name = "prost" version = "0.13.5" @@ -2312,6 +2362,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" @@ -2865,6 +2921,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "vt100" version = "0.16.2" @@ -3309,7 +3371,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "wit-parser", ] @@ -3320,7 +3382,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "indexmap 2.13.0", "prettyplease", "syn", @@ -3387,6 +3449,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "yansi" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" + [[package]] name = "yoke" version = "0.8.1" diff --git a/Cargo.toml b/Cargo.toml index d837f7f..dcf7b55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ tracing-indicatif = "0.3.14" opentelemetry = { version = "0.29" } opentelemetry_sdk = { version = "0.29", features = ["rt-tokio"] } opentelemetry-otlp = { version = "0.29", default-features = false, features = ["http-proto", "trace", "reqwest-blocking-client"] } +ouroboros = "0.18" tracing-opentelemetry = { version = "0.30" } hashlink = "0.11.0" diff --git a/lib/cache/async_backed.rs b/lib/cache/async_backed.rs new file mode 100644 index 0000000..b28004a --- /dev/null +++ b/lib/cache/async_backed.rs @@ -0,0 +1,258 @@ +//! Concurrent deduplication cache for async computations. +//! +//! Given a key and an async factory, ensures the factory runs at most once per key. Subsequent +//! callers for the same key await the already-in-flight computation via a [`Shared`] future, +//! avoiding the race conditions inherent in `Notify`-based signalling. +//! +//! Note that this cache does not support automatic eviction. + +use std::{fmt::Debug, future::Future, hash::Hash, pin::Pin}; + +use futures::FutureExt as _; +use futures::future::Shared; + +/// Two-state slot: `InFlight` while a factory future is running, then promoted to `Ready` once +/// the future completes. The caller that inserted the `InFlight` variant is responsible for +/// awaiting the `Shared` future and performing the promotion to `Ready`. +enum Slot { + InFlight(Shared + Send>>>), + Ready(V), +} + +/// Deduplicating async cache. +/// +/// If [`get_or_init`](Self::get_or_init) is called concurrently for the same key, only one +/// invocation of the factory runs. All callers receive a clone of the result. +pub struct FutureBackedCache { + map: scc::HashMap>, +} + +impl Default for FutureBackedCache +where + K: Eq + Hash, + V: Clone + Send + 'static, +{ + fn default() -> Self { + Self { + map: scc::HashMap::default(), + } + } +} + +impl FutureBackedCache +where + K: Eq + Hash + Debug + Clone + Send + Sync + 'static, + V: Clone + Send + Sync + 'static, +{ + /// Get the cached value for `key`, or initialize it by running `factory`. + /// + /// If another caller is already computing the value for this key, this awaits the in-flight + /// computation instead of spawning a duplicate. + pub async fn get_or_init(&self, key: K, factory: F) -> V + where + F: FnOnce() -> Fut, + Fut: Future + Send + 'static, + { + // Fast path: value already cached. Uses a shared read lock on the bucket, released + // immediately after the closure returns. + let existing = self + .map + .read_async(&key, |_, slot| match slot { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Err(shared.clone()), + }) + .await; + + match existing { + Some(Ok(v)) => return v, + Some(Err(shared)) => return shared.await, + None => {} + } + + // Slow path: use entry_async for atomic check-and-insert. This acquires an exclusive lock + // on the bucket, so no other caller can race between our check and insert. + let shared = match self.map.entry_async(key.clone()).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => return v.clone(), + Slot::InFlight(shared) => shared.clone(), + }, + scc::hash_map::Entry::Vacant(vac) => { + let boxed: Pin + Send>> = Box::pin(factory()); + let shared = boxed.shared(); + let ret = shared.clone(); + vac.insert_entry(Slot::InFlight(shared)); + ret + } + }; + + let val: V = shared.await; + + // Promote to Ready so future callers hit the fast path and the Shared machinery can be + // dropped. + self.map + .update_async(&key, |_, slot| { + if matches!(slot, Slot::InFlight(_)) { + *slot = Slot::Ready(val.clone()); + } + }) + .await; + + val + } + + /// Like [`get_or_init`](Self::get_or_init), but for fallible factories. + /// + /// If the factory returns `Ok(v)`, the value is cached and returned. If it returns `Err(e)`, + /// **nothing is cached** and the error is propagated to the caller. + /// + /// Unlike `get_or_init`, concurrent callers are **not** deduplicated — each caller that + /// finds the key absent will invoke the factory independently. However, if a value was + /// previously cached (by either `get_or_init` or a successful `get_or_try_init`), it is + /// returned immediately without calling the factory. + /// + /// Note: the returned value may have been produced by a concurrent caller's factory, not the + /// one supplied to this call. + pub async fn get_or_try_init(&self, key: K, factory: F) -> Result + where + F: FnOnce() -> Fut, + Fut: Future> + Send + 'static, + { + // Fast path: value already cached or in-flight from an infallible init. + let existing = self + .map + .read_async(&key, |_, slot| match slot { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Err(shared.clone()), + }) + .await; + + match existing { + Some(Ok(v)) => return Ok(v), + Some(Err(shared)) => return Ok(shared.await), + None => {} + } + + // Run the fallible factory (not deduplicated). + let val = factory().await?; + + // Attempt to cache. If another caller raced us and already inserted, + // return the existing value and discard ours. + match self.map.entry_async(key).await { + scc::hash_map::Entry::Occupied(occ) => match occ.get() { + Slot::Ready(v) => Ok(v.clone()), + // A concurrent `get_or_init` started an in-flight computation + // while our factory was running. + Slot::InFlight(shared) => Ok(shared.clone().await), + }, + scc::hash_map::Entry::Vacant(vac) => { + vac.insert_entry(Slot::Ready(val.clone())); + Ok(val) + } + } + } + + /// Get the cached value for `key` if it exists. + /// + /// - If the value is `Ready`, returns `Some(v)` immediately. + /// - If the value is `InFlight`, awaits the in-flight computation and returns `Some(v)`. + /// - If the key is absent, returns `None`. + pub async fn get(&self, key: &K) -> Option { + let existing = self + .map + .read_async(key, |_, slot| match slot { + Slot::Ready(v) => Ok(v.clone()), + Slot::InFlight(shared) => Err(shared.clone()), + }) + .await; + + match existing { + Some(Ok(v)) => Some(v), + Some(Err(shared)) => Some(shared.await), + None => None, + } + } + + /// Returns the number of entries in the cache (both `Ready` and `InFlight`). + #[must_use] + pub fn len(&self) -> usize { + self.map.len() + } + + /// Returns `true` if the cache contains no entries. + #[must_use] + pub fn is_empty(&self) -> bool { + self.map.is_empty() + } + + /// Synchronously remove the entry for `key`, returning `true` if it was present. + /// + /// Suitable for use in contexts where async is not available (e.g. inside + /// [`StatelessDrop::delete`](crate::drop_ward::StatelessDrop::delete)). + pub fn remove_sync(&self, key: &K) -> bool { + self.map.remove_sync(key).is_some() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn try_init_ok_caches_value() { + let cache = FutureBackedCache::::default(); + let result: Result = cache + .get_or_try_init(1, || async { Ok("hello".to_owned()) }) + .await; + assert_eq!(result.unwrap(), "hello", "should return Ok value"); + + // Value should now be cached (get returns it without factory) + let cached = cache.get(&1).await; + assert_eq!(cached.unwrap(), "hello", "value should be in cache"); + } + + #[tokio::test] + async fn try_init_err_does_not_cache() { + let cache = FutureBackedCache::::default(); + let result: Result = cache.get_or_try_init(1, || async { Err("boom") }).await; + assert_eq!(result.unwrap_err(), "boom", "should return the error"); + + // Cache should be empty — error was not stored + assert!(cache.is_empty(), "cache should have no entries after error"); + assert!(cache.get(&1).await.is_none(), "key should not exist"); + } + + #[tokio::test] + async fn try_init_err_then_retry_ok() { + let cache = FutureBackedCache::::default(); + + // First call: factory fails + let r1: Result = cache.get_or_try_init(1, || async { Err("fail") }).await; + assert!(r1.is_err(), "first call should fail"); + + // Second call: factory succeeds + let r2: Result = cache + .get_or_try_init(1, || async { Ok("recovered".to_owned()) }) + .await; + assert_eq!(r2.unwrap(), "recovered", "retry should succeed"); + + // Value should now be cached + let cached = cache.get(&1).await; + assert_eq!(cached.unwrap(), "recovered"); + } + + #[tokio::test] + async fn try_init_returns_value_cached_by_init() { + let cache = FutureBackedCache::::default(); + + // Populate via infallible get_or_init + cache + .get_or_init(1, || async { "from_init".to_owned() }) + .await; + + // get_or_try_init should return the cached value without running factory + let result: Result = cache + .get_or_try_init(1, || async { panic!("factory should not run") }) + .await; + assert_eq!(result.unwrap(), "from_init"); + } +} diff --git a/lib/cache/mod.rs b/lib/cache/mod.rs index e0c1c97..5c48ee2 100644 --- a/lib/cache/mod.rs +++ b/lib/cache/mod.rs @@ -1,3 +1,5 @@ +/// Async-backed cache implementation. +pub mod async_backed; /// Cache eviction policies. pub mod eviction; /// File-backed cache implementation. diff --git a/lib/drop_ward.rs b/lib/drop_ward.rs new file mode 100644 index 0000000..4922e13 --- /dev/null +++ b/lib/drop_ward.rs @@ -0,0 +1,133 @@ +//! Automatic, type-directed cleanup driven by reference counting. +//! +//! [`DropWard`] tracks how many live references exist for a given key and invokes a cleanup +//! callback when a key's count reaches zero. The cleanup logic is selected at the type level +//! through a zero-sized "tag" type that implements [`StatelessDrop`], keeping the ward itself +//! generic over *what* it manages without storing per-key values. +//! +//! This is designed for resources whose lifecycle is bound to an external context (e.g. GPU device +//! handles, connection pools, graphics pipelines) where Rust's built-in `Drop` cannot be used +//! because cleanup requires access to that context. +//! +//! # Design rationale +//! +//! The tag type `T` is constrained to be zero-sized. It exists only to carry the [`StatelessDrop`] +//! implementation at the type level — no `T` value is ever constructed or stored. This means a +//! single `DropWard` instance adds no per-key overhead beyond the key and its `usize` count. +//! +//! # Example +//! +//! ```ignore +//! struct GpuTextureDrop; +//! +//! impl StatelessDrop for GpuTextureDrop { +//! fn delete(device: &wgpu::Device, _key: &TextureId) { +//! // e.g. flush a deferred-destruction queue +//! device.poll(wgpu::Maintain::Wait); +//! } +//! } +//! +//! let mut ward: DropWard = DropWard::new(device); +//! +//! ward.inc(texture_id); // → 1 +//! ward.inc(texture_id); // → 2 +//! ward.dec(&texture_id); // → Some(1) +//! ward.dec(&texture_id); // → Some(0), calls GpuTextureDrop::delete(&device, &texture_id) +//! ``` + +use std::marker::PhantomData; + +use rustc_hash::FxHashMap; + +/// Type-level hook for cleanup that requires an external context. +/// +/// Implement this on a zero-sized tag type. The tag is never instantiated — it only selects which +/// `delete` implementation a [`DropWard`] will call. +pub trait StatelessDrop { + /// Called exactly once when a key's reference count reaches zero. + /// + /// `ctx` is the shared context owned by the [`DropWard`]. `key` is the key whose count just + /// reached zero. This callback fires synchronously inside [`DropWard::dec`]; avoid blocking or + /// panicking if the ward is used on a hot path. + fn delete(ctx: &Ctx, key: &K); +} + +/// A reference-counted key set that triggers [`StatelessDrop::delete`] on the associated context +/// when any key's count drops to zero. +/// +/// # Type parameters +/// +/// - `Ctx` — shared context passed to `T::delete` (e.g. a device handle). +/// - `K` — the key type being reference-counted. +/// - `T` — a **zero-sized** tag type carrying the cleanup logic. +/// Will fail to compile if `size_of::() != 0`. +/// +/// # Concurrency +/// +/// Not thread-safe. All access requires `&mut self`. Wrap in a `Mutex` or similar if shared across +/// threads. +/// +#[derive(Debug, Clone)] +pub struct DropWard { + map: FxHashMap, + ctx: Ctx, + _marker: PhantomData, +} + +impl DropWard +where + K: Eq + std::hash::Hash, + T: StatelessDrop, +{ + /// Compile-time guard: `T` must be zero-sized. + const _ASSERT_ZST: () = assert!(size_of::() == 0, "T must be zero-sized"); + + /// Create a new ward that will pass `ctx` to `T::delete` on cleanup. + pub fn new(ctx: Ctx) -> Self { + Self { + map: FxHashMap::default(), + ctx, + _marker: PhantomData, + } + } + + /// Increment the reference count for `key`, inserting it with a count + /// of 1 if it does not exist. + /// + /// Returns the count **after** incrementing. + pub fn inc(&mut self, key: K) -> usize { + *self + .map + .entry(key) + .and_modify(|count| *count += 1) + .or_insert(1) + } + + fn dec_by(&mut self, key: &K, by: usize) -> Option { + let curr = *self.map.get(key)?; + let new_count = curr.saturating_sub(by); + if new_count == 0 { + self.map.remove(key); + T::delete(&self.ctx, key); + } else if let Some(slot) = self.map.get_mut(key) { + *slot = new_count; + } + Some(new_count) + } + + /// Decrement the reference count for `key`. + /// + /// If the count reaches zero, the key is removed and `T::delete` is + /// called synchronously with the ward's context. Returns `Some(0)` in + /// this case — the key will no longer be tracked. + /// + /// Returns `None` if `key` was not present (no-op). + pub fn dec(&mut self, key: &K) -> Option { + self.dec_by(key, 1) + } + + /// Decrement the reference count for `key` by `count`. + pub fn dec_count(&mut self, key: &K, count: usize) -> Option { + self.dec_by(key, count) + } +} diff --git a/lib/fs/async_fs.rs b/lib/fs/async_fs.rs new file mode 100644 index 0000000..0a90291 --- /dev/null +++ b/lib/fs/async_fs.rs @@ -0,0 +1,350 @@ +//! Async `INode` Table which supports concurrent access and modification. + +use std::ffi::{OsStr, OsString}; +use std::future::Future; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; + +use bytes::Bytes; + +use crate::cache::async_backed::FutureBackedCache; +use crate::drop_ward::StatelessDrop; +use crate::fs::{ + AsyncFsStats, DirEntry, FileHandle, INode, INodeType, InodeAddr, LoadedAddr, OpenFlags, + dcache::DCache, +}; + +/// A reader for an open file, returned by [`FsDataProvider::open`]. +/// +/// Implementors provide the actual data for read operations. Dropping the +/// reader releases any resources held for the open file. +pub trait FileReader: Send + Sync + 'static { + /// Read up to `size` bytes starting at byte `offset`. + fn read( + &self, + offset: u64, + size: u32, + ) -> impl Future> + Send; +} + +/// A data provider for [`AsyncFs`] that fetches inode data on cache misses. +pub trait FsDataProvider: Clone + Send + Sync + 'static { + /// The reader type returned by [`open`](Self::open). + type Reader: FileReader; + + /// Look up a child inode by name within the given parent directory. + fn lookup( + &self, + parent: INode, + name: &OsStr, + ) -> impl Future> + Send; + + /// Open a file and return a reader for subsequent read calls. + fn open( + &self, + inode: INode, + flags: OpenFlags, + ) -> impl Future> + Send; +} + +/// Zero-sized tag whose [`StatelessDrop`] implementation automatically evicts +/// an inode from the inode table when its reference count reaches zero. +pub struct InodeForget; + +impl<'a> StatelessDrop<&'a FutureBackedCache, InodeAddr> for InodeForget { + fn delete(inode_table: &&'a FutureBackedCache, addr: &InodeAddr) { + inode_table.remove_sync(addr); + } +} + +/// A looked-up inode whose lifetime must be managed by the caller. +/// +/// Each `TrackedINode` returned by [`AsyncFs::lookup`] represents one +/// reference that the FUSE kernel holds. The caller must balance it by +/// decrementing the [`InodeLifecycle`] ward when the kernel sends `forget`. +#[derive(Debug, Clone, Copy)] +pub struct TrackedINode { + /// The resolved inode data. + pub inode: INode, +} + +/// An open file that provides read access. +/// +/// Returned by [`AsyncFs::open`]. The caller owns this handle and uses +/// [`read`](Self::read) to fetch data. Dropping the handle releases +/// the underlying reader when the last `Arc` clone is gone. +#[derive(Debug, Clone)] +pub struct OpenFile { + /// The raw file handle number, suitable for returning to the FUSE kernel. + pub fh: FileHandle, + /// The reader backing this open file. + pub reader: Arc, +} + +impl OpenFile { + /// Read up to `size` bytes starting at byte `offset`. + pub async fn read(&self, offset: u64, size: u32) -> Result { + self.reader.read(offset, size).await + } +} + +mod inode_lifecycle_impl { + #![allow(clippy::future_not_send, clippy::mem_forget)] + use ouroboros::self_referencing; + + use crate::cache::async_backed::FutureBackedCache; + use crate::drop_ward::DropWard; + use crate::fs::InodeAddr; + + use super::{INode, InodeForget}; + + /// Co-located inode table and reference-count ward. + /// + /// The ward borrows the table directly (no `Arc`) via `ouroboros`. + /// When `dec` reaches zero for a key, [`InodeForget::delete`] synchronously + /// removes that inode from the table. + #[self_referencing] + pub struct InodeLifecycle { + pub(super) table: FutureBackedCache, + #[borrows(table)] + #[not_covariant] + pub(super) ward: + DropWard<&'this FutureBackedCache, InodeAddr, InodeForget>, + } +} + +pub use inode_lifecycle_impl::InodeLifecycle; + +impl InodeLifecycle { + /// Increment the reference count for an inode address. + pub fn inc(&mut self, addr: InodeAddr) -> usize { + self.with_ward_mut(|ward| ward.inc(addr)) + } + + /// Decrement the reference count for an inode address. + /// + /// When the count reaches zero, the inode is automatically evicted + /// from the table via [`InodeForget::delete`]. + pub fn dec(&mut self, addr: &InodeAddr) -> Option { + self.with_ward_mut(|ward| ward.dec(addr)) + } + + /// Read-only access to the underlying inode table. + #[must_use] + pub fn table(&self) -> &FutureBackedCache { + self.borrow_table() + } +} + +/// An asynchronous filesystem cache mapping `InodeAddr` to `INode`. +/// +/// Uses two [`FutureBackedCache`] layers: +/// - `inode_table` stores resolved inodes by address, used by [`loaded_inode`](Self::loaded_inode). +/// - `lookup_cache` stores lookup results by `(parent_addr, name)`, ensuring `dp.lookup()` is only +/// called on a true cache miss (not already cached or in-flight). +/// +/// The [`DCache`] sits in front as a synchronous fast path mapping `(parent, name)` to child addr. +pub struct AsyncFs<'tbl, DP: FsDataProvider> { + /// Canonical addr -> `INode` map. Used by `loaded_inode()` to retrieve inodes by address. + inode_table: &'tbl FutureBackedCache, + + /// Deduplicating lookup cache keyed by `(parent_addr, child_name)`. The factory is + /// `dp.lookup()`, so the data provider is only called on a true cache miss. + lookup_cache: FutureBackedCache<(InodeAddr, OsString), INode>, + + /// Directory entry cache, mapping `(parent, name)` to child inode address. + directory_cache: DCache, + + /// The data provider used to fetch inode data on cache misses. + data_provider: DP, + + /// Monotonically increasing file handle counter. Starts at 1 (0 is reserved). + next_fh: AtomicU64, +} + +impl<'tbl, DP: FsDataProvider> AsyncFs<'tbl, DP> { + /// Create a new `AsyncFs`, seeding the root inode into the table. + pub async fn new( + data_provider: DP, + root: INode, + inode_table: &'tbl FutureBackedCache, + ) -> Self { + inode_table + .get_or_init(root.addr, || async move { root }) + .await; + + Self { + inode_table, + lookup_cache: FutureBackedCache::default(), + directory_cache: DCache::new(), + data_provider, + next_fh: AtomicU64::new(1), + } + } + + /// Get the total number of inodes currently stored in the inode table. + #[must_use] + pub fn inode_count(&self) -> usize { + self.inode_table.len() + } + + /// Return filesystem statistics. + /// + /// Reports the current inode count from the cache. Block-related + /// fields default to values appropriate for a virtual read-only + /// filesystem (4 KiB blocks, no free space). + #[must_use] + pub fn statfs(&self) -> AsyncFsStats { + AsyncFsStats { + block_size: 4096, + total_blocks: 0, + free_blocks: 0, + available_blocks: 0, + total_inodes: self.inode_count() as u64, + free_inodes: 0, + max_filename_length: 255, + } + } + + /// Asynchronously look up an inode by name within a parent directory. + /// + /// Resolution order: + /// 1. Directory cache (synchronous fast path) + /// 2. Lookup cache (`get_or_try_init` — calls `dp.lookup()` only on a true miss) + /// 3. On success, populates inode table and directory cache + pub async fn lookup( + &self, + parent: LoadedAddr, + name: &OsStr, + ) -> Result { + let parent_ino = self.loaded_inode(parent).await?; + debug_assert!( + matches!(parent_ino.itype, INodeType::Directory), + "parent inode should be a directory" + ); + + if let Some(dentry) = self.directory_cache.lookup(parent, name) + && let Some(inode) = self.inode_table.get(&dentry.ino.0).await + { + return Ok(TrackedINode { inode }); + } + // Inode was evicted from the table — fall through to the slow path. + + let name_owned = name.to_os_string(); + let name_for_cache = name_owned.clone(); + let lookup_key = (parent.0, name_owned.clone()); + let dp = self.data_provider.clone(); + + let child = self + .lookup_cache + .get_or_try_init(lookup_key, || async move { + dp.lookup(parent_ino, &name_owned).await + }) + .await?; + + self.inode_table + .get_or_init(child.addr, || async move { child }) + .await; + + self.directory_cache + .insert( + parent, + name_for_cache, + LoadedAddr(child.addr), + matches!(child.itype, INodeType::Directory), + ) + .await; + + Ok(TrackedINode { inode: child }) + } + + /// Retrieve an inode that is expected to already be loaded. + /// + /// If the inode is currently in-flight (being loaded by another caller), this awaits + /// completion. Returns an error if the inode is not in the table at all. + pub async fn loaded_inode(&self, addr: LoadedAddr) -> Result { + self.inode_table.get(&addr.0).await.ok_or_else(|| { + tracing::error!( + inode = ?addr.0, + "inode not found in table — this is a programming bug" + ); + std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("inode {:?} not found in table", addr.0), + ) + }) + } + + /// Return the attributes of the inode at `addr`. + /// + /// This is the getattr entry point for the filesystem. Returns the + /// cached [`INode`] directly — callers at the FUSE boundary are + /// responsible for converting to `fuser::FileAttr`. + pub async fn getattr(&self, addr: LoadedAddr) -> Result { + self.loaded_inode(addr).await + } + + /// Open a file for reading. + /// + /// Validates the inode is not a directory, delegates to the data provider + /// to create a [`FileReader`], and returns an [`OpenFile`] that the caller + /// owns. Reads go through [`OpenFile::read`]. + pub async fn open( + &self, + addr: LoadedAddr, + flags: OpenFlags, + ) -> Result, std::io::Error> { + let inode = self.loaded_inode(addr).await?; + if inode.itype == INodeType::Directory { + return Err(std::io::Error::from_raw_os_error(libc::EISDIR)); + } + let reader = self.data_provider.open(inode, flags).await?; + let fh = self.next_fh.fetch_add(1, Ordering::Relaxed); + Ok(OpenFile { + fh, + reader: Arc::new(reader), + }) + } + + /// Iterate directory entries for `parent`, starting from `offset`. + /// + /// Entries are yielded in name-sorted order. For each entry, `filler` is + /// called with the [`DirEntry`] and the next offset value. If `filler` + /// returns `true` (indicating the caller's buffer is full), iteration + /// stops early. + /// + /// Only reads from the directory cache and inode table — does not trigger + /// remote fetches. Entries must have been previously populated via + /// [`lookup`](Self::lookup). + /// + /// TODO(MES-746): Implement `opendir` and `releasedir` to snapshot directory contents and + /// avoid racing with `lookup`/`createfile`. + pub async fn readdir( + &self, + parent: LoadedAddr, + offset: u64, + mut filler: impl FnMut(DirEntry<'_>, u64) -> bool, + ) -> Result<(), std::io::Error> { + let parent_inode = self.loaded_inode(parent).await?; + if parent_inode.itype != INodeType::Directory { + return Err(std::io::Error::from_raw_os_error(libc::ENOTDIR)); + } + + let mut children = self.directory_cache.readdir(parent).await; + children.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + #[expect( + clippy::cast_possible_truncation, + reason = "offset fits in usize on supported 64-bit platforms" + )] + for (i, (name, dvalue)) in children.iter().enumerate().skip(offset as usize) { + let inode = self.loaded_inode(dvalue.ino).await?; + let next_offset = (i + 1) as u64; + if filler(DirEntry { name, inode }, next_offset) { + break; + } + } + + Ok(()) + } +} diff --git a/lib/fs/dcache.rs b/lib/fs/dcache.rs new file mode 100644 index 0000000..5138e80 --- /dev/null +++ b/lib/fs/dcache.rs @@ -0,0 +1,65 @@ +use std::ffi::{OsStr, OsString}; + +use crate::fs::LoadedAddr; + +/// Cached metadata for a directory entry. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DValue { + /// Inode address of this entry. + pub ino: LoadedAddr, + /// Whether this entry is itself a directory. + pub is_dir: bool, +} + +/// In-memory directory entry cache mapping `(parent, name)` to child metadata. +/// +/// Backed by [`scc::HashMap`] for atomic upsert on insert. The `readdir` +/// implementation scans the entire map and filters by parent — this is O(n) +/// over the cache size rather than O(log n + k) with an ordered index, but +/// guarantees that `insert` never creates a window where an entry is absent. +#[derive(Default)] +pub struct DCache { + cache: scc::HashMap<(LoadedAddr, OsString), DValue>, +} + +impl DCache { + /// Creates an empty directory cache. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Looks up a single child entry by parent inode and name. + #[must_use] + pub fn lookup(&self, parent_ino: LoadedAddr, name: &OsStr) -> Option { + let key = (parent_ino, name.to_os_string()); + self.cache.read_sync(&key, |_, v| v.clone()) + } + + /// Atomically inserts or overwrites a child entry in the cache. + pub async fn insert( + &self, + parent_ino: LoadedAddr, + name: OsString, + ino: LoadedAddr, + is_dir: bool, + ) { + let key = (parent_ino, name); + let value = DValue { ino, is_dir }; + self.cache.upsert_async(key, value).await; + } + + /// Returns all cached children of `parent_ino` as `(name, value)` pairs. + pub async fn readdir(&self, parent_ino: LoadedAddr) -> Vec<(OsString, DValue)> { + let mut entries = Vec::new(); + self.cache + .iter_async(|key, value| { + if key.0 == parent_ino { + entries.push((key.1.clone(), value.clone())); + } + true + }) + .await; + entries + } +} diff --git a/lib/fs/mod.rs b/lib/fs/mod.rs new file mode 100644 index 0000000..b3dd8e7 --- /dev/null +++ b/lib/fs/mod.rs @@ -0,0 +1,186 @@ +//! Useful filesystem generalizations. +/// Async filesystem cache with concurrent inode management. +pub mod async_fs; +/// Directory entry cache for fast parent-child lookups. +pub mod dcache; + +pub use async_fs::{InodeForget, InodeLifecycle, OpenFile, TrackedINode}; + +use std::ffi::OsStr; +use std::time::SystemTime; + +use bitflags::bitflags; + +/// Type representing an inode identifier. +pub type InodeAddr = u64; + +/// Represents an inode address that has been loaded into the inode table. +/// +/// This newtype wrapper distinguishes inode addresses that are known to exist +/// in the [`async_fs::AsyncFs`] inode table from raw [`InodeAddr`] values. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct LoadedAddr(pub InodeAddr); + +/// Type representing a file handle. +pub type FileHandle = u64; + +bitflags! { + /// Permission bits for an inode, similar to Unix file permissions. + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub struct InodePerms: u16 { + /// Other: execute permission. + const OTHER_EXECUTE = 1 << 0; + /// Other: write permission. + const OTHER_WRITE = 1 << 1; + /// Other: read permission. + const OTHER_READ = 1 << 2; + + /// Group: execute permission. + const GROUP_EXECUTE = 1 << 3; + /// Group: write permission. + const GROUP_WRITE = 1 << 4; + /// Group: read permission. + const GROUP_READ = 1 << 5; + + /// Owner: execute permission. + const OWNER_EXECUTE = 1 << 6; + /// Owner: write permission. + const OWNER_WRITE = 1 << 7; + /// Owner: read permission. + const OWNER_READ = 1 << 8; + + /// Sticky bit. + const STICKY = 1 << 9; + /// Set-group-ID bit. + const SETGID = 1 << 10; + /// Set-user-ID bit. + const SETUID = 1 << 11; + + /// Other: read, write, and execute. + const OTHER_RWX = Self::OTHER_READ.bits() + | Self::OTHER_WRITE.bits() + | Self::OTHER_EXECUTE.bits(); + /// Group: read, write, and execute. + const GROUP_RWX = Self::GROUP_READ.bits() + | Self::GROUP_WRITE.bits() + | Self::GROUP_EXECUTE.bits(); + /// Owner: read, write, and execute. + const OWNER_RWX = Self::OWNER_READ.bits() + | Self::OWNER_WRITE.bits() + | Self::OWNER_EXECUTE.bits(); + } +} + +bitflags! { + /// Flags for opening a file, similar to Unix open(2) flags. + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub struct OpenFlags: i32 { + /// Open for reading only. + const RDONLY = libc::O_RDONLY; + /// Open for writing only. + const WRONLY = libc::O_WRONLY; + /// Open for reading and writing. + const RDWR = libc::O_RDWR; + + /// Append on each write. + const APPEND = libc::O_APPEND; + /// Truncate to zero length. + const TRUNC = libc::O_TRUNC; + /// Create file if it does not exist. + const CREAT = libc::O_CREAT; + /// Error if file already exists (with `CREAT`). + const EXCL = libc::O_EXCL; + + /// Non-blocking mode. + const NONBLOCK = libc::O_NONBLOCK; + /// Synchronous writes. + const SYNC = libc::O_SYNC; + /// Synchronous data integrity writes. + const DSYNC = libc::O_DSYNC; + /// Do not follow symlinks. + const NOFOLLOW = libc::O_NOFOLLOW; + /// Set close-on-exec. + const CLOEXEC = libc::O_CLOEXEC; + /// Fail if not a directory. + const DIRECTORY = libc::O_DIRECTORY; + + /// Do not update access time (Linux only). + #[cfg(target_os = "linux")] + const NOATIME = libc::O_NOATIME; + } +} + +/// The type of an inode entry in the filesystem. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum INodeType { + /// A regular file. + File, + /// A directory. + Directory, + /// A symbolic link. + Symlink, +} + +/// Representation of an inode. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct INode { + /// The address of this inode, which serves as its unique identifier. + pub addr: InodeAddr, + /// The permissions associated with this inode, represented as a bitfield. + pub permissions: InodePerms, + /// The user ID of the owner of this inode. + pub uid: u32, + /// The group ID of the owner of this inode. + pub gid: u32, + /// The time this inode was created at. + pub create_time: SystemTime, + /// The time this inode was last modified at. + pub last_modified_at: SystemTime, + /// The parent inode address, if any. This is `None` for the root inode. + pub parent: Option, + /// The size of the file represented by this inode, in bytes. + pub size: u64, + /// Additional information about the type of this inode (e.g., file vs directory). + pub itype: INodeType, +} + +impl INode { + /// Check if this inode is the root inode (i.e., has no parent). + #[must_use] + pub fn is_root(&self) -> bool { + self.parent.is_none() + } +} + +/// A directory entry yielded by [`async_fs::AsyncFs::readdir`]. +/// +/// Borrows the entry name from the directory cache's iteration buffer. +#[derive(Debug, Clone, Copy)] +pub struct DirEntry<'a> { + /// The name of this entry within its parent directory. + pub name: &'a OsStr, + /// The full inode data for this entry. + pub inode: INode, +} + +/// Filesystem statistics returned by [`async_fs::AsyncFs::statfs`]. +/// +/// Block-related sizes are in units of `block_size` bytes. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct AsyncFsStats { + /// Filesystem block size (bytes). + pub block_size: u32, + /// Total number of data blocks. + pub total_blocks: u64, + /// Number of free blocks. + pub free_blocks: u64, + /// Number of blocks available to unprivileged users. + pub available_blocks: u64, + /// Total number of file nodes (inodes). + pub total_inodes: u64, + /// Number of free file nodes. + pub free_inodes: u64, + /// Maximum filename length (bytes). + pub max_filename_length: u32, +} diff --git a/lib/lib.rs b/lib/lib.rs index f7388bd..40b1e8f 100644 --- a/lib/lib.rs +++ b/lib/lib.rs @@ -2,4 +2,7 @@ /// Caching primitives for git-fs. pub mod cache; +pub mod drop_ward; +/// Filesystem abstractions and caching layers. +pub mod fs; pub mod io;