Skip to main content

lattices/
tombstone.rs

1//! Shared tombstone set implementations for efficient storage of deleted keys.
2//!
3//! This module provides specialized tombstone storage implementations that can be used
4//! with both [`crate::set_union_with_tombstones::SetUnionWithTombstones`] and
5//! [`crate::map_union_with_tombstones::MapUnionWithTombstones`].
6//!
7//! # Choosing a Tombstone Implementation
8//!
9//! This module provides several specialized tombstone storage implementations optimized for different key types:
10//!
11//! ## For Integer Keys (u64)
12//! Use [`RoaringTombstoneSet`]:
13//! - Extremely space-efficient bitmap compression
14//! - Fast O(1) lookups and efficient bitmap OR operations during merge
15//! - Works with u64 keys (other integer types can be cast to u64)
16//!
17//! ## For String Keys
18//! Use [`FstTombstoneSet<String>`]:
19//! - Compressed finite state transducer storage
20//! - Zero false positives (collision-free)
21//! - Efficient union operations for merging
22//! - Maintains sorted order
23//! - **Note:** For arbitrary byte sequences, encode them as hex or base64 strings first
24//!
25//! ## For Other Types
26//! Use [`std::collections::HashSet`] for tombstones:
27//! - Works with any `Hash + Eq` key type
28//! - No compression, but simple and flexible
29//!
30//! Alternatively, you can hash to 64-bit integers and follow instructions for that case above,
31//! understanding there's a tiny risk of hash collision which could result in keys being
32//! tombstoned (deleted) incorrectly.
33//!
34//! ## Performance Characteristics
35//!
36//! | Implementation | Space Efficiency | Merge Speed | Lookup Speed | False Positives |
37//! |----------------|------------------|-------------|--------------|-----------------|
38//! | RoaringBitmap  | Excellent        | Excellent   | Excellent    | None            |
39//! | FST            | Very Good        | Good        | Very Good    | None            |
40//! | HashSet        | Poor             | Good        | Excellent    | None            |
41//!
42//! # Performance Considerations
43//!
44//! - **RoaringBitmap:** Optimized for dense integer sets. Very fast for all operations.
45//! - **FST:** The `extend()` operation rebuilds the entire FST, so batch your insertions.
46//!   Use `from_iter()` when possible for better performance.
47//!
48//! # Thread Safety
49//!
50//! Both implementations are `Send` and `Sync` when their contained types are.
51//! They do not use interior mutability.
52
53use std::string::String;
54
55use fst::{IntoStreamer, Set as FstSet, SetBuilder, Streamer};
56use roaring::RoaringTreemap;
57
58use crate::cc_traits::Len;
59
60/// Trait for tombstone set implementations that support efficient union operations.
61///
62/// This trait abstracts over different tombstone storage strategies, allowing
63/// specialized implementations like [`RoaringTombstoneSet`] and [`FstTombstoneSet`]
64/// to provide optimized merge operations for their respective key types.
65///
66/// Implementors must provide:
67/// - A way to check membership (`contains`)
68/// - A way to union with another tombstone set (`union_with`)
69/// - Standard collection traits (`Len`, `Extend`, etc.)
70pub trait TombstoneSet<Key>: Len + Extend<Key> {
71    /// Check if a key is in the tombstone set.
72    fn contains(&self, key: &Key) -> bool;
73
74    /// Union this tombstone set with another, modifying self in place.
75    /// Returns the old length before the union.
76    fn union_with(&mut self, other: &Self) -> usize;
77}
78
79/// A tombstone set backed by [`RoaringTreemap`] for u64 integer keys.
80/// This provides space-efficient bitmap compression for integer tombstones.
81#[derive(Default, Clone, Debug)]
82pub struct RoaringTombstoneSet {
83    bitmap: RoaringTreemap,
84}
85
86impl RoaringTombstoneSet {
87    /// Create a new empty `RoaringTombstoneSet`.
88    pub fn new() -> Self {
89        Self {
90            bitmap: RoaringTreemap::new(),
91        }
92    }
93
94    /// Check if an item is in the tombstone set.
95    pub fn contains(&self, item: &u64) -> bool {
96        self.bitmap.contains(*item)
97    }
98
99    /// Insert an item into the tombstone set.
100    pub fn insert(&mut self, item: u64) -> bool {
101        self.bitmap.insert(item)
102    }
103}
104
105impl TombstoneSet<u64> for RoaringTombstoneSet {
106    fn contains(&self, key: &u64) -> bool {
107        self.bitmap.contains(*key)
108    }
109
110    fn union_with(&mut self, other: &Self) -> usize {
111        let old_len = self.len();
112        self.bitmap = &self.bitmap | &other.bitmap;
113        old_len
114    }
115}
116
117impl Extend<u64> for RoaringTombstoneSet {
118    fn extend<T: IntoIterator<Item = u64>>(&mut self, iter: T) {
119        self.bitmap.extend(iter);
120    }
121}
122
123impl Len for RoaringTombstoneSet {
124    fn len(&self) -> usize {
125        self.bitmap.len() as usize
126    }
127}
128
129impl IntoIterator for RoaringTombstoneSet {
130    type Item = u64;
131    type IntoIter = roaring::treemap::IntoIter;
132
133    fn into_iter(self) -> Self::IntoIter {
134        self.bitmap.into_iter()
135    }
136}
137
138impl FromIterator<u64> for RoaringTombstoneSet {
139    fn from_iter<T: IntoIterator<Item = u64>>(iter: T) -> Self {
140        Self {
141            bitmap: RoaringTreemap::from_iter(iter),
142        }
143    }
144}
145
146/// A tombstone set backed by FST (Finite State Transducer) for byte string keys.
147/// This provides space-efficient storage with zero false positives for any type
148/// that can be serialized to bytes (strings, serialized structs, etc.).
149/// FST maintains keys in sorted order and supports efficient set operations.
150///
151/// ## Performance Notes
152/// - The `extend()` operation rebuilds the entire FST, so batch your insertions when possible
153/// - Union operations are efficient and create a new compressed FST
154/// - Lookups are very fast (logarithmic in the number of keys)
155#[cfg(feature = "alloc")]
156#[derive(Clone, Debug)]
157pub struct FstTombstoneSet<Item> {
158    fst: FstSet<alloc::vec::Vec<u8>>,
159    _phantom: core::marker::PhantomData<Item>,
160}
161
162#[cfg(feature = "alloc")]
163impl<Item> Default for FstTombstoneSet<Item> {
164    fn default() -> Self {
165        Self::new()
166    }
167}
168
169#[cfg(feature = "alloc")]
170impl<Item> FstTombstoneSet<Item> {
171    /// Create a new empty `FstTombstoneSet`.
172    pub fn new() -> Self {
173        Self {
174            fst: FstSet::default(),
175            _phantom: core::marker::PhantomData,
176        }
177    }
178
179    /// Create from an existing FST set.
180    pub(crate) fn from_fst(fst: FstSet<alloc::vec::Vec<u8>>) -> Self {
181        Self {
182            fst,
183            _phantom: core::marker::PhantomData,
184        }
185    }
186
187    /// Check if an item is in the tombstone set.
188    pub fn contains(&self, item: &[u8]) -> bool {
189        self.fst.contains(item)
190    }
191
192    /// Get the number of items in the set.
193    pub fn len(&self) -> usize {
194        self.fst.len()
195    }
196
197    /// Check if the set is empty.
198    pub fn is_empty(&self) -> bool {
199        self.fst.is_empty()
200    }
201
202    /// Union this FST with another, returning a new FST.
203    fn union_internal(&self, other: &Self) -> Self {
204        let union_stream = self.fst.op().add(&other.fst).union();
205        let mut builder = SetBuilder::memory();
206        let mut stream = union_stream.into_stream();
207        while let Some(key) = stream.next() {
208            // Union stream produces sorted keys, so insert should not fail
209            builder
210                .insert(key)
211                .expect("union stream keys are sorted, insert should not fail");
212        }
213        Self::from_fst(
214            FstSet::new(
215                builder
216                    .into_inner()
217                    .expect("memory builder should not fail"),
218            )
219            .expect("FST construction from valid builder should not fail"),
220        )
221    }
222}
223
224#[cfg(feature = "alloc")]
225impl TombstoneSet<String> for FstTombstoneSet<String> {
226    fn contains(&self, key: &String) -> bool {
227        self.fst.contains(key.as_bytes())
228    }
229
230    fn union_with(&mut self, other: &Self) -> usize {
231        let old_len = self.len();
232        *self = self.union_internal(other);
233        old_len
234    }
235}
236
237#[cfg(feature = "alloc")]
238impl Len for FstTombstoneSet<String> {
239    fn len(&self) -> usize {
240        self.fst.len()
241    }
242}
243
244// For String items
245#[cfg(feature = "alloc")]
246impl Extend<String> for FstTombstoneSet<String> {
247    fn extend<T: IntoIterator<Item = String>>(&mut self, iter: T) {
248        let mut keys: alloc::vec::Vec<_> = self.fst.stream().into_strs().unwrap();
249        keys.extend(iter);
250        keys.sort();
251        keys.dedup();
252
253        let mut builder = SetBuilder::memory();
254        for key in keys {
255            // FST builder insert only fails if keys are not sorted, which we ensure above
256            builder
257                .insert(&key)
258                .expect("keys are sorted, insert should not fail");
259        }
260        // Memory builder and FST construction should not fail for valid sorted keys
261        self.fst = FstSet::new(
262            builder
263                .into_inner()
264                .expect("memory builder should not fail"),
265        )
266        .expect("FST construction from valid builder should not fail");
267    }
268}
269
270#[cfg(feature = "alloc")]
271impl FromIterator<String> for FstTombstoneSet<String> {
272    fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
273        let mut keys: alloc::vec::Vec<_> = iter.into_iter().collect();
274        keys.sort();
275        keys.dedup();
276
277        let mut builder = SetBuilder::memory();
278        for key in &keys {
279            // FST builder insert only fails if keys are not sorted, which we ensure above
280            builder
281                .insert(key)
282                .expect("keys are sorted, insert should not fail");
283        }
284        Self::from_fst(
285            FstSet::new(
286                builder
287                    .into_inner()
288                    .expect("memory builder should not fail"),
289            )
290            .expect("FST construction from valid builder should not fail"),
291        )
292    }
293}
294
295#[cfg(feature = "alloc")]
296impl IntoIterator for FstTombstoneSet<String> {
297    type Item = String;
298    type IntoIter = std::vec::IntoIter<String>;
299
300    fn into_iter(self) -> Self::IntoIter {
301        // Convert FST keys to strings
302        self.fst
303            .stream()
304            .into_strs()
305            .expect("FST contains valid UTF-8 strings")
306            .into_iter()
307    }
308}
309
310// Implement TombstoneSet for HashSet to support generic key types
311#[cfg(feature = "std")]
312impl<K> TombstoneSet<K> for std::collections::HashSet<K>
313where
314    K: Eq + std::hash::Hash + Clone,
315{
316    fn contains(&self, key: &K) -> bool {
317        std::collections::HashSet::contains(self, key)
318    }
319
320    fn union_with(&mut self, other: &Self) -> usize {
321        let old_len = self.len();
322        #[expect(
323            clippy::disallowed_methods,
324            reason = "nondeterministic iteration order, fine to insert into set"
325        )]
326        for item in other.iter() {
327            self.insert(item.clone());
328        }
329        old_len
330    }
331}