lattices/
tombstone.rs

1//! Shared tombstone set implementations for efficient storage of deleted keys.
2//!
3//! This module provides specialized tombstone storage implementations that can be used
4//! with both [`crate::set_union_with_tombstones::SetUnionWithTombstones`] and
5//! [`crate::map_union_with_tombstones::MapUnionWithTombstones`].
6//!
7//! # Choosing a Tombstone Implementation
8//!
9//! This module provides several specialized tombstone storage implementations optimized for different key types:
10//!
11//! ## For Integer Keys (u64)
12//! Use [`RoaringTombstoneSet`]:
13//! - Extremely space-efficient bitmap compression
14//! - Fast O(1) lookups and efficient bitmap OR operations during merge
15//! - Works with u64 keys (other integer types can be cast to u64)
16//!
17//! ## For String Keys
18//! Use [`FstTombstoneSet<String>`]:
19//! - Compressed finite state transducer storage
20//! - Zero false positives (collision-free)
21//! - Efficient union operations for merging
22//! - Maintains sorted order
23//! - **Note:** For arbitrary byte sequences, encode them as hex or base64 strings first
24//!
25//! ## For Other Types
26//! Use [`std::collections::HashSet`] for tombstones:
27//! - Works with any `Hash + Eq` key type
28//! - No compression, but simple and flexible
29//!
30//! Alternatively, you can hash to 64-bit integers and follow instructions for that case above,
31//! understanding there's a tiny risk of hash collision which could result in keys being
32//! tombstoned (deleted) incorrectly.
33//!
34//! ## Performance Characteristics
35//!
36//! | Implementation | Space Efficiency | Merge Speed | Lookup Speed | False Positives |
37//! |----------------|------------------|-------------|--------------|-----------------|
38//! | RoaringBitmap  | Excellent        | Excellent   | Excellent    | None            |
39//! | FST            | Very Good        | Good        | Very Good    | None            |
40//! | HashSet        | Poor             | Good        | Excellent    | None            |
41//!
42//! # Performance Considerations
43//!
44//! - **RoaringBitmap:** Optimized for dense integer sets. Very fast for all operations.
45//! - **FST:** The `extend()` operation rebuilds the entire FST, so batch your insertions.
46//!   Use `from_iter()` when possible for better performance.
47//!
48//! # Thread Safety
49//!
50//! Both implementations are `Send` and `Sync` when their contained types are.
51//! They do not use interior mutability.
52
53use std::collections::HashSet;
54
55use fst::{IntoStreamer, Set as FstSet, SetBuilder, Streamer};
56use roaring::RoaringTreemap;
57
58use crate::cc_traits::Len;
59
60/// Trait for tombstone set implementations that support efficient union operations.
61///
62/// This trait abstracts over different tombstone storage strategies, allowing
63/// specialized implementations like [`RoaringTombstoneSet`] and [`FstTombstoneSet`]
64/// to provide optimized merge operations for their respective key types.
65///
66/// Implementors must provide:
67/// - A way to check membership (`contains`)
68/// - A way to union with another tombstone set (`union_with`)
69/// - Standard collection traits (`Len`, `Extend`, etc.)
70pub trait TombstoneSet<Key>: Len + Extend<Key> {
71    /// Check if a key is in the tombstone set.
72    fn contains(&self, key: &Key) -> bool;
73
74    /// Union this tombstone set with another, modifying self in place.
75    /// Returns the old length before the union.
76    fn union_with(&mut self, other: &Self) -> usize;
77}
78
79/// A tombstone set backed by [`RoaringTreemap`] for u64 integer keys.
80/// This provides space-efficient bitmap compression for integer tombstones.
81#[derive(Default, Clone, Debug)]
82pub struct RoaringTombstoneSet {
83    bitmap: RoaringTreemap,
84}
85
86impl RoaringTombstoneSet {
87    /// Create a new empty `RoaringTombstoneSet`.
88    pub fn new() -> Self {
89        Self {
90            bitmap: RoaringTreemap::new(),
91        }
92    }
93
94    /// Check if an item is in the tombstone set.
95    pub fn contains(&self, item: &u64) -> bool {
96        self.bitmap.contains(*item)
97    }
98
99    /// Insert an item into the tombstone set.
100    pub fn insert(&mut self, item: u64) -> bool {
101        self.bitmap.insert(item)
102    }
103}
104
105impl TombstoneSet<u64> for RoaringTombstoneSet {
106    fn contains(&self, key: &u64) -> bool {
107        self.bitmap.contains(*key)
108    }
109
110    fn union_with(&mut self, other: &Self) -> usize {
111        let old_len = self.len();
112        self.bitmap = &self.bitmap | &other.bitmap;
113        old_len
114    }
115}
116
117impl Extend<u64> for RoaringTombstoneSet {
118    fn extend<T: IntoIterator<Item = u64>>(&mut self, iter: T) {
119        self.bitmap.extend(iter);
120    }
121}
122
123impl Len for RoaringTombstoneSet {
124    fn len(&self) -> usize {
125        self.bitmap.len() as usize
126    }
127}
128
129impl IntoIterator for RoaringTombstoneSet {
130    type Item = u64;
131    type IntoIter = roaring::treemap::IntoIter;
132
133    fn into_iter(self) -> Self::IntoIter {
134        self.bitmap.into_iter()
135    }
136}
137
138impl FromIterator<u64> for RoaringTombstoneSet {
139    fn from_iter<T: IntoIterator<Item = u64>>(iter: T) -> Self {
140        Self {
141            bitmap: RoaringTreemap::from_iter(iter),
142        }
143    }
144}
145
146/// A tombstone set backed by FST (Finite State Transducer) for byte string keys.
147/// This provides space-efficient storage with zero false positives for any type
148/// that can be serialized to bytes (strings, serialized structs, etc.).
149/// FST maintains keys in sorted order and supports efficient set operations.
150///
151/// ## Performance Notes
152/// - The `extend()` operation rebuilds the entire FST, so batch your insertions when possible
153/// - Union operations are efficient and create a new compressed FST
154/// - Lookups are very fast (logarithmic in the number of keys)
155#[derive(Clone, Debug)]
156pub struct FstTombstoneSet<Item> {
157    fst: FstSet<Vec<u8>>,
158    _phantom: std::marker::PhantomData<Item>,
159}
160
161impl<Item> Default for FstTombstoneSet<Item> {
162    fn default() -> Self {
163        Self::new()
164    }
165}
166
167impl<Item> FstTombstoneSet<Item> {
168    /// Create a new empty `FstTombstoneSet`.
169    pub fn new() -> Self {
170        Self {
171            fst: FstSet::default(),
172            _phantom: std::marker::PhantomData,
173        }
174    }
175
176    /// Create from an existing FST set.
177    pub(crate) fn from_fst(fst: FstSet<Vec<u8>>) -> Self {
178        Self {
179            fst,
180            _phantom: std::marker::PhantomData,
181        }
182    }
183
184    /// Check if an item is in the tombstone set.
185    pub fn contains(&self, item: &[u8]) -> bool {
186        self.fst.contains(item)
187    }
188
189    /// Get the number of items in the set.
190    pub fn len(&self) -> usize {
191        self.fst.len()
192    }
193
194    /// Check if the set is empty.
195    pub fn is_empty(&self) -> bool {
196        self.fst.is_empty()
197    }
198
199    /// Union this FST with another, returning a new FST.
200    fn union_internal(&self, other: &Self) -> Self {
201        let union_stream = self.fst.op().add(&other.fst).union();
202        let mut builder = SetBuilder::memory();
203        let mut stream = union_stream.into_stream();
204        while let Some(key) = stream.next() {
205            // Union stream produces sorted keys, so insert should not fail
206            builder
207                .insert(key)
208                .expect("union stream keys are sorted, insert should not fail");
209        }
210        Self::from_fst(
211            FstSet::new(
212                builder
213                    .into_inner()
214                    .expect("memory builder should not fail"),
215            )
216            .expect("FST construction from valid builder should not fail"),
217        )
218    }
219}
220
221/// Helper trait to convert keys to byte slices for FST operations.
222pub trait AsBytes {
223    /// Convert the key to a byte slice.
224    fn as_bytes(&self) -> &[u8];
225}
226
227impl AsBytes for String {
228    fn as_bytes(&self) -> &[u8] {
229        self.as_bytes()
230    }
231}
232
233impl TombstoneSet<String> for FstTombstoneSet<String> {
234    fn contains(&self, key: &String) -> bool {
235        self.fst.contains(key.as_bytes())
236    }
237
238    fn union_with(&mut self, other: &Self) -> usize {
239        let old_len = self.len();
240        *self = self.union_internal(other);
241        old_len
242    }
243}
244
245impl Len for FstTombstoneSet<String> {
246    fn len(&self) -> usize {
247        self.fst.len()
248    }
249}
250
251// For String items
252impl Extend<String> for FstTombstoneSet<String> {
253    fn extend<T: IntoIterator<Item = String>>(&mut self, iter: T) {
254        let mut keys: Vec<_> = self.fst.stream().into_strs().unwrap();
255        keys.extend(iter);
256        keys.sort();
257        keys.dedup();
258
259        let mut builder = SetBuilder::memory();
260        for key in keys {
261            // FST builder insert only fails if keys are not sorted, which we ensure above
262            builder
263                .insert(key)
264                .expect("keys are sorted, insert should not fail");
265        }
266        // Memory builder and FST construction should not fail for valid sorted keys
267        self.fst = FstSet::new(
268            builder
269                .into_inner()
270                .expect("memory builder should not fail"),
271        )
272        .expect("FST construction from valid builder should not fail");
273    }
274}
275
276impl FromIterator<String> for FstTombstoneSet<String> {
277    fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
278        let mut keys: Vec<_> = iter.into_iter().collect();
279        keys.sort();
280        keys.dedup();
281
282        let mut builder = SetBuilder::memory();
283        for key in keys {
284            // FST builder insert only fails if keys are not sorted, which we ensure above
285            builder
286                .insert(key)
287                .expect("keys are sorted, insert should not fail");
288        }
289        Self::from_fst(
290            FstSet::new(
291                builder
292                    .into_inner()
293                    .expect("memory builder should not fail"),
294            )
295            .expect("FST construction from valid builder should not fail"),
296        )
297    }
298}
299
300impl IntoIterator for FstTombstoneSet<String> {
301    type Item = String;
302    type IntoIter = std::vec::IntoIter<String>;
303
304    fn into_iter(self) -> Self::IntoIter {
305        // Convert FST keys to strings
306        self.fst
307            .stream()
308            .into_strs()
309            .expect("FST contains valid UTF-8 strings")
310            .into_iter()
311    }
312}
313
314// Implement TombstoneSet for HashSet to support generic key types
315impl<K> TombstoneSet<K> for HashSet<K>
316where
317    K: Eq + std::hash::Hash + Clone,
318{
319    fn contains(&self, key: &K) -> bool {
320        HashSet::contains(self, key)
321    }
322
323    fn union_with(&mut self, other: &Self) -> usize {
324        let old_len = self.len();
325        for item in other.iter() {
326            self.insert(item.clone());
327        }
328        old_len
329    }
330}