lattices/tombstone.rs
1//! Shared tombstone set implementations for efficient storage of deleted keys.
2//!
3//! This module provides specialized tombstone storage implementations that can be used
4//! with both [`crate::set_union_with_tombstones::SetUnionWithTombstones`] and
5//! [`crate::map_union_with_tombstones::MapUnionWithTombstones`].
6//!
7//! # Choosing a Tombstone Implementation
8//!
9//! This module provides several specialized tombstone storage implementations optimized for different key types:
10//!
11//! ## For Integer Keys (u64)
12//! Use [`RoaringTombstoneSet`]:
13//! - Extremely space-efficient bitmap compression
14//! - Fast O(1) lookups and efficient bitmap OR operations during merge
15//! - Works with u64 keys (other integer types can be cast to u64)
16//!
17//! ## For String Keys
18//! Use [`FstTombstoneSet<String>`]:
19//! - Compressed finite state transducer storage
20//! - Zero false positives (collision-free)
21//! - Efficient union operations for merging
22//! - Maintains sorted order
23//! - **Note:** For arbitrary byte sequences, encode them as hex or base64 strings first
24//!
25//! ## For Other Types
26//! Use [`std::collections::HashSet`] for tombstones:
27//! - Works with any `Hash + Eq` key type
28//! - No compression, but simple and flexible
29//!
30//! Alternatively, you can hash to 64-bit integers and follow instructions for that case above,
31//! understanding there's a tiny risk of hash collision which could result in keys being
32//! tombstoned (deleted) incorrectly.
33//!
34//! ## Performance Characteristics
35//!
36//! | Implementation | Space Efficiency | Merge Speed | Lookup Speed | False Positives |
37//! |----------------|------------------|-------------|--------------|-----------------|
38//! | RoaringBitmap | Excellent | Excellent | Excellent | None |
39//! | FST | Very Good | Good | Very Good | None |
40//! | HashSet | Poor | Good | Excellent | None |
41//!
42//! # Performance Considerations
43//!
44//! - **RoaringBitmap:** Optimized for dense integer sets. Very fast for all operations.
45//! - **FST:** The `extend()` operation rebuilds the entire FST, so batch your insertions.
46//! Use `from_iter()` when possible for better performance.
47//!
48//! # Thread Safety
49//!
50//! Both implementations are `Send` and `Sync` when their contained types are.
51//! They do not use interior mutability.
52
53use std::string::String;
54
55use fst::{IntoStreamer, Set as FstSet, SetBuilder, Streamer};
56use roaring::RoaringTreemap;
57
58use crate::cc_traits::Len;
59
60/// Trait for tombstone set implementations that support efficient union operations.
61///
62/// This trait abstracts over different tombstone storage strategies, allowing
63/// specialized implementations like [`RoaringTombstoneSet`] and [`FstTombstoneSet`]
64/// to provide optimized merge operations for their respective key types.
65///
66/// Implementors must provide:
67/// - A way to check membership (`contains`)
68/// - A way to union with another tombstone set (`union_with`)
69/// - Standard collection traits (`Len`, `Extend`, etc.)
70pub trait TombstoneSet<Key>: Len + Extend<Key> {
71 /// Check if a key is in the tombstone set.
72 fn contains(&self, key: &Key) -> bool;
73
74 /// Union this tombstone set with another, modifying self in place.
75 /// Returns the old length before the union.
76 fn union_with(&mut self, other: &Self) -> usize;
77}
78
79/// A tombstone set backed by [`RoaringTreemap`] for u64 integer keys.
80/// This provides space-efficient bitmap compression for integer tombstones.
81#[derive(Default, Clone, Debug)]
82pub struct RoaringTombstoneSet {
83 bitmap: RoaringTreemap,
84}
85
86impl RoaringTombstoneSet {
87 /// Create a new empty `RoaringTombstoneSet`.
88 pub fn new() -> Self {
89 Self {
90 bitmap: RoaringTreemap::new(),
91 }
92 }
93
94 /// Check if an item is in the tombstone set.
95 pub fn contains(&self, item: &u64) -> bool {
96 self.bitmap.contains(*item)
97 }
98
99 /// Insert an item into the tombstone set.
100 pub fn insert(&mut self, item: u64) -> bool {
101 self.bitmap.insert(item)
102 }
103}
104
105impl TombstoneSet<u64> for RoaringTombstoneSet {
106 fn contains(&self, key: &u64) -> bool {
107 self.bitmap.contains(*key)
108 }
109
110 fn union_with(&mut self, other: &Self) -> usize {
111 let old_len = self.len();
112 self.bitmap = &self.bitmap | &other.bitmap;
113 old_len
114 }
115}
116
117impl Extend<u64> for RoaringTombstoneSet {
118 fn extend<T: IntoIterator<Item = u64>>(&mut self, iter: T) {
119 self.bitmap.extend(iter);
120 }
121}
122
123impl Len for RoaringTombstoneSet {
124 fn len(&self) -> usize {
125 self.bitmap.len() as usize
126 }
127}
128
129impl IntoIterator for RoaringTombstoneSet {
130 type Item = u64;
131 type IntoIter = roaring::treemap::IntoIter;
132
133 fn into_iter(self) -> Self::IntoIter {
134 self.bitmap.into_iter()
135 }
136}
137
138impl FromIterator<u64> for RoaringTombstoneSet {
139 fn from_iter<T: IntoIterator<Item = u64>>(iter: T) -> Self {
140 Self {
141 bitmap: RoaringTreemap::from_iter(iter),
142 }
143 }
144}
145
146/// A tombstone set backed by FST (Finite State Transducer) for byte string keys.
147/// This provides space-efficient storage with zero false positives for any type
148/// that can be serialized to bytes (strings, serialized structs, etc.).
149/// FST maintains keys in sorted order and supports efficient set operations.
150///
151/// ## Performance Notes
152/// - The `extend()` operation rebuilds the entire FST, so batch your insertions when possible
153/// - Union operations are efficient and create a new compressed FST
154/// - Lookups are very fast (logarithmic in the number of keys)
155#[cfg(feature = "alloc")]
156#[derive(Clone, Debug)]
157pub struct FstTombstoneSet<Item> {
158 fst: FstSet<alloc::vec::Vec<u8>>,
159 _phantom: core::marker::PhantomData<Item>,
160}
161
162#[cfg(feature = "alloc")]
163impl<Item> Default for FstTombstoneSet<Item> {
164 fn default() -> Self {
165 Self::new()
166 }
167}
168
169#[cfg(feature = "alloc")]
170impl<Item> FstTombstoneSet<Item> {
171 /// Create a new empty `FstTombstoneSet`.
172 pub fn new() -> Self {
173 Self {
174 fst: FstSet::default(),
175 _phantom: core::marker::PhantomData,
176 }
177 }
178
179 /// Create from an existing FST set.
180 pub(crate) fn from_fst(fst: FstSet<alloc::vec::Vec<u8>>) -> Self {
181 Self {
182 fst,
183 _phantom: core::marker::PhantomData,
184 }
185 }
186
187 /// Check if an item is in the tombstone set.
188 pub fn contains(&self, item: &[u8]) -> bool {
189 self.fst.contains(item)
190 }
191
192 /// Get the number of items in the set.
193 pub fn len(&self) -> usize {
194 self.fst.len()
195 }
196
197 /// Check if the set is empty.
198 pub fn is_empty(&self) -> bool {
199 self.fst.is_empty()
200 }
201
202 /// Union this FST with another, returning a new FST.
203 fn union_internal(&self, other: &Self) -> Self {
204 let union_stream = self.fst.op().add(&other.fst).union();
205 let mut builder = SetBuilder::memory();
206 let mut stream = union_stream.into_stream();
207 while let Some(key) = stream.next() {
208 // Union stream produces sorted keys, so insert should not fail
209 builder
210 .insert(key)
211 .expect("union stream keys are sorted, insert should not fail");
212 }
213 Self::from_fst(
214 FstSet::new(
215 builder
216 .into_inner()
217 .expect("memory builder should not fail"),
218 )
219 .expect("FST construction from valid builder should not fail"),
220 )
221 }
222}
223
224#[cfg(feature = "alloc")]
225impl TombstoneSet<String> for FstTombstoneSet<String> {
226 fn contains(&self, key: &String) -> bool {
227 self.fst.contains(key.as_bytes())
228 }
229
230 fn union_with(&mut self, other: &Self) -> usize {
231 let old_len = self.len();
232 *self = self.union_internal(other);
233 old_len
234 }
235}
236
237#[cfg(feature = "alloc")]
238impl Len for FstTombstoneSet<String> {
239 fn len(&self) -> usize {
240 self.fst.len()
241 }
242}
243
244// For String items
245#[cfg(feature = "alloc")]
246impl Extend<String> for FstTombstoneSet<String> {
247 fn extend<T: IntoIterator<Item = String>>(&mut self, iter: T) {
248 let mut keys: alloc::vec::Vec<_> = self.fst.stream().into_strs().unwrap();
249 keys.extend(iter);
250 keys.sort();
251 keys.dedup();
252
253 let mut builder = SetBuilder::memory();
254 for key in keys {
255 // FST builder insert only fails if keys are not sorted, which we ensure above
256 builder
257 .insert(&key)
258 .expect("keys are sorted, insert should not fail");
259 }
260 // Memory builder and FST construction should not fail for valid sorted keys
261 self.fst = FstSet::new(
262 builder
263 .into_inner()
264 .expect("memory builder should not fail"),
265 )
266 .expect("FST construction from valid builder should not fail");
267 }
268}
269
270#[cfg(feature = "alloc")]
271impl FromIterator<String> for FstTombstoneSet<String> {
272 fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
273 let mut keys: alloc::vec::Vec<_> = iter.into_iter().collect();
274 keys.sort();
275 keys.dedup();
276
277 let mut builder = SetBuilder::memory();
278 for key in &keys {
279 // FST builder insert only fails if keys are not sorted, which we ensure above
280 builder
281 .insert(key)
282 .expect("keys are sorted, insert should not fail");
283 }
284 Self::from_fst(
285 FstSet::new(
286 builder
287 .into_inner()
288 .expect("memory builder should not fail"),
289 )
290 .expect("FST construction from valid builder should not fail"),
291 )
292 }
293}
294
295#[cfg(feature = "alloc")]
296impl IntoIterator for FstTombstoneSet<String> {
297 type Item = String;
298 type IntoIter = std::vec::IntoIter<String>;
299
300 fn into_iter(self) -> Self::IntoIter {
301 // Convert FST keys to strings
302 self.fst
303 .stream()
304 .into_strs()
305 .expect("FST contains valid UTF-8 strings")
306 .into_iter()
307 }
308}
309
310// Implement TombstoneSet for HashSet to support generic key types
311#[cfg(feature = "std")]
312impl<K> TombstoneSet<K> for std::collections::HashSet<K>
313where
314 K: Eq + std::hash::Hash + Clone,
315{
316 fn contains(&self, key: &K) -> bool {
317 std::collections::HashSet::contains(self, key)
318 }
319
320 fn union_with(&mut self, other: &Self) -> usize {
321 let old_len = self.len();
322 #[expect(
323 clippy::disallowed_methods,
324 reason = "nondeterministic iteration order, fine to insert into set"
325 )]
326 for item in other.iter() {
327 self.insert(item.clone());
328 }
329 old_len
330 }
331}