lattices/tombstone.rs
1//! Shared tombstone set implementations for efficient storage of deleted keys.
2//!
3//! This module provides specialized tombstone storage implementations that can be used
4//! with both [`crate::set_union_with_tombstones::SetUnionWithTombstones`] and
5//! [`crate::map_union_with_tombstones::MapUnionWithTombstones`].
6//!
7//! # Choosing a Tombstone Implementation
8//!
9//! This module provides several specialized tombstone storage implementations optimized for different key types:
10//!
11//! ## For Integer Keys (u64)
12//! Use [`RoaringTombstoneSet`]:
13//! - Extremely space-efficient bitmap compression
14//! - Fast O(1) lookups and efficient bitmap OR operations during merge
15//! - Works with u64 keys (other integer types can be cast to u64)
16//!
17//! ## For String Keys
18//! Use [`FstTombstoneSet<String>`]:
19//! - Compressed finite state transducer storage
20//! - Zero false positives (collision-free)
21//! - Efficient union operations for merging
22//! - Maintains sorted order
23//! - **Note:** For arbitrary byte sequences, encode them as hex or base64 strings first
24//!
25//! ## For Other Types
26//! Use [`std::collections::HashSet`] for tombstones:
27//! - Works with any `Hash + Eq` key type
28//! - No compression, but simple and flexible
29//!
30//! Alternatively, you can hash to 64-bit integers and follow instructions for that case above,
31//! understanding there's a tiny risk of hash collision which could result in keys being
32//! tombstoned (deleted) incorrectly.
33//!
34//! ## Performance Characteristics
35//!
36//! | Implementation | Space Efficiency | Merge Speed | Lookup Speed | False Positives |
37//! |----------------|------------------|-------------|--------------|-----------------|
38//! | RoaringBitmap | Excellent | Excellent | Excellent | None |
39//! | FST | Very Good | Good | Very Good | None |
40//! | HashSet | Poor | Good | Excellent | None |
41//!
42//! # Performance Considerations
43//!
44//! - **RoaringBitmap:** Optimized for dense integer sets. Very fast for all operations.
45//! - **FST:** The `extend()` operation rebuilds the entire FST, so batch your insertions.
46//! Use `from_iter()` when possible for better performance.
47//!
48//! # Thread Safety
49//!
50//! Both implementations are `Send` and `Sync` when their contained types are.
51//! They do not use interior mutability.
52
53use std::collections::HashSet;
54
55use fst::{IntoStreamer, Set as FstSet, SetBuilder, Streamer};
56use roaring::RoaringTreemap;
57
58use crate::cc_traits::Len;
59
60/// Trait for tombstone set implementations that support efficient union operations.
61///
62/// This trait abstracts over different tombstone storage strategies, allowing
63/// specialized implementations like [`RoaringTombstoneSet`] and [`FstTombstoneSet`]
64/// to provide optimized merge operations for their respective key types.
65///
66/// Implementors must provide:
67/// - A way to check membership (`contains`)
68/// - A way to union with another tombstone set (`union_with`)
69/// - Standard collection traits (`Len`, `Extend`, etc.)
70pub trait TombstoneSet<Key>: Len + Extend<Key> {
71 /// Check if a key is in the tombstone set.
72 fn contains(&self, key: &Key) -> bool;
73
74 /// Union this tombstone set with another, modifying self in place.
75 /// Returns the old length before the union.
76 fn union_with(&mut self, other: &Self) -> usize;
77}
78
79/// A tombstone set backed by [`RoaringTreemap`] for u64 integer keys.
80/// This provides space-efficient bitmap compression for integer tombstones.
81#[derive(Default, Clone, Debug)]
82pub struct RoaringTombstoneSet {
83 bitmap: RoaringTreemap,
84}
85
86impl RoaringTombstoneSet {
87 /// Create a new empty `RoaringTombstoneSet`.
88 pub fn new() -> Self {
89 Self {
90 bitmap: RoaringTreemap::new(),
91 }
92 }
93
94 /// Check if an item is in the tombstone set.
95 pub fn contains(&self, item: &u64) -> bool {
96 self.bitmap.contains(*item)
97 }
98
99 /// Insert an item into the tombstone set.
100 pub fn insert(&mut self, item: u64) -> bool {
101 self.bitmap.insert(item)
102 }
103}
104
105impl TombstoneSet<u64> for RoaringTombstoneSet {
106 fn contains(&self, key: &u64) -> bool {
107 self.bitmap.contains(*key)
108 }
109
110 fn union_with(&mut self, other: &Self) -> usize {
111 let old_len = self.len();
112 self.bitmap = &self.bitmap | &other.bitmap;
113 old_len
114 }
115}
116
117impl Extend<u64> for RoaringTombstoneSet {
118 fn extend<T: IntoIterator<Item = u64>>(&mut self, iter: T) {
119 self.bitmap.extend(iter);
120 }
121}
122
123impl Len for RoaringTombstoneSet {
124 fn len(&self) -> usize {
125 self.bitmap.len() as usize
126 }
127}
128
129impl IntoIterator for RoaringTombstoneSet {
130 type Item = u64;
131 type IntoIter = roaring::treemap::IntoIter;
132
133 fn into_iter(self) -> Self::IntoIter {
134 self.bitmap.into_iter()
135 }
136}
137
138impl FromIterator<u64> for RoaringTombstoneSet {
139 fn from_iter<T: IntoIterator<Item = u64>>(iter: T) -> Self {
140 Self {
141 bitmap: RoaringTreemap::from_iter(iter),
142 }
143 }
144}
145
146/// A tombstone set backed by FST (Finite State Transducer) for byte string keys.
147/// This provides space-efficient storage with zero false positives for any type
148/// that can be serialized to bytes (strings, serialized structs, etc.).
149/// FST maintains keys in sorted order and supports efficient set operations.
150///
151/// ## Performance Notes
152/// - The `extend()` operation rebuilds the entire FST, so batch your insertions when possible
153/// - Union operations are efficient and create a new compressed FST
154/// - Lookups are very fast (logarithmic in the number of keys)
155#[derive(Clone, Debug)]
156pub struct FstTombstoneSet<Item> {
157 fst: FstSet<Vec<u8>>,
158 _phantom: std::marker::PhantomData<Item>,
159}
160
161impl<Item> Default for FstTombstoneSet<Item> {
162 fn default() -> Self {
163 Self::new()
164 }
165}
166
167impl<Item> FstTombstoneSet<Item> {
168 /// Create a new empty `FstTombstoneSet`.
169 pub fn new() -> Self {
170 Self {
171 fst: FstSet::default(),
172 _phantom: std::marker::PhantomData,
173 }
174 }
175
176 /// Create from an existing FST set.
177 pub(crate) fn from_fst(fst: FstSet<Vec<u8>>) -> Self {
178 Self {
179 fst,
180 _phantom: std::marker::PhantomData,
181 }
182 }
183
184 /// Check if an item is in the tombstone set.
185 pub fn contains(&self, item: &[u8]) -> bool {
186 self.fst.contains(item)
187 }
188
189 /// Get the number of items in the set.
190 pub fn len(&self) -> usize {
191 self.fst.len()
192 }
193
194 /// Check if the set is empty.
195 pub fn is_empty(&self) -> bool {
196 self.fst.is_empty()
197 }
198
199 /// Union this FST with another, returning a new FST.
200 fn union_internal(&self, other: &Self) -> Self {
201 let union_stream = self.fst.op().add(&other.fst).union();
202 let mut builder = SetBuilder::memory();
203 let mut stream = union_stream.into_stream();
204 while let Some(key) = stream.next() {
205 // Union stream produces sorted keys, so insert should not fail
206 builder
207 .insert(key)
208 .expect("union stream keys are sorted, insert should not fail");
209 }
210 Self::from_fst(
211 FstSet::new(
212 builder
213 .into_inner()
214 .expect("memory builder should not fail"),
215 )
216 .expect("FST construction from valid builder should not fail"),
217 )
218 }
219}
220
221/// Helper trait to convert keys to byte slices for FST operations.
222pub trait AsBytes {
223 /// Convert the key to a byte slice.
224 fn as_bytes(&self) -> &[u8];
225}
226
227impl AsBytes for String {
228 fn as_bytes(&self) -> &[u8] {
229 self.as_bytes()
230 }
231}
232
233impl TombstoneSet<String> for FstTombstoneSet<String> {
234 fn contains(&self, key: &String) -> bool {
235 self.fst.contains(key.as_bytes())
236 }
237
238 fn union_with(&mut self, other: &Self) -> usize {
239 let old_len = self.len();
240 *self = self.union_internal(other);
241 old_len
242 }
243}
244
245impl Len for FstTombstoneSet<String> {
246 fn len(&self) -> usize {
247 self.fst.len()
248 }
249}
250
251// For String items
252impl Extend<String> for FstTombstoneSet<String> {
253 fn extend<T: IntoIterator<Item = String>>(&mut self, iter: T) {
254 let mut keys: Vec<_> = self.fst.stream().into_strs().unwrap();
255 keys.extend(iter);
256 keys.sort();
257 keys.dedup();
258
259 let mut builder = SetBuilder::memory();
260 for key in keys {
261 // FST builder insert only fails if keys are not sorted, which we ensure above
262 builder
263 .insert(key)
264 .expect("keys are sorted, insert should not fail");
265 }
266 // Memory builder and FST construction should not fail for valid sorted keys
267 self.fst = FstSet::new(
268 builder
269 .into_inner()
270 .expect("memory builder should not fail"),
271 )
272 .expect("FST construction from valid builder should not fail");
273 }
274}
275
276impl FromIterator<String> for FstTombstoneSet<String> {
277 fn from_iter<T: IntoIterator<Item = String>>(iter: T) -> Self {
278 let mut keys: Vec<_> = iter.into_iter().collect();
279 keys.sort();
280 keys.dedup();
281
282 let mut builder = SetBuilder::memory();
283 for key in keys {
284 // FST builder insert only fails if keys are not sorted, which we ensure above
285 builder
286 .insert(key)
287 .expect("keys are sorted, insert should not fail");
288 }
289 Self::from_fst(
290 FstSet::new(
291 builder
292 .into_inner()
293 .expect("memory builder should not fail"),
294 )
295 .expect("FST construction from valid builder should not fail"),
296 )
297 }
298}
299
300impl IntoIterator for FstTombstoneSet<String> {
301 type Item = String;
302 type IntoIter = std::vec::IntoIter<String>;
303
304 fn into_iter(self) -> Self::IntoIter {
305 // Convert FST keys to strings
306 self.fst
307 .stream()
308 .into_strs()
309 .expect("FST contains valid UTF-8 strings")
310 .into_iter()
311 }
312}
313
314// Implement TombstoneSet for HashSet to support generic key types
315impl<K> TombstoneSet<K> for HashSet<K>
316where
317 K: Eq + std::hash::Hash + Clone,
318{
319 fn contains(&self, key: &K) -> bool {
320 HashSet::contains(self, key)
321 }
322
323 fn union_with(&mut self, other: &Self) -> usize {
324 let old_len = self.len();
325 for item in other.iter() {
326 self.insert(item.clone());
327 }
328 old_len
329 }
330}