dfir_rs/compiled/pull/half_join_state/
set.rs

1use std::collections::VecDeque;
2use std::collections::hash_map::Entry;
3
4use super::HalfJoinState;
5use crate::util::clear::Clear;
6
7type HashMap<K, V> = rustc_hash::FxHashMap<K, V>;
8
9use smallvec::{SmallVec, smallvec};
10
11#[derive(Debug)]
12pub struct HalfSetJoinState<Key, ValBuild, ValProbe> {
13    // Here a smallvec with inline storage of 1 is chosen.
14    // The rationale for this decision is that, I speculate, that joins possibly have a bimodal distribution with regards to how much key contention they have.
15    // That is, I think that there are many joins that have 1 value per key on LHS/RHS, and there are also a large category of joins that have multiple values per key.
16    // For the category of joins that have multiple values per key, it's not clear why they would only have 2, 3, 4, or N specific number of values per key. So there's no good number to set the smallvec storage to.
17    // Instead we can just focus on the first group of joins that have 1 value per key and get benefit there without hurting the other group too much with excessive memory usage.
18    /// Table to probe, vec val contains all matches.
19    table: HashMap<Key, SmallVec<[ValBuild; 1]>>,
20    /// Not-yet emitted matches.
21    current_matches: VecDeque<(Key, ValProbe, ValBuild)>,
22    len: usize,
23}
24impl<Key, ValBuild, ValProbe> Default for HalfSetJoinState<Key, ValBuild, ValProbe> {
25    fn default() -> Self {
26        Self {
27            table: HashMap::default(),
28            current_matches: VecDeque::default(),
29            len: 0,
30        }
31    }
32}
33impl<Key, ValBuild, ValProbe> Clear for HalfSetJoinState<Key, ValBuild, ValProbe> {
34    fn clear(&mut self) {
35        self.table.clear();
36        self.current_matches.clear();
37        self.len = 0;
38    }
39}
40impl<Key, ValBuild, ValProbe> HalfJoinState<Key, ValBuild, ValProbe>
41    for HalfSetJoinState<Key, ValBuild, ValProbe>
42where
43    Key: Clone + Eq + std::hash::Hash,
44    ValBuild: Clone + Eq,
45    ValProbe: Clone,
46{
47    fn build(&mut self, k: Key, v: &ValBuild) -> bool {
48        let entry = self.table.entry(k);
49
50        match entry {
51            Entry::Occupied(mut e) => {
52                let vec = e.get_mut();
53
54                if !vec.contains(v) {
55                    vec.push(v.clone());
56                    self.len += 1;
57                    return true;
58                }
59            }
60            Entry::Vacant(e) => {
61                e.insert(smallvec![v.clone()]);
62                self.len += 1;
63                return true;
64            }
65        };
66
67        false
68    }
69
70    fn probe(&mut self, k: &Key, v: &ValProbe) -> Option<(Key, ValProbe, ValBuild)> {
71        // TODO: We currently don't free/shrink the self.current_matches vecdeque to save time.
72        // This mean it will grow to eventually become the largest number of matches in a single probe call.
73        // Maybe we should clear this memory at the beginning of every tick/periodically?
74        let mut iter = self
75            .table
76            .get(k)?
77            .iter()
78            .map(|valbuild| (k.clone(), v.clone(), valbuild.clone()));
79
80        let first = iter.next();
81
82        self.current_matches.extend(iter);
83
84        first
85    }
86
87    fn full_probe(&self, k: &Key) -> std::slice::Iter<'_, ValBuild> {
88        let Some(sv) = self.table.get(k) else {
89            return [].iter();
90        };
91
92        sv.iter()
93    }
94
95    fn pop_match(&mut self) -> Option<(Key, ValProbe, ValBuild)> {
96        self.current_matches.pop_front()
97    }
98
99    fn len(&self) -> usize {
100        self.len
101    }
102
103    fn iter(&self) -> std::collections::hash_map::Iter<'_, Key, SmallVec<[ValBuild; 1]>> {
104        self.table.iter()
105    }
106}