Skip to main content

dfir_rs/scheduled/
metrics.rs

1//! Runtime metrics for DFIR.
2
3use std::cell::Cell;
4use std::pin::Pin;
5use std::rc::Rc;
6use std::task::{Context, Poll};
7
8use dfir_lang::graph_ids::{GraphNodeId, GraphSubgraphId};
9use pin_project_lite::pin_project;
10use slotmap::SecondaryMap;
11use web_time::{Duration, Instant};
12
13/// Metrics for a [`Dfir`](super::context::Dfir) graph instance.
14///
15/// Call [`Dfir::metrics`](super::context::Dfir::metrics) for reference-counted continually-updated metrics,
16/// or call [`Dfir::metrics_intervals`](super::context::Dfir::metrics_intervals) to obtain a [`DfirMetricsIntervals`] handle, and use
17/// [`DfirMetricsIntervals::take_interval`] to retrieve metrics for successive intervals.
18#[derive(Default, Clone)]
19#[non_exhaustive]
20pub struct DfirMetrics {
21    /// Per-subgraph metrics.
22    pub subgraphs: SecondaryMap<GraphSubgraphId, SubgraphMetrics>,
23    /// Per-handoff metrics.
24    pub handoffs: SecondaryMap<GraphNodeId, HandoffMetrics>,
25}
26
27impl DfirMetrics {
28    /// Subtracts `other` from self.
29    pub(super) fn diff(&mut self, other: &Self) {
30        for (sg_id, prev_sg_metrics) in other.subgraphs.iter() {
31            if let Some(curr_sg_metrics) = self.subgraphs.get_mut(sg_id) {
32                curr_sg_metrics.diff(prev_sg_metrics);
33            }
34        }
35        for (handoff_id, prev_handoff_metrics) in other.handoffs.iter() {
36            if let Some(curr_handoff_metrics) = self.handoffs.get_mut(handoff_id) {
37                curr_handoff_metrics.diff(prev_handoff_metrics);
38            }
39        }
40    }
41}
42
43/// A handle into a DFIR instance's metrics, where each call to [`Self::take_interval`] ends the current interval and
44/// returns its metrics. Obtained via [`Dfir::metrics_intervals`](super::context::Dfir::metrics_intervals).
45///
46/// The first call to `take_interval` returns metrics since this DFIR instance was created. Each subsequent call to
47/// `take_interval` returns metrics since the previous call.
48///
49/// Cloning the handle "forks" it from the original, as afterwards each interval may return different metrics
50/// depending on when exactly `take_interval` is called.
51#[derive(Clone)]
52pub struct DfirMetricsIntervals {
53    /// `curr` is continually updating (via shared ownership).
54    pub(super) curr: Rc<DfirMetrics>,
55    /// `prev` is an unchanging snapshot in time. `None` for "since creation".
56    pub(super) prev: Option<DfirMetrics>,
57}
58
59impl DfirMetricsIntervals {
60    /// Ends the current interval and returns the accumulated metrics across the interval.
61    ///
62    /// The first call to `take_interval` returns metrics since this DFIR instance was created. Each subsequent call to
63    /// `take_interval` returns metrics since the previous call.
64    pub fn take_interval(&mut self) -> DfirMetrics {
65        let mut curr = self.curr.as_ref().clone();
66        if let Some(prev) = self.prev.replace(curr.clone()) {
67            curr.diff(&prev);
68        }
69        curr
70    }
71
72    /// Returns a reference-counted handle to the original continually-updated runtime metrics for this DFIR instance.
73    ///
74    /// See [`Dfir::metrics`](super::context::Dfir::metrics).
75    pub fn all_metrics(&self) -> Rc<DfirMetrics> {
76        Rc::clone(&self.curr)
77    }
78}
79
80/// Declarative macro to generate metrics structs with Cell-based fields and getter methods.
81macro_rules! define_metrics {
82    (
83        $(#[$struct_attr:meta])*
84        pub struct $struct_name:ident {
85            $(
86                $( #[doc = $doc:literal] )*
87                #[diff($diff:ident)]
88                $( #[$field_attr:meta] )*
89                $field_name:ident: Cell<$field_type:ty>,
90            )*
91        }
92    ) => {
93        $(#[$struct_attr])*
94        #[derive(Default, Debug, Clone)]
95        #[non_exhaustive] // May add more metrics later.
96        pub struct $struct_name {
97            $(
98                #[doc(hidden)] // Public for codegen access; use the getter method instead.
99                $(#[$field_attr])*
100                pub $field_name: Cell<$field_type>,
101            )*
102        }
103
104        impl $struct_name {
105            $(
106                $( #[doc = $doc] )*
107                pub fn $field_name(&self) -> $field_type {
108                    self.$field_name.get()
109                }
110            )*
111
112            fn diff(&mut self, other: &Self) {
113                $(
114                    define_metrics_diff_field!($diff, $field_name, self, other);
115                )*
116            }
117        }
118    };
119}
120
121macro_rules! define_metrics_diff_field {
122    (total, $field:ident, $slf:ident, $other:ident) => {
123        debug_assert!($other.$field.get() <= $slf.$field.get());
124        $slf.$field.update(|x| x - $other.$field.get());
125    };
126    (curr, $field:ident, $slf:ident, $other:ident) => {};
127}
128
129define_metrics! {
130    /// Per-handoff metrics.
131    pub struct HandoffMetrics {
132        /// Number of items currently in the handoff.
133        #[diff(curr)]
134        curr_items_count: Cell<usize>,
135
136        /// Total number of items read out of the handoff.
137        #[diff(total)]
138        total_items_count: Cell<usize>,
139    }
140}
141
142define_metrics! {
143    /// Per-subgraph metrics.
144    pub struct SubgraphMetrics {
145        /// Number of times the subgraph has run.
146        #[diff(total)]
147        total_run_count: Cell<usize>,
148
149        /// Time elapsed during polling (when the subgraph is actively doing work).
150        #[diff(total)]
151        total_poll_duration: Cell<Duration>,
152
153        /// Number of times the subgraph has been polled.
154        #[diff(total)]
155        total_poll_count: Cell<usize>,
156
157        /// Time elapsed during idle (when the subgraph has yielded and is waiting for async events).
158        #[diff(total)]
159        total_idle_duration: Cell<Duration>,
160
161        /// Number of times the subgraph has been idle.
162        #[diff(total)]
163        total_idle_count: Cell<usize>,
164    }
165}
166
167pin_project! {
168    /// Helper struct which instruments a future to track polling times.
169    #[doc(hidden)]
170    pub struct InstrumentSubgraph<'a, Fut> {
171        #[pin]
172        future: Fut,
173        idle_start: Option<Instant>,
174        metrics: &'a SubgraphMetrics,
175    }
176}
177
178impl<'a, Fut> InstrumentSubgraph<'a, Fut> {
179    /// Wrap a future to track per-subgraph poll and idle durations.
180    pub fn new(future: Fut, metrics: &'a SubgraphMetrics) -> Self {
181        Self {
182            future,
183            idle_start: None,
184            metrics,
185        }
186    }
187}
188
189impl<'a, Fut> Future for InstrumentSubgraph<'a, Fut>
190where
191    Fut: Future,
192{
193    type Output = Fut::Output;
194    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
195        let this = self.project();
196
197        // End idle duration.
198        if let Some(idle_start) = this.idle_start {
199            this.metrics
200                .total_idle_duration
201                .update(|x| x + idle_start.elapsed());
202            this.metrics.total_idle_count.update(|x| x + 1);
203        }
204
205        // Begin poll duration.
206        let poll_start = Instant::now();
207        let out = this.future.poll(cx);
208
209        // End poll duration.
210        this.metrics
211            .total_poll_duration
212            .update(|x| x + poll_start.elapsed());
213        this.metrics.total_poll_count.update(|x| x + 1);
214
215        // Begin idle duration.
216        this.idle_start.replace(Instant::now());
217
218        out
219    }
220}
221
222#[cfg(test)]
223mod test {
224    use dfir_lang::graph_ids::{GraphNodeId, GraphSubgraphId};
225    use slotmap::SlotMap;
226
227    use super::*;
228
229    #[test]
230    fn test_dfir_metrics_intervals() {
231        // Create slotmaps to generate valid keys.
232        let mut sg_map: SlotMap<GraphSubgraphId, ()> = SlotMap::with_key();
233        let mut node_map: SlotMap<GraphNodeId, ()> = SlotMap::with_key();
234        let sg_id = sg_map.insert(());
235        let handoff_id = node_map.insert(());
236
237        let mut metrics = DfirMetrics::default();
238        metrics.subgraphs.insert(
239            sg_id,
240            SubgraphMetrics {
241                total_run_count: Cell::new(5),
242                total_poll_count: Cell::new(10),
243                total_idle_count: Cell::new(2),
244                total_poll_duration: Cell::new(Duration::from_millis(500)),
245                total_idle_duration: Cell::new(Duration::from_millis(200)),
246            },
247        );
248        metrics.handoffs.insert(
249            handoff_id,
250            HandoffMetrics {
251                curr_items_count: Cell::new(3),
252                total_items_count: Cell::new(100),
253            },
254        );
255        let metrics = Rc::new(metrics);
256
257        let mut intervals = DfirMetricsIntervals {
258            curr: Rc::clone(&metrics),
259            prev: None,
260        };
261
262        // First iteration - captures initial state
263        let first = intervals.take_interval();
264        let sg_metrics = &first.subgraphs[sg_id];
265        assert_eq!(sg_metrics.total_run_count(), 5);
266        let hoff_metrics = &first.handoffs[handoff_id];
267        assert_eq!(hoff_metrics.total_items_count(), 100);
268        assert_eq!(hoff_metrics.curr_items_count(), 3);
269
270        // Simulate more work being done.
271        let sg_metrics = &metrics.subgraphs[sg_id];
272        sg_metrics.total_run_count.set(12);
273        sg_metrics.total_poll_count.set(25);
274        sg_metrics.total_idle_count.set(7);
275        sg_metrics
276            .total_poll_duration
277            .set(Duration::from_millis(1200));
278        sg_metrics
279            .total_idle_duration
280            .set(Duration::from_millis(600));
281        let hoff_metrics = &metrics.handoffs[handoff_id];
282        hoff_metrics.total_items_count.set(250);
283        hoff_metrics.curr_items_count.set(10);
284
285        // Second iteration - should return the diff
286        let second = intervals.take_interval();
287        let sg_metrics = &second.subgraphs[sg_id];
288        assert_eq!(sg_metrics.total_run_count(), 7); // 12 - 5
289        assert_eq!(sg_metrics.total_poll_count(), 15); // 25 - 10
290        assert_eq!(sg_metrics.total_idle_count(), 5); // 7 - 2
291        //
292        let hoff_metrics = &second.handoffs[handoff_id];
293        // total_items_count should be diffed
294        assert_eq!(hoff_metrics.total_items_count(), 150); // 250 - 100
295        // curr_items_count should NOT be diffed (it's a current value, not cumulative)
296        assert_eq!(hoff_metrics.curr_items_count(), 10);
297    }
298}