Skip to main content

hashiverse_lib/tools/
hyper_log_log.rs

1//! # Compact probabilistic cardinality estimation
2//!
3//! A HyperLogLog sketch backed by Blake3 hashing. Used where we need an approximate count
4//! of distinct items (for example, unique viewers of a post or unique voters on a piece of
5//! feedback) without storing the items themselves — the sketch is a fixed 64 bytes
6//! regardless of how many distinct inputs have been seen.
7//!
8//! Configured with 2^6 = 64 registers (yielding ~7.8% standard error), which keeps the
9//! sketch small enough to embed freely in RPC responses and on-disk records while still
10//! being accurate enough for UI-grade counts.
11
12use crate::tools::hashing;
13
14/// A HyperLogLog probabilistic cardinality estimator.
15///
16/// Uses 64 registers (2^6), giving ~7.8% standard error.
17/// Backed by Blake3 hashing.
18const NUM_REGISTERS_POWER: u8 = 6;
19const NUM_REGISTERS: usize = 1 << NUM_REGISTERS_POWER; // 64
20
21/// Alpha constant for bias correction with m=64
22const ALPHA: f64 = 0.709;
23
24#[derive(Clone, Debug)]
25pub struct HyperLogLog {
26    registers: [u8; NUM_REGISTERS],
27}
28
29impl HyperLogLog {
30    pub fn new() -> Self {
31        Self { registers: [0; NUM_REGISTERS] }
32    }
33
34    pub fn insert(&mut self, data: &[u8]) {
35        let hash = hashing::hash(data);
36        let hash_bytes = hash.0;
37
38        // Use first byte bits for register index (we need 6 bits for 64 registers)
39        let register_index = (hash_bytes[0] as usize) & (NUM_REGISTERS - 1);
40
41        // Count leading zeros in the remaining bits (starting from bit 6 onward).
42        // We reconstruct a u64 from bytes 1..9 for the leading-zero count.
43        let remaining_value = u64::from_be_bytes([
44            hash_bytes[1], hash_bytes[2], hash_bytes[3], hash_bytes[4],
45            hash_bytes[5], hash_bytes[6], hash_bytes[7], hash_bytes[8],
46        ]);
47
48        // leading_zeros + 1 (HLL convention: rho function)
49        let leading_zeros_plus_one = (remaining_value.leading_zeros() + 1) as u8;
50
51        if leading_zeros_plus_one > self.registers[register_index] {
52            self.registers[register_index] = leading_zeros_plus_one;
53        }
54    }
55
56    pub fn count(&self) -> u64 {
57        let num_registers_f64 = NUM_REGISTERS as f64;
58
59        // Harmonic mean of 2^(-register[i])
60        let harmonic_sum: f64 = self.registers.iter().map(|&register_value| {
61            2.0_f64.powi(-(register_value as i32))
62        }).sum();
63
64        let raw_estimate = ALPHA * num_registers_f64 * num_registers_f64 / harmonic_sum;
65
66        // Small range correction: use linear counting if estimate is small and there are zero registers
67        if raw_estimate <= 2.5 * num_registers_f64 {
68            let num_zero_registers = self.registers.iter().filter(|&&v| v == 0).count();
69            if num_zero_registers > 0 {
70                // Linear counting
71                let linear_count = num_registers_f64 * (num_registers_f64 / num_zero_registers as f64).ln();
72                return linear_count as u64;
73            }
74        }
75
76        raw_estimate as u64
77    }
78
79    pub fn merge(&mut self, other: &HyperLogLog) {
80        for i in 0..NUM_REGISTERS {
81            if other.registers[i] > self.registers[i] {
82                self.registers[i] = other.registers[i];
83            }
84        }
85    }
86}
87
88#[cfg(test)]
89mod tests {
90    use super::*;
91
92    #[test]
93    fn test_hyperloglog_empty() {
94        let hll = HyperLogLog::new();
95        assert_eq!(hll.count(), 0);
96    }
97
98    #[test]
99    fn test_hyperloglog_single_insert() {
100        let mut hll = HyperLogLog::new();
101        hll.insert(b"hello");
102        assert!(hll.count() >= 1);
103    }
104
105    #[test]
106    fn test_hyperloglog_duplicate_inserts() {
107        let mut hll = HyperLogLog::new();
108        for _ in 0..1000 {
109            hll.insert(b"same_value");
110        }
111        // Duplicates should not inflate the count
112        assert_eq!(hll.count(), 1);
113    }
114
115    #[test]
116    fn test_hyperloglog_distinct_values() {
117        let mut hll = HyperLogLog::new();
118        let num_distinct = 1000;
119        for i in 0..num_distinct {
120            hll.insert(format!("value_{}", i).as_bytes());
121        }
122        let estimated_count = hll.count();
123        // With 64 registers (~7.8% standard error), allow 25% tolerance for test stability
124        let lower_bound = (num_distinct as f64 * 0.75) as u64;
125        let upper_bound = (num_distinct as f64 * 1.25) as u64;
126        assert!(
127            estimated_count >= lower_bound && estimated_count <= upper_bound,
128            "Expected count near {}, got {} (bounds: {}-{})", num_distinct, estimated_count, lower_bound, upper_bound
129        );
130    }
131
132    #[test]
133    fn test_hyperloglog_merge() {
134        let mut hll_a = HyperLogLog::new();
135        let mut hll_b = HyperLogLog::new();
136
137        for i in 0..500 {
138            hll_a.insert(format!("a_{}", i).as_bytes());
139        }
140        for i in 0..500 {
141            hll_b.insert(format!("b_{}", i).as_bytes());
142        }
143
144        let count_a = hll_a.count();
145        let count_b = hll_b.count();
146
147        hll_a.merge(&hll_b);
148        let merged_count = hll_a.count();
149
150        // Merged count should be roughly count_a + count_b (no overlap)
151        assert!(merged_count > count_a);
152        assert!(merged_count > count_b);
153        let expected = 1000;
154        let lower_bound = (expected as f64 * 0.75) as u64;
155        let upper_bound = (expected as f64 * 1.25) as u64;
156        assert!(
157            merged_count >= lower_bound && merged_count <= upper_bound,
158            "Expected merged count near {}, got {} (bounds: {}-{})", expected, merged_count, lower_bound, upper_bound
159        );
160    }
161
162    #[test]
163    fn test_hyperloglog_merge_with_overlap() {
164        let mut hll_a = HyperLogLog::new();
165        let mut hll_b = HyperLogLog::new();
166
167        // Both see the same 500 items
168        for i in 0..500 {
169            hll_a.insert(format!("shared_{}", i).as_bytes());
170            hll_b.insert(format!("shared_{}", i).as_bytes());
171        }
172
173        hll_a.merge(&hll_b);
174        let merged_count = hll_a.count();
175
176        // Should still be ~500, not ~1000
177        let lower_bound = (500_f64 * 0.75) as u64;
178        let upper_bound = (500_f64 * 1.25) as u64;
179        assert!(
180            merged_count >= lower_bound && merged_count <= upper_bound,
181            "Expected merged count near 500, got {} (bounds: {}-{})", merged_count, lower_bound, upper_bound
182        );
183    }
184
185    #[test]
186    fn test_hyperloglog_small_counts() {
187        for expected in [2, 5, 10, 20, 50] {
188            let mut hll = HyperLogLog::new();
189            for i in 0..expected {
190                hll.insert(format!("item_{}", i).as_bytes());
191            }
192            let estimated_count = hll.count();
193            // Wider tolerance for small counts
194            let lower_bound = (expected as f64 * 0.5) as u64;
195            let upper_bound = (expected as f64 * 2.0) as u64;
196            assert!(
197                estimated_count >= lower_bound && estimated_count <= upper_bound,
198                "For {} items: expected count near {}, got {} (bounds: {}-{})", expected, expected, estimated_count, lower_bound, upper_bound
199            );
200        }
201    }
202}