hashiverse_lib/tools/
hyper_log_log.rs1use crate::tools::hashing;
13
14const NUM_REGISTERS_POWER: u8 = 6;
19const NUM_REGISTERS: usize = 1 << NUM_REGISTERS_POWER; const ALPHA: f64 = 0.709;
23
24#[derive(Clone, Debug)]
25pub struct HyperLogLog {
26 registers: [u8; NUM_REGISTERS],
27}
28
29impl HyperLogLog {
30 pub fn new() -> Self {
31 Self { registers: [0; NUM_REGISTERS] }
32 }
33
34 pub fn insert(&mut self, data: &[u8]) {
35 let hash = hashing::hash(data);
36 let hash_bytes = hash.0;
37
38 let register_index = (hash_bytes[0] as usize) & (NUM_REGISTERS - 1);
40
41 let remaining_value = u64::from_be_bytes([
44 hash_bytes[1], hash_bytes[2], hash_bytes[3], hash_bytes[4],
45 hash_bytes[5], hash_bytes[6], hash_bytes[7], hash_bytes[8],
46 ]);
47
48 let leading_zeros_plus_one = (remaining_value.leading_zeros() + 1) as u8;
50
51 if leading_zeros_plus_one > self.registers[register_index] {
52 self.registers[register_index] = leading_zeros_plus_one;
53 }
54 }
55
56 pub fn count(&self) -> u64 {
57 let num_registers_f64 = NUM_REGISTERS as f64;
58
59 let harmonic_sum: f64 = self.registers.iter().map(|®ister_value| {
61 2.0_f64.powi(-(register_value as i32))
62 }).sum();
63
64 let raw_estimate = ALPHA * num_registers_f64 * num_registers_f64 / harmonic_sum;
65
66 if raw_estimate <= 2.5 * num_registers_f64 {
68 let num_zero_registers = self.registers.iter().filter(|&&v| v == 0).count();
69 if num_zero_registers > 0 {
70 let linear_count = num_registers_f64 * (num_registers_f64 / num_zero_registers as f64).ln();
72 return linear_count as u64;
73 }
74 }
75
76 raw_estimate as u64
77 }
78
79 pub fn merge(&mut self, other: &HyperLogLog) {
80 for i in 0..NUM_REGISTERS {
81 if other.registers[i] > self.registers[i] {
82 self.registers[i] = other.registers[i];
83 }
84 }
85 }
86}
87
88#[cfg(test)]
89mod tests {
90 use super::*;
91
92 #[test]
93 fn test_hyperloglog_empty() {
94 let hll = HyperLogLog::new();
95 assert_eq!(hll.count(), 0);
96 }
97
98 #[test]
99 fn test_hyperloglog_single_insert() {
100 let mut hll = HyperLogLog::new();
101 hll.insert(b"hello");
102 assert!(hll.count() >= 1);
103 }
104
105 #[test]
106 fn test_hyperloglog_duplicate_inserts() {
107 let mut hll = HyperLogLog::new();
108 for _ in 0..1000 {
109 hll.insert(b"same_value");
110 }
111 assert_eq!(hll.count(), 1);
113 }
114
115 #[test]
116 fn test_hyperloglog_distinct_values() {
117 let mut hll = HyperLogLog::new();
118 let num_distinct = 1000;
119 for i in 0..num_distinct {
120 hll.insert(format!("value_{}", i).as_bytes());
121 }
122 let estimated_count = hll.count();
123 let lower_bound = (num_distinct as f64 * 0.75) as u64;
125 let upper_bound = (num_distinct as f64 * 1.25) as u64;
126 assert!(
127 estimated_count >= lower_bound && estimated_count <= upper_bound,
128 "Expected count near {}, got {} (bounds: {}-{})", num_distinct, estimated_count, lower_bound, upper_bound
129 );
130 }
131
132 #[test]
133 fn test_hyperloglog_merge() {
134 let mut hll_a = HyperLogLog::new();
135 let mut hll_b = HyperLogLog::new();
136
137 for i in 0..500 {
138 hll_a.insert(format!("a_{}", i).as_bytes());
139 }
140 for i in 0..500 {
141 hll_b.insert(format!("b_{}", i).as_bytes());
142 }
143
144 let count_a = hll_a.count();
145 let count_b = hll_b.count();
146
147 hll_a.merge(&hll_b);
148 let merged_count = hll_a.count();
149
150 assert!(merged_count > count_a);
152 assert!(merged_count > count_b);
153 let expected = 1000;
154 let lower_bound = (expected as f64 * 0.75) as u64;
155 let upper_bound = (expected as f64 * 1.25) as u64;
156 assert!(
157 merged_count >= lower_bound && merged_count <= upper_bound,
158 "Expected merged count near {}, got {} (bounds: {}-{})", expected, merged_count, lower_bound, upper_bound
159 );
160 }
161
162 #[test]
163 fn test_hyperloglog_merge_with_overlap() {
164 let mut hll_a = HyperLogLog::new();
165 let mut hll_b = HyperLogLog::new();
166
167 for i in 0..500 {
169 hll_a.insert(format!("shared_{}", i).as_bytes());
170 hll_b.insert(format!("shared_{}", i).as_bytes());
171 }
172
173 hll_a.merge(&hll_b);
174 let merged_count = hll_a.count();
175
176 let lower_bound = (500_f64 * 0.75) as u64;
178 let upper_bound = (500_f64 * 1.25) as u64;
179 assert!(
180 merged_count >= lower_bound && merged_count <= upper_bound,
181 "Expected merged count near 500, got {} (bounds: {}-{})", merged_count, lower_bound, upper_bound
182 );
183 }
184
185 #[test]
186 fn test_hyperloglog_small_counts() {
187 for expected in [2, 5, 10, 20, 50] {
188 let mut hll = HyperLogLog::new();
189 for i in 0..expected {
190 hll.insert(format!("item_{}", i).as_bytes());
191 }
192 let estimated_count = hll.count();
193 let lower_bound = (expected as f64 * 0.5) as u64;
195 let upper_bound = (expected as f64 * 2.0) as u64;
196 assert!(
197 estimated_count >= lower_bound && estimated_count <= upper_bound,
198 "For {} items: expected count near {}, got {} (bounds: {}-{})", expected, expected, estimated_count, lower_bound, upper_bound
199 );
200 }
201 }
202}