Rust implementation of the CVM algorithm for counting distinct elements in a stream

Merge pull request #12 from urschrei/shugel/push-yoztvtyqqvol

Update rand dep

authored by urschrei.bsky.social and committed by GitHub 691ed194 4b6da578

Changed files
+10 -10
benches
src
+1 -2
Cargo.toml
··· 13 edition = "2024" 14 15 [dependencies] 16 - rand = "0.8.5" 17 regex = "1.10.4" 18 clap = { version = "4.5.4", features = ["cargo"] } 19 rustc-hash = "2.1.1" 20 21 [dev-dependencies] 22 - rand = "0.8.5" 23 criterion = "0.5.1" 24 25 [lib]
··· 13 edition = "2024" 14 15 [dependencies] 16 + rand = { version = "0.9.1", features = ["std_rng"] } 17 regex = "1.10.4" 18 clap = { version = "4.5.4", features = ["cargo"] } 19 rustc-hash = "2.1.1" 20 21 [dev-dependencies] 22 criterion = "0.5.1" 23 24 [lib]
+3 -3
benches/benchmarks.rs
··· 8 9 use criterion::Criterion; 10 use cvmcount::CVM; 11 - use rand::{Rng, thread_rng}; 12 use regex::Regex; 13 14 use std::collections::HashSet; 15 16 // generate 1 million 7-digit random positive integers 17 fn generate_random_numbers() -> Vec<i32> { 18 - let mut rng = thread_rng(); 19 20 (0..1_000_000) 21 - .map(|_| rng.gen_range(1_000_000..10_000_000)) 22 .collect() 23 } 24
··· 8 9 use criterion::Criterion; 10 use cvmcount::CVM; 11 + use rand::{Rng, rng}; 12 use regex::Regex; 13 14 use std::collections::HashSet; 15 16 // generate 1 million 7-digit random positive integers 17 fn generate_random_numbers() -> Vec<i32> { 18 + let mut rng = rng(); 19 20 (0..1_000_000) 21 + .map(|_| rng.random_range(1_000_000..10_000_000)) 22 .collect() 23 } 24
+5 -4
src/lib.rs
··· 6 mod treap; 7 8 use crate::treap::Treap; 9 use rand::rngs::StdRng; 10 - use rand::{Rng, SeedableRng}; 11 12 /// Specification for confidence level in the CVM algorithm 13 #[derive(Debug, Clone, Copy)] ··· 227 buf_size: bufsize, 228 buf: Treap::new(), 229 probability: 1.0, 230 - rng: StdRng::from_entropy(), 231 } 232 } 233 /// Add an element, potentially updating the unique element count ··· 240 if self.buf.contains(&elem) { 241 self.buf.remove(&elem); 242 } 243 - if self.rng.gen_bool(self.probability) { 244 self.buf.insert(elem, &mut self.rng); 245 } 246 while self.buf.len() == self.buf_size { ··· 252 fn clear_about_half(&mut self) { 253 // Need to capture rng reference to use in closure 254 let rng = &mut self.rng; 255 - self.buf.retain(|_| rng.gen_bool(0.5)); 256 } 257 /// Process an entire iterator of owned values and return the final estimate 258 ///
··· 6 mod treap; 7 8 use crate::treap::Treap; 9 + use rand::Rng; 10 + use rand::SeedableRng; 11 use rand::rngs::StdRng; 12 13 /// Specification for confidence level in the CVM algorithm 14 #[derive(Debug, Clone, Copy)] ··· 228 buf_size: bufsize, 229 buf: Treap::new(), 230 probability: 1.0, 231 + rng: StdRng::from_os_rng(), 232 } 233 } 234 /// Add an element, potentially updating the unique element count ··· 241 if self.buf.contains(&elem) { 242 self.buf.remove(&elem); 243 } 244 + if self.rng.random_bool(self.probability) { 245 self.buf.insert(elem, &mut self.rng); 246 } 247 while self.buf.len() == self.buf_size { ··· 253 fn clear_about_half(&mut self) { 254 // Need to capture rng reference to use in closure 255 let rng = &mut self.rng; 256 + self.buf.retain(|_| rng.random_bool(0.5)); 257 } 258 /// Process an entire iterator of owned values and return the final estimate 259 ///
+1 -1
src/treap.rs
··· 84 85 /// Insert a key with a random priority 86 pub fn insert<R: Rng>(&mut self, key: T, rng: &mut R) { 87 - let priority = rng.r#gen(); 88 self.root = Self::insert_node(self.root.take(), key, priority); 89 self.size += 1; 90 }
··· 84 85 /// Insert a key with a random priority 86 pub fn insert<R: Rng>(&mut self, key: T, rng: &mut R) { 87 + let priority = rng.random(); 88 self.root = Self::insert_node(self.root.take(), key, priority); 89 self.size += 1; 90 }