Rust implementation of the CVM algorithm for counting distinct elements in a stream
at main 2.9 kB view raw
1use clap::{Command, arg, crate_version, value_parser}; 2use regex::Regex; 3use std::fs::File; 4use std::io::BufRead; 5use std::io::BufReader; 6use std::path::Path; 7use std::path::PathBuf; 8 9use cvmcount::CVM; 10 11fn open_file<P>(filename: P) -> BufReader<File> 12where 13 P: AsRef<Path>, 14{ 15 let f = File::open(filename).expect("Couldn't read from file"); 16 BufReader::new(f) 17} 18 19fn line_to_word(re: &Regex, cvm: &mut CVM<String>, line: &str) { 20 let words = line.split(' '); 21 words.for_each(|word| { 22 let clean_word = re.replace_all(word, "").to_lowercase(); 23 cvm.process_element(clean_word) 24 }) 25} 26 27fn main() { 28 // Generate a CLI, and get input filename to process 29 let params = Command::new("CVM") 30 .version(crate_version!()) 31 .author("Stephan Hügel <urschrei@gmail.com>") 32 .about("Use the CVM algorithm to estimate the number of unique tokens in a stream") 33 .arg(arg!(-t --tokens <FILE> "A text file containing words") 34 .required(true) 35 .value_parser(value_parser!(PathBuf))) 36 .arg(arg!(-e --epsilon <EPSILON> "How close you want your estimate to be to the true number of distinct tokens. A smaller ε means you require a more precise estimate. For example, ε = 0.05 means you want your estimate to be within 5 % of the actual value. An epsilon of 0.8 is a good starting point for most applications") 37 .required(true) 38 .value_parser(clap::value_parser!(f64)) 39 ) 40 .arg(arg!(-d --delta <DELTA> "The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a higher chance the estimate might be outside the desired range. A delta of 0.1 is a good starting point for most applications") 41 .required(true) 42 .value_parser(value_parser!(f64))) 43 .arg(arg!(-s --streamsize <STREAM_SIZE> "This is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, the more accurate the results") 44 .required(true) 45 .value_parser(value_parser!(usize))) 46 .get_matches(); 47 48 let input_file = params.get_one::<PathBuf>("tokens").unwrap(); 49 let epsilon = params.get_one::<f64>("epsilon").unwrap(); 50 let delta = params.get_one::<f64>("delta").unwrap(); 51 let stream_size = params.get_one::<usize>("streamsize").unwrap(); 52 let mut counter: CVM<String> = CVM::new(*epsilon, *delta, *stream_size); 53 let re = Regex::new(r"[^\w\s]").unwrap(); 54 55 let br = open_file(input_file); 56 br.lines() 57 .for_each(|line| line_to_word(&re, &mut counter, &line.unwrap())); 58 println!( 59 "Unique tokens: {:?}", 60 counter.calculate_final_result() as i32 61 ); 62}