src/main.rs at main · urschrei.bsky.social/cvmcount

Rust implementation of the CVM algorithm for counting distinct elements in a stream
cvmcount / src / main.rs
at main 2.9 kB view raw
 1use clap::{Command, arg, crate_version, value_parser};
 2use regex::Regex;
 3use std::fs::File;
 4use std::io::BufRead;
 5use std::io::BufReader;
 6use std::path::Path;
 7use std::path::PathBuf;
 8
 9use cvmcount::CVM;
10
11fn open_file<P>(filename: P) -> BufReader<File>
12where
13    P: AsRef<Path>,
14{
15    let f = File::open(filename).expect("Couldn't read from file");
16    BufReader::new(f)
17}
18
19fn line_to_word(re: &Regex, cvm: &mut CVM<String>, line: &str) {
20    let words = line.split(' ');
21    words.for_each(|word| {
22        let clean_word = re.replace_all(word, "").to_lowercase();
23        cvm.process_element(clean_word)
24    })
25}
26
27fn main() {
28    // Generate a CLI, and get input filename to process
29    let params = Command::new("CVM")
30        .version(crate_version!())
31        .author("Stephan Hügel <urschrei@gmail.com>")
32        .about("Use the CVM algorithm to estimate the number of unique tokens in a stream")
33        .arg(arg!(-t --tokens <FILE> "A text file containing words")
34            .required(true)
35            .value_parser(value_parser!(PathBuf)))
36        .arg(arg!(-e --epsilon <EPSILON> "How close you want your estimate to be to the true number of distinct tokens. A smaller ε means you require a more precise estimate. For example, ε = 0.05 means you want your estimate to be within 5 % of the actual value. An epsilon of 0.8 is a good starting point for most applications")
37            .required(true)
38            .value_parser(clap::value_parser!(f64))
39            )
40        .arg(arg!(-d --delta <DELTA> "The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a higher chance the estimate might be outside the desired range. A delta of 0.1 is a good starting point for most applications")
41            .required(true)
42            .value_parser(value_parser!(f64)))
43        .arg(arg!(-s --streamsize <STREAM_SIZE> "This is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, the more accurate the results")
44            .required(true)
45            .value_parser(value_parser!(usize)))
46        .get_matches();
47
48    let input_file = params.get_one::<PathBuf>("tokens").unwrap();
49    let epsilon = params.get_one::<f64>("epsilon").unwrap();
50    let delta = params.get_one::<f64>("delta").unwrap();
51    let stream_size = params.get_one::<usize>("streamsize").unwrap();
52    let mut counter: CVM<String> = CVM::new(*epsilon, *delta, *stream_size);
53    let re = Regex::new(r"[^\w\s]").unwrap();
54
55    let br = open_file(input_file);
56    br.lines()
57        .for_each(|line| line_to_word(&re, &mut counter, &line.unwrap()));
58    println!(
59        "Unique tokens: {:?}",
60        counter.calculate_final_result() as i32
61    );
62}