clone detector (rabin-karp) · desertthunder.dev/mccabre@ab9599a

+260

crates/core/src/cloner/detector.rs

··· 1 + use crate::Result; 2 + use crate::cloner::rolling_hash::{RollingHash, token_hash}; 3 + use crate::tokenizer::{Language, Token, Tokenizer}; 4 + use serde::{Deserialize, Serialize}; 5 + use std::collections::HashMap; 6 + use std::path::PathBuf; 7 + 8 + /// A detected code clone 9 + #[derive(Debug, Clone, Serialize, Deserialize)] 10 + pub struct Clone { 11 + /// Unique ID for this clone group 12 + pub id: usize, 13 + /// Number of tokens in the cloned sequence 14 + pub length: usize, 15 + /// All locations where this clone appears 16 + pub locations: Vec<CloneLocation>, 17 + /// Hash value of the clone (for deduplication) 18 + #[serde(skip)] 19 + pub hash: u64, 20 + } 21 + 22 + /// Location of a code clone 23 + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] 24 + pub struct CloneLocation { 25 + /// File path 26 + pub file: PathBuf, 27 + /// Starting line number 28 + pub start_line: usize, 29 + /// Ending line number 30 + pub end_line: usize, 31 + } 32 + 33 + pub struct CloneDetector { 34 + /// Minimum number of tokens to consider as a clone 35 + _min_tokens: usize, 36 + /// Window size for rolling hash 37 + window_size: usize, 38 + } 39 + 40 + impl Default for CloneDetector { 41 + fn default() -> Self { 42 + Self { _min_tokens: 30, window_size: 30 } 43 + } 44 + } 45 + 46 + impl CloneDetector { 47 + pub fn new(min_tokens: usize) -> Self { 48 + Self { _min_tokens: min_tokens, window_size: min_tokens } 49 + } 50 + 51 + /// Detect clones in a single file 52 + pub fn detect_in_file(&self, source: &str, language: Language, file_path: PathBuf) -> Result<Vec<Clone>> { 53 + let tokens = Tokenizer::new(source, language).tokenize()?; 54 + let significant_tokens: Vec<&Token> = tokens.iter().filter(|t| t.token_type.is_significant()).collect(); 55 + 56 + if significant_tokens.len() < self.window_size { 57 + return Ok(Vec::new()); 58 + } 59 + 60 + let mut hash_map: HashMap<u64, Vec<(usize, usize)>> = HashMap::new(); 61 + let mut rh = RollingHash::new(self.window_size); 62 + 63 + let token_hashes: Vec<u64> = significant_tokens.iter().map(|t| token_hash(&t.text)).collect(); 64 + 65 + rh.init(&token_hashes[0..self.window_size]); 66 + let start_line = significant_tokens[0].line; 67 + let end_line = significant_tokens[self.window_size - 1].line; 68 + hash_map.entry(rh.get()).or_default().push((start_line, end_line)); 69 + 70 + for i in self.window_size..token_hashes.len() { 71 + let hash = rh.roll(token_hashes[i - self.window_size], token_hashes[i]); 72 + let start_line = significant_tokens[i - self.window_size + 1].line; 73 + let end_line = significant_tokens[i].line; 74 + hash_map.entry(hash).or_default().push((start_line, end_line)); 75 + } 76 + 77 + let mut clones = Vec::new(); 78 + let mut clone_id = 0; 79 + 80 + for (hash, locations) in hash_map { 81 + if locations.len() > 1 { 82 + clone_id += 1; 83 + clones.push(Clone { 84 + id: clone_id, 85 + length: self.window_size, 86 + locations: locations 87 + .into_iter() 88 + .map(|(start, end)| CloneLocation { file: file_path.clone(), start_line: start, end_line: end }) 89 + .collect(), 90 + hash, 91 + }); 92 + } 93 + } 94 + 95 + Ok(clones) 96 + } 97 + 98 + /// Detect clones across multiple files 99 + pub fn detect_across_files(&self, files: &[(PathBuf, String, Language)]) -> Result<Vec<Clone>> { 100 + let mut global_hash_map: HashMap<u64, Vec<CloneLocation>> = HashMap::new(); 101 + 102 + for (file_path, source, language) in files { 103 + let tokens = Tokenizer::new(source, *language).tokenize()?; 104 + let significant_tokens: Vec<&Token> = tokens.iter().filter(|t| t.token_type.is_significant()).collect(); 105 + 106 + if significant_tokens.len() < self.window_size { 107 + continue; 108 + } 109 + 110 + let mut rh = RollingHash::new(self.window_size); 111 + 112 + let token_hashes: Vec<u64> = significant_tokens.iter().map(|t| token_hash(&t.text)).collect(); 113 + 114 + rh.init(&token_hashes[0..self.window_size]); 115 + let start_line = significant_tokens[0].line; 116 + let end_line = significant_tokens[self.window_size - 1].line; 117 + global_hash_map.entry(rh.get()).or_default().push(CloneLocation { 118 + file: file_path.clone(), 119 + start_line, 120 + end_line, 121 + }); 122 + 123 + for i in self.window_size..token_hashes.len() { 124 + let hash = rh.roll(token_hashes[i - self.window_size], token_hashes[i]); 125 + let start_line = significant_tokens[i - self.window_size + 1].line; 126 + let end_line = significant_tokens[i].line; 127 + global_hash_map.entry(hash).or_default().push(CloneLocation { 128 + file: file_path.clone(), 129 + start_line, 130 + end_line, 131 + }); 132 + } 133 + } 134 + 135 + let mut clones = Vec::new(); 136 + let mut clone_id = 0; 137 + 138 + for (hash, mut locations) in global_hash_map { 139 + if locations.len() > 1 { 140 + locations.sort_by(|a, b| { 141 + a.file 142 + .cmp(&b.file) 143 + .then(a.start_line.cmp(&b.start_line)) 144 + .then(a.end_line.cmp(&b.end_line)) 145 + }); 146 + locations.dedup(); 147 + 148 + if locations.len() > 1 { 149 + clone_id += 1; 150 + clones.push(Clone { id: clone_id, length: self.window_size, locations, hash }); 151 + } 152 + } 153 + } 154 + 155 + clones.sort_by(|a, b| b.locations.len().cmp(&a.locations.len())); 156 + Ok(clones) 157 + } 158 + } 159 + 160 + #[cfg(test)] 161 + mod tests { 162 + use super::*; 163 + 164 + #[test] 165 + fn test_no_clones_in_simple_file() { 166 + let source = r#" 167 + fn simple() { 168 + let x = 5; 169 + let y = 10; 170 + return x + y; 171 + } 172 + "#; 173 + let detector = CloneDetector::new(10); 174 + let clones = detector 175 + .detect_in_file(source, Language::Rust, PathBuf::from("test.rs")) 176 + .unwrap(); 177 + 178 + assert_eq!(clones.len(), 0); 179 + } 180 + 181 + #[test] 182 + fn test_detect_simple_clone() { 183 + let source = r#" 184 + fn process_a() { 185 + let x = input.get(); 186 + let y = x * 2; 187 + let z = y + 5; 188 + return z; 189 + } 190 + 191 + fn process_b() { 192 + let x = input.get(); 193 + let y = x * 2; 194 + let z = y + 5; 195 + return z; 196 + } 197 + "#; 198 + let detector = CloneDetector::new(5); 199 + let clones = detector 200 + .detect_in_file(source, Language::Rust, PathBuf::from("test.rs")) 201 + .unwrap(); 202 + 203 + assert!(!clones.is_empty()); 204 + 205 + for clone in &clones { 206 + assert!(clone.locations.len() >= 2); 207 + } 208 + } 209 + 210 + #[test] 211 + fn test_across_files() { 212 + let file1 = r#" 213 + fn helper() { 214 + for i in 0..10 { 215 + println!("{}", i); 216 + } 217 + } 218 + "#; 219 + let file2 = r#" 220 + fn another() { 221 + for i in 0..10 { 222 + println!("{}", i); 223 + } 224 + } 225 + "#; 226 + 227 + let files = vec![ 228 + (PathBuf::from("file1.rs"), file1.to_string(), Language::Rust), 229 + (PathBuf::from("file2.rs"), file2.to_string(), Language::Rust), 230 + ]; 231 + 232 + let detector = CloneDetector::new(5); 233 + let clones = detector.detect_across_files(&files).unwrap(); 234 + 235 + if !clones.is_empty() { 236 + let has_cross_file = clones.iter().any(|clone| { 237 + let files: std::collections::HashSet<_> = clone.locations.iter().map(|l| &l.file).collect(); 238 + files.len() > 1 239 + }); 240 + assert!(has_cross_file, "Should detect clones across different files"); 241 + } 242 + } 243 + 244 + #[test] 245 + fn test_min_tokens_threshold() { 246 + let source = "let x = 5; let y = 10; let x = 5; let y = 10;"; 247 + 248 + let detector1 = CloneDetector::new(3); 249 + let clones1 = detector1 250 + .detect_in_file(source, Language::Rust, PathBuf::from("test.rs")) 251 + .unwrap(); 252 + 253 + let detector2 = CloneDetector::new(100); 254 + let clones2 = detector2 255 + .detect_in_file(source, Language::Rust, PathBuf::from("test.rs")) 256 + .unwrap(); 257 + 258 + assert!(clones2.len() <= clones1.len()); 259 + } 260 + }

+5

crates/core/src/cloner/mod.rs

··· 1 + pub mod detector; 2 + pub mod rolling_hash; 3 + 4 + pub use detector::{Clone, CloneDetector, CloneLocation}; 5 + pub use rolling_hash::RollingHash;

+155

crates/core/src/cloner/rolling_hash.rs

··· 1 + /// Rabin-Karp rolling hash implementation for efficient string/token matching 2 + /// 3 + /// This uses polynomial rolling hash with a large prime modulus to minimize collisions 4 + /// while allowing O(1) hash updates when the window slides. 5 + pub struct RollingHash { 6 + /// Base for polynomial hash (using 257 for byte values + 1) 7 + base: u64, 8 + /// Large prime modulus to reduce collisions 9 + modulus: u64, 10 + /// Current hash value 11 + hash: u64, 12 + /// Window size 13 + window_size: usize, 14 + /// Precomputed base^(window_size-1) mod modulus for removing leftmost element 15 + base_power: u64, 16 + } 17 + 18 + impl RollingHash { 19 + /// Create a new rolling hash with the specified window size 20 + pub fn new(window_size: usize) -> Self { 21 + let base = 257u64; 22 + let modulus = 1_000_000_007u64; 23 + 24 + let mut base_power = 1u64; 25 + for _ in 0..window_size.saturating_sub(1) { 26 + base_power = Self::mul_mod(base_power, base, modulus); 27 + } 28 + 29 + Self { base, modulus, hash: 0, window_size, base_power } 30 + } 31 + 32 + /// Initialize hash with a sequence of values 33 + pub fn init(&mut self, values: &[u64]) { 34 + self.hash = 0; 35 + for &val in values.iter().take(self.window_size) { 36 + self.hash = Self::mul_mod(self.hash, self.base, self.modulus); 37 + self.hash = Self::add_mod(self.hash, val, self.modulus); 38 + } 39 + } 40 + 41 + /// Roll the window: remove the leftmost value and add a new rightmost value 42 + /// Returns the new hash value 43 + pub fn roll(&mut self, old_value: u64, new_value: u64) -> u64 { 44 + let old_contrib = Self::mul_mod(old_value, self.base_power, self.modulus); 45 + self.hash = Self::sub_mod(self.hash, old_contrib, self.modulus); 46 + 47 + self.hash = Self::mul_mod(self.hash, self.base, self.modulus); 48 + self.hash = Self::add_mod(self.hash, new_value, self.modulus); 49 + 50 + self.hash 51 + } 52 + 53 + /// Get current hash value 54 + pub fn get(&self) -> u64 { 55 + self.hash 56 + } 57 + 58 + /// Multiply with modular arithmetic to prevent overflow 59 + fn mul_mod(a: u64, b: u64, modulus: u64) -> u64 { 60 + ((a as u128 * b as u128) % modulus as u128) as u64 61 + } 62 + 63 + /// Add with modular arithmetic 64 + fn add_mod(a: u64, b: u64, modulus: u64) -> u64 { 65 + ((a as u128 + b as u128) % modulus as u128) as u64 66 + } 67 + 68 + /// Subtract with modular arithmetic (handles underflow) 69 + fn sub_mod(a: u64, b: u64, modulus: u64) -> u64 { 70 + if a >= b { (a - b) % modulus } else { (modulus - (b - a) % modulus) % modulus } 71 + } 72 + } 73 + 74 + /// Compute a hash value for a token string (for use in rolling hash) 75 + pub fn token_hash(token: &str) -> u64 { 76 + let mut hash = 5381u64; 77 + for byte in token.bytes() { 78 + hash = hash.wrapping_mul(33).wrapping_add(byte as u64); 79 + } 80 + hash 81 + } 82 + 83 + #[cfg(test)] 84 + mod tests { 85 + use super::*; 86 + 87 + #[test] 88 + fn test_rolling_hash_basic() { 89 + let window_size = 3; 90 + let mut rh = RollingHash::new(window_size); 91 + 92 + let values = [1, 2, 3, 4, 5]; 93 + rh.init(&values[0..3]); 94 + 95 + let hash1 = rh.get(); 96 + assert_ne!(hash1, 0); 97 + 98 + let hash2 = rh.roll(1, 4); 99 + assert_ne!(hash2, 0); 100 + assert_ne!(hash1, hash2); 101 + 102 + let hash3 = rh.roll(2, 5); 103 + assert_ne!(hash3, 0); 104 + assert_ne!(hash2, hash3); 105 + } 106 + 107 + #[test] 108 + fn test_same_sequence_same_hash() { 109 + let mut rh1 = RollingHash::new(4); 110 + let mut rh2 = RollingHash::new(4); 111 + 112 + let seq = vec![10, 20, 30, 40]; 113 + rh1.init(&seq); 114 + rh2.init(&seq); 115 + 116 + assert_eq!(rh1.get(), rh2.get()); 117 + } 118 + 119 + #[test] 120 + fn test_different_sequences_different_hash() { 121 + let mut rh1 = RollingHash::new(4); 122 + let mut rh2 = RollingHash::new(4); 123 + 124 + rh1.init(&[1, 2, 3, 4]); 125 + rh2.init(&[1, 2, 3, 5]); 126 + 127 + assert_ne!(rh1.get(), rh2.get()); 128 + } 129 + 130 + #[test] 131 + fn test_token_hash() { 132 + let h1 = token_hash("if"); 133 + let h2 = token_hash("else"); 134 + let h3 = token_hash("if"); 135 + 136 + assert_ne!(h1, h2); 137 + assert_eq!(h1, h3); 138 + } 139 + 140 + #[test] 141 + fn test_rolling_preserves_pattern() { 142 + let mut rh = RollingHash::new(3); 143 + let values = vec![1, 2, 3, 4, 5, 6, 1, 2, 3]; 144 + 145 + rh.init(&values[0..3]); 146 + let first_hash = rh.get(); 147 + 148 + let mut current_hash = first_hash; 149 + for i in 3..9 { 150 + current_hash = rh.roll(values[i - 3], values[i]); 151 + } 152 + 153 + assert_eq!(current_hash, first_hash); 154 + } 155 + }

Configure Feed

Configure Feed