code complexity & repetition analysis tool
1use crate::error::{MccabreError, Result};
2use std::path::Path;
3
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum Language {
6 Rust,
7 JavaScript,
8 TypeScript,
9 Go,
10 Java,
11 Cpp,
12}
13
14impl Language {
15 /// Detect language from file extension
16 pub fn from_path(path: &Path) -> Result<Self> {
17 let extension = path
18 .extension()
19 .and_then(|e| e.to_str())
20 .ok_or_else(|| MccabreError::UnsupportedFileType(path.to_string_lossy().to_string()))?;
21
22 match extension {
23 "rs" => Ok(Language::Rust),
24 "js" | "jsx" | "mjs" | "cjs" => Ok(Language::JavaScript),
25 "ts" | "tsx" => Ok(Language::TypeScript),
26 "go" => Ok(Language::Go),
27 "java" => Ok(Language::Java),
28 "cpp" | "cc" | "cxx" | "c++" | "hpp" | "h" | "hh" | "hxx" => Ok(Language::Cpp),
29 _ => Err(MccabreError::UnsupportedFileType(extension.to_string())),
30 }
31 }
32
33 /// Get single-line comment prefix
34 pub fn single_line_comment(&self) -> &'static str {
35 match self {
36 Language::Rust
37 | Language::JavaScript
38 | Language::TypeScript
39 | Language::Go
40 | Language::Java
41 | Language::Cpp => "//",
42 }
43 }
44
45 /// Get multi-line comment delimiters (start, end)
46 pub fn multi_line_comment(&self) -> (&'static str, &'static str) {
47 match self {
48 Language::Rust
49 | Language::JavaScript
50 | Language::TypeScript
51 | Language::Go
52 | Language::Java
53 | Language::Cpp => ("/*", "*/"),
54 }
55 }
56}
57
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub enum TokenType {
60 If,
61 Else,
62 ElseIf,
63 While,
64 For,
65 Loop,
66 Match,
67 Switch,
68 Case,
69 Default,
70 Catch,
71
72 LogicalAnd,
73 LogicalOr,
74 Ternary,
75
76 Operator(String),
77
78 Identifier(String),
79 Literal(String),
80
81 LeftBrace,
82 RightBrace,
83 LeftParen,
84 RightParen,
85 LeftBracket,
86 RightBracket,
87 Semicolon,
88 Comma,
89
90 Comment,
91 Whitespace,
92 Newline,
93 Unknown(char),
94}
95
96impl TokenType {
97 /// Returns true if this token contributes to cyclomatic complexity
98 pub fn is_decision_point(&self) -> bool {
99 matches!(
100 self,
101 TokenType::If
102 | TokenType::ElseIf
103 | TokenType::While
104 | TokenType::For
105 | TokenType::Loop
106 | TokenType::Match
107 | TokenType::Switch
108 | TokenType::Case
109 | TokenType::Catch
110 | TokenType::LogicalAnd
111 | TokenType::LogicalOr
112 | TokenType::Ternary
113 )
114 }
115
116 /// Returns true if this token should be included in clone detection
117 pub fn is_significant(&self) -> bool {
118 !matches!(self, TokenType::Comment | TokenType::Whitespace | TokenType::Newline)
119 }
120}
121
122#[derive(Debug, Clone)]
123pub struct Token {
124 pub token_type: TokenType,
125 pub line: usize,
126 pub column: usize,
127 pub text: String,
128}
129
130pub struct Tokenizer {
131 source: Vec<char>,
132 position: usize,
133 line: usize,
134 column: usize,
135 _language: Language,
136}
137
138impl Tokenizer {
139 pub fn new(source: &str, language: Language) -> Self {
140 Self { source: source.chars().collect(), position: 0, line: 1, column: 1, _language: language }
141 }
142
143 pub fn tokenize(mut self) -> Result<Vec<Token>> {
144 let mut tokens = Vec::new();
145
146 while !self.is_at_end() {
147 if let Some(token) = self.next_token()? {
148 tokens.push(token);
149 }
150 }
151
152 Ok(tokens)
153 }
154
155 fn next_token(&mut self) -> Result<Option<Token>> {
156 let start_line = self.line;
157 let start_column = self.column;
158 let start_pos = self.position;
159 let ch = self.current()?;
160
161 if ch.is_whitespace() {
162 if ch == '\n' {
163 self.advance();
164 return Ok(Some(Token {
165 token_type: TokenType::Newline,
166 line: start_line,
167 column: start_column,
168 text: "\n".to_string(),
169 }));
170 } else {
171 while !self.is_at_end() && self.current()?.is_whitespace() && self.current()? != '\n' {
172 self.advance();
173 }
174 return Ok(Some(Token {
175 token_type: TokenType::Whitespace,
176 line: start_line,
177 column: start_column,
178 text: " ".to_string(),
179 }));
180 }
181 }
182
183 if ch == '/' {
184 if self.peek() == Some('/') {
185 while !self.is_at_end() && self.current()? != '\n' {
186 self.advance();
187 }
188 return Ok(Some(Token {
189 token_type: TokenType::Comment,
190 line: start_line,
191 column: start_column,
192 text: "//".to_string(),
193 }));
194 } else if self.peek() == Some('*') {
195 self.advance();
196 self.advance();
197 while !self.is_at_end() {
198 if self.current()? == '*' && self.peek() == Some('/') {
199 self.advance();
200 self.advance();
201 break;
202 }
203 self.advance();
204 }
205 return Ok(Some(Token {
206 token_type: TokenType::Comment,
207 line: start_line,
208 column: start_column,
209 text: "/**/".to_string(),
210 }));
211 }
212 }
213
214 if ch == '"' || ch == '\'' {
215 let quote = ch;
216 self.advance();
217 while !self.is_at_end() && self.current()? != quote {
218 if self.current()? == '\\' {
219 self.advance();
220 if !self.is_at_end() {
221 self.advance();
222 }
223 } else {
224 self.advance();
225 }
226 }
227 if !self.is_at_end() {
228 self.advance();
229 }
230 let text: String = self.source[start_pos..self.position].iter().collect();
231 return Ok(Some(Token {
232 token_type: TokenType::Literal(text.clone()),
233 line: start_line,
234 column: start_column,
235 text,
236 }));
237 }
238
239 if ch.is_ascii_digit() {
240 while !self.is_at_end()
241 && (self.current()?.is_ascii_alphanumeric() || self.current()? == '.' || self.current()? == '_')
242 {
243 self.advance();
244 }
245 let text: String = self.source[start_pos..self.position].iter().collect();
246 return Ok(Some(Token {
247 token_type: TokenType::Literal(text.clone()),
248 line: start_line,
249 column: start_column,
250 text,
251 }));
252 }
253
254 if ch.is_alphabetic() || ch == '_' {
255 while !self.is_at_end() && (self.current()?.is_alphanumeric() || self.current()? == '_') {
256 self.advance();
257 }
258 let text: String = self.source[start_pos..self.position].iter().collect();
259 let token_type = self.classify_keyword(&text);
260 return Ok(Some(Token { token_type, line: start_line, column: start_column, text }));
261 }
262
263 let token_type = match ch {
264 '{' => {
265 self.advance();
266 TokenType::LeftBrace
267 }
268 '}' => {
269 self.advance();
270 TokenType::RightBrace
271 }
272 '(' => {
273 self.advance();
274 TokenType::LeftParen
275 }
276 ')' => {
277 self.advance();
278 TokenType::RightParen
279 }
280 '[' => {
281 self.advance();
282 TokenType::LeftBracket
283 }
284 ']' => {
285 self.advance();
286 TokenType::RightBracket
287 }
288 ';' => {
289 self.advance();
290 TokenType::Semicolon
291 }
292 ',' => {
293 self.advance();
294 TokenType::Comma
295 }
296 '?' => {
297 self.advance();
298 TokenType::Ternary
299 }
300 '&' if self.peek() == Some('&') => {
301 self.advance();
302 self.advance();
303 TokenType::LogicalAnd
304 }
305 '|' if self.peek() == Some('|') => {
306 self.advance();
307 self.advance();
308 TokenType::LogicalOr
309 }
310 _ => {
311 let op_chars = "+-*/%=<>!&|^~";
312 if op_chars.contains(ch) {
313 while !self.is_at_end() && op_chars.contains(self.current()?) {
314 self.advance();
315 }
316 let text: String = self.source[start_pos..self.position].iter().collect();
317 TokenType::Operator(text)
318 } else {
319 self.advance();
320 TokenType::Unknown(ch)
321 }
322 }
323 };
324
325 let text: String = self.source[start_pos..self.position].iter().collect();
326 Ok(Some(Token { token_type, line: start_line, column: start_column, text }))
327 }
328
329 fn classify_keyword(&self, word: &str) -> TokenType {
330 match word {
331 "if" => TokenType::If,
332 "else" => TokenType::Else,
333 "elif" => TokenType::ElseIf,
334 "while" => TokenType::While,
335 "for" => TokenType::For,
336 "loop" => TokenType::Loop,
337 "match" => TokenType::Match,
338 "switch" => TokenType::Switch,
339 "case" => TokenType::Case,
340 "default" => TokenType::Default,
341 "catch" => TokenType::Catch,
342 _ => TokenType::Identifier(word.to_string()),
343 }
344 }
345
346 fn current(&self) -> Result<char> {
347 self.source
348 .get(self.position)
349 .copied()
350 .ok_or_else(|| MccabreError::TokenizationError("Unexpected end of input".to_string()))
351 }
352
353 fn peek(&self) -> Option<char> {
354 self.source.get(self.position + 1).copied()
355 }
356
357 fn advance(&mut self) {
358 if let Some(ch) = self.source.get(self.position) {
359 if *ch == '\n' {
360 self.line += 1;
361 self.column = 1;
362 } else {
363 self.column += 1;
364 }
365 self.position += 1;
366 }
367 }
368
369 fn is_at_end(&self) -> bool {
370 self.position >= self.source.len()
371 }
372}
373
374#[cfg(test)]
375mod tests {
376 use super::*;
377
378 #[test]
379 fn test_language_detection() {
380 assert_eq!(Language::from_path(Path::new("test.rs")).unwrap(), Language::Rust);
381 assert_eq!(Language::from_path(Path::new("test.js")).unwrap(), Language::JavaScript);
382 assert_eq!(Language::from_path(Path::new("test.ts")).unwrap(), Language::TypeScript);
383 assert_eq!(Language::from_path(Path::new("test.go")).unwrap(), Language::Go);
384 assert_eq!(Language::from_path(Path::new("test.java")).unwrap(), Language::Java);
385 assert_eq!(Language::from_path(Path::new("test.cpp")).unwrap(), Language::Cpp);
386 }
387
388 #[test]
389 fn test_tokenize_simple() {
390 let source = "if (x > 5) { return true; }";
391 let tokenizer = Tokenizer::new(source, Language::Rust);
392 let tokens = tokenizer.tokenize().unwrap();
393
394 let significant: Vec<_> = tokens.iter().filter(|t| t.token_type.is_significant()).collect();
395
396 assert!(!significant.is_empty());
397 assert!(tokens.iter().any(|t| matches!(t.token_type, TokenType::If)));
398 }
399
400 #[test]
401 fn test_decision_points() {
402 let source = "if (x && y || z) { while (true) { } }";
403 let tokenizer = Tokenizer::new(source, Language::Rust);
404 let tokens = tokenizer.tokenize().unwrap();
405 let decision_count = tokens.iter().filter(|t| t.token_type.is_decision_point()).count();
406 assert_eq!(decision_count, 4);
407 }
408
409 #[test]
410 fn test_comments() {
411 let source = r#"
412// Single line comment
413/* Multi-line
414 comment */
415let x = 5;
416"#;
417 let tokenizer = Tokenizer::new(source, Language::Rust);
418 let tokens = tokenizer.tokenize().unwrap();
419
420 let comments: Vec<_> = tokens
421 .iter()
422 .filter(|t| matches!(t.token_type, TokenType::Comment))
423 .collect();
424
425 assert_eq!(comments.len(), 2);
426 }
427
428 #[test]
429 fn test_strings() {
430 let source = r#"let s = "hello \"world\""; let c = 'x';"#;
431 let tokenizer = Tokenizer::new(source, Language::Rust);
432 let tokens = tokenizer.tokenize().unwrap();
433
434 let literals: Vec<_> = tokens
435 .iter()
436 .filter(|t| matches!(t.token_type, TokenType::Literal(_)))
437 .collect();
438
439 assert!(literals.len() >= 2);
440 }
441}