rust-based ai-native terminal for cloud infrastructure operations, with an integrated agentic engine for DevOps assistance www.infraware.dev
ai llm dev rust cloudops opensource devops cloud os
at main 620 lines 18 kB view raw
1//! Input classification: Command vs Natural Language 2//! 3//! This module provides a simple classifier to determine if user input 4//! should be sent to the shell or to the LLM backend. 5//! 6//! The classification uses several heuristics: 7//! 1. Question marks (?, ¿) indicate natural language 8//! 2. Non-ASCII characters suggest non-English queries 9//! 3. Long phrases without shell operators are likely natural language 10//! 4. Question words (how, what, why, etc.) indicate queries 11//! 5. Shell syntax (pipes, flags, paths) indicates commands 12 13use once_cell::sync::Lazy; 14use regex::RegexSet; 15 16/// Represents the type of user input 17#[derive(Debug, Clone, PartialEq, Eq)] 18pub enum InputType { 19 /// A shell command to be executed 20 Command(String), 21 /// Natural language query for the LLM 22 NaturalLanguage(String), 23 /// Empty input 24 Empty, 25} 26 27/// Precompiled regex patterns for efficient classification 28struct Patterns { 29 /// Patterns indicating natural language 30 natural_language: RegexSet, 31 /// Patterns indicating shell command syntax 32 command_syntax: RegexSet, 33 /// Shell operators 34 shell_operators: RegexSet, 35} 36 37/// Global compiled patterns (initialized once) 38static PATTERNS: Lazy<Patterns> = Lazy::new(|| Patterns { 39 natural_language: RegexSet::new([ 40 r"[\?¿]", // Question marks (universal) 41 r"(?i)^(how|what|why|when|where|who|which)\s", // Question words 42 r"(?i)^(can you|could you|would you|will you)\s", // Request phrases 43 r"(?i)(please|help me|show me|explain)\s", // Polite phrases 44 r"(?i)\s(a|an|the)\s", // Articles (indicate prose) 45 ]) 46 .expect("Failed to compile natural_language patterns"), 47 48 command_syntax: RegexSet::new([ 49 r"^[a-zA-Z0-9_-]+\s+--?[a-zA-Z]", // Flags: cmd --flag, cmd -f 50 r"^\.{1,2}/", // Relative paths: ./, ../ 51 r"^/[a-zA-Z]", // Absolute paths: /usr/bin 52 r"^\$[A-Z_]", // Env var start: $HOME 53 r"^[a-z]+=$", // Env assignment: FOO= 54 ]) 55 .expect("Failed to compile command_syntax patterns"), 56 57 shell_operators: RegexSet::new([ 58 r"\|", // Pipe 59 r"&&|\|\|", // Logical operators 60 r"[<>]", // Redirects 61 r";", // Command separator 62 ]) 63 .expect("Failed to compile shell_operators patterns"), 64}); 65 66/// Returns `true` when `input` contains a `?` that is NOT part of the 67/// shell exit-code variable `$?`. 68fn has_real_question_mark(input: &str) -> bool { 69 let stripped = input.replace("$?", ""); 70 stripped.contains('?') 71} 72 73/// Simple input classifier 74/// 75/// Uses heuristics to determine if input is a command or natural language. 76/// Natural language queries are sent to the LLM, commands to the shell. 77#[derive(Debug, Default)] 78pub struct InputClassifier; 79 80impl InputClassifier { 81 /// Create a new classifier 82 pub fn new() -> Self { 83 Self 84 } 85 86 /// Classify user input as Command, NaturalLanguage, or Empty 87 pub fn classify(&self, input: &str) -> InputType { 88 let trimmed = input.trim(); 89 90 // Empty input 91 if trimmed.is_empty() { 92 return InputType::Empty; 93 } 94 95 // Check for explicit '?' prefix (user explicitly wants LLM) 96 if let Some(query) = trimmed.strip_prefix('?') { 97 let query = query.trim(); 98 if !query.is_empty() { 99 return InputType::NaturalLanguage(query.to_string()); 100 } 101 } 102 103 // Check for natural language indicators 104 if self.is_natural_language(trimmed) { 105 return InputType::NaturalLanguage(trimmed.to_string()); 106 } 107 108 // Default to command 109 InputType::Command(trimmed.to_string()) 110 } 111 112 /// Check if input is likely natural language. 113 /// 114 /// Uses layered heuristics: explicit question marks are the strongest NL 115 /// signal and override command detection. When the first word is a known 116 /// command, weaker signals (articles, phrase length) are ignored so that 117 /// inputs like `cat a file` or `sudo apt install a package` stay 118 /// classified as commands. 119 fn is_natural_language(&self, input: &str) -> bool { 120 let words: Vec<&str> = input.split_whitespace().collect(); 121 let first_word = words.first().copied().unwrap_or(""); 122 let starts_with_command = self.looks_like_command(first_word); 123 124 // 1. Explicit question marks → strong NL signal, overrides command detection. 125 // Ignore `$?` (shell exit-code variable) — it is not a real question mark. 126 if (has_real_question_mark(input) || input.contains('¿')) 127 && !PATTERNS.shell_operators.is_match(input) 128 { 129 return true; 130 } 131 132 // 2. If the first word is a known command, classify as command. 133 // Articles, question words and phrase length are weak signals that 134 // must not override an unambiguous command prefix. 135 if starts_with_command { 136 return false; 137 } 138 139 // 3. NL patterns (question words, request phrases, articles) → NL 140 // Only reached when the first word is NOT a known command. 141 if PATTERNS.natural_language.is_match(input) && !PATTERNS.shell_operators.is_match(input) { 142 return true; 143 } 144 145 // 4. Explicit command syntax → not NL 146 if PATTERNS.command_syntax.is_match(input) { 147 return false; 148 } 149 150 // 5. Contains shell operators → command 151 if PATTERNS.shell_operators.is_match(input) { 152 return false; 153 } 154 155 // 6. Contains non-ASCII characters → likely non-English NL 156 // (e.g., "chi sono io" in Italian, "什么是" in Chinese) 157 if !input.is_ascii() { 158 return true; 159 } 160 161 // 7. Long phrase (>5 words) without shell operators → likely NL 162 if words.len() > 5 { 163 return true; 164 } 165 166 // 8. Medium phrase (3-5 words) → likely NL 167 // (known-command-first inputs already returned at step 2) 168 if words.len() >= 3 { 169 return true; 170 } 171 172 false 173 } 174 175 /// Check if a word looks like a Unix command name 176 fn looks_like_command(&self, word: &str) -> bool { 177 // Commands are typically lowercase alphanumeric with optional hyphens/underscores 178 // Very short words (1-2 chars) are often commands (ls, cd, rm, cp, mv) 179 if word.len() <= 2 && word.chars().all(|c| c.is_ascii_lowercase()) { 180 return true; 181 } 182 183 // Known commands and tools (shell builtins, coreutils, package 184 // managers, container tooling, networking, etc.). Kept sorted 185 // alphabetically for easy maintenance. M-DOCUMENTED-MAGIC: this 186 // list biases the classifier toward "command" for 3+ word inputs 187 // that start with a recognized executable name. 188 const COMMON_COMMANDS: &[&str] = &[ 189 "adduser", 190 "alias", 191 "apk", 192 "apt", 193 "apt-get", 194 "ar", 195 "awk", 196 "base64", 197 "bat", 198 "bc", 199 "brew", 200 "cal", 201 "cargo", 202 "cat", 203 "cd", 204 "chmod", 205 "chown", 206 "chsh", 207 "clang", 208 "clear", 209 "cmake", 210 "code", 211 "cp", 212 "crontab", 213 "curl", 214 "cut", 215 "date", 216 "dd", 217 "deluser", 218 "df", 219 "diff", 220 "dmesg", 221 "dnf", 222 "docker", 223 "dpkg", 224 "du", 225 "echo", 226 "emacs", 227 "env", 228 "exa", 229 "exit", 230 "export", 231 "expr", 232 "false", 233 "fd", 234 "file", 235 "find", 236 "flatpak", 237 "free", 238 "fuser", 239 "fzf", 240 "gcc", 241 "git", 242 "go", 243 "grep", 244 "groupadd", 245 "groupdel", 246 "gunzip", 247 "gzip", 248 "head", 249 "help", 250 "history", 251 "hostname", 252 "hostnamectl", 253 "htop", 254 "htpasswd", 255 "ifconfig", 256 "info", 257 "insmod", 258 "ip", 259 "iptables", 260 "java", 261 "javac", 262 "journalctl", 263 "jq", 264 "kill", 265 "killall", 266 "kubectl", 267 "ldd", 268 "less", 269 "ln", 270 "locale", 271 "loginctl", 272 "ls", 273 "lsblk", 274 "lsmod", 275 "lsof", 276 "make", 277 "man", 278 "md5sum", 279 "mkdir", 280 "modprobe", 281 "more", 282 "mount", 283 "mv", 284 "nano", 285 "netstat", 286 "nft", 287 "nice", 288 "nm", 289 "nmap", 290 "node", 291 "nohup", 292 "npm", 293 "openssl", 294 "pacman", 295 "passwd", 296 "patch", 297 "pgrep", 298 "pip", 299 "pkill", 300 "podman", 301 "printenv", 302 "printf", 303 "ps", 304 "pwd", 305 "python", 306 "python3", 307 "readelf", 308 "renice", 309 "rg", 310 "rm", 311 "rmmod", 312 "route", 313 "rpm", 314 "rsync", 315 "rustc", 316 "scp", 317 "screen", 318 "sed", 319 "seq", 320 "service", 321 "sha256sum", 322 "sleep", 323 "snap", 324 "sort", 325 "source", 326 "ss", 327 "ssh", 328 "stat", 329 "strace", 330 "strings", 331 "strip", 332 "su", 333 "subl", 334 "sudo", 335 "sysctl", 336 "systemctl", 337 "tail", 338 "tar", 339 "tee", 340 "test", 341 "timedatectl", 342 "tmux", 343 "top", 344 "touch", 345 "tr", 346 "true", 347 "umount", 348 "uname", 349 "uniq", 350 "unzip", 351 "update-alternatives", 352 "useradd", 353 "userdel", 354 "vi", 355 "vim", 356 "watch", 357 "wc", 358 "wget", 359 "which", 360 "whereis", 361 "who", 362 "whoami", 363 "xargs", 364 "yarn", 365 "yes", 366 "yq", 367 "zip", 368 "zypper", 369 ]; 370 371 COMMON_COMMANDS.contains(&word.to_lowercase().as_str()) 372 } 373} 374 375#[cfg(test)] 376mod tests { 377 use super::*; 378 379 #[test] 380 fn test_empty_input() { 381 let classifier = InputClassifier::new(); 382 assert_eq!(classifier.classify(""), InputType::Empty); 383 assert_eq!(classifier.classify(" "), InputType::Empty); 384 } 385 386 #[test] 387 fn test_explicit_query_prefix() { 388 let classifier = InputClassifier::new(); 389 assert_eq!( 390 classifier.classify("? how do I list files"), 391 InputType::NaturalLanguage("how do I list files".to_string()) 392 ); 393 assert_eq!( 394 classifier.classify("?chi sono io"), 395 InputType::NaturalLanguage("chi sono io".to_string()) 396 ); 397 } 398 399 #[test] 400 fn test_question_marks() { 401 let classifier = InputClassifier::new(); 402 assert!(matches!( 403 classifier.classify("how do I list files?"), 404 InputType::NaturalLanguage(_) 405 )); 406 assert!(matches!( 407 classifier.classify("what is docker?"), 408 InputType::NaturalLanguage(_) 409 )); 410 } 411 412 #[test] 413 fn test_non_ascii() { 414 let classifier = InputClassifier::new(); 415 // Italian 416 assert!(matches!( 417 classifier.classify("chi sono io"), 418 InputType::NaturalLanguage(_) 419 )); 420 // Spanish 421 assert!(matches!( 422 classifier.classify("cómo listar archivos"), 423 InputType::NaturalLanguage(_) 424 )); 425 } 426 427 #[test] 428 fn test_commands() { 429 let classifier = InputClassifier::new(); 430 assert!(matches!( 431 classifier.classify("ls -la"), 432 InputType::Command(_) 433 )); 434 assert!(matches!( 435 classifier.classify("docker ps"), 436 InputType::Command(_) 437 )); 438 assert!(matches!( 439 classifier.classify("git status"), 440 InputType::Command(_) 441 )); 442 assert!(matches!( 443 classifier.classify("cat /etc/passwd"), 444 InputType::Command(_) 445 )); 446 } 447 448 #[test] 449 fn test_shell_operators() { 450 let classifier = InputClassifier::new(); 451 assert!(matches!( 452 classifier.classify("ls | grep foo"), 453 InputType::Command(_) 454 )); 455 assert!(matches!( 456 classifier.classify("cat file > output"), 457 InputType::Command(_) 458 )); 459 assert!(matches!( 460 classifier.classify("cmd1 && cmd2"), 461 InputType::Command(_) 462 )); 463 } 464 465 #[test] 466 fn test_long_phrases() { 467 let classifier = InputClassifier::new(); 468 assert!(matches!( 469 classifier.classify("show me all the docker containers running"), 470 InputType::NaturalLanguage(_) 471 )); 472 } 473 474 #[test] 475 fn test_question_words() { 476 let classifier = InputClassifier::new(); 477 assert!(matches!( 478 classifier.classify("how to list files"), 479 InputType::NaturalLanguage(_) 480 )); 481 assert!(matches!( 482 classifier.classify("what is kubernetes"), 483 InputType::NaturalLanguage(_) 484 )); 485 assert!(matches!( 486 classifier.classify("why is my container failing"), 487 InputType::NaturalLanguage(_) 488 )); 489 } 490 491 #[test] 492 fn test_polite_phrases() { 493 let classifier = InputClassifier::new(); 494 assert!(matches!( 495 classifier.classify("please help me"), 496 InputType::NaturalLanguage(_) 497 )); 498 assert!(matches!( 499 classifier.classify("can you explain docker"), 500 InputType::NaturalLanguage(_) 501 )); 502 } 503 504 #[test] 505 fn test_whoami_classification() { 506 let classifier = InputClassifier::new(); 507 // "whoami" is a known command 508 assert!(matches!( 509 classifier.classify("whoami"), 510 InputType::Command(_) 511 )); 512 // "chi sono io" is Italian (non-ASCII 'ì' not present, but space pattern) 513 // Actually "chi sono io" is all ASCII, but it's 3 words and "chi" is not a command 514 assert!(matches!( 515 classifier.classify("chi sono io"), 516 InputType::NaturalLanguage(_) 517 )); 518 } 519 520 #[test] 521 fn test_which_command_not_classified_as_nl() { 522 let classifier = InputClassifier::new(); 523 // "which" is both a question word AND a Unix command. 524 // When used as first word it must be classified as Command. 525 assert!(matches!( 526 classifier.classify("which python"), 527 InputType::Command(_) 528 )); 529 assert!(matches!( 530 classifier.classify("which node"), 531 InputType::Command(_) 532 )); 533 } 534 535 #[test] 536 fn test_who_command_not_classified_as_nl() { 537 let classifier = InputClassifier::new(); 538 assert!(matches!( 539 classifier.classify("who am i"), 540 InputType::Command(_) 541 )); 542 } 543 544 #[test] 545 fn test_command_with_article_not_classified_as_nl() { 546 let classifier = InputClassifier::new(); 547 // Commands containing articles ("a", "an", "the") must stay commands 548 assert!(matches!( 549 classifier.classify("cat a file.txt"), 550 InputType::Command(_) 551 )); 552 assert!(matches!( 553 classifier.classify("touch a new_file"), 554 InputType::Command(_) 555 )); 556 assert!(matches!( 557 classifier.classify("sudo apt install a package"), 558 InputType::Command(_) 559 )); 560 assert!(matches!( 561 classifier.classify("mkdir the directory"), 562 InputType::Command(_) 563 )); 564 } 565 566 #[test] 567 fn test_container_commands() { 568 let classifier = InputClassifier::new(); 569 // Commands commonly used inside Docker/container environments 570 assert!(matches!( 571 classifier.classify("service nginx restart"), 572 InputType::Command(_) 573 )); 574 assert!(matches!( 575 classifier.classify("dpkg --list"), 576 InputType::Command(_) 577 )); 578 assert!(matches!( 579 classifier.classify("ip addr show"), 580 InputType::Command(_) 581 )); 582 assert!(matches!( 583 classifier.classify("useradd -m newuser"), 584 InputType::Command(_) 585 )); 586 assert!(matches!( 587 classifier.classify("apk add curl"), 588 InputType::Command(_) 589 )); 590 } 591 592 #[test] 593 fn test_question_mark_overrides_command_prefix() { 594 let classifier = InputClassifier::new(); 595 // Even if the first word is a command, a question mark is a strong NL signal 596 assert!(matches!( 597 classifier.classify("git what branch am I on?"), 598 InputType::NaturalLanguage(_) 599 )); 600 } 601 602 #[test] 603 fn test_dollar_question_mark_not_classified_as_nl() { 604 let classifier = InputClassifier::new(); 605 // $? is the shell exit-code variable, not a real question mark 606 assert!(matches!( 607 classifier.classify("echo $?"), 608 InputType::Command(_) 609 )); 610 assert!(matches!( 611 classifier.classify("test $? -eq 0"), 612 InputType::Command(_) 613 )); 614 // But a real question mark after a command should still trigger NL 615 assert!(matches!( 616 classifier.classify("echo what is this?"), 617 InputType::NaturalLanguage(_) 618 )); 619 } 620}