rust-based ai-native terminal for cloud infrastructure operations, with an integrated agentic engine for DevOps assistance
www.infraware.dev
ai
llm
dev
rust
cloudops
opensource
devops
cloud
os
1//! Input classification: Command vs Natural Language
2//!
3//! This module provides a simple classifier to determine if user input
4//! should be sent to the shell or to the LLM backend.
5//!
6//! The classification uses several heuristics:
7//! 1. Question marks (?, ¿) indicate natural language
8//! 2. Non-ASCII characters suggest non-English queries
9//! 3. Long phrases without shell operators are likely natural language
10//! 4. Question words (how, what, why, etc.) indicate queries
11//! 5. Shell syntax (pipes, flags, paths) indicates commands
12
13use once_cell::sync::Lazy;
14use regex::RegexSet;
15
16/// Represents the type of user input
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub enum InputType {
19 /// A shell command to be executed
20 Command(String),
21 /// Natural language query for the LLM
22 NaturalLanguage(String),
23 /// Empty input
24 Empty,
25}
26
27/// Precompiled regex patterns for efficient classification
28struct Patterns {
29 /// Patterns indicating natural language
30 natural_language: RegexSet,
31 /// Patterns indicating shell command syntax
32 command_syntax: RegexSet,
33 /// Shell operators
34 shell_operators: RegexSet,
35}
36
37/// Global compiled patterns (initialized once)
38static PATTERNS: Lazy<Patterns> = Lazy::new(|| Patterns {
39 natural_language: RegexSet::new([
40 r"[\?¿]", // Question marks (universal)
41 r"(?i)^(how|what|why|when|where|who|which)\s", // Question words
42 r"(?i)^(can you|could you|would you|will you)\s", // Request phrases
43 r"(?i)(please|help me|show me|explain)\s", // Polite phrases
44 r"(?i)\s(a|an|the)\s", // Articles (indicate prose)
45 ])
46 .expect("Failed to compile natural_language patterns"),
47
48 command_syntax: RegexSet::new([
49 r"^[a-zA-Z0-9_-]+\s+--?[a-zA-Z]", // Flags: cmd --flag, cmd -f
50 r"^\.{1,2}/", // Relative paths: ./, ../
51 r"^/[a-zA-Z]", // Absolute paths: /usr/bin
52 r"^\$[A-Z_]", // Env var start: $HOME
53 r"^[a-z]+=$", // Env assignment: FOO=
54 ])
55 .expect("Failed to compile command_syntax patterns"),
56
57 shell_operators: RegexSet::new([
58 r"\|", // Pipe
59 r"&&|\|\|", // Logical operators
60 r"[<>]", // Redirects
61 r";", // Command separator
62 ])
63 .expect("Failed to compile shell_operators patterns"),
64});
65
66/// Returns `true` when `input` contains a `?` that is NOT part of the
67/// shell exit-code variable `$?`.
68fn has_real_question_mark(input: &str) -> bool {
69 let stripped = input.replace("$?", "");
70 stripped.contains('?')
71}
72
73/// Simple input classifier
74///
75/// Uses heuristics to determine if input is a command or natural language.
76/// Natural language queries are sent to the LLM, commands to the shell.
77#[derive(Debug, Default)]
78pub struct InputClassifier;
79
80impl InputClassifier {
81 /// Create a new classifier
82 pub fn new() -> Self {
83 Self
84 }
85
86 /// Classify user input as Command, NaturalLanguage, or Empty
87 pub fn classify(&self, input: &str) -> InputType {
88 let trimmed = input.trim();
89
90 // Empty input
91 if trimmed.is_empty() {
92 return InputType::Empty;
93 }
94
95 // Check for explicit '?' prefix (user explicitly wants LLM)
96 if let Some(query) = trimmed.strip_prefix('?') {
97 let query = query.trim();
98 if !query.is_empty() {
99 return InputType::NaturalLanguage(query.to_string());
100 }
101 }
102
103 // Check for natural language indicators
104 if self.is_natural_language(trimmed) {
105 return InputType::NaturalLanguage(trimmed.to_string());
106 }
107
108 // Default to command
109 InputType::Command(trimmed.to_string())
110 }
111
112 /// Check if input is likely natural language.
113 ///
114 /// Uses layered heuristics: explicit question marks are the strongest NL
115 /// signal and override command detection. When the first word is a known
116 /// command, weaker signals (articles, phrase length) are ignored so that
117 /// inputs like `cat a file` or `sudo apt install a package` stay
118 /// classified as commands.
119 fn is_natural_language(&self, input: &str) -> bool {
120 let words: Vec<&str> = input.split_whitespace().collect();
121 let first_word = words.first().copied().unwrap_or("");
122 let starts_with_command = self.looks_like_command(first_word);
123
124 // 1. Explicit question marks → strong NL signal, overrides command detection.
125 // Ignore `$?` (shell exit-code variable) — it is not a real question mark.
126 if (has_real_question_mark(input) || input.contains('¿'))
127 && !PATTERNS.shell_operators.is_match(input)
128 {
129 return true;
130 }
131
132 // 2. If the first word is a known command, classify as command.
133 // Articles, question words and phrase length are weak signals that
134 // must not override an unambiguous command prefix.
135 if starts_with_command {
136 return false;
137 }
138
139 // 3. NL patterns (question words, request phrases, articles) → NL
140 // Only reached when the first word is NOT a known command.
141 if PATTERNS.natural_language.is_match(input) && !PATTERNS.shell_operators.is_match(input) {
142 return true;
143 }
144
145 // 4. Explicit command syntax → not NL
146 if PATTERNS.command_syntax.is_match(input) {
147 return false;
148 }
149
150 // 5. Contains shell operators → command
151 if PATTERNS.shell_operators.is_match(input) {
152 return false;
153 }
154
155 // 6. Contains non-ASCII characters → likely non-English NL
156 // (e.g., "chi sono io" in Italian, "什么是" in Chinese)
157 if !input.is_ascii() {
158 return true;
159 }
160
161 // 7. Long phrase (>5 words) without shell operators → likely NL
162 if words.len() > 5 {
163 return true;
164 }
165
166 // 8. Medium phrase (3-5 words) → likely NL
167 // (known-command-first inputs already returned at step 2)
168 if words.len() >= 3 {
169 return true;
170 }
171
172 false
173 }
174
175 /// Check if a word looks like a Unix command name
176 fn looks_like_command(&self, word: &str) -> bool {
177 // Commands are typically lowercase alphanumeric with optional hyphens/underscores
178 // Very short words (1-2 chars) are often commands (ls, cd, rm, cp, mv)
179 if word.len() <= 2 && word.chars().all(|c| c.is_ascii_lowercase()) {
180 return true;
181 }
182
183 // Known commands and tools (shell builtins, coreutils, package
184 // managers, container tooling, networking, etc.). Kept sorted
185 // alphabetically for easy maintenance. M-DOCUMENTED-MAGIC: this
186 // list biases the classifier toward "command" for 3+ word inputs
187 // that start with a recognized executable name.
188 const COMMON_COMMANDS: &[&str] = &[
189 "adduser",
190 "alias",
191 "apk",
192 "apt",
193 "apt-get",
194 "ar",
195 "awk",
196 "base64",
197 "bat",
198 "bc",
199 "brew",
200 "cal",
201 "cargo",
202 "cat",
203 "cd",
204 "chmod",
205 "chown",
206 "chsh",
207 "clang",
208 "clear",
209 "cmake",
210 "code",
211 "cp",
212 "crontab",
213 "curl",
214 "cut",
215 "date",
216 "dd",
217 "deluser",
218 "df",
219 "diff",
220 "dmesg",
221 "dnf",
222 "docker",
223 "dpkg",
224 "du",
225 "echo",
226 "emacs",
227 "env",
228 "exa",
229 "exit",
230 "export",
231 "expr",
232 "false",
233 "fd",
234 "file",
235 "find",
236 "flatpak",
237 "free",
238 "fuser",
239 "fzf",
240 "gcc",
241 "git",
242 "go",
243 "grep",
244 "groupadd",
245 "groupdel",
246 "gunzip",
247 "gzip",
248 "head",
249 "help",
250 "history",
251 "hostname",
252 "hostnamectl",
253 "htop",
254 "htpasswd",
255 "ifconfig",
256 "info",
257 "insmod",
258 "ip",
259 "iptables",
260 "java",
261 "javac",
262 "journalctl",
263 "jq",
264 "kill",
265 "killall",
266 "kubectl",
267 "ldd",
268 "less",
269 "ln",
270 "locale",
271 "loginctl",
272 "ls",
273 "lsblk",
274 "lsmod",
275 "lsof",
276 "make",
277 "man",
278 "md5sum",
279 "mkdir",
280 "modprobe",
281 "more",
282 "mount",
283 "mv",
284 "nano",
285 "netstat",
286 "nft",
287 "nice",
288 "nm",
289 "nmap",
290 "node",
291 "nohup",
292 "npm",
293 "openssl",
294 "pacman",
295 "passwd",
296 "patch",
297 "pgrep",
298 "pip",
299 "pkill",
300 "podman",
301 "printenv",
302 "printf",
303 "ps",
304 "pwd",
305 "python",
306 "python3",
307 "readelf",
308 "renice",
309 "rg",
310 "rm",
311 "rmmod",
312 "route",
313 "rpm",
314 "rsync",
315 "rustc",
316 "scp",
317 "screen",
318 "sed",
319 "seq",
320 "service",
321 "sha256sum",
322 "sleep",
323 "snap",
324 "sort",
325 "source",
326 "ss",
327 "ssh",
328 "stat",
329 "strace",
330 "strings",
331 "strip",
332 "su",
333 "subl",
334 "sudo",
335 "sysctl",
336 "systemctl",
337 "tail",
338 "tar",
339 "tee",
340 "test",
341 "timedatectl",
342 "tmux",
343 "top",
344 "touch",
345 "tr",
346 "true",
347 "umount",
348 "uname",
349 "uniq",
350 "unzip",
351 "update-alternatives",
352 "useradd",
353 "userdel",
354 "vi",
355 "vim",
356 "watch",
357 "wc",
358 "wget",
359 "which",
360 "whereis",
361 "who",
362 "whoami",
363 "xargs",
364 "yarn",
365 "yes",
366 "yq",
367 "zip",
368 "zypper",
369 ];
370
371 COMMON_COMMANDS.contains(&word.to_lowercase().as_str())
372 }
373}
374
375#[cfg(test)]
376mod tests {
377 use super::*;
378
379 #[test]
380 fn test_empty_input() {
381 let classifier = InputClassifier::new();
382 assert_eq!(classifier.classify(""), InputType::Empty);
383 assert_eq!(classifier.classify(" "), InputType::Empty);
384 }
385
386 #[test]
387 fn test_explicit_query_prefix() {
388 let classifier = InputClassifier::new();
389 assert_eq!(
390 classifier.classify("? how do I list files"),
391 InputType::NaturalLanguage("how do I list files".to_string())
392 );
393 assert_eq!(
394 classifier.classify("?chi sono io"),
395 InputType::NaturalLanguage("chi sono io".to_string())
396 );
397 }
398
399 #[test]
400 fn test_question_marks() {
401 let classifier = InputClassifier::new();
402 assert!(matches!(
403 classifier.classify("how do I list files?"),
404 InputType::NaturalLanguage(_)
405 ));
406 assert!(matches!(
407 classifier.classify("what is docker?"),
408 InputType::NaturalLanguage(_)
409 ));
410 }
411
412 #[test]
413 fn test_non_ascii() {
414 let classifier = InputClassifier::new();
415 // Italian
416 assert!(matches!(
417 classifier.classify("chi sono io"),
418 InputType::NaturalLanguage(_)
419 ));
420 // Spanish
421 assert!(matches!(
422 classifier.classify("cómo listar archivos"),
423 InputType::NaturalLanguage(_)
424 ));
425 }
426
427 #[test]
428 fn test_commands() {
429 let classifier = InputClassifier::new();
430 assert!(matches!(
431 classifier.classify("ls -la"),
432 InputType::Command(_)
433 ));
434 assert!(matches!(
435 classifier.classify("docker ps"),
436 InputType::Command(_)
437 ));
438 assert!(matches!(
439 classifier.classify("git status"),
440 InputType::Command(_)
441 ));
442 assert!(matches!(
443 classifier.classify("cat /etc/passwd"),
444 InputType::Command(_)
445 ));
446 }
447
448 #[test]
449 fn test_shell_operators() {
450 let classifier = InputClassifier::new();
451 assert!(matches!(
452 classifier.classify("ls | grep foo"),
453 InputType::Command(_)
454 ));
455 assert!(matches!(
456 classifier.classify("cat file > output"),
457 InputType::Command(_)
458 ));
459 assert!(matches!(
460 classifier.classify("cmd1 && cmd2"),
461 InputType::Command(_)
462 ));
463 }
464
465 #[test]
466 fn test_long_phrases() {
467 let classifier = InputClassifier::new();
468 assert!(matches!(
469 classifier.classify("show me all the docker containers running"),
470 InputType::NaturalLanguage(_)
471 ));
472 }
473
474 #[test]
475 fn test_question_words() {
476 let classifier = InputClassifier::new();
477 assert!(matches!(
478 classifier.classify("how to list files"),
479 InputType::NaturalLanguage(_)
480 ));
481 assert!(matches!(
482 classifier.classify("what is kubernetes"),
483 InputType::NaturalLanguage(_)
484 ));
485 assert!(matches!(
486 classifier.classify("why is my container failing"),
487 InputType::NaturalLanguage(_)
488 ));
489 }
490
491 #[test]
492 fn test_polite_phrases() {
493 let classifier = InputClassifier::new();
494 assert!(matches!(
495 classifier.classify("please help me"),
496 InputType::NaturalLanguage(_)
497 ));
498 assert!(matches!(
499 classifier.classify("can you explain docker"),
500 InputType::NaturalLanguage(_)
501 ));
502 }
503
504 #[test]
505 fn test_whoami_classification() {
506 let classifier = InputClassifier::new();
507 // "whoami" is a known command
508 assert!(matches!(
509 classifier.classify("whoami"),
510 InputType::Command(_)
511 ));
512 // "chi sono io" is Italian (non-ASCII 'ì' not present, but space pattern)
513 // Actually "chi sono io" is all ASCII, but it's 3 words and "chi" is not a command
514 assert!(matches!(
515 classifier.classify("chi sono io"),
516 InputType::NaturalLanguage(_)
517 ));
518 }
519
520 #[test]
521 fn test_which_command_not_classified_as_nl() {
522 let classifier = InputClassifier::new();
523 // "which" is both a question word AND a Unix command.
524 // When used as first word it must be classified as Command.
525 assert!(matches!(
526 classifier.classify("which python"),
527 InputType::Command(_)
528 ));
529 assert!(matches!(
530 classifier.classify("which node"),
531 InputType::Command(_)
532 ));
533 }
534
535 #[test]
536 fn test_who_command_not_classified_as_nl() {
537 let classifier = InputClassifier::new();
538 assert!(matches!(
539 classifier.classify("who am i"),
540 InputType::Command(_)
541 ));
542 }
543
544 #[test]
545 fn test_command_with_article_not_classified_as_nl() {
546 let classifier = InputClassifier::new();
547 // Commands containing articles ("a", "an", "the") must stay commands
548 assert!(matches!(
549 classifier.classify("cat a file.txt"),
550 InputType::Command(_)
551 ));
552 assert!(matches!(
553 classifier.classify("touch a new_file"),
554 InputType::Command(_)
555 ));
556 assert!(matches!(
557 classifier.classify("sudo apt install a package"),
558 InputType::Command(_)
559 ));
560 assert!(matches!(
561 classifier.classify("mkdir the directory"),
562 InputType::Command(_)
563 ));
564 }
565
566 #[test]
567 fn test_container_commands() {
568 let classifier = InputClassifier::new();
569 // Commands commonly used inside Docker/container environments
570 assert!(matches!(
571 classifier.classify("service nginx restart"),
572 InputType::Command(_)
573 ));
574 assert!(matches!(
575 classifier.classify("dpkg --list"),
576 InputType::Command(_)
577 ));
578 assert!(matches!(
579 classifier.classify("ip addr show"),
580 InputType::Command(_)
581 ));
582 assert!(matches!(
583 classifier.classify("useradd -m newuser"),
584 InputType::Command(_)
585 ));
586 assert!(matches!(
587 classifier.classify("apk add curl"),
588 InputType::Command(_)
589 ));
590 }
591
592 #[test]
593 fn test_question_mark_overrides_command_prefix() {
594 let classifier = InputClassifier::new();
595 // Even if the first word is a command, a question mark is a strong NL signal
596 assert!(matches!(
597 classifier.classify("git what branch am I on?"),
598 InputType::NaturalLanguage(_)
599 ));
600 }
601
602 #[test]
603 fn test_dollar_question_mark_not_classified_as_nl() {
604 let classifier = InputClassifier::new();
605 // $? is the shell exit-code variable, not a real question mark
606 assert!(matches!(
607 classifier.classify("echo $?"),
608 InputType::Command(_)
609 ));
610 assert!(matches!(
611 classifier.classify("test $? -eq 0"),
612 InputType::Command(_)
613 ));
614 // But a real question mark after a command should still trigger NL
615 assert!(matches!(
616 classifier.classify("echo what is this?"),
617 InputType::NaturalLanguage(_)
618 ));
619 }
620}