const std = @import("std"); /// Decode one UTF-8 codepoint, returning the codepoint and byte length. pub fn decodeUtf8(bytes: []const u8) struct { codepoint: u21, len: u3 } { if (bytes.len == 0) return .{ .codepoint = 0, .len = 0 }; const b0 = bytes[0]; if (b0 < 0x80) return .{ .codepoint = b0, .len = 1 }; if (b0 & 0xE0 == 0xC0 and bytes.len >= 2) return .{ .codepoint = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 }; if (b0 & 0xF0 == 0xE0 and bytes.len >= 3) return .{ .codepoint = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 }; if (b0 & 0xF8 == 0xF0 and bytes.len >= 4) return .{ .codepoint = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 }; return .{ .codepoint = 0xFFFD, .len = 1 }; // replacement char } /// Check if a codepoint is an ASCII letter. pub fn isAlpha(cp: u21) bool { return (cp >= 'a' and cp <= 'z') or (cp >= 'A' and cp <= 'Z'); } pub fn isUpper(cp: u21) bool { return cp >= 'A' and cp <= 'Z'; } pub fn toLower(cp: u21) u21 { if (cp >= 'A' and cp <= 'Z') return cp + 32; return cp; } pub fn isDigit(cp: u21) bool { return cp >= '0' and cp <= '9'; } pub fn isWhitespace(cp: u21) bool { return cp == ' ' or cp == '\t' or cp == '\n' or cp == '\r' or cp == 0x0C; } /// Check if byte slice starts with a Latin character with diacritics (common in ES/FR/DE). pub fn isLatinExtended(cp: u21) bool { return (cp >= 0xC0 and cp <= 0x024F); } /// Detect language from text using character frequency heuristics. /// Returns "es" for Spanish, "en" for English (default). pub fn detectLanguage(text: []const u8) []const u8 { var n_tilde: u32 = 0; // n with tilde var inverted_punct: u32 = 0; // inverted ? and ! var total_alpha: u32 = 0; var i: usize = 0; while (i < text.len) { const dec = decodeUtf8(text[i..]); if (dec.len == 0) break; const cp = dec.codepoint; if (isAlpha(cp) or isLatinExtended(cp)) total_alpha += 1; if (cp == 0xF1 or cp == 0xD1) n_tilde += 1; // n tilde / N tilde if (cp == 0xBF or cp == 0xA1) inverted_punct += 1; // inverted ? / ! i += dec.len; } if (total_alpha == 0) return "en"; // Spanish heuristic: presence of n-tilde or inverted punctuation if (n_tilde > 0 or inverted_punct > 0) return "es"; return "en"; } test "detectLanguage basics" { try std.testing.expectEqualStrings("en", detectLanguage("Hello world, this is a test.")); try std.testing.expectEqualStrings("es", detectLanguage("El ni\xc3\xb1o corri\xc3\xb3 r\xc3\xa1pido.")); // niño corrió rápido try std.testing.expectEqualStrings("es", detectLanguage("\xc2\xbfComo estas?")); // ¿Como estas? }