src/utils/unicode.zig at main · metaend.eth.xyz/fantasma

Anonymize your writing style. Zig WASM engine detects authorship markers, fine-tuned LLM rewrites to remove them. Runs entirely in-browser. fantasma.qstorage.quilibrium.com/

wasm privacy qwen zig

fork atom

fantasma / src / utils / unicode.zig

at main 76 lines 2.8 kB view raw

wrap content

chris Initial commit: Zig WASM stylometric engine + browser frontend 4w ago

49a28200

 1const std = @import("std");
 2
 3/// Decode one UTF-8 codepoint, returning the codepoint and byte length.
 4pub fn decodeUtf8(bytes: []const u8) struct { codepoint: u21, len: u3 } {
 5    if (bytes.len == 0) return .{ .codepoint = 0, .len = 0 };
 6    const b0 = bytes[0];
 7    if (b0 < 0x80) return .{ .codepoint = b0, .len = 1 };
 8    if (b0 & 0xE0 == 0xC0 and bytes.len >= 2)
 9        return .{ .codepoint = (@as(u21, b0 & 0x1F) << 6) | (bytes[1] & 0x3F), .len = 2 };
10    if (b0 & 0xF0 == 0xE0 and bytes.len >= 3)
11        return .{ .codepoint = (@as(u21, b0 & 0x0F) << 12) | (@as(u21, bytes[1] & 0x3F) << 6) | (bytes[2] & 0x3F), .len = 3 };
12    if (b0 & 0xF8 == 0xF0 and bytes.len >= 4)
13        return .{ .codepoint = (@as(u21, b0 & 0x07) << 18) | (@as(u21, bytes[1] & 0x3F) << 12) | (@as(u21, bytes[2] & 0x3F) << 6) | (bytes[3] & 0x3F), .len = 4 };
14    return .{ .codepoint = 0xFFFD, .len = 1 }; // replacement char
15}
16
17/// Check if a codepoint is an ASCII letter.
18pub fn isAlpha(cp: u21) bool {
19    return (cp >= 'a' and cp <= 'z') or (cp >= 'A' and cp <= 'Z');
20}
21
22pub fn isUpper(cp: u21) bool {
23    return cp >= 'A' and cp <= 'Z';
24}
25
26pub fn toLower(cp: u21) u21 {
27    if (cp >= 'A' and cp <= 'Z') return cp + 32;
28    return cp;
29}
30
31pub fn isDigit(cp: u21) bool {
32    return cp >= '0' and cp <= '9';
33}
34
35pub fn isWhitespace(cp: u21) bool {
36    return cp == ' ' or cp == '\t' or cp == '\n' or cp == '\r' or cp == 0x0C;
37}
38
39/// Check if byte slice starts with a Latin character with diacritics (common in ES/FR/DE).
40pub fn isLatinExtended(cp: u21) bool {
41    return (cp >= 0xC0 and cp <= 0x024F);
42}
43
44/// Detect language from text using character frequency heuristics.
45/// Returns "es" for Spanish, "en" for English (default).
46pub fn detectLanguage(text: []const u8) []const u8 {
47    var n_tilde: u32 = 0; // n with tilde
48    var inverted_punct: u32 = 0; // inverted ? and !
49    var total_alpha: u32 = 0;
50
51    var i: usize = 0;
52    while (i < text.len) {
53        const dec = decodeUtf8(text[i..]);
54        if (dec.len == 0) break;
55        const cp = dec.codepoint;
56
57        if (isAlpha(cp) or isLatinExtended(cp)) total_alpha += 1;
58        if (cp == 0xF1 or cp == 0xD1) n_tilde += 1; // n tilde / N tilde
59        if (cp == 0xBF or cp == 0xA1) inverted_punct += 1; // inverted ? / !
60
61        i += dec.len;
62    }
63
64    if (total_alpha == 0) return "en";
65
66    // Spanish heuristic: presence of n-tilde or inverted punctuation
67    if (n_tilde > 0 or inverted_punct > 0) return "es";
68
69    return "en";
70}
71
72test "detectLanguage basics" {
73    try std.testing.expectEqualStrings("en", detectLanguage("Hello world, this is a test."));
74    try std.testing.expectEqualStrings("es", detectLanguage("El ni\xc3\xb1o corri\xc3\xb3 r\xc3\xa1pido.")); // niño corrió rápido
75    try std.testing.expectEqualStrings("es", detectLanguage("\xc2\xbfComo estas?")); // ¿Como estas?
76}