sdk/tools/utf16le/utf16le.cpp at master · huwcampbell.com/reactos

huwcampbell.com / reactos
fork atom
Reactos
fork atom
reactos / sdk / tools / utf16le / utf16le.cpp
at master 315 lines 9.0 kB view raw
wrap content
Andriy Shevchenko [REACTOS] Fix typos in comments (#5591) 2y ago
2ea03b5b
  1/*
  2 * Usage: utf16le inputfile outputfile
  3 *
  4 * This is a tool and is compiled using the host compiler,
  5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
  6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
  7 * to utf-16 LE and especially made for automatic conversions of
  8 * INF-files from utf-8 to utf-16LE (so we can furthermore
  9 * store the INF files in utf-8 for subversion.
 10 *
 11 * Author: Matthias Kupfer (mkupfer@reactos.org)
 12 */
 13
 14#include <fstream>
 15#include <iostream>
 16#include <string.h>
 17
 18//#define DISPLAY_DETECTED_UNICODE
 19
 20using namespace std;
 21
 22#ifdef _MSC_VER
 23#define strcasecmp _stricmp
 24#endif
 25
 26class utf_converter
 27{
 28public:
 29    // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
 30    // due to ambiguous BOM
 31    enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
 32    enum err_types { none, iopen, oopen, eof, read, write, decode };
 33    enum bom_types { bom, nobom };
 34protected:
 35    err_types error;
 36    enc_types encoding;
 37    bom_types bom_type;
 38    unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
 39    std::streamsize fill;
 40    fstream inputfile,outputfile;
 41    static const unsigned char utf8table[64];
 42public:
 43    utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
 44    {
 45        enc_types tmp_enc;
 46        inputfile.open(ifname.c_str(), ios::in | ios::binary);
 47        if (!inputfile)
 48        {
 49            error = iopen;
 50            return;
 51        }
 52        outputfile.open(ofname.c_str(), ios::out | ios::binary);
 53        if (!outputfile)
 54        {
 55            error = oopen;
 56            return;
 57        }
 58        tmp_enc = getBOM();
 59        if (enc != detect)
 60        {
 61            if (enc != tmp_enc)
 62                cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
 63        }
 64        else
 65            encoding = tmp_enc;
 66    }
 67    err_types getError()
 68    {
 69        return error;
 70    }
 71    enc_types getBOM()
 72    {
 73        index = 0;
 74        /* first byte can also detect with:
 75        if ((buffer[0] & 0x11) || !buffer[0]))
 76        valid values are 0xef, 0xff, 0xfe, 0x00
 77        */
 78        inputfile.read(reinterpret_cast<char*>(&buffer),4);
 79        fill = inputfile.gcount();
 80        // stupid utf8 bom
 81        if ((fill > 2) &&
 82            (buffer[0] == 0xef) &&
 83            (buffer[1] == 0xbb) &&
 84            (buffer[2] == 0xbf))
 85        {
 86            index += 3;
 87            fill -=3;
 88#ifdef DISPLAY_DETECTED_UNICODE
 89            cerr << "UTF-8 BOM found" << endl;
 90#endif
 91            return utf8;
 92        }
 93        if ((fill > 1) &&
 94            (buffer[0] == 0xfe) &&
 95            (buffer[1] == 0xff))
 96        {
 97            index += 2;
 98            fill -= 2;
 99#ifdef DISPLAY_DETECTED_UNICODE
100            cerr << "UTF-16BE BOM found" << endl;
101#endif
102            return utf16be;
103        }
104        if ((fill > 1) &&
105            (buffer[0] == 0xff) &&
106            (buffer[1] == 0xfe))
107        {
108            if ((fill == 4) &&
109                (buffer[2] == 0x00) &&
110                (buffer[3] == 0x00))
111            {
112                cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113                fill = 0;
114                index = 0;
115                return utf32le;
116            }
117            fill -= 2;
118            index += 2;
119#ifdef DISPLAY_DETECTED_UNICODE
120            cerr << "UTF-16LE BOM found" << endl;
121#endif
122            return utf16le;
123        }
124        if ((fill == 4) &&
125            (buffer[0] == 0x00) &&
126            (buffer[1] == 0x00) &&
127            (buffer[2] == 0xfe) &&
128            (buffer[3] == 0xff))
129        {
130            fill = 0;
131            index = 0;
132#ifdef DISPLAY_DETECTED_UNICODE
133            cerr << "UTF-32BE BOM found" << endl;
134#endif
135            return utf32be;
136        }
137        return utf8; // no valid bom so use utf8 as default
138    }
139    std::streamsize getByte(unsigned char &c)
140    {
141        if (fill)
142        {
143            index %= 4;
144            --fill;
145            c = buffer[index++];
146            return 1;
147        } else
148        {
149            inputfile.read(reinterpret_cast<char*>(&c),1);
150            return inputfile.gcount();
151        }
152    }
153    std::streamsize getWord(unsigned short &w)
154    {
155        unsigned char c[2];
156        if (!getByte(c[0]))
157                return 0;
158        if (!getByte(c[1]))
159                return 1;
160        if (encoding == utf16le)
161            w = c[0] | (c[1] << 8);
162        else
163            w = c[1] | (c[0] << 8);
164        return 2;
165    }
166    std::streamsize getDWord(wchar_t &d)
167    {
168        unsigned char c[4];
169        for (int i=0;i<4;i++)
170            if (!getByte(c[i]))
171                    return i;
172        if (encoding == utf32le)
173            d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174        else
175            d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176        return 4;
177    }
178    wchar_t get_wchar_t()
179    {
180        wchar_t ret = (wchar_t)-1;
181        switch (encoding)
182        {
183            case detect: // if still unknown
184                encoding = utf8; // assume utf8 as default
185            case utf8:
186                unsigned char c, tmp;
187                if (!getByte(tmp))
188                    return ret;
189                // table for 64 bytes (all 11xxxxxx resp. >=192)
190                // resulting byte is determined:
191                // lower 3 bits: number of following bytes (max.8) 0=error
192                // upper 5 bits: data filled with 0
193                if (tmp & 0x80)
194                {
195                    if ((tmp & 0xc0) != 0xc0)
196                    {
197                        cerr << "UTF-8 Error: invalid data byte" << endl;
198                        return ret;
199                    }
200                    unsigned char i = utf8table[tmp & 0x3f];
201                    ret = i >> 3;
202                    i &= 7;
203                    while (i--)
204                    {
205                        ret <<= 6;
206                        if (!getByte(c))
207                            return wchar_t(-1);
208                        ret |= c & 0x3f;
209                    }
210                    return ret;
211                }
212                else
213                    return wchar_t(tmp);
214            case utf16le:
215            case utf16be:
216                unsigned short w,w2;
217                if (getWord(w) != 2)
218                    return ret;
219                if ((w & 0xfc00) == 0xd800) // high surrogate first
220                {
221                    if (getWord(w2) != 2)
222                        return ret;
223                    if ((w2 & 0xfc00) != 0xdc00)
224                    {
225                        cerr << "UTF-16 Error: invalid low surrogate" << endl;
226                        return ret;
227                    }
228                    return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229                }
230                return w;
231            case utf32le:
232            case utf32be:
233                if (getDWord(ret) != 4)
234                    return wchar_t (-1);
235                return ret;
236        }
237        return ret;
238    }
239    void convert2utf16le()
240    {
241        unsigned char buffer[2] = { 0xff, 0xfe };
242
243        if (bom_type == bom)
244        {
245            outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246        }
247
248        wchar_t c = get_wchar_t();
249
250        while (!inputfile.eof())
251        {
252            buffer[0] = c & 0xff;
253            buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254            outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255            c = get_wchar_t();
256        }
257    }
258    ~utf_converter()
259    {
260        if (inputfile)
261            inputfile.close();
262        if (outputfile)
263            outputfile.close();
264    }
265};
266
267const unsigned char utf_converter::utf8table[64] = {
2681, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
2702, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
2713, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272};
273
274
275int main(int argc, char* argv[])
276{
277    utf_converter::err_types err;
278
279    if (argc < 3)
280    {
281        cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282        return -1;
283    }
284
285    utf_converter::bom_types bom_type = utf_converter::bom;
286
287    if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288    {
289        bom_type = utf_converter::nobom;
290    }
291
292    utf_converter conv(argv[1], argv[2], bom_type);
293
294    if ((err = conv.getError())!=utf_converter::none)
295    {
296        switch (err)
297        {
298            case utf_converter::iopen:
299                cerr << "Couldn't open input file." << endl;
300                break;
301            case utf_converter::oopen:
302                cerr << "Couldn't open output file." << endl;
303                break;
304            default:
305                cerr << "Unknown error." << endl;
306        }
307        return -1;
308    }
309    else
310    {
311        conv.convert2utf16le();
312    }
313
314    return 0;
315}