Reactos
1/*
2 * Usage: utf16le inputfile outputfile
3 *
4 * This is a tool and is compiled using the host compiler,
5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler).
6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE)
7 * to utf-16 LE and especially made for automatic conversions of
8 * INF-files from utf-8 to utf-16LE (so we can furthermore
9 * store the INF files in utf-8 for subversion.
10 *
11 * Author: Matthias Kupfer (mkupfer@reactos.org)
12 */
13
14#include <fstream>
15#include <iostream>
16#include <string.h>
17
18//#define DISPLAY_DETECTED_UNICODE
19
20using namespace std;
21
22#ifdef _MSC_VER
23#define strcasecmp _stricmp
24#endif
25
26class utf_converter
27{
28public:
29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only
30 // due to ambiguous BOM
31 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be };
32 enum err_types { none, iopen, oopen, eof, read, write, decode };
33 enum bom_types { bom, nobom };
34protected:
35 err_types error;
36 enc_types encoding;
37 bom_types bom_type;
38 unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling
39 std::streamsize fill;
40 fstream inputfile,outputfile;
41 static const unsigned char utf8table[64];
42public:
43 utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0)
44 {
45 enc_types tmp_enc;
46 inputfile.open(ifname.c_str(), ios::in | ios::binary);
47 if (!inputfile)
48 {
49 error = iopen;
50 return;
51 }
52 outputfile.open(ofname.c_str(), ios::out | ios::binary);
53 if (!outputfile)
54 {
55 error = oopen;
56 return;
57 }
58 tmp_enc = getBOM();
59 if (enc != detect)
60 {
61 if (enc != tmp_enc)
62 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl;
63 }
64 else
65 encoding = tmp_enc;
66 }
67 err_types getError()
68 {
69 return error;
70 }
71 enc_types getBOM()
72 {
73 index = 0;
74 /* first byte can also detect with:
75 if ((buffer[0] & 0x11) || !buffer[0]))
76 valid values are 0xef, 0xff, 0xfe, 0x00
77 */
78 inputfile.read(reinterpret_cast<char*>(&buffer),4);
79 fill = inputfile.gcount();
80 // stupid utf8 bom
81 if ((fill > 2) &&
82 (buffer[0] == 0xef) &&
83 (buffer[1] == 0xbb) &&
84 (buffer[2] == 0xbf))
85 {
86 index += 3;
87 fill -=3;
88#ifdef DISPLAY_DETECTED_UNICODE
89 cerr << "UTF-8 BOM found" << endl;
90#endif
91 return utf8;
92 }
93 if ((fill > 1) &&
94 (buffer[0] == 0xfe) &&
95 (buffer[1] == 0xff))
96 {
97 index += 2;
98 fill -= 2;
99#ifdef DISPLAY_DETECTED_UNICODE
100 cerr << "UTF-16BE BOM found" << endl;
101#endif
102 return utf16be;
103 }
104 if ((fill > 1) &&
105 (buffer[0] == 0xff) &&
106 (buffer[1] == 0xfe))
107 {
108 if ((fill == 4) &&
109 (buffer[2] == 0x00) &&
110 (buffer[3] == 0x00))
111 {
112 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl;
113 fill = 0;
114 index = 0;
115 return utf32le;
116 }
117 fill -= 2;
118 index += 2;
119#ifdef DISPLAY_DETECTED_UNICODE
120 cerr << "UTF-16LE BOM found" << endl;
121#endif
122 return utf16le;
123 }
124 if ((fill == 4) &&
125 (buffer[0] == 0x00) &&
126 (buffer[1] == 0x00) &&
127 (buffer[2] == 0xfe) &&
128 (buffer[3] == 0xff))
129 {
130 fill = 0;
131 index = 0;
132#ifdef DISPLAY_DETECTED_UNICODE
133 cerr << "UTF-32BE BOM found" << endl;
134#endif
135 return utf32be;
136 }
137 return utf8; // no valid bom so use utf8 as default
138 }
139 std::streamsize getByte(unsigned char &c)
140 {
141 if (fill)
142 {
143 index %= 4;
144 --fill;
145 c = buffer[index++];
146 return 1;
147 } else
148 {
149 inputfile.read(reinterpret_cast<char*>(&c),1);
150 return inputfile.gcount();
151 }
152 }
153 std::streamsize getWord(unsigned short &w)
154 {
155 unsigned char c[2];
156 if (!getByte(c[0]))
157 return 0;
158 if (!getByte(c[1]))
159 return 1;
160 if (encoding == utf16le)
161 w = c[0] | (c[1] << 8);
162 else
163 w = c[1] | (c[0] << 8);
164 return 2;
165 }
166 std::streamsize getDWord(wchar_t &d)
167 {
168 unsigned char c[4];
169 for (int i=0;i<4;i++)
170 if (!getByte(c[i]))
171 return i;
172 if (encoding == utf32le)
173 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24);
174 else
175 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24);
176 return 4;
177 }
178 wchar_t get_wchar_t()
179 {
180 wchar_t ret = (wchar_t)-1;
181 switch (encoding)
182 {
183 case detect: // if still unknown
184 encoding = utf8; // assume utf8 as default
185 case utf8:
186 unsigned char c, tmp;
187 if (!getByte(tmp))
188 return ret;
189 // table for 64 bytes (all 11xxxxxx resp. >=192)
190 // resulting byte is determined:
191 // lower 3 bits: number of following bytes (max.8) 0=error
192 // upper 5 bits: data filled with 0
193 if (tmp & 0x80)
194 {
195 if ((tmp & 0xc0) != 0xc0)
196 {
197 cerr << "UTF-8 Error: invalid data byte" << endl;
198 return ret;
199 }
200 unsigned char i = utf8table[tmp & 0x3f];
201 ret = i >> 3;
202 i &= 7;
203 while (i--)
204 {
205 ret <<= 6;
206 if (!getByte(c))
207 return wchar_t(-1);
208 ret |= c & 0x3f;
209 }
210 return ret;
211 }
212 else
213 return wchar_t(tmp);
214 case utf16le:
215 case utf16be:
216 unsigned short w,w2;
217 if (getWord(w) != 2)
218 return ret;
219 if ((w & 0xfc00) == 0xd800) // high surrogate first
220 {
221 if (getWord(w2) != 2)
222 return ret;
223 if ((w2 & 0xfc00) != 0xdc00)
224 {
225 cerr << "UTF-16 Error: invalid low surrogate" << endl;
226 return ret;
227 }
228 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff);
229 }
230 return w;
231 case utf32le:
232 case utf32be:
233 if (getDWord(ret) != 4)
234 return wchar_t (-1);
235 return ret;
236 }
237 return ret;
238 }
239 void convert2utf16le()
240 {
241 unsigned char buffer[2] = { 0xff, 0xfe };
242
243 if (bom_type == bom)
244 {
245 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM
246 }
247
248 wchar_t c = get_wchar_t();
249
250 while (!inputfile.eof())
251 {
252 buffer[0] = c & 0xff;
253 buffer[1] = (c >> 8) & 0xff; // create utf16-le char
254 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char
255 c = get_wchar_t();
256 }
257 }
258 ~utf_converter()
259 {
260 if (inputfile)
261 inputfile.close();
262 if (outputfile)
263 outputfile.close();
264 }
265};
266
267const unsigned char utf_converter::utf8table[64] = {
2681, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
269129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249,
2702, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
2713, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7
272};
273
274
275int main(int argc, char* argv[])
276{
277 utf_converter::err_types err;
278
279 if (argc < 3)
280 {
281 cout << "usage: " << argv[0] << " inputfile outputfile" << endl;
282 return -1;
283 }
284
285 utf_converter::bom_types bom_type = utf_converter::bom;
286
287 if (argc == 4 && strcasecmp(argv[3], "nobom") == 0)
288 {
289 bom_type = utf_converter::nobom;
290 }
291
292 utf_converter conv(argv[1], argv[2], bom_type);
293
294 if ((err = conv.getError())!=utf_converter::none)
295 {
296 switch (err)
297 {
298 case utf_converter::iopen:
299 cerr << "Couldn't open input file." << endl;
300 break;
301 case utf_converter::oopen:
302 cerr << "Couldn't open output file." << endl;
303 break;
304 default:
305 cerr << "Unknown error." << endl;
306 }
307 return -1;
308 }
309 else
310 {
311 conv.convert2utf16le();
312 }
313
314 return 0;
315}