Reactos
at master 315 lines 9.0 kB view raw
1/* 2 * Usage: utf16le inputfile outputfile 3 * 4 * This is a tool and is compiled using the host compiler, 5 * i.e. on Linux gcc and not mingw-gcc (cross-compiler). 6 * It's a converter from utf-8, utf-16 (LE/BE) and utf-32 (LE/BE) 7 * to utf-16 LE and especially made for automatic conversions of 8 * INF-files from utf-8 to utf-16LE (so we can furthermore 9 * store the INF files in utf-8 for subversion. 10 * 11 * Author: Matthias Kupfer (mkupfer@reactos.org) 12 */ 13 14#include <fstream> 15#include <iostream> 16#include <string.h> 17 18//#define DISPLAY_DETECTED_UNICODE 19 20using namespace std; 21 22#ifdef _MSC_VER 23#define strcasecmp _stricmp 24#endif 25 26class utf_converter 27{ 28public: 29 // detect can detect utf-8 and both utf-16 variants, but assume utf-32 only 30 // due to ambiguous BOM 31 enum enc_types { detect, utf8, utf16le, utf16be, utf32le, utf32be }; 32 enum err_types { none, iopen, oopen, eof, read, write, decode }; 33 enum bom_types { bom, nobom }; 34protected: 35 err_types error; 36 enc_types encoding; 37 bom_types bom_type; 38 unsigned char buffer[4], index; // need 4 char buffer for optional BOM handling 39 std::streamsize fill; 40 fstream inputfile,outputfile; 41 static const unsigned char utf8table[64]; 42public: 43 utf_converter(string ifname, string ofname, bom_types ofbom = bom, enc_types enc = detect) : error(none), bom_type(ofbom), encoding(enc), fill(0), index(0) 44 { 45 enc_types tmp_enc; 46 inputfile.open(ifname.c_str(), ios::in | ios::binary); 47 if (!inputfile) 48 { 49 error = iopen; 50 return; 51 } 52 outputfile.open(ofname.c_str(), ios::out | ios::binary); 53 if (!outputfile) 54 { 55 error = oopen; 56 return; 57 } 58 tmp_enc = getBOM(); 59 if (enc != detect) 60 { 61 if (enc != tmp_enc) 62 cerr << "Warning: UTF-BOM doesn't match encoding setting, but given encoding forced" << endl; 63 } 64 else 65 encoding = tmp_enc; 66 } 67 err_types getError() 68 { 69 return error; 70 } 71 enc_types getBOM() 72 { 73 index = 0; 74 /* first byte can also detect with: 75 if ((buffer[0] & 0x11) || !buffer[0])) 76 valid values are 0xef, 0xff, 0xfe, 0x00 77 */ 78 inputfile.read(reinterpret_cast<char*>(&buffer),4); 79 fill = inputfile.gcount(); 80 // stupid utf8 bom 81 if ((fill > 2) && 82 (buffer[0] == 0xef) && 83 (buffer[1] == 0xbb) && 84 (buffer[2] == 0xbf)) 85 { 86 index += 3; 87 fill -=3; 88#ifdef DISPLAY_DETECTED_UNICODE 89 cerr << "UTF-8 BOM found" << endl; 90#endif 91 return utf8; 92 } 93 if ((fill > 1) && 94 (buffer[0] == 0xfe) && 95 (buffer[1] == 0xff)) 96 { 97 index += 2; 98 fill -= 2; 99#ifdef DISPLAY_DETECTED_UNICODE 100 cerr << "UTF-16BE BOM found" << endl; 101#endif 102 return utf16be; 103 } 104 if ((fill > 1) && 105 (buffer[0] == 0xff) && 106 (buffer[1] == 0xfe)) 107 { 108 if ((fill == 4) && 109 (buffer[2] == 0x00) && 110 (buffer[3] == 0x00)) 111 { 112 cerr << "UTF Error: ambiguous BOM UTF-16 or UTF-32; assume UTF-32" << endl; 113 fill = 0; 114 index = 0; 115 return utf32le; 116 } 117 fill -= 2; 118 index += 2; 119#ifdef DISPLAY_DETECTED_UNICODE 120 cerr << "UTF-16LE BOM found" << endl; 121#endif 122 return utf16le; 123 } 124 if ((fill == 4) && 125 (buffer[0] == 0x00) && 126 (buffer[1] == 0x00) && 127 (buffer[2] == 0xfe) && 128 (buffer[3] == 0xff)) 129 { 130 fill = 0; 131 index = 0; 132#ifdef DISPLAY_DETECTED_UNICODE 133 cerr << "UTF-32BE BOM found" << endl; 134#endif 135 return utf32be; 136 } 137 return utf8; // no valid bom so use utf8 as default 138 } 139 std::streamsize getByte(unsigned char &c) 140 { 141 if (fill) 142 { 143 index %= 4; 144 --fill; 145 c = buffer[index++]; 146 return 1; 147 } else 148 { 149 inputfile.read(reinterpret_cast<char*>(&c),1); 150 return inputfile.gcount(); 151 } 152 } 153 std::streamsize getWord(unsigned short &w) 154 { 155 unsigned char c[2]; 156 if (!getByte(c[0])) 157 return 0; 158 if (!getByte(c[1])) 159 return 1; 160 if (encoding == utf16le) 161 w = c[0] | (c[1] << 8); 162 else 163 w = c[1] | (c[0] << 8); 164 return 2; 165 } 166 std::streamsize getDWord(wchar_t &d) 167 { 168 unsigned char c[4]; 169 for (int i=0;i<4;i++) 170 if (!getByte(c[i])) 171 return i; 172 if (encoding == utf32le) 173 d = c[0] | (c[1] << 8) | (c[2] << 16) | (c[3] << 24); 174 else 175 d = c[3] | (c[2] << 8) | (c[1] << 16) | (c[0] << 24); 176 return 4; 177 } 178 wchar_t get_wchar_t() 179 { 180 wchar_t ret = (wchar_t)-1; 181 switch (encoding) 182 { 183 case detect: // if still unknown 184 encoding = utf8; // assume utf8 as default 185 case utf8: 186 unsigned char c, tmp; 187 if (!getByte(tmp)) 188 return ret; 189 // table for 64 bytes (all 11xxxxxx resp. >=192) 190 // resulting byte is determined: 191 // lower 3 bits: number of following bytes (max.8) 0=error 192 // upper 5 bits: data filled with 0 193 if (tmp & 0x80) 194 { 195 if ((tmp & 0xc0) != 0xc0) 196 { 197 cerr << "UTF-8 Error: invalid data byte" << endl; 198 return ret; 199 } 200 unsigned char i = utf8table[tmp & 0x3f]; 201 ret = i >> 3; 202 i &= 7; 203 while (i--) 204 { 205 ret <<= 6; 206 if (!getByte(c)) 207 return wchar_t(-1); 208 ret |= c & 0x3f; 209 } 210 return ret; 211 } 212 else 213 return wchar_t(tmp); 214 case utf16le: 215 case utf16be: 216 unsigned short w,w2; 217 if (getWord(w) != 2) 218 return ret; 219 if ((w & 0xfc00) == 0xd800) // high surrogate first 220 { 221 if (getWord(w2) != 2) 222 return ret; 223 if ((w2 & 0xfc00) != 0xdc00) 224 { 225 cerr << "UTF-16 Error: invalid low surrogate" << endl; 226 return ret; 227 } 228 return (((w & 0x3ff) + 0x40) << 10) | (w2 & 0x3ff); 229 } 230 return w; 231 case utf32le: 232 case utf32be: 233 if (getDWord(ret) != 4) 234 return wchar_t (-1); 235 return ret; 236 } 237 return ret; 238 } 239 void convert2utf16le() 240 { 241 unsigned char buffer[2] = { 0xff, 0xfe }; 242 243 if (bom_type == bom) 244 { 245 outputfile.write(reinterpret_cast<char*>(&buffer), 2); // write BOM 246 } 247 248 wchar_t c = get_wchar_t(); 249 250 while (!inputfile.eof()) 251 { 252 buffer[0] = c & 0xff; 253 buffer[1] = (c >> 8) & 0xff; // create utf16-le char 254 outputfile.write(reinterpret_cast<char*>(&buffer),2); // write char 255 c = get_wchar_t(); 256 } 257 } 258 ~utf_converter() 259 { 260 if (inputfile) 261 inputfile.close(); 262 if (outputfile) 263 outputfile.close(); 264 } 265}; 266 267const unsigned char utf_converter::utf8table[64] = { 2681, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 269129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 2702, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 2713, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 5, 13, 6, 7 272}; 273 274 275int main(int argc, char* argv[]) 276{ 277 utf_converter::err_types err; 278 279 if (argc < 3) 280 { 281 cout << "usage: " << argv[0] << " inputfile outputfile" << endl; 282 return -1; 283 } 284 285 utf_converter::bom_types bom_type = utf_converter::bom; 286 287 if (argc == 4 && strcasecmp(argv[3], "nobom") == 0) 288 { 289 bom_type = utf_converter::nobom; 290 } 291 292 utf_converter conv(argv[1], argv[2], bom_type); 293 294 if ((err = conv.getError())!=utf_converter::none) 295 { 296 switch (err) 297 { 298 case utf_converter::iopen: 299 cerr << "Couldn't open input file." << endl; 300 break; 301 case utf_converter::oopen: 302 cerr << "Couldn't open output file." << endl; 303 break; 304 default: 305 cerr << "Unknown error." << endl; 306 } 307 return -1; 308 } 309 else 310 { 311 conv.convert2utf16le(); 312 } 313 314 return 0; 315}