Reactos
at listview 636 lines 23 kB view raw
1// 2// read.cpp 3// 4// Copyright (c) Microsoft Corporation. All rights reserved. 5// 6// Defines _read(), which reads bytes from a file. 7// 8#include <corecrt_internal_lowio.h> 9 10// Lookup table for UTF-8 lead bytes 11// Probably preferable to just ask if the bits are set than use an entire 12// table, however the macros using this were #defined in the header so 13// removing this extern table would break apps compiled to an earlier verison. 14// 1 for pattern 110xxxxx - 1 trailbyte 15// 2 for pattern 1110xxxx - 2 trailbytes 16// 3 for pattern 11110xxx - 3 trailbytes 17// 0 for everything else, including invalid patterns. 18// We return 0 for invalid patterns because we rely on MultiByteToWideChar to 19// do the validations. 20extern "C" { char _lookuptrailbytes[256] = 21{ 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 33 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 34 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0 35}; } 36 37 38static void store_lookahead(int const fh, char const c) throw() 39{ 40 _pipe_lookahead(fh)[0] = c; 41} 42 43static void store_lookahead(int const fh, wchar_t const c) throw() 44{ 45 char const* const byte_pointer = reinterpret_cast<char const*>(&c); 46 _pipe_lookahead(fh)[0] = byte_pointer[0]; 47 _pipe_lookahead(fh)[1] = byte_pointer[1]; 48 _pipe_lookahead(fh)[2] = LF; // Mark as empty 49} 50 51 52 53static int __cdecl translate_utf16_from_console_nolock( 54 int const fh, 55 _Inout_updates_(count) wchar_t* const buffer, 56 size_t const count 57 ) throw() 58{ 59 // The translation can be performend in-place, because we are converting 60 // CRLF sequences into LF, so the resulting text will never be longer than 61 // any corresponding source text. 62 wchar_t* const buffer_end = buffer + count; 63 64 wchar_t* source_it = buffer; 65 wchar_t* result_it = buffer; 66 67 while (source_it < buffer_end) 68 { 69 // If at any point during translation we encounter a Ctrl+Z, we stop 70 // translating immediately: 71 if (*source_it == CTRLZ) 72 { 73 _osfile(fh) |= FEOFLAG; 74 break; 75 } 76 77 // When a CR character is encountered, we must check to see if the next 78 // character is an LF. If it is, then we skip the CR and copy only the 79 // LF: 80 if (*source_it == CR && source_it + 1 < buffer_end && *(source_it + 1) == LF) 81 { 82 source_it += 2; 83 *result_it++ = LF; 84 continue; 85 } 86 87 // Otherwise, we just copy the character: 88 *result_it++ = *source_it++; 89 } 90 91 // Return the number of bytes that we translated: 92 return static_cast<int>((result_it - buffer) * sizeof(wchar_t)); 93} 94 95 96 97template <typename Character> 98static int __cdecl translate_text_mode_nolock( 99 _In_ int const fh, 100 _Pre_writable_size_(count) _Post_readable_byte_size_(return) Character* const buffer, 101 _In_ size_t const count 102 ) throw() 103{ 104 HANDLE const os_handle = reinterpret_cast<HANDLE>(_osfhnd(fh)); 105 106 // If there is an LF at the beginning of the buffer, set the CRLF flag: 107 if (count != 0 && *buffer == LF) 108 { 109 _osfile(fh) |= FCRLF; 110 } 111 else 112 { 113 _osfile(fh) &= ~FCRLF; 114 } 115 116 // The translation can be performend in-place, because we are converting 117 // CRLF sequences into LF, so the resulting text will never be longer than 118 // any corresponding source text. 119 Character* const buffer_end = buffer + count; 120 121 Character* source_it = buffer; 122 Character* result_it = buffer; 123 124 while (source_it < buffer_end) 125 { 126 // If during translation we encounter a Ctrl+Z, we stop translating 127 // immeidately. For devices, we need to just set the Ctrl+Z flag; 128 // for other files, we just copy the Ctrl+Z as a normal character 129 // before returning: 130 if (*source_it == CTRLZ) 131 { 132 if ((_osfile(fh) & FDEV) == 0) 133 { 134 _osfile(fh) |= FEOFLAG; 135 } 136 else 137 { 138 *result_it++ = *source_it++; 139 } 140 141 break; 142 } 143 144 // If the character is not a CR, then we can simply copy it: 145 if (*source_it != CR) 146 { 147 *result_it++ = *source_it++; 148 continue; 149 } 150 151 // Otherwise, the character is a CR. We need to look-ahead to see if 152 // the next character is an LF, so that we can perform the CRLF => LF 153 // translation. First, handle the easy case where the CR does not 154 // appear at the end of the buffer: 155 if (source_it + 1 < buffer_end) 156 { 157 if (*(source_it + 1) == LF) 158 { 159 source_it += 2; 160 *result_it++ = LF; // Convert CRLF => LF 161 } 162 else 163 { 164 *result_it++ = *source_it++; 165 } 166 167 continue; 168 } 169 170 // This is the hard case: The CR is at the end of the buffer. We need 171 // to peek ahead to see if the next character is an LF: 172 ++source_it; 173 174 Character peek; 175 DWORD peek_size; 176 if (!ReadFile(os_handle, &peek, sizeof(peek), &peek_size, nullptr) || peek_size == 0) 177 { 178 // We couldn't peek ahead; just store the CR: 179 *result_it++ = CR; 180 continue; 181 } 182 183 // The peek succeeded. What we do next depends on whether the file is 184 // seekable or not. First we handle the case where the file does not 185 // allow seeking: 186 if (_osfile(fh) & (FDEV | FPIPE)) 187 { 188 // If the peek character is an LF, then we just need to copy that 189 // character to the output buffer: 190 if (peek == LF) 191 { 192 *result_it++ = LF; 193 } 194 // Otherwise, it was some other character. We need to write the CR 195 // to the output buffer, then we need to store the peek character 196 // for later retrieval: 197 else 198 { 199 *result_it++ = CR; 200 store_lookahead(fh, peek); 201 } 202 } 203 // If the file does allow seeking, then we handle the peek differently. 204 // For seekable files, we translate the CRLF => LF by eliminating the 205 // CR. If the peek character is an LF, we simply do not write it to 206 // the output buffer; instead, we will seek backwards to unpeek the 207 // character, then let the LF get retrieved during the next call to 208 // read(). 209 else 210 { 211 // However, if the buffer is currenty empty, then this is a one- 212 // character read, so we store the LF in order that we make progress 213 if (peek == LF && result_it == buffer) 214 { 215 *result_it++ = LF; 216 } 217 // Otherwise, we do what is described above: we seek backwards and 218 // write the CR if and only if the peek character was not an LF: 219 else 220 { 221 _lseeki64_nolock(fh, -1 * static_cast<int>(sizeof(Character)), FILE_CURRENT); 222 if (peek != LF) 223 { 224 *result_it++ = CR; 225 } 226 } 227 } 228 } 229 230 // Return the number of bytes that we translated: 231 return static_cast<int>((result_it - buffer) * sizeof(Character)); 232} 233 234 235 236_Success_(return != -1) 237static int __cdecl translate_ansi_or_utf8_nolock( 238 int const fh, 239 _In_reads_(source_count) char* const source_buffer, 240 size_t const source_count, 241 _Pre_writable_size_(result_count) _Post_readable_byte_size_(return) wchar_t* const result_buffer, 242 size_t const result_count 243 ) throw() 244{ 245 int const text_mode_translation_result_size = translate_text_mode_nolock(fh, source_buffer, source_count); 246 247 // If we read no characters, then we are done: 248 if (text_mode_translation_result_size == 0) 249 { 250 return 0; 251 } 252 253 // If the file is open in ANSI mode, then no further translation is 254 // required; we can simply return the number of bytes that we read. 255 // Even though there is no translation, there may still be 256 // characters in the buffer due to CRLF translation (a CR without 257 // a LF would 'unget' the would-be LF). 258 // text_mode_translation_result_size has already been adjusted for 259 // CRLF translation by translate_text_mode_nolock(). 260 if (_textmode(fh) == __crt_lowio_text_mode::ansi) 261 { 262 return text_mode_translation_result_size; 263 } 264 265 // Otherwise, the file is open in UTF-8 mode and we read a nonzero number 266 // of characters. We need to translate from UTF-8 to UTF-16. To do this, 267 // we first need to hunt for the end of the translatable buffer. This may 268 // not be result_it, because we may have read a partial multibyte UTF-8 269 // character. 270 char* result_it = source_buffer + text_mode_translation_result_size - 1; 271 272 // If the last character is an independent character, then we can 273 // translate the entire buffer: 274 if (_utf8_is_independent(*result_it)) 275 { 276 ++result_it; // Reset the result_it 277 } 278 // Otherwise, we have to find the end of the last full UTF-8 character 279 // that was read: 280 else 281 { 282 // Walk backwards from the end of the buffer until we find a lead byte: 283 unsigned counter = 1; 284 while (!_utf8_is_leadbyte(*result_it) && counter <= 4 && result_it >= source_buffer) 285 { 286 --result_it; 287 ++counter; 288 } 289 290 // Now that we've found the last lead byte, determine whether the 291 // character is complete or incomplete. We compute the number of 292 // trailbytes... 293 unsigned const trailbyte_count = _utf8_no_of_trailbytes(static_cast<const unsigned char>(*result_it)); 294 if (trailbyte_count == 0) 295 { 296 // Oh, apparently that wasn't a lead byte; the file contains invalid 297 // UTF-8 character sequences: 298 errno = EILSEQ; 299 return -1; 300 } 301 302 // If the lead byte plus the remaining bytes form a full set, then we 303 // can translate the entire buffer: 304 if (trailbyte_count + 1 == counter) 305 { 306 result_it += counter; 307 } 308 // Otherwise, the last character is incomplete, so we will not include 309 // this character in the result. We unget the last characters, either 310 // by seeking backwards if the file is seekable, or by buffering the 311 // characters. Note that result_it currently points one-past-the-end 312 // of the translatable buffer, because it points to the lead byte of 313 // the partially read character. 314 else 315 { 316 // If the file does not support seeking, buffer the characters: 317 if (_osfile(fh) & (FDEV | FPIPE)) 318 { 319 _pipe_lookahead(fh)[0] = *result_it++; 320 321 if (counter >= 2) 322 { 323 _pipe_lookahead(fh)[1] = *result_it++; 324 } 325 326 if (counter == 3) 327 { 328 _pipe_lookahead(fh)[2] = *result_it++; 329 } 330 331 // Now that we've buffered the characters, seek the end iterator 332 // back to the actual end of the translatable sequence: 333 result_it -= counter; 334 335 } 336 // If the file does support seeking, we can just seek backwards so 337 // that the next read will get the characters directly: 338 else 339 { 340 _lseeki64_nolock(fh, -static_cast<int>(counter), FILE_CURRENT); 341 } 342 } 343 } 344 345 // Finally, we can translate the characters into the result buffer: 346 int const characters_translated = static_cast<int>(__acrt_MultiByteToWideChar( 347 CP_UTF8, 348 0, 349 source_buffer, 350 static_cast<DWORD>(result_it - source_buffer), 351 result_buffer, 352 static_cast<DWORD>(result_count))); 353 354 if (characters_translated == 0) 355 { 356 __acrt_errno_map_os_error(GetLastError()); 357 return -1; 358 } 359 360 _utf8translations(fh) = (characters_translated != static_cast<int>(result_it - source_buffer)); 361 362 // MultiByteToWideChar returns the number of wide characters that 363 // it produced; we need to return the number of bytes: 364 return characters_translated * sizeof(wchar_t); 365} 366 367 368 369// Reads bytes from a file. This function attempts to read enough bytes to fill 370// the provided buffer. If the file is in text mode, CRLF sequences are mapped 371// to LF, thus affecting the number of characters read. This mapping does not 372// affect the file pointer. 373// 374// Returns the number of bytes read, which may be less than the number of bytes 375// requested if EOF was reached or if the file is in text mode. Returns -1 and 376// sets errno on failure. 377extern "C" int __cdecl _read(int const fh, void* const buffer, unsigned const buffer_size) 378{ 379 _CHECK_FH_CLEAR_OSSERR_RETURN(fh, EBADF, -1); 380 _VALIDATE_CLEAR_OSSERR_RETURN(fh >= 0 && (unsigned)fh < (unsigned)_nhandle, EBADF, -1); 381 _VALIDATE_CLEAR_OSSERR_RETURN(_osfile(fh) & FOPEN, EBADF, -1); 382 _VALIDATE_CLEAR_OSSERR_RETURN(buffer_size <= INT_MAX, EINVAL, -1); 383 384 __acrt_lowio_lock_fh(fh); 385 int result = -1; 386 __try 387 { 388 if ((_osfile(fh) & FOPEN) == 0) 389 { 390 errno = EBADF; 391 _doserrno = 0; 392 _ASSERTE(("Invalid file descriptor. File possibly closed by a different thread",0)); 393 __leave; 394 } 395 396 result = _read_nolock(fh, buffer, buffer_size); 397 } 398 __finally 399 { 400 __acrt_lowio_unlock_fh(fh); 401 } 402 __endtry 403 return result; 404} 405 406 407 408extern "C" int __cdecl _read_nolock( 409 int const fh, 410 void* const result_buffer, 411 unsigned const result_buffer_size 412 ) 413{ 414 _CHECK_FH_CLEAR_OSSERR_RETURN(fh, EBADF, -1 ); 415 _VALIDATE_CLEAR_OSSERR_RETURN(fh >= 0 && (unsigned)fh < (unsigned)_nhandle, EBADF, -1); 416 _VALIDATE_CLEAR_OSSERR_RETURN(_osfile(fh) & FOPEN, EBADF, -1); 417 _VALIDATE_CLEAR_OSSERR_RETURN(result_buffer_size <= INT_MAX, EINVAL, -1); 418 419 // If there is no data to be written or if the file is at EOF, no work to do: 420 if (result_buffer_size == 0 || (_osfile(fh) & FEOFLAG)) 421 return 0; 422 423 _VALIDATE_CLEAR_OSSERR_RETURN(result_buffer != nullptr, EINVAL, -1); 424 425 426 HANDLE const os_handle = reinterpret_cast<HANDLE>(_osfhnd(fh)); 427 __crt_lowio_text_mode const text_mode = _textmode(fh); 428 429 430 __crt_unique_heap_ptr<char> owned_internal_buffer; 431 432 char* internal_buffer; 433 unsigned internal_buffer_remaining; 434 switch (text_mode) 435 { 436 case __crt_lowio_text_mode::utf8: 437 // For UTF-8 files, we need two buffers, because after reading we need 438 // to convert the text into Unicode. MultiByteToWideChar doesn't do 439 // in-place conversions. 440 // 441 // The multibyte to wide character conversion may double the size of the 442 // text, hence we halve the size here. 443 // 444 // Since we are reading a UTF-8 stream, the number of bytes read may 445 // vary from 'size' characters to 'size/4' characters. For this reason, 446 // if we need to read 'size' characters, we will allocate an MBCS buffer 447 // of size 'size'. In case the size is zero, we will use four as a 448 // minimum value. This will make sure we don't overflow when we read 449 // from a pipe. 450 // 451 // In this case, the number of wide characters that we can read is 452 // size / 2. This means that we require a buffer of size size / 2. 453 454 // For UTF-8 the count always needs to be an even number: 455 _VALIDATE_CLEAR_OSSERR_RETURN(result_buffer_size % 2 == 0, EINVAL, -1); 456 457 internal_buffer_remaining = (result_buffer_size / 2) < 4 458 ? 4 459 : (result_buffer_size/2); 460 461 owned_internal_buffer = _malloc_crt_t(char, internal_buffer_remaining); 462 internal_buffer = owned_internal_buffer.get(); 463 if (!internal_buffer) 464 { 465 errno = ENOMEM; 466 _doserrno = ERROR_NOT_ENOUGH_MEMORY; 467 return -1; 468 } 469 470 _startpos(fh) = _lseeki64_nolock(fh, 0, FILE_CURRENT); 471 break; 472 473 case __crt_lowio_text_mode::utf16le: 474 // For UTF-16 the count always needs to be an even number: 475 _VALIDATE_CLEAR_OSSERR_RETURN((result_buffer_size % 2) == 0, EINVAL, -1); 476 477 // For UTF-16 files, we can directly use the input buffer: 478 internal_buffer_remaining = result_buffer_size; 479 internal_buffer = static_cast<char*>(result_buffer); 480 break; 481 482 default: 483 // For ANSI files, we can directly use the input buffer: 484 internal_buffer_remaining = result_buffer_size; 485 internal_buffer = static_cast<char*>(result_buffer); 486 break; 487 } 488 489 wchar_t* wide_internal_buffer = reinterpret_cast<wchar_t*>(internal_buffer); 490 491 int bytes_read = 0; 492 493 // We may have buffered look-ahead characters during the last read. If 494 // so, read them into the buffer and set the look-ahead buffers back to 495 // empty state (with the value of LF): 496 // 497 // CRT_REFACTOR This look-ahead buffering could use additional work, but 498 // will require nonlocal changes, so that work is not included in this 499 // changeset. 500 if ((_osfile(fh) & (FPIPE | FDEV)) && 501 _pipe_lookahead(fh)[0] != LF && 502 internal_buffer_remaining != 0) 503 { 504 *internal_buffer++ = _pipe_lookahead(fh)[0]; 505 ++bytes_read; 506 --internal_buffer_remaining; 507 _pipe_lookahead(fh)[0] = LF; 508 509 // For UTF-16, there may be an additional look-ahead character 510 // bufferred. For UTF-8, there may be two more: 511 if (text_mode != __crt_lowio_text_mode::ansi && 512 _pipe_lookahead(fh)[1] != LF && 513 internal_buffer_remaining != 0) 514 { 515 *internal_buffer++ = _pipe_lookahead(fh)[1]; 516 ++bytes_read; 517 --internal_buffer_remaining; 518 _pipe_lookahead(fh)[1] = LF; 519 520 if (text_mode == __crt_lowio_text_mode::utf8 && 521 _pipe_lookahead(fh)[2] != LF && 522 internal_buffer_remaining != 0) 523 { 524 *internal_buffer++ = _pipe_lookahead(fh)[2]; 525 ++bytes_read; 526 --internal_buffer_remaining; 527 _pipe_lookahead(fh)[2] = LF; 528 } 529 } 530 } 531 532 DWORD console_mode; 533 bool const from_console = 534 _isatty(fh) && 535 (_osfile(fh) & FTEXT) && 536 GetConsoleMode(os_handle, &console_mode); 537 538 // Read the data directly from the console: 539 if (from_console && text_mode == __crt_lowio_text_mode::utf16le) 540 { 541 DWORD console_characters_read; 542 if (!ReadConsoleW( 543 os_handle, 544 internal_buffer, 545 internal_buffer_remaining / sizeof(wchar_t), 546 &console_characters_read, 547 nullptr)) 548 { 549 __acrt_errno_map_os_error(GetLastError()); 550 return -1; 551 } 552 553 // In UTF-16 mode, the return value is the actual number of wide 554 // characters read; we need the number of bytes: 555 bytes_read += console_characters_read * sizeof(wchar_t); 556 } 557 // Otherwise, read the data from the file normally: 558 else 559 { 560 DWORD bytes_read_from_file; 561 if (!ReadFile( 562 os_handle, 563 internal_buffer, 564 internal_buffer_remaining, 565 &bytes_read_from_file, 566 nullptr 567 ) || bytes_read_from_file > result_buffer_size) 568 { 569 DWORD const last_error = GetLastError(); 570 if (last_error == ERROR_ACCESS_DENIED) 571 { 572 // ERROR_ACCESS_DENIED occurs if the file is open with the wrong 573 // read/write mode. For this error, we should return EBADF, not 574 // the EACCES that will be set by __acrt_errno_map_os_error: 575 errno = EBADF; 576 _doserrno = last_error; 577 return -1; 578 579 } 580 else if (last_error == ERROR_BROKEN_PIPE) 581 { 582 // Return 0 if ERROR_BROKEN_PIPE occurs. It means the handle is 583 // a read handle on a pipe for which all write handles have been 584 // closed and all data has been read: 585 return 0; 586 } 587 else 588 { 589 // Otherwise, map the error normally and return: 590 __acrt_errno_map_os_error(last_error); 591 return -1; 592 } 593 } 594 595 bytes_read += bytes_read_from_file; 596 } 597 598 599 // If the file is open in binary mode, no translation is required, so we 600 // can skip all of the rest of this function: 601 if ((_osfile(fh) & FTEXT) == 0) 602 return bytes_read; 603 604 605 // Perform the CRLF => LF translation and convert to the required 606 // encoding (UTF-8 must be converted to UTF-16). This first case 607 // handles UTF-8 and ANSI: 608 if (text_mode != __crt_lowio_text_mode::utf16le) 609 { 610 return translate_ansi_or_utf8_nolock( 611 fh, 612 internal_buffer, 613 bytes_read, 614 static_cast<wchar_t*>(result_buffer), 615 result_buffer_size / sizeof(wchar_t)); 616 } 617 618 // The text mode is __crt_lowio_text_mode::utf16le and we are reading from the 619 // console: 620 else if (from_console) 621 { 622 return translate_utf16_from_console_nolock( 623 fh, 624 wide_internal_buffer, 625 bytes_read / sizeof(wchar_t)); 626 } 627 // Otherwise, the text mode is __crt_lowio_text_mode::utf16le and we are NOT 628 // reading from the console: 629 else 630 { 631 return translate_text_mode_nolock( 632 fh, 633 wide_internal_buffer, 634 bytes_read / sizeof(wchar_t)); 635 } 636}