sdk/lib/ucrt/lowio/read.cpp at listview · huwcampbell.com/reactos

huwcampbell.com / reactos
fork atom
Reactos
fork atom
reactos / sdk / lib / ucrt / lowio / read.cpp
at listview 636 lines 23 kB view raw
wrap content
Timo Kreuzer [UCRT] Make SEH blocks ReactOS PSEH compatible 1y ago
e98e9000
  1//
  2// read.cpp
  3//
  4//      Copyright (c) Microsoft Corporation. All rights reserved.
  5//
  6// Defines _read(), which reads bytes from a file.
  7//
  8#include <corecrt_internal_lowio.h>
  9
 10// Lookup table for UTF-8 lead bytes
 11// Probably preferable to just ask if the bits are set than use an entire
 12// table, however the macros using this were #defined in the header so
 13// removing this extern table would break apps compiled to an earlier verison.
 14//    1 for pattern 110xxxxx - 1 trailbyte
 15//    2 for pattern 1110xxxx - 2 trailbytes
 16//    3 for pattern 11110xxx - 3 trailbytes
 17//    0 for everything else, including invalid patterns.
 18// We return 0 for invalid patterns because we rely on MultiByteToWideChar to
 19// do the validations.
 20extern "C" { char _lookuptrailbytes[256] =
 21{
 22    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 23    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 24    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 25    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 26    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 27    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 28    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 29    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 30    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 31    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
 32    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 33    1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 34    3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0
 35}; }
 36
 37
 38static void store_lookahead(int const fh, char const c) throw()
 39{
 40    _pipe_lookahead(fh)[0] = c;
 41}
 42
 43static void store_lookahead(int const fh, wchar_t const c) throw()
 44{
 45    char const* const byte_pointer = reinterpret_cast<char const*>(&c);
 46    _pipe_lookahead(fh)[0] = byte_pointer[0];
 47    _pipe_lookahead(fh)[1] = byte_pointer[1];
 48    _pipe_lookahead(fh)[2] = LF; // Mark as empty
 49}
 50
 51
 52
 53static int __cdecl translate_utf16_from_console_nolock(
 54                            int      const  fh,
 55    _Inout_updates_(count)  wchar_t* const  buffer,
 56                            size_t   const  count
 57    ) throw()
 58{
 59    // The translation can be performend in-place, because we are converting
 60    // CRLF sequences into LF, so the resulting text will never be longer than
 61    // any corresponding source text.
 62    wchar_t* const buffer_end = buffer + count;
 63
 64    wchar_t* source_it = buffer;
 65    wchar_t* result_it = buffer;
 66
 67    while (source_it < buffer_end)
 68    {
 69        // If at any point during translation we encounter a Ctrl+Z, we stop
 70        // translating immediately:
 71        if (*source_it == CTRLZ)
 72        {
 73            _osfile(fh) |= FEOFLAG;
 74            break;
 75        }
 76
 77        // When a CR character is encountered, we must check to see if the next
 78        // character is an LF.  If it is, then we skip the CR and copy only the
 79        // LF:
 80        if (*source_it == CR && source_it + 1 < buffer_end && *(source_it + 1) == LF)
 81        {
 82            source_it += 2;
 83            *result_it++ = LF;
 84            continue;
 85        }
 86
 87        // Otherwise, we just copy the character:
 88        *result_it++ = *source_it++;
 89    }
 90
 91    // Return the number of bytes that we translated:
 92    return static_cast<int>((result_it - buffer) * sizeof(wchar_t));
 93}
 94
 95
 96
 97template <typename Character>
 98static int __cdecl translate_text_mode_nolock(
 99    _In_                                                         int        const fh,
100    _Pre_writable_size_(count) _Post_readable_byte_size_(return) Character* const buffer,
101    _In_                                                         size_t     const count
102    ) throw()
103{
104    HANDLE const os_handle = reinterpret_cast<HANDLE>(_osfhnd(fh));
105
106    // If there is an LF at the beginning of the buffer, set the CRLF flag:
107    if (count != 0 && *buffer == LF)
108    {
109        _osfile(fh) |= FCRLF;
110    }
111    else
112    {
113        _osfile(fh) &= ~FCRLF;
114    }
115
116    // The translation can be performend in-place, because we are converting
117    // CRLF sequences into LF, so the resulting text will never be longer than
118    // any corresponding source text.
119    Character* const buffer_end = buffer + count;
120
121    Character* source_it = buffer;
122    Character* result_it = buffer;
123
124    while (source_it < buffer_end)
125    {
126        // If during translation we encounter a Ctrl+Z, we stop translating
127        // immeidately.  For devices, we need to just set the Ctrl+Z flag;
128        // for other files, we just copy the Ctrl+Z as a normal character
129        // before returning:
130        if (*source_it == CTRLZ)
131        {
132            if ((_osfile(fh) & FDEV) == 0)
133            {
134                _osfile(fh) |= FEOFLAG;
135            }
136            else
137            {
138                *result_it++ = *source_it++;
139            }
140
141            break;
142        }
143
144        // If the character is not a CR, then we can simply copy it:
145        if (*source_it != CR)
146        {
147            *result_it++ = *source_it++;
148            continue;
149        }
150
151        // Otherwise, the character is a CR.  We need to look-ahead to see if
152        // the next character is an LF, so that we can perform the CRLF => LF
153        // translation.  First, handle the easy case where the CR does not
154        // appear at the end of the buffer:
155        if (source_it + 1 < buffer_end)
156        {
157            if (*(source_it + 1) == LF)
158            {
159                source_it += 2;
160                *result_it++ = LF; // Convert CRLF => LF
161            }
162            else
163            {
164                *result_it++ = *source_it++;
165            }
166
167            continue;
168        }
169
170        // This is the hard case:  The CR is at the end of the buffer.  We need
171        // to peek ahead to see if the next character is an LF:
172        ++source_it;
173
174        Character peek;
175        DWORD     peek_size;
176        if (!ReadFile(os_handle, &peek, sizeof(peek), &peek_size, nullptr) || peek_size == 0)
177        {
178            // We couldn't peek ahead; just store the CR:
179            *result_it++ = CR;
180            continue;
181        }
182
183        // The peek succeeded.  What we do next depends on whether the file is
184        // seekable or not.  First we handle the case where the file does not
185        // allow seeking:
186        if (_osfile(fh) & (FDEV | FPIPE))
187        {
188            // If the peek character is an LF, then we just need to copy that
189            // character to the output buffer:
190            if (peek == LF)
191            {
192                *result_it++ = LF;
193            }
194            // Otherwise, it was some other character.  We need to write the CR
195            // to the output buffer, then we need to store the peek character
196            // for later retrieval:
197            else
198            {
199                *result_it++ = CR;
200                store_lookahead(fh, peek);
201            }
202        }
203        // If the file does allow seeking, then we handle the peek differently.
204        // For seekable files, we translate the CRLF => LF by eliminating the
205        // CR.  If the peek character is an LF, we simply do not write it to
206        // the output buffer; instead, we will seek backwards to unpeek the
207        // character, then let the LF get retrieved during the next call to
208        // read().
209        else
210        {
211            // However, if the buffer is currenty empty, then this is a one-
212            // character read, so we store the LF in order that we make progress
213            if (peek == LF && result_it == buffer)
214            {
215                *result_it++ = LF;
216            }
217            // Otherwise, we do what is described above:  we seek backwards and
218            // write the CR if and only if the peek character was not an LF:
219            else
220            {
221                _lseeki64_nolock(fh, -1 * static_cast<int>(sizeof(Character)), FILE_CURRENT);
222                if (peek != LF)
223                {
224                    *result_it++ = CR;
225                }
226            }
227        }
228    }
229
230    // Return the number of bytes that we translated:
231    return static_cast<int>((result_it - buffer) * sizeof(Character));
232}
233
234
235
236_Success_(return != -1)
237static int __cdecl translate_ansi_or_utf8_nolock(
238                                                                        int      const fh,
239    _In_reads_(source_count)                                            char*    const source_buffer,
240                                                                        size_t   const source_count,
241    _Pre_writable_size_(result_count) _Post_readable_byte_size_(return) wchar_t* const result_buffer,
242                                                                        size_t   const result_count
243    ) throw()
244{
245    int const text_mode_translation_result_size = translate_text_mode_nolock(fh, source_buffer, source_count);
246
247    // If we read no characters, then we are done:
248    if (text_mode_translation_result_size == 0)
249    {
250        return 0;
251    }
252
253    // If the file is open in ANSI mode, then no further translation is
254    // required; we can simply return the number of bytes that we read.
255    // Even though there is no translation, there may still be
256    // characters in the buffer due to CRLF translation (a CR without
257    // a LF would 'unget' the would-be LF).
258    // text_mode_translation_result_size has already been adjusted for
259    // CRLF translation by translate_text_mode_nolock().
260    if (_textmode(fh) == __crt_lowio_text_mode::ansi)
261    {
262        return text_mode_translation_result_size;
263    }
264
265    // Otherwise, the file is open in UTF-8 mode and we read a nonzero number
266    // of characters.  We need to translate from UTF-8 to UTF-16.  To do this,
267    // we first need to hunt for the end of the translatable buffer.  This may
268    // not be result_it, because we may have read a partial multibyte UTF-8
269    // character.
270    char* result_it = source_buffer + text_mode_translation_result_size - 1;
271
272    // If the last character is an independent character, then we can
273    // translate the entire buffer:
274    if (_utf8_is_independent(*result_it))
275    {
276        ++result_it; // Reset the result_it
277    }
278    // Otherwise, we have to find the end of the last full UTF-8 character
279    // that was read:
280    else
281    {
282        // Walk backwards from the end of the buffer until we find a lead byte:
283        unsigned counter = 1;
284        while (!_utf8_is_leadbyte(*result_it) && counter <= 4 && result_it >= source_buffer)
285        {
286            --result_it;
287            ++counter;
288        }
289
290        // Now that we've found the last lead byte, determine whether the
291        // character is complete or incomplete.  We compute the number of
292        // trailbytes...
293        unsigned const trailbyte_count = _utf8_no_of_trailbytes(static_cast<const unsigned char>(*result_it));
294        if (trailbyte_count == 0)
295        {
296            // Oh, apparently that wasn't a lead byte; the file contains invalid
297            // UTF-8 character sequences:
298            errno = EILSEQ;
299            return -1;
300        }
301
302        // If the lead byte plus the remaining bytes form a full set, then we
303        // can translate the entire buffer:
304        if (trailbyte_count + 1 == counter)
305        {
306            result_it += counter;
307        }
308        // Otherwise, the last character is incomplete, so we will not include
309        // this character in the result.  We unget the last characters, either
310        // by seeking backwards if the file is seekable, or by buffering the
311        // characters.  Note that result_it currently points one-past-the-end
312        // of the translatable buffer, because it points to the lead byte of
313        // the partially read character.
314        else
315        {
316            // If the file does not support seeking, buffer the characters:
317            if (_osfile(fh) & (FDEV | FPIPE))
318            {
319                _pipe_lookahead(fh)[0] = *result_it++;
320
321                if (counter >= 2)
322                {
323                    _pipe_lookahead(fh)[1] = *result_it++;
324                }
325
326                if (counter == 3)
327                {
328                    _pipe_lookahead(fh)[2] = *result_it++;
329                }
330
331                // Now that we've buffered the characters, seek the end iterator
332                // back to the actual end of the translatable sequence:
333                result_it -= counter;
334
335            }
336            // If the file does support seeking, we can just seek backwards so
337            // that the next read will get the characters directly:
338            else
339            {
340                _lseeki64_nolock(fh, -static_cast<int>(counter), FILE_CURRENT);
341            }
342        }
343    }
344
345    // Finally, we can translate the characters into the result buffer:
346    int const characters_translated = static_cast<int>(__acrt_MultiByteToWideChar(
347            CP_UTF8,
348            0,
349            source_buffer,
350            static_cast<DWORD>(result_it - source_buffer),
351            result_buffer,
352            static_cast<DWORD>(result_count)));
353
354    if (characters_translated == 0)
355    {
356        __acrt_errno_map_os_error(GetLastError());
357        return -1;
358    }
359
360    _utf8translations(fh) = (characters_translated != static_cast<int>(result_it - source_buffer));
361
362    // MultiByteToWideChar returns the number of wide characters that
363    // it produced; we need to return the number of bytes:
364    return characters_translated * sizeof(wchar_t);
365}
366
367
368
369// Reads bytes from a file.  This function attempts to read enough bytes to fill
370// the provided buffer.  If the file is in text mode, CRLF sequences are mapped
371// to LF, thus affecting the number of characters read.  This mapping does not
372// affect the file pointer.
373//
374// Returns the number of bytes read, which may be less than the number of bytes
375// requested if EOF was reached or if the file is in text mode.  Returns -1 and
376// sets errno on failure.
377extern "C" int __cdecl _read(int const fh, void* const buffer, unsigned const buffer_size)
378{
379    _CHECK_FH_CLEAR_OSSERR_RETURN(fh, EBADF, -1);
380    _VALIDATE_CLEAR_OSSERR_RETURN(fh >= 0 && (unsigned)fh < (unsigned)_nhandle, EBADF, -1);
381    _VALIDATE_CLEAR_OSSERR_RETURN(_osfile(fh) & FOPEN, EBADF, -1);
382    _VALIDATE_CLEAR_OSSERR_RETURN(buffer_size <= INT_MAX, EINVAL, -1);
383
384    __acrt_lowio_lock_fh(fh);
385    int result = -1;
386    __try
387    {
388        if ((_osfile(fh) & FOPEN) == 0)
389        {
390            errno = EBADF;
391            _doserrno = 0;
392            _ASSERTE(("Invalid file descriptor. File possibly closed by a different thread",0));
393            __leave;
394        }
395
396        result = _read_nolock(fh, buffer, buffer_size);
397    }
398    __finally
399    {
400        __acrt_lowio_unlock_fh(fh);
401    }
402    __endtry
403    return result;
404}
405
406
407
408extern "C" int __cdecl _read_nolock(
409    int      const fh,
410    void*    const result_buffer,
411    unsigned const result_buffer_size
412    )
413{
414    _CHECK_FH_CLEAR_OSSERR_RETURN(fh, EBADF, -1 );
415    _VALIDATE_CLEAR_OSSERR_RETURN(fh >= 0 && (unsigned)fh < (unsigned)_nhandle, EBADF, -1);
416    _VALIDATE_CLEAR_OSSERR_RETURN(_osfile(fh) & FOPEN, EBADF, -1);
417    _VALIDATE_CLEAR_OSSERR_RETURN(result_buffer_size <= INT_MAX, EINVAL, -1);
418
419    // If there is no data to be written or if the file is at EOF, no work to do:
420    if (result_buffer_size == 0 || (_osfile(fh) & FEOFLAG))
421        return 0;
422
423    _VALIDATE_CLEAR_OSSERR_RETURN(result_buffer != nullptr, EINVAL, -1);
424
425
426    HANDLE const os_handle = reinterpret_cast<HANDLE>(_osfhnd(fh));
427    __crt_lowio_text_mode const text_mode = _textmode(fh);
428
429
430    __crt_unique_heap_ptr<char> owned_internal_buffer;
431
432    char*    internal_buffer;
433    unsigned internal_buffer_remaining;
434    switch (text_mode)
435    {
436    case __crt_lowio_text_mode::utf8:
437        // For UTF-8 files, we need two buffers, because after reading we need
438        // to convert the text into Unicode.  MultiByteToWideChar doesn't do
439        // in-place conversions.
440        //
441        // The multibyte to wide character conversion may double the size of the
442        // text, hence we halve the size here.
443        //
444        // Since we are reading a UTF-8 stream, the number of bytes read may
445        // vary from 'size' characters to 'size/4' characters.  For this reason,
446        // if we need to read 'size' characters, we will allocate an MBCS buffer
447        // of size 'size'.  In case the size is zero, we will use four as a
448        // minimum value.  This will make sure we don't overflow when we read
449        // from a pipe.
450        //
451        // In this case, the number of wide characters that we can read is
452        // size / 2.  This means that we require a buffer of size size / 2.
453
454        // For UTF-8 the count always needs to be an even number:
455        _VALIDATE_CLEAR_OSSERR_RETURN(result_buffer_size % 2 == 0, EINVAL, -1);
456
457        internal_buffer_remaining = (result_buffer_size / 2) < 4
458            ? 4
459            : (result_buffer_size/2);
460
461        owned_internal_buffer = _malloc_crt_t(char, internal_buffer_remaining);
462        internal_buffer = owned_internal_buffer.get();
463        if (!internal_buffer)
464        {
465            errno = ENOMEM;
466            _doserrno = ERROR_NOT_ENOUGH_MEMORY;
467            return -1;
468        }
469
470        _startpos(fh) = _lseeki64_nolock(fh, 0, FILE_CURRENT);
471        break;
472
473    case __crt_lowio_text_mode::utf16le:
474        // For UTF-16 the count always needs to be an even number:
475        _VALIDATE_CLEAR_OSSERR_RETURN((result_buffer_size % 2) == 0, EINVAL, -1);
476
477        // For UTF-16 files, we can directly use the input buffer:
478        internal_buffer_remaining = result_buffer_size;
479        internal_buffer           = static_cast<char*>(result_buffer);
480        break;
481
482    default:
483        // For ANSI files, we can directly use the input buffer:
484        internal_buffer_remaining = result_buffer_size;
485        internal_buffer           = static_cast<char*>(result_buffer);
486        break;
487    }
488
489    wchar_t* wide_internal_buffer = reinterpret_cast<wchar_t*>(internal_buffer);
490
491    int bytes_read = 0;
492
493    // We may have buffered look-ahead characters during the last read.  If
494    // so, read them into the buffer and set the look-ahead buffers back to
495    // empty state (with the value of LF):
496    //
497    // CRT_REFACTOR This look-ahead buffering could use additional work, but
498    // will require nonlocal changes, so that work is not included in this
499    // changeset.
500    if ((_osfile(fh) & (FPIPE | FDEV)) &&
501        _pipe_lookahead(fh)[0] != LF &&
502        internal_buffer_remaining != 0)
503    {
504        *internal_buffer++ = _pipe_lookahead(fh)[0];
505        ++bytes_read;
506        --internal_buffer_remaining;
507        _pipe_lookahead(fh)[0] = LF;
508
509        // For UTF-16, there may be an additional look-ahead character
510        // bufferred.  For UTF-8, there may be two more:
511        if (text_mode != __crt_lowio_text_mode::ansi &&
512            _pipe_lookahead(fh)[1] != LF &&
513            internal_buffer_remaining != 0)
514        {
515            *internal_buffer++ = _pipe_lookahead(fh)[1];
516            ++bytes_read;
517            --internal_buffer_remaining;
518            _pipe_lookahead(fh)[1] = LF;
519
520            if (text_mode == __crt_lowio_text_mode::utf8 &&
521                _pipe_lookahead(fh)[2] != LF &&
522                internal_buffer_remaining != 0)
523            {
524                *internal_buffer++ = _pipe_lookahead(fh)[2];
525                ++bytes_read;
526                --internal_buffer_remaining;
527                _pipe_lookahead(fh)[2] = LF;
528            }
529        }
530    }
531
532    DWORD console_mode;
533    bool const from_console =
534        _isatty(fh) &&
535        (_osfile(fh) & FTEXT) &&
536        GetConsoleMode(os_handle, &console_mode);
537
538    // Read the data directly from the console:
539    if (from_console && text_mode == __crt_lowio_text_mode::utf16le)
540    {
541        DWORD console_characters_read;
542        if (!ReadConsoleW(
543                os_handle,
544                internal_buffer,
545                internal_buffer_remaining / sizeof(wchar_t),
546                &console_characters_read,
547                nullptr))
548        {
549            __acrt_errno_map_os_error(GetLastError());
550            return -1;
551        }
552
553        // In UTF-16 mode, the return value is the actual number of wide
554        // characters read; we need the number of bytes:
555        bytes_read += console_characters_read * sizeof(wchar_t);
556    }
557    // Otherwise, read the data from the file normally:
558    else
559    {
560        DWORD bytes_read_from_file;
561        if (!ReadFile(
562                os_handle,
563                internal_buffer,
564                internal_buffer_remaining,
565                &bytes_read_from_file,
566                nullptr
567            ) || bytes_read_from_file > result_buffer_size)
568        {
569            DWORD const last_error = GetLastError();
570            if (last_error == ERROR_ACCESS_DENIED)
571            {
572                // ERROR_ACCESS_DENIED occurs if the file is open with the wrong
573                // read/write mode.  For this error, we should return EBADF, not
574                // the EACCES that will be set by __acrt_errno_map_os_error:
575                errno = EBADF;
576                _doserrno = last_error;
577                return -1;
578
579            }
580            else if (last_error == ERROR_BROKEN_PIPE)
581            {
582                // Return 0 if ERROR_BROKEN_PIPE occurs.  It means the handle is
583                // a read handle on a pipe for which all write handles have been
584                // closed and all data has been read:
585                return 0;
586            }
587            else
588            {
589                // Otherwise, map the error normally and return:
590                __acrt_errno_map_os_error(last_error);
591                return -1;
592            }
593        }
594
595        bytes_read += bytes_read_from_file;
596    }
597
598
599    // If the file is open in binary mode, no translation is required, so we
600    // can skip all of the rest of this function:
601    if ((_osfile(fh) & FTEXT) == 0)
602        return bytes_read;
603
604
605    // Perform the CRLF => LF translation and convert to the required
606    // encoding (UTF-8 must be converted to UTF-16).  This first case
607    // handles UTF-8 and ANSI:
608    if (text_mode != __crt_lowio_text_mode::utf16le)
609    {
610        return translate_ansi_or_utf8_nolock(
611            fh,
612            internal_buffer,
613            bytes_read,
614            static_cast<wchar_t*>(result_buffer),
615            result_buffer_size / sizeof(wchar_t));
616    }
617
618    // The text mode is __crt_lowio_text_mode::utf16le and we are reading from the
619    // console:
620    else if (from_console)
621    {
622        return translate_utf16_from_console_nolock(
623            fh,
624            wide_internal_buffer,
625            bytes_read / sizeof(wchar_t));
626    }
627    // Otherwise, the text mode is __crt_lowio_text_mode::utf16le and we are NOT
628    // reading from the console:
629    else
630    {
631        return translate_text_mode_nolock(
632            fh,
633            wide_internal_buffer,
634            bytes_read / sizeof(wchar_t));
635    }
636}