yaml-cpp

FORK: A YAML parser and emitter in C++
git clone https://git.neptards.moe/neptards/yaml-cpp.git
Log | Files | Refs | README | LICENSE

stream.cpp (11907B)


      1 #include <iostream>
      2 
      3 #include "stream.h"
      4 
      5 #ifndef YAML_PREFETCH_SIZE
      6 #define YAML_PREFETCH_SIZE 2048
      7 #endif
      8 
      9 #define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A)))
     10 #define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A))
     11 
     12 #define CP_REPLACEMENT_CHARACTER (0xFFFD)
     13 
     14 namespace YAML {
     15 enum UtfIntroState {
     16   uis_start,
     17   uis_utfbe_b1,
     18   uis_utf32be_b2,
     19   uis_utf32be_bom3,
     20   uis_utf32be,
     21   uis_utf16be,
     22   uis_utf16be_bom1,
     23   uis_utfle_bom1,
     24   uis_utf16le_bom2,
     25   uis_utf32le_bom3,
     26   uis_utf16le,
     27   uis_utf32le,
     28   uis_utf8_imp,
     29   uis_utf16le_imp,
     30   uis_utf32le_imp3,
     31   uis_utf8_bom1,
     32   uis_utf8_bom2,
     33   uis_utf8,
     34   uis_error
     35 };
     36 
     37 enum UtfIntroCharType {
     38   uict00,
     39   uictBB,
     40   uictBF,
     41   uictEF,
     42   uictFE,
     43   uictFF,
     44   uictAscii,
     45   uictOther,
     46   uictMax
     47 };
     48 
     49 static bool s_introFinalState[] = {
     50     false,  // uis_start
     51     false,  // uis_utfbe_b1
     52     false,  // uis_utf32be_b2
     53     false,  // uis_utf32be_bom3
     54     true,   // uis_utf32be
     55     true,   // uis_utf16be
     56     false,  // uis_utf16be_bom1
     57     false,  // uis_utfle_bom1
     58     false,  // uis_utf16le_bom2
     59     false,  // uis_utf32le_bom3
     60     true,   // uis_utf16le
     61     true,   // uis_utf32le
     62     false,  // uis_utf8_imp
     63     false,  // uis_utf16le_imp
     64     false,  // uis_utf32le_imp3
     65     false,  // uis_utf8_bom1
     66     false,  // uis_utf8_bom2
     67     true,   // uis_utf8
     68     true,   // uis_error
     69 };
     70 
     71 static UtfIntroState s_introTransitions[][uictMax] = {
     72     // uict00,           uictBB,           uictBF,           uictEF,
     73     // uictFE,           uictFF,           uictAscii,        uictOther
     74     {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1,
     75      uis_utfle_bom1, uis_utf8_imp, uis_utf8},
     76     {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
     77      uis_utf16be, uis_utf8},
     78     {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8,
     79      uis_utf8, uis_utf8},
     80     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8,
     81      uis_utf8},
     82     {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be,
     83      uis_utf32be, uis_utf32be, uis_utf32be},
     84     {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be,
     85      uis_utf16be, uis_utf16be, uis_utf16be},
     86     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8,
     87      uis_utf8},
     88     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8,
     89      uis_utf8, uis_utf8},
     90     {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
     91      uis_utf16le, uis_utf16le, uis_utf16le},
     92     {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
     93      uis_utf16le, uis_utf16le, uis_utf16le},
     94     {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
     95      uis_utf16le, uis_utf16le, uis_utf16le},
     96     {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le,
     97      uis_utf32le, uis_utf32le, uis_utf32le},
     98     {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
     99      uis_utf8, uis_utf8},
    100     {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
    101      uis_utf16le, uis_utf16le, uis_utf16le},
    102     {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le,
    103      uis_utf16le, uis_utf16le, uis_utf16le},
    104     {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
    105      uis_utf8},
    106     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
    107      uis_utf8},
    108     {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8,
    109      uis_utf8},
    110 };
    111 
    112 static char s_introUngetCount[][uictMax] = {
    113     // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
    114     {0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
    115     {3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4},
    116     {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
    117     {2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2},
    118     {0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2},
    119     {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1},
    120     {0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3},
    121     {4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2},
    122     {3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1},
    123 };
    124 
    125 inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) {
    126   if (std::istream::traits_type::eof() == ch) {
    127     return uictOther;
    128   }
    129 
    130   switch (ch) {
    131     case 0:
    132       return uict00;
    133     case 0xBB:
    134       return uictBB;
    135     case 0xBF:
    136       return uictBF;
    137     case 0xEF:
    138       return uictEF;
    139     case 0xFE:
    140       return uictFE;
    141     case 0xFF:
    142       return uictFF;
    143   }
    144 
    145   if ((ch > 0) && (ch < 0xFF)) {
    146     return uictAscii;
    147   }
    148 
    149   return uictOther;
    150 }
    151 
    152 inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits,
    153                        unsigned char rshift) {
    154   const unsigned char header =
    155       static_cast<unsigned char>(((1 << lead_bits) - 1) << (8 - lead_bits));
    156   const unsigned char mask = (0xFF >> (lead_bits + 1));
    157   return static_cast<char>(
    158       static_cast<unsigned char>(header | ((ch >> rshift) & mask)));
    159 }
    160 
    161 inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) {
    162   // We are not allowed to queue the Stream::eof() codepoint, so
    163   // replace it with CP_REPLACEMENT_CHARACTER
    164   if (static_cast<unsigned long>(Stream::eof()) == ch) {
    165     ch = CP_REPLACEMENT_CHARACTER;
    166   }
    167 
    168   if (ch < 0x80) {
    169     q.push_back(Utf8Adjust(ch, 0, 0));
    170   } else if (ch < 0x800) {
    171     q.push_back(Utf8Adjust(ch, 2, 6));
    172     q.push_back(Utf8Adjust(ch, 1, 0));
    173   } else if (ch < 0x10000) {
    174     q.push_back(Utf8Adjust(ch, 3, 12));
    175     q.push_back(Utf8Adjust(ch, 1, 6));
    176     q.push_back(Utf8Adjust(ch, 1, 0));
    177   } else {
    178     q.push_back(Utf8Adjust(ch, 4, 18));
    179     q.push_back(Utf8Adjust(ch, 1, 12));
    180     q.push_back(Utf8Adjust(ch, 1, 6));
    181     q.push_back(Utf8Adjust(ch, 1, 0));
    182   }
    183 }
    184 
    185 Stream::Stream(std::istream& input)
    186     : m_input(input),
    187       m_mark{},
    188       m_charSet{},
    189       m_readahead{},
    190       m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]),
    191       m_nPrefetchedAvailable(0),
    192       m_nPrefetchedUsed(0) {
    193   using char_traits = std::istream::traits_type;
    194 
    195   if (!input)
    196     return;
    197 
    198   // Determine (or guess) the character-set by reading the BOM, if any.  See
    199   // the YAML specification for the determination algorithm.
    200   char_traits::int_type intro[4]{};
    201   int nIntroUsed = 0;
    202   UtfIntroState state = uis_start;
    203   for (; !s_introFinalState[state];) {
    204     std::istream::int_type ch = input.get();
    205     intro[nIntroUsed++] = ch;
    206     UtfIntroCharType charType = IntroCharTypeOf(ch);
    207     UtfIntroState newState = s_introTransitions[state][charType];
    208     int nUngets = s_introUngetCount[state][charType];
    209     if (nUngets > 0) {
    210       input.clear();
    211       for (; nUngets > 0; --nUngets) {
    212         if (char_traits::eof() != intro[--nIntroUsed])
    213           input.putback(char_traits::to_char_type(intro[nIntroUsed]));
    214       }
    215     }
    216     state = newState;
    217   }
    218 
    219   switch (state) {
    220     case uis_utf8:
    221       m_charSet = utf8;
    222       break;
    223     case uis_utf16le:
    224       m_charSet = utf16le;
    225       break;
    226     case uis_utf16be:
    227       m_charSet = utf16be;
    228       break;
    229     case uis_utf32le:
    230       m_charSet = utf32le;
    231       break;
    232     case uis_utf32be:
    233       m_charSet = utf32be;
    234       break;
    235     default:
    236       m_charSet = utf8;
    237       break;
    238   }
    239 
    240   ReadAheadTo(0);
    241 }
    242 
    243 Stream::~Stream() { delete[] m_pPrefetched; }
    244 
    245 char Stream::peek() const {
    246   if (m_readahead.empty()) {
    247     return Stream::eof();
    248   }
    249 
    250   return m_readahead[0];
    251 }
    252 
    253 Stream::operator bool() const {
    254   return m_input.good() ||
    255          (!m_readahead.empty() && m_readahead[0] != Stream::eof());
    256 }
    257 
    258 // get
    259 // . Extracts a character from the stream and updates our position
    260 char Stream::get() {
    261   char ch = peek();
    262   AdvanceCurrent();
    263   m_mark.column++;
    264 
    265   if (ch == '\n') {
    266     m_mark.column = 0;
    267     m_mark.line++;
    268   }
    269 
    270   return ch;
    271 }
    272 
    273 // get
    274 // . Extracts 'n' characters from the stream and updates our position
    275 std::string Stream::get(int n) {
    276   std::string ret;
    277   if (n > 0) {
    278     ret.reserve(static_cast<std::string::size_type>(n));
    279     for (int i = 0; i < n; i++)
    280       ret += get();
    281   }
    282   return ret;
    283 }
    284 
    285 // eat
    286 // . Eats 'n' characters and updates our position.
    287 void Stream::eat(int n) {
    288   for (int i = 0; i < n; i++)
    289     get();
    290 }
    291 
    292 void Stream::AdvanceCurrent() {
    293   if (!m_readahead.empty()) {
    294     m_readahead.pop_front();
    295     m_mark.pos++;
    296   }
    297 
    298   ReadAheadTo(0);
    299 }
    300 
    301 bool Stream::_ReadAheadTo(size_t i) const {
    302   while (m_input.good() && (m_readahead.size() <= i)) {
    303     switch (m_charSet) {
    304       case utf8:
    305         StreamInUtf8();
    306         break;
    307       case utf16le:
    308         StreamInUtf16();
    309         break;
    310       case utf16be:
    311         StreamInUtf16();
    312         break;
    313       case utf32le:
    314         StreamInUtf32();
    315         break;
    316       case utf32be:
    317         StreamInUtf32();
    318         break;
    319     }
    320   }
    321 
    322   // signal end of stream
    323   if (!m_input.good())
    324     m_readahead.push_back(Stream::eof());
    325 
    326   return m_readahead.size() > i;
    327 }
    328 
    329 void Stream::StreamInUtf8() const {
    330   unsigned char b = GetNextByte();
    331   if (m_input.good()) {
    332     m_readahead.push_back(static_cast<char>(b));
    333   }
    334 }
    335 
    336 void Stream::StreamInUtf16() const {
    337   unsigned long ch = 0;
    338   unsigned char bytes[2];
    339   int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
    340 
    341   bytes[0] = GetNextByte();
    342   bytes[1] = GetNextByte();
    343   if (!m_input.good()) {
    344     return;
    345   }
    346   ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
    347        static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
    348 
    349   if (ch >= 0xDC00 && ch < 0xE000) {
    350     // Trailing (low) surrogate...ugh, wrong order
    351     QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
    352     return;
    353   }
    354 
    355   if (ch >= 0xD800 && ch < 0xDC00) {
    356     // ch is a leading (high) surrogate
    357 
    358     // Four byte UTF-8 code point
    359 
    360     // Read the trailing (low) surrogate
    361     for (;;) {
    362       bytes[0] = GetNextByte();
    363       bytes[1] = GetNextByte();
    364       if (!m_input.good()) {
    365         QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
    366         return;
    367       }
    368       unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
    369                             static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
    370       if (chLow < 0xDC00 || chLow >= 0xE000) {
    371         // Trouble...not a low surrogate.  Dump a REPLACEMENT CHARACTER into the
    372         // stream.
    373         QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
    374 
    375         // Deal with the next UTF-16 unit
    376         if (chLow < 0xD800 || chLow >= 0xE000) {
    377           // Easiest case: queue the codepoint and return
    378           QueueUnicodeCodepoint(m_readahead, ch);
    379           return;
    380         }
    381         // Start the loop over with the new high surrogate
    382         ch = chLow;
    383         continue;
    384       }
    385 
    386       // Select the payload bits from the high surrogate
    387       ch &= 0x3FF;
    388       ch <<= 10;
    389 
    390       // Include bits from low surrogate
    391       ch |= (chLow & 0x3FF);
    392 
    393       // Add the surrogacy offset
    394       ch += 0x10000;
    395       break;
    396     }
    397   }
    398 
    399   QueueUnicodeCodepoint(m_readahead, ch);
    400 }
    401 
    402 inline char* ReadBuffer(unsigned char* pBuffer) {
    403   return reinterpret_cast<char*>(pBuffer);
    404 }
    405 
    406 unsigned char Stream::GetNextByte() const {
    407   if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) {
    408     std::streambuf* pBuf = m_input.rdbuf();
    409     m_nPrefetchedAvailable = static_cast<std::size_t>(
    410         pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE));
    411     m_nPrefetchedUsed = 0;
    412     if (!m_nPrefetchedAvailable) {
    413       m_input.setstate(std::ios_base::eofbit);
    414     }
    415 
    416     if (0 == m_nPrefetchedAvailable) {
    417       return 0;
    418     }
    419   }
    420 
    421   return m_pPrefetched[m_nPrefetchedUsed++];
    422 }
    423 
    424 void Stream::StreamInUtf32() const {
    425   static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}};
    426 
    427   unsigned long ch = 0;
    428   unsigned char bytes[4];
    429   int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
    430 
    431   bytes[0] = GetNextByte();
    432   bytes[1] = GetNextByte();
    433   bytes[2] = GetNextByte();
    434   bytes[3] = GetNextByte();
    435   if (!m_input.good()) {
    436     return;
    437   }
    438 
    439   for (int i = 0; i < 4; ++i) {
    440     ch <<= 8;
    441     ch |= bytes[pIndexes[i]];
    442   }
    443 
    444   QueueUnicodeCodepoint(m_readahead, ch);
    445 }
    446 }  // namespace YAML