stream.cpp (11907B)
1 #include <iostream> 2 3 #include "stream.h" 4 5 #ifndef YAML_PREFETCH_SIZE 6 #define YAML_PREFETCH_SIZE 2048 7 #endif 8 9 #define S_ARRAY_SIZE(A) (sizeof(A) / sizeof(*(A))) 10 #define S_ARRAY_END(A) ((A) + S_ARRAY_SIZE(A)) 11 12 #define CP_REPLACEMENT_CHARACTER (0xFFFD) 13 14 namespace YAML { 15 enum UtfIntroState { 16 uis_start, 17 uis_utfbe_b1, 18 uis_utf32be_b2, 19 uis_utf32be_bom3, 20 uis_utf32be, 21 uis_utf16be, 22 uis_utf16be_bom1, 23 uis_utfle_bom1, 24 uis_utf16le_bom2, 25 uis_utf32le_bom3, 26 uis_utf16le, 27 uis_utf32le, 28 uis_utf8_imp, 29 uis_utf16le_imp, 30 uis_utf32le_imp3, 31 uis_utf8_bom1, 32 uis_utf8_bom2, 33 uis_utf8, 34 uis_error 35 }; 36 37 enum UtfIntroCharType { 38 uict00, 39 uictBB, 40 uictBF, 41 uictEF, 42 uictFE, 43 uictFF, 44 uictAscii, 45 uictOther, 46 uictMax 47 }; 48 49 static bool s_introFinalState[] = { 50 false, // uis_start 51 false, // uis_utfbe_b1 52 false, // uis_utf32be_b2 53 false, // uis_utf32be_bom3 54 true, // uis_utf32be 55 true, // uis_utf16be 56 false, // uis_utf16be_bom1 57 false, // uis_utfle_bom1 58 false, // uis_utf16le_bom2 59 false, // uis_utf32le_bom3 60 true, // uis_utf16le 61 true, // uis_utf32le 62 false, // uis_utf8_imp 63 false, // uis_utf16le_imp 64 false, // uis_utf32le_imp3 65 false, // uis_utf8_bom1 66 false, // uis_utf8_bom2 67 true, // uis_utf8 68 true, // uis_error 69 }; 70 71 static UtfIntroState s_introTransitions[][uictMax] = { 72 // uict00, uictBB, uictBF, uictEF, 73 // uictFE, uictFF, uictAscii, uictOther 74 {uis_utfbe_b1, uis_utf8, uis_utf8, uis_utf8_bom1, uis_utf16be_bom1, 75 uis_utfle_bom1, uis_utf8_imp, uis_utf8}, 76 {uis_utf32be_b2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, 77 uis_utf16be, uis_utf8}, 78 {uis_utf32be, uis_utf8, uis_utf8, uis_utf8, uis_utf32be_bom3, uis_utf8, 79 uis_utf8, uis_utf8}, 80 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf32be, uis_utf8, 81 uis_utf8}, 82 {uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, uis_utf32be, 83 uis_utf32be, uis_utf32be, uis_utf32be}, 84 {uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, uis_utf16be, 85 uis_utf16be, uis_utf16be, uis_utf16be}, 86 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16be, uis_utf8, 87 uis_utf8}, 88 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf16le_bom2, uis_utf8, 89 uis_utf8, uis_utf8}, 90 {uis_utf32le_bom3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, 91 uis_utf16le, uis_utf16le, uis_utf16le}, 92 {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, 93 uis_utf16le, uis_utf16le, uis_utf16le}, 94 {uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, 95 uis_utf16le, uis_utf16le, uis_utf16le}, 96 {uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, uis_utf32le, 97 uis_utf32le, uis_utf32le, uis_utf32le}, 98 {uis_utf16le_imp, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, 99 uis_utf8, uis_utf8}, 100 {uis_utf32le_imp3, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, 101 uis_utf16le, uis_utf16le, uis_utf16le}, 102 {uis_utf32le, uis_utf16le, uis_utf16le, uis_utf16le, uis_utf16le, 103 uis_utf16le, uis_utf16le, uis_utf16le}, 104 {uis_utf8, uis_utf8_bom2, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, 105 uis_utf8}, 106 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, 107 uis_utf8}, 108 {uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, uis_utf8, 109 uis_utf8}, 110 }; 111 112 static char s_introUngetCount[][uictMax] = { 113 // uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther 114 {0, 1, 1, 0, 0, 0, 0, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, 115 {3, 3, 3, 3, 0, 3, 3, 3}, {4, 4, 4, 4, 4, 0, 4, 4}, 116 {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1}, 117 {2, 2, 2, 2, 2, 0, 2, 2}, {2, 2, 2, 2, 0, 2, 2, 2}, 118 {0, 1, 1, 1, 1, 1, 1, 1}, {0, 2, 2, 2, 2, 2, 2, 2}, 119 {1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1, 1, 1}, 120 {0, 2, 2, 2, 2, 2, 2, 2}, {0, 3, 3, 3, 3, 3, 3, 3}, 121 {4, 4, 4, 4, 4, 4, 4, 4}, {2, 0, 2, 2, 2, 2, 2, 2}, 122 {3, 3, 0, 3, 3, 3, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1}, 123 }; 124 125 inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch) { 126 if (std::istream::traits_type::eof() == ch) { 127 return uictOther; 128 } 129 130 switch (ch) { 131 case 0: 132 return uict00; 133 case 0xBB: 134 return uictBB; 135 case 0xBF: 136 return uictBF; 137 case 0xEF: 138 return uictEF; 139 case 0xFE: 140 return uictFE; 141 case 0xFF: 142 return uictFF; 143 } 144 145 if ((ch > 0) && (ch < 0xFF)) { 146 return uictAscii; 147 } 148 149 return uictOther; 150 } 151 152 inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, 153 unsigned char rshift) { 154 const unsigned char header = 155 static_cast<unsigned char>(((1 << lead_bits) - 1) << (8 - lead_bits)); 156 const unsigned char mask = (0xFF >> (lead_bits + 1)); 157 return static_cast<char>( 158 static_cast<unsigned char>(header | ((ch >> rshift) & mask))); 159 } 160 161 inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch) { 162 // We are not allowed to queue the Stream::eof() codepoint, so 163 // replace it with CP_REPLACEMENT_CHARACTER 164 if (static_cast<unsigned long>(Stream::eof()) == ch) { 165 ch = CP_REPLACEMENT_CHARACTER; 166 } 167 168 if (ch < 0x80) { 169 q.push_back(Utf8Adjust(ch, 0, 0)); 170 } else if (ch < 0x800) { 171 q.push_back(Utf8Adjust(ch, 2, 6)); 172 q.push_back(Utf8Adjust(ch, 1, 0)); 173 } else if (ch < 0x10000) { 174 q.push_back(Utf8Adjust(ch, 3, 12)); 175 q.push_back(Utf8Adjust(ch, 1, 6)); 176 q.push_back(Utf8Adjust(ch, 1, 0)); 177 } else { 178 q.push_back(Utf8Adjust(ch, 4, 18)); 179 q.push_back(Utf8Adjust(ch, 1, 12)); 180 q.push_back(Utf8Adjust(ch, 1, 6)); 181 q.push_back(Utf8Adjust(ch, 1, 0)); 182 } 183 } 184 185 Stream::Stream(std::istream& input) 186 : m_input(input), 187 m_mark{}, 188 m_charSet{}, 189 m_readahead{}, 190 m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), 191 m_nPrefetchedAvailable(0), 192 m_nPrefetchedUsed(0) { 193 using char_traits = std::istream::traits_type; 194 195 if (!input) 196 return; 197 198 // Determine (or guess) the character-set by reading the BOM, if any. See 199 // the YAML specification for the determination algorithm. 200 char_traits::int_type intro[4]{}; 201 int nIntroUsed = 0; 202 UtfIntroState state = uis_start; 203 for (; !s_introFinalState[state];) { 204 std::istream::int_type ch = input.get(); 205 intro[nIntroUsed++] = ch; 206 UtfIntroCharType charType = IntroCharTypeOf(ch); 207 UtfIntroState newState = s_introTransitions[state][charType]; 208 int nUngets = s_introUngetCount[state][charType]; 209 if (nUngets > 0) { 210 input.clear(); 211 for (; nUngets > 0; --nUngets) { 212 if (char_traits::eof() != intro[--nIntroUsed]) 213 input.putback(char_traits::to_char_type(intro[nIntroUsed])); 214 } 215 } 216 state = newState; 217 } 218 219 switch (state) { 220 case uis_utf8: 221 m_charSet = utf8; 222 break; 223 case uis_utf16le: 224 m_charSet = utf16le; 225 break; 226 case uis_utf16be: 227 m_charSet = utf16be; 228 break; 229 case uis_utf32le: 230 m_charSet = utf32le; 231 break; 232 case uis_utf32be: 233 m_charSet = utf32be; 234 break; 235 default: 236 m_charSet = utf8; 237 break; 238 } 239 240 ReadAheadTo(0); 241 } 242 243 Stream::~Stream() { delete[] m_pPrefetched; } 244 245 char Stream::peek() const { 246 if (m_readahead.empty()) { 247 return Stream::eof(); 248 } 249 250 return m_readahead[0]; 251 } 252 253 Stream::operator bool() const { 254 return m_input.good() || 255 (!m_readahead.empty() && m_readahead[0] != Stream::eof()); 256 } 257 258 // get 259 // . Extracts a character from the stream and updates our position 260 char Stream::get() { 261 char ch = peek(); 262 AdvanceCurrent(); 263 m_mark.column++; 264 265 if (ch == '\n') { 266 m_mark.column = 0; 267 m_mark.line++; 268 } 269 270 return ch; 271 } 272 273 // get 274 // . Extracts 'n' characters from the stream and updates our position 275 std::string Stream::get(int n) { 276 std::string ret; 277 if (n > 0) { 278 ret.reserve(static_cast<std::string::size_type>(n)); 279 for (int i = 0; i < n; i++) 280 ret += get(); 281 } 282 return ret; 283 } 284 285 // eat 286 // . Eats 'n' characters and updates our position. 287 void Stream::eat(int n) { 288 for (int i = 0; i < n; i++) 289 get(); 290 } 291 292 void Stream::AdvanceCurrent() { 293 if (!m_readahead.empty()) { 294 m_readahead.pop_front(); 295 m_mark.pos++; 296 } 297 298 ReadAheadTo(0); 299 } 300 301 bool Stream::_ReadAheadTo(size_t i) const { 302 while (m_input.good() && (m_readahead.size() <= i)) { 303 switch (m_charSet) { 304 case utf8: 305 StreamInUtf8(); 306 break; 307 case utf16le: 308 StreamInUtf16(); 309 break; 310 case utf16be: 311 StreamInUtf16(); 312 break; 313 case utf32le: 314 StreamInUtf32(); 315 break; 316 case utf32be: 317 StreamInUtf32(); 318 break; 319 } 320 } 321 322 // signal end of stream 323 if (!m_input.good()) 324 m_readahead.push_back(Stream::eof()); 325 326 return m_readahead.size() > i; 327 } 328 329 void Stream::StreamInUtf8() const { 330 unsigned char b = GetNextByte(); 331 if (m_input.good()) { 332 m_readahead.push_back(static_cast<char>(b)); 333 } 334 } 335 336 void Stream::StreamInUtf16() const { 337 unsigned long ch = 0; 338 unsigned char bytes[2]; 339 int nBigEnd = (m_charSet == utf16be) ? 0 : 1; 340 341 bytes[0] = GetNextByte(); 342 bytes[1] = GetNextByte(); 343 if (!m_input.good()) { 344 return; 345 } 346 ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | 347 static_cast<unsigned long>(bytes[1 ^ nBigEnd]); 348 349 if (ch >= 0xDC00 && ch < 0xE000) { 350 // Trailing (low) surrogate...ugh, wrong order 351 QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); 352 return; 353 } 354 355 if (ch >= 0xD800 && ch < 0xDC00) { 356 // ch is a leading (high) surrogate 357 358 // Four byte UTF-8 code point 359 360 // Read the trailing (low) surrogate 361 for (;;) { 362 bytes[0] = GetNextByte(); 363 bytes[1] = GetNextByte(); 364 if (!m_input.good()) { 365 QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); 366 return; 367 } 368 unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | 369 static_cast<unsigned long>(bytes[1 ^ nBigEnd]); 370 if (chLow < 0xDC00 || chLow >= 0xE000) { 371 // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the 372 // stream. 373 QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); 374 375 // Deal with the next UTF-16 unit 376 if (chLow < 0xD800 || chLow >= 0xE000) { 377 // Easiest case: queue the codepoint and return 378 QueueUnicodeCodepoint(m_readahead, ch); 379 return; 380 } 381 // Start the loop over with the new high surrogate 382 ch = chLow; 383 continue; 384 } 385 386 // Select the payload bits from the high surrogate 387 ch &= 0x3FF; 388 ch <<= 10; 389 390 // Include bits from low surrogate 391 ch |= (chLow & 0x3FF); 392 393 // Add the surrogacy offset 394 ch += 0x10000; 395 break; 396 } 397 } 398 399 QueueUnicodeCodepoint(m_readahead, ch); 400 } 401 402 inline char* ReadBuffer(unsigned char* pBuffer) { 403 return reinterpret_cast<char*>(pBuffer); 404 } 405 406 unsigned char Stream::GetNextByte() const { 407 if (m_nPrefetchedUsed >= m_nPrefetchedAvailable) { 408 std::streambuf* pBuf = m_input.rdbuf(); 409 m_nPrefetchedAvailable = static_cast<std::size_t>( 410 pBuf->sgetn(ReadBuffer(m_pPrefetched), YAML_PREFETCH_SIZE)); 411 m_nPrefetchedUsed = 0; 412 if (!m_nPrefetchedAvailable) { 413 m_input.setstate(std::ios_base::eofbit); 414 } 415 416 if (0 == m_nPrefetchedAvailable) { 417 return 0; 418 } 419 } 420 421 return m_pPrefetched[m_nPrefetchedUsed++]; 422 } 423 424 void Stream::StreamInUtf32() const { 425 static int indexes[2][4] = {{3, 2, 1, 0}, {0, 1, 2, 3}}; 426 427 unsigned long ch = 0; 428 unsigned char bytes[4]; 429 int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; 430 431 bytes[0] = GetNextByte(); 432 bytes[1] = GetNextByte(); 433 bytes[2] = GetNextByte(); 434 bytes[3] = GetNextByte(); 435 if (!m_input.good()) { 436 return; 437 } 438 439 for (int i = 0; i < 4; ++i) { 440 ch <<= 8; 441 ch |= bytes[pIndexes[i]]; 442 } 443 444 QueueUnicodeCodepoint(m_readahead, ch); 445 } 446 } // namespace YAML