emitterutils.cpp (13250B)
1 #include <algorithm> 2 #include <iomanip> 3 #include <sstream> 4 5 #include "emitterutils.h" 6 #include "exp.h" 7 #include "indentation.h" 8 #include "regex_yaml.h" 9 #include "regeximpl.h" 10 #include "stringsource.h" 11 #include "yaml-cpp/binary.h" // IWYU pragma: keep 12 #include "yaml-cpp/null.h" 13 #include "yaml-cpp/ostream_wrapper.h" 14 15 namespace YAML { 16 namespace Utils { 17 namespace { 18 enum { REPLACEMENT_CHARACTER = 0xFFFD }; 19 20 bool IsAnchorChar(int ch) { // test for ns-anchor-char 21 switch (ch) { 22 case ',': 23 case '[': 24 case ']': 25 case '{': 26 case '}': // c-flow-indicator 27 case ' ': 28 case '\t': // s-white 29 case 0xFEFF: // c-byte-order-mark 30 case 0xA: 31 case 0xD: // b-char 32 return false; 33 case 0x85: 34 return true; 35 } 36 37 if (ch < 0x20) { 38 return false; 39 } 40 41 if (ch < 0x7E) { 42 return true; 43 } 44 45 if (ch < 0xA0) { 46 return false; 47 } 48 if (ch >= 0xD800 && ch <= 0xDFFF) { 49 return false; 50 } 51 if ((ch & 0xFFFE) == 0xFFFE) { 52 return false; 53 } 54 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) { 55 return false; 56 } 57 if (ch > 0x10FFFF) { 58 return false; 59 } 60 61 return true; 62 } 63 64 int Utf8BytesIndicated(char ch) { 65 int byteVal = static_cast<unsigned char>(ch); 66 switch (byteVal >> 4) { 67 case 0: 68 case 1: 69 case 2: 70 case 3: 71 case 4: 72 case 5: 73 case 6: 74 case 7: 75 return 1; 76 case 12: 77 case 13: 78 return 2; 79 case 14: 80 return 3; 81 case 15: 82 return 4; 83 default: 84 return -1; 85 } 86 } 87 88 bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; } 89 90 bool GetNextCodePointAndAdvance(int& codePoint, 91 std::string::const_iterator& first, 92 std::string::const_iterator last) { 93 if (first == last) 94 return false; 95 96 int nBytes = Utf8BytesIndicated(*first); 97 if (nBytes < 1) { 98 // Bad lead byte 99 ++first; 100 codePoint = REPLACEMENT_CHARACTER; 101 return true; 102 } 103 104 if (nBytes == 1) { 105 codePoint = *first++; 106 return true; 107 } 108 109 // Gather bits from trailing bytes 110 codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes)); 111 ++first; 112 --nBytes; 113 for (; nBytes > 0; ++first, --nBytes) { 114 if ((first == last) || !IsTrailingByte(*first)) { 115 codePoint = REPLACEMENT_CHARACTER; 116 break; 117 } 118 codePoint <<= 6; 119 codePoint |= *first & 0x3F; 120 } 121 122 // Check for illegal code points 123 if (codePoint > 0x10FFFF) 124 codePoint = REPLACEMENT_CHARACTER; 125 else if (codePoint >= 0xD800 && codePoint <= 0xDFFF) 126 codePoint = REPLACEMENT_CHARACTER; 127 else if ((codePoint & 0xFFFE) == 0xFFFE) 128 codePoint = REPLACEMENT_CHARACTER; 129 else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF) 130 codePoint = REPLACEMENT_CHARACTER; 131 return true; 132 } 133 134 void WriteCodePoint(ostream_wrapper& out, int codePoint) { 135 if (codePoint < 0 || codePoint > 0x10FFFF) { 136 codePoint = REPLACEMENT_CHARACTER; 137 } 138 if (codePoint <= 0x7F) { 139 out << static_cast<char>(codePoint); 140 } else if (codePoint <= 0x7FF) { 141 out << static_cast<char>(0xC0 | (codePoint >> 6)) 142 << static_cast<char>(0x80 | (codePoint & 0x3F)); 143 } else if (codePoint <= 0xFFFF) { 144 out << static_cast<char>(0xE0 | (codePoint >> 12)) 145 << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F)) 146 << static_cast<char>(0x80 | (codePoint & 0x3F)); 147 } else { 148 out << static_cast<char>(0xF0 | (codePoint >> 18)) 149 << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F)) 150 << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F)) 151 << static_cast<char>(0x80 | (codePoint & 0x3F)); 152 } 153 } 154 155 bool IsValidPlainScalar(const std::string& str, FlowType::value flowType, 156 bool allowOnlyAscii) { 157 // check against null 158 if (IsNullString(str)) { 159 return false; 160 } 161 162 // check the start 163 const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow() 164 : Exp::PlainScalar()); 165 if (!start.Matches(str)) { 166 return false; 167 } 168 169 // and check the end for plain whitespace (which can't be faithfully kept in a 170 // plain scalar) 171 if (!str.empty() && *str.rbegin() == ' ') { 172 return false; 173 } 174 175 // then check until something is disallowed 176 static const RegEx& disallowed_flow = 177 Exp::EndScalarInFlow() | (Exp::BlankOrBreak() + Exp::Comment()) | 178 Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() | 179 Exp::Tab(); 180 static const RegEx& disallowed_block = 181 Exp::EndScalar() | (Exp::BlankOrBreak() + Exp::Comment()) | 182 Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() | 183 Exp::Tab(); 184 const RegEx& disallowed = 185 flowType == FlowType::Flow ? disallowed_flow : disallowed_block; 186 187 StringCharSource buffer(str.c_str(), str.size()); 188 while (buffer) { 189 if (disallowed.Matches(buffer)) { 190 return false; 191 } 192 if (allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) { 193 return false; 194 } 195 ++buffer; 196 } 197 198 return true; 199 } 200 201 bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) { 202 // TODO: check for non-printable characters? 203 return std::none_of(str.begin(), str.end(), [=](char ch) { 204 return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch))) || 205 (ch == '\n'); 206 }); 207 } 208 209 bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType, 210 bool escapeNonAscii) { 211 if (flowType == FlowType::Flow) { 212 return false; 213 } 214 215 // TODO: check for non-printable characters? 216 return std::none_of(str.begin(), str.end(), [=](char ch) { 217 return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch))); 218 }); 219 } 220 221 std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) { 222 const uint32_t leadOffset = 0xD800 - (0x10000 >> 10); 223 224 return { 225 leadOffset | (codePoint >> 10), 226 0xDC00 | (codePoint & 0x3FF), 227 }; 228 } 229 230 void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) { 231 static const char hexDigits[] = "0123456789abcdef"; 232 233 out << "\\"; 234 int digits = 8; 235 if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) { 236 out << "x"; 237 digits = 2; 238 } else if (codePoint < 0xFFFF) { 239 out << "u"; 240 digits = 4; 241 } else if (stringEscapingStyle != StringEscaping::JSON) { 242 out << "U"; 243 digits = 8; 244 } else { 245 auto surrogatePair = EncodeUTF16SurrogatePair(codePoint); 246 WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle); 247 WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle); 248 return; 249 } 250 251 // Write digits into the escape sequence 252 for (; digits > 0; --digits) 253 out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF]; 254 } 255 256 bool WriteAliasName(ostream_wrapper& out, const std::string& str) { 257 int codePoint; 258 for (std::string::const_iterator i = str.begin(); 259 GetNextCodePointAndAdvance(codePoint, i, str.end());) { 260 if (!IsAnchorChar(codePoint)) { 261 return false; 262 } 263 264 WriteCodePoint(out, codePoint); 265 } 266 return true; 267 } 268 } // namespace 269 270 StringFormat::value ComputeStringFormat(const std::string& str, 271 EMITTER_MANIP strFormat, 272 FlowType::value flowType, 273 bool escapeNonAscii) { 274 switch (strFormat) { 275 case Auto: 276 if (IsValidPlainScalar(str, flowType, escapeNonAscii)) { 277 return StringFormat::Plain; 278 } 279 return StringFormat::DoubleQuoted; 280 case SingleQuoted: 281 if (IsValidSingleQuotedScalar(str, escapeNonAscii)) { 282 return StringFormat::SingleQuoted; 283 } 284 return StringFormat::DoubleQuoted; 285 case DoubleQuoted: 286 return StringFormat::DoubleQuoted; 287 case Literal: 288 if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) { 289 return StringFormat::Literal; 290 } 291 return StringFormat::DoubleQuoted; 292 default: 293 break; 294 } 295 296 return StringFormat::DoubleQuoted; 297 } 298 299 bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) { 300 out << "'"; 301 int codePoint; 302 for (std::string::const_iterator i = str.begin(); 303 GetNextCodePointAndAdvance(codePoint, i, str.end());) { 304 if (codePoint == '\n') { 305 return false; // We can't handle a new line and the attendant indentation 306 // yet 307 } 308 309 if (codePoint == '\'') { 310 out << "''"; 311 } else { 312 WriteCodePoint(out, codePoint); 313 } 314 } 315 out << "'"; 316 return true; 317 } 318 319 bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str, 320 StringEscaping::value stringEscaping) { 321 out << "\""; 322 int codePoint; 323 for (std::string::const_iterator i = str.begin(); 324 GetNextCodePointAndAdvance(codePoint, i, str.end());) { 325 switch (codePoint) { 326 case '\"': 327 out << "\\\""; 328 break; 329 case '\\': 330 out << "\\\\"; 331 break; 332 case '\n': 333 out << "\\n"; 334 break; 335 case '\t': 336 out << "\\t"; 337 break; 338 case '\r': 339 out << "\\r"; 340 break; 341 case '\b': 342 out << "\\b"; 343 break; 344 case '\f': 345 out << "\\f"; 346 break; 347 default: 348 if (codePoint < 0x20 || 349 (codePoint >= 0x80 && 350 codePoint <= 0xA0)) { // Control characters and non-breaking space 351 WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); 352 } else if (codePoint == 0xFEFF) { // Byte order marks (ZWNS) should be 353 // escaped (YAML 1.2, sec. 5.2) 354 WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); 355 } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) { 356 WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping); 357 } else { 358 WriteCodePoint(out, codePoint); 359 } 360 } 361 } 362 out << "\""; 363 return true; 364 } 365 366 bool WriteLiteralString(ostream_wrapper& out, const std::string& str, 367 std::size_t indent) { 368 out << "|\n"; 369 int codePoint; 370 for (std::string::const_iterator i = str.begin(); 371 GetNextCodePointAndAdvance(codePoint, i, str.end());) { 372 if (codePoint == '\n') { 373 out << "\n"; 374 } else { 375 out<< IndentTo(indent); 376 WriteCodePoint(out, codePoint); 377 } 378 } 379 return true; 380 } 381 382 bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) { 383 if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) { 384 out << ch; 385 } else if (ch == '\"') { 386 out << R"("\"")"; 387 } else if (ch == '\t') { 388 out << R"("\t")"; 389 } else if (ch == '\n') { 390 out << R"("\n")"; 391 } else if (ch == '\b') { 392 out << R"("\b")"; 393 } else if (ch == '\r') { 394 out << R"("\r")"; 395 } else if (ch == '\f') { 396 out << R"("\f")"; 397 } else if (ch == '\\') { 398 out << R"("\\")"; 399 } else if (0x20 <= ch && ch <= 0x7e) { 400 out << "\"" << ch << "\""; 401 } else { 402 out << "\""; 403 WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle); 404 out << "\""; 405 } 406 return true; 407 } 408 409 bool WriteComment(ostream_wrapper& out, const std::string& str, 410 std::size_t postCommentIndent) { 411 const std::size_t curIndent = out.col(); 412 out << "#" << Indentation(postCommentIndent); 413 out.set_comment(); 414 int codePoint; 415 for (std::string::const_iterator i = str.begin(); 416 GetNextCodePointAndAdvance(codePoint, i, str.end());) { 417 if (codePoint == '\n') { 418 out << "\n" 419 << IndentTo(curIndent) << "#" << Indentation(postCommentIndent); 420 out.set_comment(); 421 } else { 422 WriteCodePoint(out, codePoint); 423 } 424 } 425 return true; 426 } 427 428 bool WriteAlias(ostream_wrapper& out, const std::string& str) { 429 out << "*"; 430 return WriteAliasName(out, str); 431 } 432 433 bool WriteAnchor(ostream_wrapper& out, const std::string& str) { 434 out << "&"; 435 return WriteAliasName(out, str); 436 } 437 438 bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) { 439 out << (verbatim ? "!<" : "!"); 440 StringCharSource buffer(str.c_str(), str.size()); 441 const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag(); 442 while (buffer) { 443 int n = reValid.Match(buffer); 444 if (n <= 0) { 445 return false; 446 } 447 448 while (--n >= 0) { 449 out << buffer[0]; 450 ++buffer; 451 } 452 } 453 if (verbatim) { 454 out << ">"; 455 } 456 return true; 457 } 458 459 bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix, 460 const std::string& tag) { 461 out << "!"; 462 StringCharSource prefixBuffer(prefix.c_str(), prefix.size()); 463 while (prefixBuffer) { 464 int n = Exp::URI().Match(prefixBuffer); 465 if (n <= 0) { 466 return false; 467 } 468 469 while (--n >= 0) { 470 out << prefixBuffer[0]; 471 ++prefixBuffer; 472 } 473 } 474 475 out << "!"; 476 StringCharSource tagBuffer(tag.c_str(), tag.size()); 477 while (tagBuffer) { 478 int n = Exp::Tag().Match(tagBuffer); 479 if (n <= 0) { 480 return false; 481 } 482 483 while (--n >= 0) { 484 out << tagBuffer[0]; 485 ++tagBuffer; 486 } 487 } 488 return true; 489 } 490 491 bool WriteBinary(ostream_wrapper& out, const Binary& binary) { 492 WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()), 493 StringEscaping::None); 494 return true; 495 } 496 } // namespace Utils 497 } // namespace YAML