yaml-cpp

FORK: A YAML parser and emitter in C++
git clone https://git.neptards.moe/neptards/yaml-cpp.git
Log | Files | Refs | README | LICENSE

emitterutils.cpp (13250B)


      1 #include <algorithm>
      2 #include <iomanip>
      3 #include <sstream>
      4 
      5 #include "emitterutils.h"
      6 #include "exp.h"
      7 #include "indentation.h"
      8 #include "regex_yaml.h"
      9 #include "regeximpl.h"
     10 #include "stringsource.h"
     11 #include "yaml-cpp/binary.h"  // IWYU pragma: keep
     12 #include "yaml-cpp/null.h"
     13 #include "yaml-cpp/ostream_wrapper.h"
     14 
     15 namespace YAML {
     16 namespace Utils {
     17 namespace {
     18 enum { REPLACEMENT_CHARACTER = 0xFFFD };
     19 
     20 bool IsAnchorChar(int ch) {  // test for ns-anchor-char
     21   switch (ch) {
     22     case ',':
     23     case '[':
     24     case ']':
     25     case '{':
     26     case '}':  // c-flow-indicator
     27     case ' ':
     28     case '\t':    // s-white
     29     case 0xFEFF:  // c-byte-order-mark
     30     case 0xA:
     31     case 0xD:  // b-char
     32       return false;
     33     case 0x85:
     34       return true;
     35   }
     36 
     37   if (ch < 0x20) {
     38     return false;
     39   }
     40 
     41   if (ch < 0x7E) {
     42     return true;
     43   }
     44 
     45   if (ch < 0xA0) {
     46     return false;
     47   }
     48   if (ch >= 0xD800 && ch <= 0xDFFF) {
     49     return false;
     50   }
     51   if ((ch & 0xFFFE) == 0xFFFE) {
     52     return false;
     53   }
     54   if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) {
     55     return false;
     56   }
     57   if (ch > 0x10FFFF) {
     58     return false;
     59   }
     60 
     61   return true;
     62 }
     63 
     64 int Utf8BytesIndicated(char ch) {
     65   int byteVal = static_cast<unsigned char>(ch);
     66   switch (byteVal >> 4) {
     67     case 0:
     68     case 1:
     69     case 2:
     70     case 3:
     71     case 4:
     72     case 5:
     73     case 6:
     74     case 7:
     75       return 1;
     76     case 12:
     77     case 13:
     78       return 2;
     79     case 14:
     80       return 3;
     81     case 15:
     82       return 4;
     83     default:
     84       return -1;
     85   }
     86 }
     87 
     88 bool IsTrailingByte(char ch) { return (ch & 0xC0) == 0x80; }
     89 
     90 bool GetNextCodePointAndAdvance(int& codePoint,
     91                                 std::string::const_iterator& first,
     92                                 std::string::const_iterator last) {
     93   if (first == last)
     94     return false;
     95 
     96   int nBytes = Utf8BytesIndicated(*first);
     97   if (nBytes < 1) {
     98     // Bad lead byte
     99     ++first;
    100     codePoint = REPLACEMENT_CHARACTER;
    101     return true;
    102   }
    103 
    104   if (nBytes == 1) {
    105     codePoint = *first++;
    106     return true;
    107   }
    108 
    109   // Gather bits from trailing bytes
    110   codePoint = static_cast<unsigned char>(*first) & ~(0xFF << (7 - nBytes));
    111   ++first;
    112   --nBytes;
    113   for (; nBytes > 0; ++first, --nBytes) {
    114     if ((first == last) || !IsTrailingByte(*first)) {
    115       codePoint = REPLACEMENT_CHARACTER;
    116       break;
    117     }
    118     codePoint <<= 6;
    119     codePoint |= *first & 0x3F;
    120   }
    121 
    122   // Check for illegal code points
    123   if (codePoint > 0x10FFFF)
    124     codePoint = REPLACEMENT_CHARACTER;
    125   else if (codePoint >= 0xD800 && codePoint <= 0xDFFF)
    126     codePoint = REPLACEMENT_CHARACTER;
    127   else if ((codePoint & 0xFFFE) == 0xFFFE)
    128     codePoint = REPLACEMENT_CHARACTER;
    129   else if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF)
    130     codePoint = REPLACEMENT_CHARACTER;
    131   return true;
    132 }
    133 
    134 void WriteCodePoint(ostream_wrapper& out, int codePoint) {
    135   if (codePoint < 0 || codePoint > 0x10FFFF) {
    136     codePoint = REPLACEMENT_CHARACTER;
    137   }
    138   if (codePoint <= 0x7F) {
    139     out << static_cast<char>(codePoint);
    140   } else if (codePoint <= 0x7FF) {
    141     out << static_cast<char>(0xC0 | (codePoint >> 6))
    142         << static_cast<char>(0x80 | (codePoint & 0x3F));
    143   } else if (codePoint <= 0xFFFF) {
    144     out << static_cast<char>(0xE0 | (codePoint >> 12))
    145         << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
    146         << static_cast<char>(0x80 | (codePoint & 0x3F));
    147   } else {
    148     out << static_cast<char>(0xF0 | (codePoint >> 18))
    149         << static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F))
    150         << static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F))
    151         << static_cast<char>(0x80 | (codePoint & 0x3F));
    152   }
    153 }
    154 
    155 bool IsValidPlainScalar(const std::string& str, FlowType::value flowType,
    156                         bool allowOnlyAscii) {
    157   // check against null
    158   if (IsNullString(str)) {
    159     return false;
    160   }
    161 
    162   // check the start
    163   const RegEx& start = (flowType == FlowType::Flow ? Exp::PlainScalarInFlow()
    164                                                    : Exp::PlainScalar());
    165   if (!start.Matches(str)) {
    166     return false;
    167   }
    168 
    169   // and check the end for plain whitespace (which can't be faithfully kept in a
    170   // plain scalar)
    171   if (!str.empty() && *str.rbegin() == ' ') {
    172     return false;
    173   }
    174 
    175   // then check until something is disallowed
    176   static const RegEx& disallowed_flow =
    177       Exp::EndScalarInFlow() | (Exp::BlankOrBreak() + Exp::Comment()) |
    178       Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
    179       Exp::Tab();
    180   static const RegEx& disallowed_block =
    181       Exp::EndScalar() | (Exp::BlankOrBreak() + Exp::Comment()) |
    182       Exp::NotPrintable() | Exp::Utf8_ByteOrderMark() | Exp::Break() |
    183       Exp::Tab();
    184   const RegEx& disallowed =
    185       flowType == FlowType::Flow ? disallowed_flow : disallowed_block;
    186 
    187   StringCharSource buffer(str.c_str(), str.size());
    188   while (buffer) {
    189     if (disallowed.Matches(buffer)) {
    190       return false;
    191     }
    192     if (allowOnlyAscii && (0x80 <= static_cast<unsigned char>(buffer[0]))) {
    193       return false;
    194     }
    195     ++buffer;
    196   }
    197 
    198   return true;
    199 }
    200 
    201 bool IsValidSingleQuotedScalar(const std::string& str, bool escapeNonAscii) {
    202   // TODO: check for non-printable characters?
    203   return std::none_of(str.begin(), str.end(), [=](char ch) {
    204     return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch))) ||
    205            (ch == '\n');
    206   });
    207 }
    208 
    209 bool IsValidLiteralScalar(const std::string& str, FlowType::value flowType,
    210                           bool escapeNonAscii) {
    211   if (flowType == FlowType::Flow) {
    212     return false;
    213   }
    214 
    215   // TODO: check for non-printable characters?
    216   return std::none_of(str.begin(), str.end(), [=](char ch) {
    217     return (escapeNonAscii && (0x80 <= static_cast<unsigned char>(ch)));
    218   });
    219 }
    220 
    221 std::pair<uint16_t, uint16_t> EncodeUTF16SurrogatePair(int codePoint) {
    222   const uint32_t leadOffset = 0xD800 - (0x10000 >> 10);
    223 
    224   return {
    225     leadOffset | (codePoint >> 10),
    226     0xDC00 | (codePoint & 0x3FF),
    227   };
    228 }
    229 
    230 void WriteDoubleQuoteEscapeSequence(ostream_wrapper& out, int codePoint, StringEscaping::value stringEscapingStyle) {
    231   static const char hexDigits[] = "0123456789abcdef";
    232 
    233   out << "\\";
    234   int digits = 8;
    235   if (codePoint < 0xFF && stringEscapingStyle != StringEscaping::JSON) {
    236     out << "x";
    237     digits = 2;
    238   } else if (codePoint < 0xFFFF) {
    239     out << "u";
    240     digits = 4;
    241   } else if (stringEscapingStyle != StringEscaping::JSON) {
    242     out << "U";
    243     digits = 8;
    244   } else {
    245     auto surrogatePair = EncodeUTF16SurrogatePair(codePoint);
    246     WriteDoubleQuoteEscapeSequence(out, surrogatePair.first, stringEscapingStyle);
    247     WriteDoubleQuoteEscapeSequence(out, surrogatePair.second, stringEscapingStyle);
    248     return;
    249   }
    250 
    251   // Write digits into the escape sequence
    252   for (; digits > 0; --digits)
    253     out << hexDigits[(codePoint >> (4 * (digits - 1))) & 0xF];
    254 }
    255 
    256 bool WriteAliasName(ostream_wrapper& out, const std::string& str) {
    257   int codePoint;
    258   for (std::string::const_iterator i = str.begin();
    259        GetNextCodePointAndAdvance(codePoint, i, str.end());) {
    260     if (!IsAnchorChar(codePoint)) {
    261       return false;
    262     }
    263 
    264     WriteCodePoint(out, codePoint);
    265   }
    266   return true;
    267 }
    268 }  // namespace
    269 
    270 StringFormat::value ComputeStringFormat(const std::string& str,
    271                                         EMITTER_MANIP strFormat,
    272                                         FlowType::value flowType,
    273                                         bool escapeNonAscii) {
    274   switch (strFormat) {
    275     case Auto:
    276       if (IsValidPlainScalar(str, flowType, escapeNonAscii)) {
    277         return StringFormat::Plain;
    278       }
    279       return StringFormat::DoubleQuoted;
    280     case SingleQuoted:
    281       if (IsValidSingleQuotedScalar(str, escapeNonAscii)) {
    282         return StringFormat::SingleQuoted;
    283       }
    284       return StringFormat::DoubleQuoted;
    285     case DoubleQuoted:
    286       return StringFormat::DoubleQuoted;
    287     case Literal:
    288       if (IsValidLiteralScalar(str, flowType, escapeNonAscii)) {
    289         return StringFormat::Literal;
    290       }
    291       return StringFormat::DoubleQuoted;
    292     default:
    293       break;
    294   }
    295 
    296   return StringFormat::DoubleQuoted;
    297 }
    298 
    299 bool WriteSingleQuotedString(ostream_wrapper& out, const std::string& str) {
    300   out << "'";
    301   int codePoint;
    302   for (std::string::const_iterator i = str.begin();
    303        GetNextCodePointAndAdvance(codePoint, i, str.end());) {
    304     if (codePoint == '\n') {
    305       return false;  // We can't handle a new line and the attendant indentation
    306                      // yet
    307     }
    308 
    309     if (codePoint == '\'') {
    310       out << "''";
    311     } else {
    312       WriteCodePoint(out, codePoint);
    313     }
    314   }
    315   out << "'";
    316   return true;
    317 }
    318 
    319 bool WriteDoubleQuotedString(ostream_wrapper& out, const std::string& str,
    320                              StringEscaping::value stringEscaping) {
    321   out << "\"";
    322   int codePoint;
    323   for (std::string::const_iterator i = str.begin();
    324        GetNextCodePointAndAdvance(codePoint, i, str.end());) {
    325     switch (codePoint) {
    326       case '\"':
    327         out << "\\\"";
    328         break;
    329       case '\\':
    330         out << "\\\\";
    331         break;
    332       case '\n':
    333         out << "\\n";
    334         break;
    335       case '\t':
    336         out << "\\t";
    337         break;
    338       case '\r':
    339         out << "\\r";
    340         break;
    341       case '\b':
    342         out << "\\b";
    343         break;
    344       case '\f':
    345         out << "\\f";
    346         break;
    347       default:
    348         if (codePoint < 0x20 ||
    349             (codePoint >= 0x80 &&
    350              codePoint <= 0xA0)) {  // Control characters and non-breaking space
    351           WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
    352         } else if (codePoint == 0xFEFF) {  // Byte order marks (ZWNS) should be
    353                                            // escaped (YAML 1.2, sec. 5.2)
    354           WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
    355         } else if (stringEscaping == StringEscaping::NonAscii && codePoint > 0x7E) {
    356           WriteDoubleQuoteEscapeSequence(out, codePoint, stringEscaping);
    357         } else {
    358           WriteCodePoint(out, codePoint);
    359         }
    360     }
    361   }
    362   out << "\"";
    363   return true;
    364 }
    365 
    366 bool WriteLiteralString(ostream_wrapper& out, const std::string& str,
    367                         std::size_t indent) {
    368   out << "|\n";
    369   int codePoint;
    370   for (std::string::const_iterator i = str.begin();
    371        GetNextCodePointAndAdvance(codePoint, i, str.end());) {
    372     if (codePoint == '\n') {
    373       out << "\n";
    374     } else {
    375       out<< IndentTo(indent);
    376       WriteCodePoint(out, codePoint);
    377     }
    378   }
    379   return true;
    380 }
    381 
    382 bool WriteChar(ostream_wrapper& out, char ch, StringEscaping::value stringEscapingStyle) {
    383   if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z')) {
    384     out << ch;
    385   } else if (ch == '\"') {
    386     out << R"("\"")";
    387   } else if (ch == '\t') {
    388     out << R"("\t")";
    389   } else if (ch == '\n') {
    390     out << R"("\n")";
    391   } else if (ch == '\b') {
    392     out << R"("\b")";
    393   } else if (ch == '\r') {
    394     out << R"("\r")";
    395   } else if (ch == '\f') {
    396     out << R"("\f")";
    397   } else if (ch == '\\') {
    398     out << R"("\\")";
    399   } else if (0x20 <= ch && ch <= 0x7e) {
    400     out << "\"" << ch << "\"";
    401   } else {
    402     out << "\"";
    403     WriteDoubleQuoteEscapeSequence(out, ch, stringEscapingStyle);
    404     out << "\"";
    405   }
    406   return true;
    407 }
    408 
    409 bool WriteComment(ostream_wrapper& out, const std::string& str,
    410                   std::size_t postCommentIndent) {
    411   const std::size_t curIndent = out.col();
    412   out << "#" << Indentation(postCommentIndent);
    413   out.set_comment();
    414   int codePoint;
    415   for (std::string::const_iterator i = str.begin();
    416        GetNextCodePointAndAdvance(codePoint, i, str.end());) {
    417     if (codePoint == '\n') {
    418       out << "\n"
    419           << IndentTo(curIndent) << "#" << Indentation(postCommentIndent);
    420       out.set_comment();
    421     } else {
    422       WriteCodePoint(out, codePoint);
    423     }
    424   }
    425   return true;
    426 }
    427 
    428 bool WriteAlias(ostream_wrapper& out, const std::string& str) {
    429   out << "*";
    430   return WriteAliasName(out, str);
    431 }
    432 
    433 bool WriteAnchor(ostream_wrapper& out, const std::string& str) {
    434   out << "&";
    435   return WriteAliasName(out, str);
    436 }
    437 
    438 bool WriteTag(ostream_wrapper& out, const std::string& str, bool verbatim) {
    439   out << (verbatim ? "!<" : "!");
    440   StringCharSource buffer(str.c_str(), str.size());
    441   const RegEx& reValid = verbatim ? Exp::URI() : Exp::Tag();
    442   while (buffer) {
    443     int n = reValid.Match(buffer);
    444     if (n <= 0) {
    445       return false;
    446     }
    447 
    448     while (--n >= 0) {
    449       out << buffer[0];
    450       ++buffer;
    451     }
    452   }
    453   if (verbatim) {
    454     out << ">";
    455   }
    456   return true;
    457 }
    458 
    459 bool WriteTagWithPrefix(ostream_wrapper& out, const std::string& prefix,
    460                         const std::string& tag) {
    461   out << "!";
    462   StringCharSource prefixBuffer(prefix.c_str(), prefix.size());
    463   while (prefixBuffer) {
    464     int n = Exp::URI().Match(prefixBuffer);
    465     if (n <= 0) {
    466       return false;
    467     }
    468 
    469     while (--n >= 0) {
    470       out << prefixBuffer[0];
    471       ++prefixBuffer;
    472     }
    473   }
    474 
    475   out << "!";
    476   StringCharSource tagBuffer(tag.c_str(), tag.size());
    477   while (tagBuffer) {
    478     int n = Exp::Tag().Match(tagBuffer);
    479     if (n <= 0) {
    480       return false;
    481     }
    482 
    483     while (--n >= 0) {
    484       out << tagBuffer[0];
    485       ++tagBuffer;
    486     }
    487   }
    488   return true;
    489 }
    490 
    491 bool WriteBinary(ostream_wrapper& out, const Binary& binary) {
    492   WriteDoubleQuotedString(out, EncodeBase64(binary.data(), binary.size()),
    493                           StringEscaping::None);
    494   return true;
    495 }
    496 }  // namespace Utils
    497 }  // namespace YAML