duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

parse.cpp (198070B)


      1 #include "c4/yml/parse.hpp"
      2 #include "c4/error.hpp"
      3 #include "c4/utf.hpp"
      4 #include <c4/dump.hpp>
      5 
      6 #include <ctype.h>
      7 #include <stdarg.h>
      8 #include <stdio.h>
      9 
     10 #include "c4/yml/detail/parser_dbg.hpp"
     11 #ifdef RYML_DBG
     12 #include "c4/yml/detail/print.hpp"
     13 #endif
     14 
     15 #ifndef RYML_ERRMSG_SIZE
     16     #define RYML_ERRMSG_SIZE 1024
     17 #endif
     18 
     19 //#define RYML_WITH_TAB_TOKENS
     20 #ifdef RYML_WITH_TAB_TOKENS
     21 #define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__
     22 #define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) with
     23 #else
     24 #define _RYML_WITH_TAB_TOKENS(...)
     25 #define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) without
     26 #endif
     27 
     28 
     29 #if defined(_MSC_VER)
     30 #   pragma warning(push)
     31 #   pragma warning(disable: 4296/*expression is always 'boolean_value'*/)
     32 #elif defined(__clang__)
     33 #   pragma clang diagnostic push
     34 #   pragma clang diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
     35 #   pragma clang diagnostic ignored "-Wformat-nonliteral"
     36 #   pragma clang diagnostic ignored "-Wold-style-cast"
     37 #elif defined(__GNUC__)
     38 #   pragma GCC diagnostic push
     39 #   pragma GCC diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
     40 #   pragma GCC diagnostic ignored "-Wformat-nonliteral"
     41 #   pragma GCC diagnostic ignored "-Wold-style-cast"
     42 #   if __GNUC__ >= 7
     43 #       pragma GCC diagnostic ignored "-Wduplicated-branches"
     44 #   endif
     45 #endif
     46 
     47 namespace c4 {
     48 namespace yml {
     49 
     50 namespace {
     51 
     52 template<class DumpFn, class ...Args>
     53 void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args)
     54 {
     55     char writebuf[256];
     56     auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward<Args>(args)...);
     57     // resume writing if the results failed to fit the buffer
     58     if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte.
     59     {
     60         results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
     61         if(C4_UNLIKELY(results.bufsize > sizeof(writebuf)))
     62         {
     63             results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
     64         }
     65     }
     66 }
     67 
     68 bool _is_scalar_next__runk(csubstr s)
     69 {
     70     return !(s.begins_with(": ") || s.begins_with_any("#,{}[]%&") || s.begins_with("? ") || s == "-" || s.begins_with("- ") || s.begins_with(":\"") || s.begins_with(":'"));
     71 }
     72 
     73 bool _is_scalar_next__rseq_rval(csubstr s)
     74 {
     75     return !(s.begins_with_any("[{!&") || s.begins_with("? ") || s.begins_with("- ") || s == "-");
     76 }
     77 
     78 bool _is_scalar_next__rmap(csubstr s)
     79 {
     80     return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ") _RYML_WITH_TAB_TOKENS(|| s.begins_with(":\t")));
     81 }
     82 
     83 bool _is_scalar_next__rmap_val(csubstr s)
     84 {
     85     return !(s.begins_with("- ") || s.begins_with_any("{[") || s == "-");
     86 }
     87 
     88 bool _is_doc_sep(csubstr s)
     89 {
     90     constexpr const csubstr dashes = "---";
     91     constexpr const csubstr ellipsis = "...";
     92     constexpr const csubstr whitesp = " \t";
     93     if(s.begins_with(dashes))
     94         return s == dashes || s.sub(3).begins_with_any(whitesp);
     95     else if(s.begins_with(ellipsis))
     96         return s == ellipsis || s.sub(3).begins_with_any(whitesp);
     97     return false;
     98 }
     99 
    100 /** @p i is set to the first non whitespace character after the line
    101  * @return the number of empty lines after the initial position */
    102 size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation)
    103 {
    104     RYML_ASSERT(r[*i] == '\n');
    105     size_t numnl_following = 0;
    106     ++(*i);
    107     for( ; *i < r.len; ++(*i))
    108     {
    109         if(r.str[*i] == '\n')
    110         {
    111             ++numnl_following;
    112             if(indentation) // skip the indentation after the newline
    113             {
    114                 size_t stop = *i + indentation;
    115                 for( ; *i < r.len; ++(*i))
    116                 {
    117                     if(r.str[*i] != ' ' && r.str[*i] != '\r')
    118                         break;
    119                     RYML_ASSERT(*i < stop);
    120                 }
    121                 C4_UNUSED(stop);
    122             }
    123         }
    124         else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r')  // skip leading whitespace
    125             ;
    126         else
    127             break;
    128     }
    129     return numnl_following;
    130 }
    131 
    132 } // anon namespace
    133 
    134 
    135 //-----------------------------------------------------------------------------
    136 
    137 Parser::~Parser()
    138 {
    139     _free();
    140     _clr();
    141 }
    142 
    143 Parser::Parser(Callbacks const& cb, ParserOptions opts)
    144     : m_options(opts)
    145     , m_file()
    146     , m_buf()
    147     , m_root_id(NONE)
    148     , m_tree()
    149     , m_stack(cb)
    150     , m_state()
    151     , m_key_tag_indentation(0)
    152     , m_key_tag2_indentation(0)
    153     , m_key_tag()
    154     , m_key_tag2()
    155     , m_val_tag_indentation(0)
    156     , m_val_tag()
    157     , m_key_anchor_was_before(false)
    158     , m_key_anchor_indentation(0)
    159     , m_key_anchor()
    160     , m_val_anchor_indentation(0)
    161     , m_val_anchor()
    162     , m_filter_arena()
    163     , m_newline_offsets()
    164     , m_newline_offsets_size(0)
    165     , m_newline_offsets_capacity(0)
    166     , m_newline_offsets_buf()
    167 {
    168     m_stack.push(State{});
    169     m_state = &m_stack.top();
    170 }
    171 
    172 Parser::Parser(Parser &&that)
    173     : m_options(that.m_options)
    174     , m_file(that.m_file)
    175     , m_buf(that.m_buf)
    176     , m_root_id(that.m_root_id)
    177     , m_tree(that.m_tree)
    178     , m_stack(std::move(that.m_stack))
    179     , m_state(&m_stack.top())
    180     , m_key_tag_indentation(that.m_key_tag_indentation)
    181     , m_key_tag2_indentation(that.m_key_tag2_indentation)
    182     , m_key_tag(that.m_key_tag)
    183     , m_key_tag2(that.m_key_tag2)
    184     , m_val_tag_indentation(that.m_val_tag_indentation)
    185     , m_val_tag(that.m_val_tag)
    186     , m_key_anchor_was_before(that.m_key_anchor_was_before)
    187     , m_key_anchor_indentation(that.m_key_anchor_indentation)
    188     , m_key_anchor(that.m_key_anchor)
    189     , m_val_anchor_indentation(that.m_val_anchor_indentation)
    190     , m_val_anchor(that.m_val_anchor)
    191     , m_filter_arena(that.m_filter_arena)
    192     , m_newline_offsets(that.m_newline_offsets)
    193     , m_newline_offsets_size(that.m_newline_offsets_size)
    194     , m_newline_offsets_capacity(that.m_newline_offsets_capacity)
    195     , m_newline_offsets_buf(that.m_newline_offsets_buf)
    196 {
    197     that._clr();
    198 }
    199 
    200 Parser::Parser(Parser const& that)
    201     : m_options(that.m_options)
    202     , m_file(that.m_file)
    203     , m_buf(that.m_buf)
    204     , m_root_id(that.m_root_id)
    205     , m_tree(that.m_tree)
    206     , m_stack(that.m_stack)
    207     , m_state(&m_stack.top())
    208     , m_key_tag_indentation(that.m_key_tag_indentation)
    209     , m_key_tag2_indentation(that.m_key_tag2_indentation)
    210     , m_key_tag(that.m_key_tag)
    211     , m_key_tag2(that.m_key_tag2)
    212     , m_val_tag_indentation(that.m_val_tag_indentation)
    213     , m_val_tag(that.m_val_tag)
    214     , m_key_anchor_was_before(that.m_key_anchor_was_before)
    215     , m_key_anchor_indentation(that.m_key_anchor_indentation)
    216     , m_key_anchor(that.m_key_anchor)
    217     , m_val_anchor_indentation(that.m_val_anchor_indentation)
    218     , m_val_anchor(that.m_val_anchor)
    219     , m_filter_arena()
    220     , m_newline_offsets()
    221     , m_newline_offsets_size()
    222     , m_newline_offsets_capacity()
    223     , m_newline_offsets_buf()
    224 {
    225     if(that.m_newline_offsets_capacity)
    226     {
    227         _resize_locations(that.m_newline_offsets_capacity);
    228         _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity == that.m_newline_offsets_capacity);
    229         memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
    230         m_newline_offsets_size = that.m_newline_offsets_size;
    231     }
    232     if(that.m_filter_arena.len)
    233     {
    234         _resize_filter_arena(that.m_filter_arena.len);
    235     }
    236 }
    237 
    238 Parser& Parser::operator=(Parser &&that)
    239 {
    240     _free();
    241     m_options = (that.m_options);
    242     m_file = (that.m_file);
    243     m_buf = (that.m_buf);
    244     m_root_id = (that.m_root_id);
    245     m_tree = (that.m_tree);
    246     m_stack = std::move(that.m_stack);
    247     m_state = (&m_stack.top());
    248     m_key_tag_indentation = (that.m_key_tag_indentation);
    249     m_key_tag2_indentation = (that.m_key_tag2_indentation);
    250     m_key_tag = (that.m_key_tag);
    251     m_key_tag2 = (that.m_key_tag2);
    252     m_val_tag_indentation = (that.m_val_tag_indentation);
    253     m_val_tag = (that.m_val_tag);
    254     m_key_anchor_was_before = (that.m_key_anchor_was_before);
    255     m_key_anchor_indentation = (that.m_key_anchor_indentation);
    256     m_key_anchor = (that.m_key_anchor);
    257     m_val_anchor_indentation = (that.m_val_anchor_indentation);
    258     m_val_anchor = (that.m_val_anchor);
    259     m_filter_arena = that.m_filter_arena;
    260     m_newline_offsets = (that.m_newline_offsets);
    261     m_newline_offsets_size = (that.m_newline_offsets_size);
    262     m_newline_offsets_capacity = (that.m_newline_offsets_capacity);
    263     m_newline_offsets_buf = (that.m_newline_offsets_buf);
    264     that._clr();
    265     return *this;
    266 }
    267 
    268 Parser& Parser::operator=(Parser const& that)
    269 {
    270     _free();
    271     m_options = (that.m_options);
    272     m_file = (that.m_file);
    273     m_buf = (that.m_buf);
    274     m_root_id = (that.m_root_id);
    275     m_tree = (that.m_tree);
    276     m_stack = that.m_stack;
    277     m_state = &m_stack.top();
    278     m_key_tag_indentation = (that.m_key_tag_indentation);
    279     m_key_tag2_indentation = (that.m_key_tag2_indentation);
    280     m_key_tag = (that.m_key_tag);
    281     m_key_tag2 = (that.m_key_tag2);
    282     m_val_tag_indentation = (that.m_val_tag_indentation);
    283     m_val_tag = (that.m_val_tag);
    284     m_key_anchor_was_before = (that.m_key_anchor_was_before);
    285     m_key_anchor_indentation = (that.m_key_anchor_indentation);
    286     m_key_anchor = (that.m_key_anchor);
    287     m_val_anchor_indentation = (that.m_val_anchor_indentation);
    288     m_val_anchor = (that.m_val_anchor);
    289     if(that.m_filter_arena.len > 0)
    290         _resize_filter_arena(that.m_filter_arena.len);
    291     if(that.m_newline_offsets_capacity > m_newline_offsets_capacity)
    292         _resize_locations(that.m_newline_offsets_capacity);
    293     _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity);
    294     _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_size);
    295     memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
    296     m_newline_offsets_size = that.m_newline_offsets_size;
    297     m_newline_offsets_buf = that.m_newline_offsets_buf;
    298     return *this;
    299 }
    300 
    301 void Parser::_clr()
    302 {
    303     m_options = {};
    304     m_file = {};
    305     m_buf = {};
    306     m_root_id = {};
    307     m_tree = {};
    308     m_stack.clear();
    309     m_state = {};
    310     m_key_tag_indentation = {};
    311     m_key_tag2_indentation = {};
    312     m_key_tag = {};
    313     m_key_tag2 = {};
    314     m_val_tag_indentation = {};
    315     m_val_tag = {};
    316     m_key_anchor_was_before = {};
    317     m_key_anchor_indentation = {};
    318     m_key_anchor = {};
    319     m_val_anchor_indentation = {};
    320     m_val_anchor = {};
    321     m_filter_arena = {};
    322     m_newline_offsets = {};
    323     m_newline_offsets_size = {};
    324     m_newline_offsets_capacity = {};
    325     m_newline_offsets_buf = {};
    326 }
    327 
    328 void Parser::_free()
    329 {
    330     if(m_newline_offsets)
    331     {
    332         _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
    333         m_newline_offsets = nullptr;
    334         m_newline_offsets_size = 0u;
    335         m_newline_offsets_capacity = 0u;
    336         m_newline_offsets_buf = 0u;
    337     }
    338     if(m_filter_arena.len)
    339     {
    340         _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
    341         m_filter_arena = {};
    342     }
    343     m_stack._free();
    344 }
    345 
    346 
    347 //-----------------------------------------------------------------------------
    348 void Parser::_reset()
    349 {
    350     _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() == 1);
    351     m_stack.clear();
    352     m_stack.push({});
    353     m_state = &m_stack.top();
    354     m_state->reset(m_file.str, m_root_id);
    355 
    356     m_key_tag_indentation = 0;
    357     m_key_tag2_indentation = 0;
    358     m_key_tag.clear();
    359     m_key_tag2.clear();
    360     m_val_tag_indentation = 0;
    361     m_val_tag.clear();
    362     m_key_anchor_was_before = false;
    363     m_key_anchor_indentation = 0;
    364     m_key_anchor.clear();
    365     m_val_anchor_indentation = 0;
    366     m_val_anchor.clear();
    367 
    368     if(m_options.locations())
    369     {
    370         _prepare_locations();
    371     }
    372 }
    373 
    374 //-----------------------------------------------------------------------------
    375 template<class DumpFn>
    376 void Parser::_fmt_msg(DumpFn &&dumpfn) const
    377 {
    378     auto const& lc = m_state->line_contents;
    379     csubstr contents = lc.stripped;
    380     if(contents.len)
    381     {
    382         // print the yaml src line
    383         size_t offs = 3u + to_chars(substr{}, m_state->pos.line) + to_chars(substr{}, m_state->pos.col);
    384         if(m_file.len)
    385         {
    386             _parse_dump(dumpfn, "{}:", m_file);
    387             offs += m_file.len + 1;
    388         }
    389         _parse_dump(dumpfn, "{}:{}: ", m_state->pos.line, m_state->pos.col);
    390         csubstr maybe_full_content = (contents.len < 80u ? contents : contents.first(80u));
    391         csubstr maybe_ellipsis = (contents.len < 80u ? csubstr{} : csubstr("..."));
    392         _parse_dump(dumpfn, "{}{}  (size={})\n", maybe_full_content, maybe_ellipsis, contents.len);
    393         // highlight the remaining portion of the previous line
    394         size_t firstcol = (size_t)(lc.rem.begin() - lc.full.begin());
    395         size_t lastcol = firstcol + lc.rem.len;
    396         for(size_t i = 0; i < offs + firstcol; ++i)
    397             dumpfn(" ");
    398         dumpfn("^");
    399         for(size_t i = 1, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i)
    400             dumpfn("~");
    401         _parse_dump(dumpfn, "{}  (cols {}-{})\n", maybe_ellipsis, firstcol+1, lastcol+1);
    402     }
    403     else
    404     {
    405         dumpfn("\n");
    406     }
    407 
    408 #ifdef RYML_DBG
    409     // next line: print the state flags
    410     {
    411         char flagbuf_[64];
    412         _parse_dump(dumpfn, "top state: {}\n", _prfl(flagbuf_, m_state->flags));
    413     }
    414 #endif
    415 }
    416 
    417 
    418 //-----------------------------------------------------------------------------
    419 template<class ...Args>
    420 void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const
    421 {
    422     char errmsg[RYML_ERRMSG_SIZE];
    423     detail::_SubstrWriter writer(errmsg);
    424     auto dumpfn = [&writer](csubstr s){ writer.append(s); };
    425     _parse_dump(dumpfn, fmt, args...);
    426     writer.append('\n');
    427     _fmt_msg(dumpfn);
    428     size_t len = writer.pos < RYML_ERRMSG_SIZE ? writer.pos : RYML_ERRMSG_SIZE;
    429     m_tree->m_callbacks.m_error(errmsg, len, m_state->pos, m_tree->m_callbacks.m_user_data);
    430 }
    431 
    432 //-----------------------------------------------------------------------------
    433 #ifdef RYML_DBG
    434 template<class ...Args>
    435 void Parser::_dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const
    436 {
    437     auto dumpfn = [](csubstr s){ fwrite(s.str, 1, s.len, stdout); };
    438     _parse_dump(dumpfn, fmt, args...);
    439     dumpfn("\n");
    440     _fmt_msg(dumpfn);
    441 }
    442 #endif
    443 
    444 //-----------------------------------------------------------------------------
    445 bool Parser::_finished_file() const
    446 {
    447     bool ret = m_state->pos.offset >= m_buf.len;
    448     if(ret)
    449     {
    450         _c4dbgp("finished file!!!");
    451     }
    452     return ret;
    453 }
    454 
    455 //-----------------------------------------------------------------------------
    456 bool Parser::_finished_line() const
    457 {
    458     return m_state->line_contents.rem.empty();
    459 }
    460 
    461 //-----------------------------------------------------------------------------
    462 void Parser::parse_in_place(csubstr file, substr buf, Tree *t, size_t node_id)
    463 {
    464     m_file = file;
    465     m_buf = buf;
    466     m_root_id = node_id;
    467     m_tree = t;
    468     _reset();
    469     while( ! _finished_file())
    470     {
    471         _scan_line();
    472         while( ! _finished_line())
    473             _handle_line();
    474         if(_finished_file())
    475             break; // it may have finished because of multiline blocks
    476         _line_ended();
    477     }
    478     _handle_finished_file();
    479 }
    480 
    481 //-----------------------------------------------------------------------------
    482 void Parser::_handle_finished_file()
    483 {
    484     _end_stream();
    485 }
    486 
    487 //-----------------------------------------------------------------------------
    488 void Parser::_handle_line()
    489 {
    490     _c4dbgq("\n-----------");
    491     _c4dbgt("handling line={}, offset={}B", m_state->pos.line, m_state->pos.offset);
    492     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! m_state->line_contents.rem.empty());
    493     if(has_any(RSEQ))
    494     {
    495         if(has_any(FLOW))
    496         {
    497             if(_handle_seq_flow())
    498                 return;
    499         }
    500         else
    501         {
    502             if(_handle_seq_blck())
    503                 return;
    504         }
    505     }
    506     else if(has_any(RMAP))
    507     {
    508         if(has_any(FLOW))
    509         {
    510             if(_handle_map_flow())
    511                 return;
    512         }
    513         else
    514         {
    515             if(_handle_map_blck())
    516                 return;
    517         }
    518     }
    519     else if(has_any(RUNK))
    520     {
    521         if(_handle_unk())
    522             return;
    523     }
    524 
    525     if(_handle_top())
    526         return;
    527 }
    528 
    529 
    530 //-----------------------------------------------------------------------------
    531 bool Parser::_handle_unk()
    532 {
    533     _c4dbgp("handle_unk");
    534 
    535     csubstr rem = m_state->line_contents.rem;
    536     const bool start_as_child = (node(m_state) == nullptr);
    537 
    538     if(C4_UNLIKELY(has_any(NDOC)))
    539     {
    540         if(rem == "---" || rem.begins_with("--- "))
    541         {
    542             _start_new_doc(rem);
    543             return true;
    544         }
    545         auto trimmed = rem.triml(' ');
    546         if(trimmed == "---" || trimmed.begins_with("--- "))
    547         {
    548             _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len >= trimmed.len);
    549             _line_progressed(rem.len - trimmed.len);
    550             _start_new_doc(trimmed);
    551             _save_indentation();
    552             return true;
    553         }
    554         else if(trimmed.begins_with("..."))
    555         {
    556             _end_stream();
    557         }
    558         else if(trimmed.first_of("#%") == csubstr::npos) // neither a doc nor a tag
    559         {
    560             _c4dbgpf("starting implicit doc to accomodate unexpected tokens: '{}'", rem);
    561             size_t indref = m_state->indref;
    562             _push_level();
    563             _start_doc();
    564             _set_indentation(indref);
    565         }
    566         _RYML_CB_ASSERT(m_stack.m_callbacks, !trimmed.empty());
    567     }
    568 
    569     _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT|RSEQ|RMAP));
    570     if(m_state->indref > 0)
    571     {
    572         csubstr ws = rem.left_of(rem.first_not_of(' '));
    573         if(m_state->indref <= ws.len)
    574         {
    575             _c4dbgpf("skipping base indentation of {}", m_state->indref);
    576             _line_progressed(m_state->indref);
    577             rem = rem.sub(m_state->indref);
    578         }
    579     }
    580 
    581     if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
    582     {
    583         _c4dbgpf("it's a seq (as_child={})", start_as_child);
    584         _move_key_anchor_to_val_anchor();
    585         _move_key_tag_to_val_tag();
    586         _push_level();
    587         _start_seq(start_as_child);
    588         _save_indentation();
    589         _line_progressed(2);
    590         return true;
    591     }
    592     else if(rem == '-')
    593     {
    594         _c4dbgpf("it's a seq (as_child={})", start_as_child);
    595         _move_key_anchor_to_val_anchor();
    596         _move_key_tag_to_val_tag();
    597         _push_level();
    598         _start_seq(start_as_child);
    599         _save_indentation();
    600         _line_progressed(1);
    601         return true;
    602     }
    603     else if(rem.begins_with('['))
    604     {
    605         _c4dbgpf("it's a seq, flow (as_child={})", start_as_child);
    606         _move_key_anchor_to_val_anchor();
    607         _move_key_tag_to_val_tag();
    608         _push_level(/*explicit flow*/true);
    609         _start_seq(start_as_child);
    610         add_flags(FLOW);
    611         _line_progressed(1);
    612         return true;
    613     }
    614     else if(rem.begins_with('{'))
    615     {
    616         _c4dbgpf("it's a map, flow (as_child={})", start_as_child);
    617         _move_key_anchor_to_val_anchor();
    618         _move_key_tag_to_val_tag();
    619         _push_level(/*explicit flow*/true);
    620         _start_map(start_as_child);
    621         addrem_flags(FLOW|RKEY, RVAL);
    622         _line_progressed(1);
    623         return true;
    624     }
    625     else if(rem.begins_with("? "))
    626     {
    627         _c4dbgpf("it's a map (as_child={}) + this key is complex", start_as_child);
    628         _move_key_anchor_to_val_anchor();
    629         _move_key_tag_to_val_tag();
    630         _push_level();
    631         _start_map(start_as_child);
    632         addrem_flags(RKEY|QMRK, RVAL);
    633         _save_indentation();
    634         _line_progressed(2);
    635         return true;
    636     }
    637     else if(rem.begins_with(": ") && !has_any(SSCL))
    638     {
    639         _c4dbgp("it's a map with an empty key");
    640         _move_key_anchor_to_val_anchor();
    641         _move_key_tag_to_val_tag();
    642         _push_level();
    643         _start_map(start_as_child);
    644         _store_scalar_null(rem.str);
    645         addrem_flags(RVAL, RKEY);
    646         _save_indentation();
    647         _line_progressed(2);
    648         return true;
    649     }
    650     else if(rem == ':' && !has_any(SSCL))
    651     {
    652         _c4dbgp("it's a map with an empty key");
    653         _move_key_anchor_to_val_anchor();
    654         _move_key_tag_to_val_tag();
    655         _push_level();
    656         _start_map(start_as_child);
    657         _store_scalar_null(rem.str);
    658         addrem_flags(RVAL, RKEY);
    659         _save_indentation();
    660         _line_progressed(1);
    661         return true;
    662     }
    663     else if(_handle_types())
    664     {
    665         return true;
    666     }
    667     else if(!rem.begins_with('*') && _handle_key_anchors_and_refs())
    668     {
    669         return true;
    670     }
    671     else if(has_any(SSCL))
    672     {
    673         _c4dbgpf("there's a stored scalar: '{}'", m_state->scalar);
    674 
    675         csubstr saved_scalar;
    676         bool is_quoted = false;
    677         if(_scan_scalar_unk(&saved_scalar, &is_quoted))
    678         {
    679             rem = m_state->line_contents.rem;
    680             _c4dbgpf("... and there's also a scalar next! '{}'", saved_scalar);
    681             if(rem.begins_with_any(" \t"))
    682             {
    683                 size_t n = rem.first_not_of(" \t");
    684                 _c4dbgpf("skipping {} spaces/tabs", n);
    685                 rem = rem.sub(n);
    686                 _line_progressed(n);
    687             }
    688         }
    689 
    690         _c4dbgpf("rem='{}'", rem);
    691 
    692         if(rem.begins_with(", "))
    693         {
    694             _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
    695             _start_seq(start_as_child);
    696             add_flags(FLOW);
    697             _append_val(_consume_scalar());
    698             _line_progressed(2);
    699         }
    700         else if(rem.begins_with(','))
    701         {
    702             _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
    703             _start_seq(start_as_child);
    704             add_flags(FLOW);
    705             _append_val(_consume_scalar());
    706             _line_progressed(1);
    707         }
    708         else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
    709         {
    710             _c4dbgpf("got a ': ' -- it's a map (as_child={})", start_as_child);
    711             _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
    712             _line_progressed(2);
    713         }
    714         else if(rem == ":" || rem.begins_with(":\"") || rem.begins_with(":'"))
    715         {
    716             if(rem == ":") { _c4dbgpf("got a ':' -- it's a map (as_child={})", start_as_child); }
    717             else { _c4dbgpf("got a '{}' -- it's a map (as_child={})", rem.first(2), start_as_child); }
    718             _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
    719             _line_progressed(1); // advance only 1
    720         }
    721         #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
    722         else if(rem.begins_with('}'))
    723         {
    724             if(!has_all(RMAP|FLOW))
    725             {
    726                 _c4err("invalid token: not reading a map");
    727             }
    728             if(!has_all(SSCL))
    729             {
    730                 _c4err("no scalar stored");
    731             }
    732             _append_key_val(saved_scalar, is_quoted);
    733             _stop_map();
    734             _line_progressed(1);
    735             saved_scalar.clear();
    736             is_quoted = false;
    737         }
    738         #endif
    739         else if(rem.begins_with("..."))
    740         {
    741             _c4dbgp("got stream end '...'");
    742             _end_stream();
    743             _line_progressed(3);
    744         }
    745         else if(rem.begins_with('#'))
    746         {
    747             _c4dbgpf("it's a comment: '{}'", rem);
    748             _scan_comment();
    749             return true;
    750         }
    751         else if(_handle_key_anchors_and_refs())
    752         {
    753             return true;
    754         }
    755         else if(rem.begins_with(" ") || rem.begins_with("\t"))
    756         {
    757             size_t n = rem.first_not_of(" \t");
    758             if(n == npos)
    759                 n = rem.len;
    760             _c4dbgpf("has {} spaces/tabs, skip...", n);
    761             _line_progressed(n);
    762             return true;
    763         }
    764         else if(rem.empty())
    765         {
    766             // nothing to do
    767         }
    768         else if(rem == "---" || rem.begins_with("--- "))
    769         {
    770             _c4dbgp("caught ---: starting doc");
    771             _start_new_doc(rem);
    772             return true;
    773         }
    774         else if(rem.begins_with('%'))
    775         {
    776             _c4dbgp("caught a directive: ignoring...");
    777             _line_progressed(rem.len);
    778             return true;
    779         }
    780         else
    781         {
    782             _c4err("parse error");
    783         }
    784 
    785         if(is_quoted || (! saved_scalar.empty()))
    786         {
    787             _store_scalar(saved_scalar, is_quoted);
    788         }
    789 
    790         return true;
    791     }
    792     else
    793     {
    794         _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(SSCL));
    795         csubstr scalar;
    796         size_t indentation = m_state->line_contents.indentation; // save
    797         bool is_quoted;
    798         if(_scan_scalar_unk(&scalar, &is_quoted))
    799         {
    800             _c4dbgpf("got a {} scalar", is_quoted ? "quoted" : "");
    801             rem = m_state->line_contents.rem;
    802             {
    803                 size_t first = rem.first_not_of(" \t");
    804                 if(first && first != npos)
    805                 {
    806                     _c4dbgpf("skip {} whitespace characters", first);
    807                    _line_progressed(first);
    808                    rem = rem.sub(first);
    809                 }
    810             }
    811             _store_scalar(scalar, is_quoted);
    812             if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
    813             {
    814                 _c4dbgpf("got a ': ' next -- it's a map (as_child={})", start_as_child);
    815                 _push_level();
    816                 _start_map(start_as_child); // wait for the val scalar to append the key-val pair
    817                 _set_indentation(indentation);
    818                 _line_progressed(2); // call this AFTER saving the indentation
    819             }
    820             else if(rem.begins_with(':'))
    821             {
    822                 _c4dbgpf("got a ':' next -- it's a map (as_child={})", start_as_child);
    823                 _push_level();
    824                 _start_map(start_as_child); // wait for the val scalar to append the key-val pair
    825                 _set_indentation(indentation);
    826                 _line_progressed(1); // call this AFTER saving the indentation
    827             }
    828             else
    829             {
    830                 // we still don't know whether it's a seq or a map
    831                 // so just store the scalar
    832             }
    833             return true;
    834         }
    835         else if(rem.begins_with_any(" \t"))
    836         {
    837             csubstr ws = rem.left_of(rem.first_not_of(" \t"));
    838             rem = rem.right_of(ws);
    839             if(has_all(RTOP) && rem.begins_with("---"))
    840             {
    841                 _c4dbgp("there's a doc starting, and it's indented");
    842                 _set_indentation(ws.len);
    843             }
    844             _c4dbgpf("skipping {} spaces/tabs", ws.len);
    845             _line_progressed(ws.len);
    846             return true;
    847         }
    848     }
    849 
    850     return false;
    851 }
    852 
    853 
    854 //-----------------------------------------------------------------------------
    855 C4_ALWAYS_INLINE void Parser::_skipchars(char c)
    856 {
    857     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with(c));
    858     size_t pos = m_state->line_contents.rem.first_not_of(c);
    859     if(pos == npos)
    860         pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
    861     _c4dbgpf("skip {} '{}'", pos, c);
    862     _line_progressed(pos);
    863 }
    864 
    865 template<size_t N>
    866 C4_ALWAYS_INLINE void Parser::_skipchars(const char (&chars)[N])
    867 {
    868     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with_any(chars));
    869     size_t pos = m_state->line_contents.rem.first_not_of(chars);
    870     if(pos == npos)
    871         pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
    872     _c4dbgpf("skip {} characters", pos);
    873     _line_progressed(pos);
    874 }
    875 
    876 
    877 //-----------------------------------------------------------------------------
    878 bool Parser::_handle_seq_flow()
    879 {
    880     _c4dbgpf("handle_seq_flow: node_id={} level={}", m_state->node_id, m_state->level);
    881     csubstr rem = m_state->line_contents.rem;
    882 
    883     _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
    884     _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
    885 
    886     if(rem.begins_with(' '))
    887     {
    888         // with explicit flow, indentation does not matter
    889         _c4dbgp("starts with spaces");
    890         _skipchars(' ');
    891         return true;
    892     }
    893     _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
    894     {
    895         _c4dbgp("starts with tabs");
    896         _skipchars('\t');
    897         return true;
    898     })
    899     else if(rem.begins_with('#'))
    900     {
    901         _c4dbgp("it's a comment");
    902         rem = _scan_comment(); // also progresses the line
    903         return true;
    904     }
    905     else if(rem.begins_with(']'))
    906     {
    907         _c4dbgp("end the sequence");
    908         _pop_level();
    909         _line_progressed(1);
    910         if(has_all(RSEQIMAP))
    911         {
    912             _stop_seqimap();
    913             _pop_level();
    914         }
    915         return true;
    916     }
    917 
    918     if(has_any(RVAL))
    919     {
    920         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
    921         bool is_quoted;
    922         if(_scan_scalar_seq_flow(&rem, &is_quoted))
    923         {
    924             _c4dbgp("it's a scalar");
    925             addrem_flags(RNXT, RVAL);
    926             _append_val(rem, is_quoted);
    927             return true;
    928         }
    929         else if(rem.begins_with('['))
    930         {
    931             _c4dbgp("val is a child seq");
    932             addrem_flags(RNXT, RVAL); // before _push_level!
    933             _push_level(/*explicit flow*/true);
    934             _start_seq();
    935             add_flags(FLOW);
    936             _line_progressed(1);
    937             return true;
    938         }
    939         else if(rem.begins_with('{'))
    940         {
    941             _c4dbgp("val is a child map");
    942             addrem_flags(RNXT, RVAL); // before _push_level!
    943             _push_level(/*explicit flow*/true);
    944             _start_map();
    945             addrem_flags(FLOW|RKEY, RVAL);
    946             _line_progressed(1);
    947             return true;
    948         }
    949         else if(rem == ':')
    950         {
    951             _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
    952             _start_seqimap();
    953             _line_progressed(1);
    954             return true;
    955         }
    956         else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
    957         {
    958             _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
    959             _start_seqimap();
    960             _line_progressed(2);
    961             return true;
    962         }
    963         else if(rem.begins_with("? "))
    964         {
    965             _c4dbgpf("found '? ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
    966             _start_seqimap();
    967             _line_progressed(2);
    968             _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(SSCL) && m_state->scalar == "");
    969             addrem_flags(QMRK|RKEY, RVAL|SSCL);
    970             return true;
    971         }
    972         else if(_handle_types())
    973         {
    974             return true;
    975         }
    976         else if(_handle_val_anchors_and_refs())
    977         {
    978             return true;
    979         }
    980         else if(rem.begins_with(", "))
    981         {
    982             _c4dbgp("found ',' -- the value was null");
    983             _append_val_null(rem.str - 1);
    984             _line_progressed(2);
    985             return true;
    986         }
    987         else if(rem.begins_with(','))
    988         {
    989             _c4dbgp("found ',' -- the value was null");
    990             _append_val_null(rem.str - 1);
    991             _line_progressed(1);
    992             return true;
    993         }
    994         else if(rem.begins_with('\t'))
    995         {
    996             _skipchars('\t');
    997             return true;
    998         }
    999         else
   1000         {
   1001             _c4err("parse error");
   1002         }
   1003     }
   1004     else if(has_any(RNXT))
   1005     {
   1006         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
   1007         if(rem.begins_with(", "))
   1008         {
   1009             _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
   1010             _c4dbgp("seq: expect next val");
   1011             addrem_flags(RVAL, RNXT);
   1012             _line_progressed(2);
   1013             return true;
   1014         }
   1015         else if(rem.begins_with(','))
   1016         {
   1017             _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
   1018             _c4dbgp("seq: expect next val");
   1019             addrem_flags(RVAL, RNXT);
   1020             _line_progressed(1);
   1021             return true;
   1022         }
   1023         else if(rem == ':')
   1024         {
   1025             _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
   1026             _start_seqimap();
   1027             _line_progressed(1);
   1028             return true;
   1029         }
   1030         else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
   1031         {
   1032             _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
   1033             _start_seqimap();
   1034             _line_progressed(2);
   1035             return true;
   1036         }
   1037         else
   1038         {
   1039             _c4err("was expecting a comma");
   1040         }
   1041     }
   1042     else
   1043     {
   1044         _c4err("internal error");
   1045     }
   1046 
   1047     return true;
   1048 }
   1049 
   1050 //-----------------------------------------------------------------------------
   1051 bool Parser::_handle_seq_blck()
   1052 {
   1053     _c4dbgpf("handle_seq_impl: node_id={} level={}", m_state->node_id, m_state->level);
   1054     csubstr rem = m_state->line_contents.rem;
   1055 
   1056     _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ));
   1057     _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
   1058     _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
   1059 
   1060     if(rem.begins_with('#'))
   1061     {
   1062         _c4dbgp("it's a comment");
   1063         rem = _scan_comment();
   1064         return true;
   1065     }
   1066     if(has_any(RNXT))
   1067     {
   1068         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
   1069 
   1070         if(_handle_indentation())
   1071             return true;
   1072 
   1073         if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
   1074         {
   1075             _c4dbgp("expect another val");
   1076             addrem_flags(RVAL, RNXT);
   1077             _line_progressed(2);
   1078             return true;
   1079         }
   1080         else if(rem == '-')
   1081         {
   1082             _c4dbgp("expect another val");
   1083             addrem_flags(RVAL, RNXT);
   1084             _line_progressed(1);
   1085             return true;
   1086         }
   1087         else if(rem.begins_with_any(" \t"))
   1088         {
   1089             _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
   1090             _skipchars(" \t");
   1091             return true;
   1092         }
   1093         else if(rem.begins_with("..."))
   1094         {
   1095             _c4dbgp("got stream end '...'");
   1096             _end_stream();
   1097             _line_progressed(3);
   1098             return true;
   1099         }
   1100         else if(rem.begins_with("---"))
   1101         {
   1102             _c4dbgp("got document start '---'");
   1103             _start_new_doc(rem);
   1104             return true;
   1105         }
   1106         else
   1107         {
   1108             _c4err("parse error");
   1109         }
   1110     }
   1111     else if(has_any(RVAL))
   1112     {
   1113         // there can be empty values
   1114         if(_handle_indentation())
   1115             return true;
   1116 
   1117         csubstr s;
   1118         bool is_quoted;
   1119         if(_scan_scalar_seq_blck(&s, &is_quoted)) // this also progresses the line
   1120         {
   1121             _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
   1122 
   1123             rem = m_state->line_contents.rem;
   1124             if(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(rem.begins_with_any(" \t"), rem.begins_with(' ')))
   1125             {
   1126                 _c4dbgp("skipping whitespace...");
   1127                 size_t skip = rem.first_not_of(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   1128                 if(skip == csubstr::npos)
   1129                     skip = rem.len; // maybe the line is just whitespace
   1130                 _line_progressed(skip);
   1131                 rem = rem.sub(skip);
   1132             }
   1133 
   1134             _c4dbgpf("rem=[{}]~~~{}~~~", rem.len, rem);
   1135             if(!rem.begins_with('#') && (rem.ends_with(':') || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
   1136             {
   1137                 _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
   1138                 if(m_key_anchor.empty())
   1139                     _move_val_anchor_to_key_anchor();
   1140                 if(m_key_tag.empty())
   1141                     _move_val_tag_to_key_tag();
   1142                 addrem_flags(RNXT, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
   1143                 _push_level();
   1144                 _start_map();
   1145                 _store_scalar(s, is_quoted);
   1146                 if( ! _maybe_set_indentation_from_anchor_or_tag())
   1147                 {
   1148                     _c4dbgpf("set indentation from scalar: {}", m_state->scalar_col);
   1149                     _set_indentation(m_state->scalar_col); // this is the column where the scalar starts
   1150                 }
   1151                 _move_key_tag2_to_key_tag();
   1152                 addrem_flags(RVAL, RKEY);
   1153                 _line_progressed(1);
   1154             }
   1155             else
   1156             {
   1157                 _c4dbgp("appending val to current seq");
   1158                 _append_val(s, is_quoted);
   1159                 addrem_flags(RNXT, RVAL);
   1160             }
   1161             return true;
   1162         }
   1163         else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
   1164         {
   1165             if(_rval_dash_start_or_continue_seq())
   1166                 _line_progressed(2);
   1167             return true;
   1168         }
   1169         else if(rem == '-')
   1170         {
   1171             if(_rval_dash_start_or_continue_seq())
   1172                 _line_progressed(1);
   1173             return true;
   1174         }
   1175         else if(rem.begins_with('['))
   1176         {
   1177             _c4dbgp("val is a child seq, flow");
   1178             addrem_flags(RNXT, RVAL); // before _push_level!
   1179             _push_level(/*explicit flow*/true);
   1180             _start_seq();
   1181             add_flags(FLOW);
   1182             _line_progressed(1);
   1183             return true;
   1184         }
   1185         else if(rem.begins_with('{'))
   1186         {
   1187             _c4dbgp("val is a child map, flow");
   1188             addrem_flags(RNXT, RVAL); // before _push_level!
   1189             _push_level(/*explicit flow*/true);
   1190             _start_map();
   1191             addrem_flags(FLOW|RKEY, RVAL);
   1192             _line_progressed(1);
   1193             return true;
   1194         }
   1195         else if(rem.begins_with("? "))
   1196         {
   1197             _c4dbgp("val is a child map + this key is complex");
   1198             addrem_flags(RNXT, RVAL); // before _push_level!
   1199             _push_level();
   1200             _start_map();
   1201             addrem_flags(QMRK|RKEY, RVAL);
   1202             _save_indentation();
   1203             _line_progressed(2);
   1204             return true;
   1205         }
   1206         else if(rem.begins_with(' '))
   1207         {
   1208             csubstr spc = rem.left_of(rem.first_not_of(' '));
   1209             if(_at_line_begin())
   1210             {
   1211                 _c4dbgpf("skipping value indentation: {} spaces", spc.len);
   1212                 _line_progressed(spc.len);
   1213                 return true;
   1214             }
   1215             else
   1216             {
   1217                 _c4dbgpf("skipping {} spaces", spc.len);
   1218                 _line_progressed(spc.len);
   1219                 return true;
   1220             }
   1221         }
   1222         else if(_handle_types())
   1223         {
   1224             return true;
   1225         }
   1226         else if(_handle_val_anchors_and_refs())
   1227         {
   1228             return true;
   1229         }
   1230         /* pathological case:
   1231          * - &key : val
   1232          * - &key :
   1233          * - : val
   1234          */
   1235         else if((!has_all(SSCL)) &&
   1236                 (rem.begins_with(": ") || rem.left_of(rem.find("#")).trimr("\t") == ":"))
   1237         {
   1238             if(!m_val_anchor.empty() || !m_val_tag.empty())
   1239             {
   1240                 _c4dbgp("val is a child map + this key is empty, with anchors or tags");
   1241                 addrem_flags(RNXT, RVAL); // before _push_level!
   1242                 _move_val_tag_to_key_tag();
   1243                 _move_val_anchor_to_key_anchor();
   1244                 _push_level();
   1245                 _start_map();
   1246                 _store_scalar_null(rem.str);
   1247                 addrem_flags(RVAL, RKEY);
   1248                 RYML_CHECK(_maybe_set_indentation_from_anchor_or_tag()); // one of them must exist
   1249                 _line_progressed(rem.begins_with(": ") ? 2u : 1u);
   1250                 return true;
   1251             }
   1252             else
   1253             {
   1254                 _c4dbgp("val is a child map + this key is empty, no anchors or tags");
   1255                 addrem_flags(RNXT, RVAL); // before _push_level!
   1256                 size_t ind = m_state->indref;
   1257                 _push_level();
   1258                 _start_map();
   1259                 _store_scalar_null(rem.str);
   1260                 addrem_flags(RVAL, RKEY);
   1261                 _c4dbgpf("set indentation from map anchor: {}", ind + 2);
   1262                 _set_indentation(ind + 2); // this is the column where the map starts
   1263                 _line_progressed(rem.begins_with(": ") ? 2u : 1u);
   1264                 return true;
   1265             }
   1266         }
   1267         else
   1268         {
   1269             _c4err("parse error");
   1270         }
   1271     }
   1272 
   1273     return false;
   1274 }
   1275 
   1276 //-----------------------------------------------------------------------------
   1277 
   1278 bool Parser::_rval_dash_start_or_continue_seq()
   1279 {
   1280     size_t ind = m_state->line_contents.current_col();
   1281     _RYML_CB_ASSERT(m_stack.m_callbacks, ind >= m_state->indref);
   1282     size_t delta_ind = ind - m_state->indref;
   1283     if( ! delta_ind)
   1284     {
   1285         _c4dbgp("prev val was empty");
   1286         addrem_flags(RNXT, RVAL);
   1287         _append_val_null(&m_state->line_contents.full[ind]);
   1288         return false;
   1289     }
   1290     _c4dbgp("val is a nested seq, indented");
   1291     addrem_flags(RNXT, RVAL); // before _push_level!
   1292     _push_level();
   1293     _start_seq();
   1294     _save_indentation();
   1295     return true;
   1296 }
   1297 
   1298 //-----------------------------------------------------------------------------
   1299 bool Parser::_handle_map_flow()
   1300 {
   1301     // explicit flow, ie, inside {}, separated by commas
   1302     _c4dbgpf("handle_map_flow: node_id={}  level={}", m_state->node_id, m_state->level);
   1303     csubstr rem = m_state->line_contents.rem;
   1304 
   1305     _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|FLOW));
   1306 
   1307     if(rem.begins_with(' '))
   1308     {
   1309         // with explicit flow, indentation does not matter
   1310         _c4dbgp("starts with spaces");
   1311         _skipchars(' ');
   1312         return true;
   1313     }
   1314     _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
   1315     {
   1316         // with explicit flow, indentation does not matter
   1317         _c4dbgp("starts with tabs");
   1318         _skipchars('\t');
   1319         return true;
   1320     })
   1321     else if(rem.begins_with('#'))
   1322     {
   1323         _c4dbgp("it's a comment");
   1324         rem = _scan_comment(); // also progresses the line
   1325         return true;
   1326     }
   1327     else if(rem.begins_with('}'))
   1328     {
   1329         _c4dbgp("end the map");
   1330         if(has_all(SSCL))
   1331         {
   1332             _c4dbgp("the last val was null");
   1333             _append_key_val_null(rem.str - 1);
   1334             rem_flags(RVAL);
   1335         }
   1336         _pop_level();
   1337         _line_progressed(1);
   1338         if(has_all(RSEQIMAP))
   1339         {
   1340             _c4dbgp("stopping implicitly nested 1x map");
   1341             _stop_seqimap();
   1342             _pop_level();
   1343         }
   1344         return true;
   1345     }
   1346 
   1347     if(has_any(RNXT))
   1348     {
   1349         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
   1350         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
   1351         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RSEQIMAP));
   1352 
   1353         if(rem.begins_with(", "))
   1354         {
   1355             _c4dbgp("seq: expect next keyval");
   1356             addrem_flags(RKEY, RNXT);
   1357             _line_progressed(2);
   1358             return true;
   1359         }
   1360         else if(rem.begins_with(','))
   1361         {
   1362             _c4dbgp("seq: expect next keyval");
   1363             addrem_flags(RKEY, RNXT);
   1364             _line_progressed(1);
   1365             return true;
   1366         }
   1367         else
   1368         {
   1369             _c4err("parse error");
   1370         }
   1371     }
   1372     else if(has_any(RKEY))
   1373     {
   1374         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
   1375         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
   1376 
   1377         bool is_quoted;
   1378         if(has_none(SSCL) && _scan_scalar_map_flow(&rem, &is_quoted))
   1379         {
   1380             _c4dbgp("it's a scalar");
   1381             _store_scalar(rem, is_quoted);
   1382             rem = m_state->line_contents.rem;
   1383             csubstr trimmed = rem.triml(" \t");
   1384             if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
   1385             {
   1386                 _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= rem.str);
   1387                 size_t num = static_cast<size_t>(trimmed.str - rem.str);
   1388                 _c4dbgpf("trimming {} whitespace after the scalar: '{}' --> '{}'", num, rem, rem.sub(num));
   1389                 rem = rem.sub(num);
   1390                 _line_progressed(num);
   1391             }
   1392         }
   1393 
   1394         if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
   1395         {
   1396             _c4dbgp("wait for val");
   1397             addrem_flags(RVAL, RKEY|QMRK);
   1398             _line_progressed(2);
   1399             if(!has_all(SSCL))
   1400             {
   1401                 _c4dbgp("no key was found, defaulting to empty key ''");
   1402                 _store_scalar_null(rem.str);
   1403             }
   1404             return true;
   1405         }
   1406         else if(rem == ':')
   1407         {
   1408             _c4dbgp("wait for val");
   1409             addrem_flags(RVAL, RKEY|QMRK);
   1410             _line_progressed(1);
   1411             if(!has_all(SSCL))
   1412             {
   1413                 _c4dbgp("no key was found, defaulting to empty key ''");
   1414                 _store_scalar_null(rem.str);
   1415             }
   1416             return true;
   1417         }
   1418         else if(rem.begins_with('?'))
   1419         {
   1420             _c4dbgp("complex key");
   1421             add_flags(QMRK);
   1422             _line_progressed(1);
   1423             return true;
   1424         }
   1425         else if(rem.begins_with(','))
   1426         {
   1427             _c4dbgp("prev scalar was a key with null value");
   1428             _append_key_val_null(rem.str - 1);
   1429             _line_progressed(1);
   1430             return true;
   1431         }
   1432         else if(rem.begins_with('}'))
   1433         {
   1434             _c4dbgp("map terminates after a key...");
   1435             _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
   1436             _c4dbgp("the last val was null");
   1437             _append_key_val_null(rem.str - 1);
   1438             rem_flags(RVAL);
   1439             if(has_all(RSEQIMAP))
   1440             {
   1441                 _c4dbgp("stopping implicitly nested 1x map");
   1442                 _stop_seqimap();
   1443                 _pop_level();
   1444             }
   1445             _pop_level();
   1446             _line_progressed(1);
   1447             return true;
   1448         }
   1449         else if(_handle_types())
   1450         {
   1451             return true;
   1452         }
   1453         else if(_handle_key_anchors_and_refs())
   1454         {
   1455             return true;
   1456         }
   1457         else if(rem == "")
   1458         {
   1459             return true;
   1460         }
   1461         else
   1462         {
   1463             size_t pos = rem.first_not_of(" \t");
   1464             if(pos == csubstr::npos)
   1465                pos = 0;
   1466             rem = rem.sub(pos);
   1467             if(rem.begins_with(':'))
   1468             {
   1469                 _c4dbgp("wait for val");
   1470                 addrem_flags(RVAL, RKEY|QMRK);
   1471                 _line_progressed(pos + 1);
   1472                 if(!has_all(SSCL))
   1473                 {
   1474                     _c4dbgp("no key was found, defaulting to empty key ''");
   1475                     _store_scalar_null(rem.str);
   1476                 }
   1477                 return true;
   1478             }
   1479             else if(rem.begins_with('#'))
   1480             {
   1481                 _c4dbgp("it's a comment");
   1482                 _line_progressed(pos);
   1483                 rem = _scan_comment(); // also progresses the line
   1484                 return true;
   1485             }
   1486             else
   1487             {
   1488                 _c4err("parse error");
   1489             }
   1490         }
   1491     }
   1492     else if(has_any(RVAL))
   1493     {
   1494         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
   1495         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
   1496         _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
   1497         bool is_quoted;
   1498         if(_scan_scalar_map_flow(&rem, &is_quoted))
   1499         {
   1500             _c4dbgp("it's a scalar");
   1501             addrem_flags(RNXT, RVAL|RKEY);
   1502             _append_key_val(rem, is_quoted);
   1503             if(has_all(RSEQIMAP))
   1504             {
   1505                 _c4dbgp("stopping implicitly nested 1x map");
   1506                 _stop_seqimap();
   1507                 _pop_level();
   1508             }
   1509             return true;
   1510         }
   1511         else if(rem.begins_with('['))
   1512         {
   1513             _c4dbgp("val is a child seq");
   1514             addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
   1515             _push_level(/*explicit flow*/true);
   1516             _move_scalar_from_top();
   1517             _start_seq();
   1518             add_flags(FLOW);
   1519             _line_progressed(1);
   1520             return true;
   1521         }
   1522         else if(rem.begins_with('{'))
   1523         {
   1524             _c4dbgp("val is a child map");
   1525             addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
   1526             _push_level(/*explicit flow*/true);
   1527             _move_scalar_from_top();
   1528             _start_map();
   1529             addrem_flags(FLOW|RKEY, RNXT|RVAL);
   1530             _line_progressed(1);
   1531             return true;
   1532         }
   1533         else if(_handle_types())
   1534         {
   1535             return true;
   1536         }
   1537         else if(_handle_val_anchors_and_refs())
   1538         {
   1539             return true;
   1540         }
   1541         else if(rem.begins_with(','))
   1542         {
   1543             _c4dbgp("appending empty val");
   1544             _append_key_val_null(rem.str - 1);
   1545             addrem_flags(RKEY, RVAL);
   1546             _line_progressed(1);
   1547             if(has_any(RSEQIMAP))
   1548             {
   1549                 _c4dbgp("stopping implicitly nested 1x map");
   1550                 _stop_seqimap();
   1551                 _pop_level();
   1552             }
   1553             return true;
   1554         }
   1555         else if(has_any(RSEQIMAP) && rem.begins_with(']'))
   1556         {
   1557             _c4dbgp("stopping implicitly nested 1x map");
   1558             if(has_any(SSCL))
   1559             {
   1560                 _append_key_val_null(rem.str - 1);
   1561             }
   1562             _stop_seqimap();
   1563             _pop_level();
   1564             return true;
   1565         }
   1566         else
   1567         {
   1568             _c4err("parse error");
   1569         }
   1570     }
   1571     else
   1572     {
   1573         _c4err("internal error");
   1574     }
   1575 
   1576     return false;
   1577 }
   1578 
   1579 //-----------------------------------------------------------------------------
   1580 bool Parser::_handle_map_blck()
   1581 {
   1582     _c4dbgpf("handle_map_blck: node_id={}  level={}", m_state->node_id, m_state->level);
   1583     csubstr rem = m_state->line_contents.rem;
   1584 
   1585     _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP));
   1586     _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
   1587 
   1588     if(rem.begins_with('#'))
   1589     {
   1590         _c4dbgp("it's a comment");
   1591         rem = _scan_comment();
   1592         return true;
   1593     }
   1594 
   1595     if(has_any(RNXT))
   1596     {
   1597         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
   1598         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
   1599         // actually, we don't need RNXT in indent-based maps.
   1600         addrem_flags(RKEY, RNXT);
   1601     }
   1602 
   1603     if(_handle_indentation())
   1604     {
   1605         _c4dbgp("indentation token");
   1606         return true;
   1607     }
   1608 
   1609     if(has_any(RKEY))
   1610     {
   1611         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
   1612         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
   1613 
   1614         _c4dbgp("RMAP|RKEY read scalar?");
   1615         bool is_quoted;
   1616         if(_scan_scalar_map_blck(&rem, &is_quoted)) // this also progresses the line
   1617         {
   1618             _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
   1619             if(has_all(QMRK|SSCL))
   1620             {
   1621                 _c4dbgpf("current key is QMRK; SSCL is set. so take store scalar='{}' as key and add an empty val", m_state->scalar);
   1622                 _append_key_val_null(rem.str - 1);
   1623             }
   1624             _store_scalar(rem, is_quoted);
   1625             if(has_all(QMRK|RSET))
   1626             {
   1627                 _c4dbgp("it's a complex key, so use null value '~'");
   1628                 _append_key_val_null(rem.str);
   1629             }
   1630             rem = m_state->line_contents.rem;
   1631 
   1632             if(rem.begins_with(':'))
   1633             {
   1634                 _c4dbgp("wait for val");
   1635                 addrem_flags(RVAL, RKEY|QMRK);
   1636                 _line_progressed(1);
   1637                 rem = m_state->line_contents.rem;
   1638                 if(rem.begins_with_any(" \t"))
   1639                 {
   1640                     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
   1641                     rem = rem.left_of(rem.first_not_of(" \t"));
   1642                     _c4dbgpf("skip {} spaces/tabs", rem.len);
   1643                     _line_progressed(rem.len);
   1644                 }
   1645             }
   1646             return true;
   1647         }
   1648         else if(rem.begins_with_any(" \t"))
   1649         {
   1650             size_t pos = rem.first_not_of(" \t");
   1651             if(pos == npos)
   1652                 pos = rem.len;
   1653             _c4dbgpf("skip {} spaces/tabs", pos);
   1654             _line_progressed(pos);
   1655             return true;
   1656         }
   1657         else if(rem == '?' || rem.begins_with("? "))
   1658         {
   1659             _c4dbgp("it's a complex key");
   1660             _line_progressed(rem.begins_with("? ") ? 2u : 1u);
   1661             if(has_any(SSCL))
   1662                 _append_key_val_null(rem.str - 1);
   1663             add_flags(QMRK);
   1664             return true;
   1665         }
   1666         else if(has_all(QMRK) && rem.begins_with(':'))
   1667         {
   1668             _c4dbgp("complex key finished");
   1669             if(!has_any(SSCL))
   1670                 _store_scalar_null(rem.str);
   1671             addrem_flags(RVAL, RKEY|QMRK);
   1672             _line_progressed(1);
   1673             rem = m_state->line_contents.rem;
   1674             if(rem.begins_with(' '))
   1675             {
   1676                 _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
   1677                 _skipchars(' ');
   1678             }
   1679             return true;
   1680         }
   1681         else if(rem == ':' || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
   1682         {
   1683             _c4dbgp("key finished");
   1684             if(!has_all(SSCL))
   1685             {
   1686                 _c4dbgp("key was empty...");
   1687                 _store_scalar_null(rem.str);
   1688                 rem_flags(QMRK);
   1689             }
   1690             addrem_flags(RVAL, RKEY);
   1691             _line_progressed(rem == ':' ? 1 : 2);
   1692             return true;
   1693         }
   1694         else if(rem.begins_with("..."))
   1695         {
   1696             _c4dbgp("end current document");
   1697             _end_stream();
   1698             _line_progressed(3);
   1699             return true;
   1700         }
   1701         else if(rem.begins_with("---"))
   1702         {
   1703             _c4dbgp("start new document '---'");
   1704             _start_new_doc(rem);
   1705             return true;
   1706         }
   1707         else if(_handle_types())
   1708         {
   1709             return true;
   1710         }
   1711         else if(_handle_key_anchors_and_refs())
   1712         {
   1713             return true;
   1714         }
   1715         else
   1716         {
   1717             _c4err("parse error");
   1718         }
   1719     }
   1720     else if(has_any(RVAL))
   1721     {
   1722         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
   1723         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
   1724 
   1725         _c4dbgp("RMAP|RVAL read scalar?");
   1726         csubstr s;
   1727         bool is_quoted;
   1728         if(_scan_scalar_map_blck(&s, &is_quoted)) // this also progresses the line
   1729         {
   1730             _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
   1731 
   1732             rem = m_state->line_contents.rem;
   1733 
   1734             if(rem.begins_with(": "))
   1735             {
   1736                 _c4dbgp("actually, the scalar is the first key of a map");
   1737                 addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
   1738                 _push_level();
   1739                 _move_scalar_from_top();
   1740                 _move_val_anchor_to_key_anchor();
   1741                 _start_map();
   1742                 _save_indentation(m_state->scalar_col);
   1743                 addrem_flags(RVAL, RKEY);
   1744                 _line_progressed(2);
   1745             }
   1746             else if(rem.begins_with(':'))
   1747             {
   1748                 _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
   1749                 addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
   1750                 _push_level();
   1751                 _move_scalar_from_top();
   1752                 _move_val_anchor_to_key_anchor();
   1753                 _start_map();
   1754                 _save_indentation(/*behind*/s.len);
   1755                 addrem_flags(RVAL, RKEY);
   1756                 _line_progressed(1);
   1757             }
   1758             else
   1759             {
   1760                 _c4dbgp("appending keyval to current map");
   1761                 _append_key_val(s, is_quoted);
   1762                 addrem_flags(RKEY, RVAL);
   1763             }
   1764             return true;
   1765         }
   1766         else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
   1767         {
   1768             _c4dbgp("val is a nested seq, indented");
   1769             addrem_flags(RKEY, RVAL); // before _push_level!
   1770             _push_level();
   1771             _move_scalar_from_top();
   1772             _start_seq();
   1773             _save_indentation();
   1774             _line_progressed(2);
   1775             return true;
   1776         }
   1777         else if(rem == '-')
   1778         {
   1779             _c4dbgp("maybe a seq. start unknown, indented");
   1780             _start_unk();
   1781             _save_indentation();
   1782             _line_progressed(1);
   1783             return true;
   1784         }
   1785         else if(rem.begins_with('['))
   1786         {
   1787             _c4dbgp("val is a child seq, flow");
   1788             addrem_flags(RKEY, RVAL); // before _push_level!
   1789             _push_level(/*explicit flow*/true);
   1790             _move_scalar_from_top();
   1791             _start_seq();
   1792             add_flags(FLOW);
   1793             _line_progressed(1);
   1794             return true;
   1795         }
   1796         else if(rem.begins_with('{'))
   1797         {
   1798             _c4dbgp("val is a child map, flow");
   1799             addrem_flags(RKEY, RVAL); // before _push_level!
   1800             _push_level(/*explicit flow*/true);
   1801             _move_scalar_from_top();
   1802             _start_map();
   1803             addrem_flags(FLOW|RKEY, RVAL);
   1804             _line_progressed(1);
   1805             return true;
   1806         }
   1807         else if(rem.begins_with(' '))
   1808         {
   1809             csubstr spc = rem.left_of(rem.first_not_of(' '));
   1810             if(_at_line_begin())
   1811             {
   1812                 _c4dbgpf("skipping value indentation: {} spaces", spc.len);
   1813                 _line_progressed(spc.len);
   1814                 return true;
   1815             }
   1816             else
   1817             {
   1818                 _c4dbgpf("skipping {} spaces", spc.len);
   1819                 _line_progressed(spc.len);
   1820                 return true;
   1821             }
   1822         }
   1823         else if(_handle_types())
   1824         {
   1825             return true;
   1826         }
   1827         else if(_handle_val_anchors_and_refs())
   1828         {
   1829             return true;
   1830         }
   1831         else if(rem.begins_with("--- ") || rem == "---" || rem.begins_with("---\t"))
   1832         {
   1833             _start_new_doc(rem);
   1834             return true;
   1835         }
   1836         else if(rem.begins_with("..."))
   1837         {
   1838             _c4dbgp("end current document");
   1839             _end_stream();
   1840             _line_progressed(3);
   1841             return true;
   1842         }
   1843         else
   1844         {
   1845             _c4err("parse error");
   1846         }
   1847     }
   1848     else
   1849     {
   1850         _c4err("internal error");
   1851     }
   1852 
   1853     return false;
   1854 }
   1855 
   1856 
   1857 //-----------------------------------------------------------------------------
   1858 bool Parser::_handle_top()
   1859 {
   1860     _c4dbgp("handle_top");
   1861     csubstr rem = m_state->line_contents.rem;
   1862 
   1863     if(rem.begins_with('#'))
   1864     {
   1865         _c4dbgp("a comment line");
   1866         _scan_comment();
   1867         return true;
   1868     }
   1869 
   1870     csubstr trimmed = rem.triml(' ');
   1871 
   1872     if(trimmed.begins_with('%'))
   1873     {
   1874         _handle_directive(trimmed);
   1875         _line_progressed(rem.len);
   1876         return true;
   1877     }
   1878     else if(trimmed.begins_with("--- ") || trimmed == "---" || trimmed.begins_with("---\t"))
   1879     {
   1880         _start_new_doc(rem);
   1881         if(trimmed.len < rem.len)
   1882         {
   1883             _line_progressed(rem.len - trimmed.len);
   1884             _save_indentation();
   1885         }
   1886         return true;
   1887     }
   1888     else if(trimmed.begins_with("..."))
   1889     {
   1890         _c4dbgp("end current document");
   1891         _end_stream();
   1892         if(trimmed.len < rem.len)
   1893         {
   1894             _line_progressed(rem.len - trimmed.len);
   1895         }
   1896         _line_progressed(3);
   1897         return true;
   1898     }
   1899     else
   1900     {
   1901         _c4err("parse error");
   1902     }
   1903 
   1904     return false;
   1905 }
   1906 
   1907 
   1908 //-----------------------------------------------------------------------------
   1909 
   1910 bool Parser::_handle_key_anchors_and_refs()
   1911 {
   1912     _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RVAL));
   1913     const csubstr rem = m_state->line_contents.rem;
   1914     if(rem.begins_with('&'))
   1915     {
   1916         _c4dbgp("found a key anchor!!!");
   1917         if(has_all(QMRK|SSCL))
   1918         {
   1919             _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
   1920             _c4dbgp("there is a stored key, so this anchor is for the next element");
   1921             _append_key_val_null(rem.str - 1);
   1922             rem_flags(QMRK);
   1923             return true;
   1924         }
   1925         csubstr anchor = rem.left_of(rem.first_of(' '));
   1926         _line_progressed(anchor.len);
   1927         anchor = anchor.sub(1); // skip the first character
   1928         _move_key_anchor_to_val_anchor();
   1929         _c4dbgpf("key anchor value: '{}'", anchor);
   1930         m_key_anchor = anchor;
   1931         m_key_anchor_indentation = m_state->line_contents.current_col(rem);
   1932         return true;
   1933     }
   1934     else if(C4_UNLIKELY(rem.begins_with('*')))
   1935     {
   1936         _c4err("not implemented - this should have been catched elsewhere");
   1937         C4_NEVER_REACH();
   1938         return false;
   1939     }
   1940     return false;
   1941 }
   1942 
   1943 bool Parser::_handle_val_anchors_and_refs()
   1944 {
   1945     _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RKEY));
   1946     const csubstr rem = m_state->line_contents.rem;
   1947     if(rem.begins_with('&'))
   1948     {
   1949         csubstr anchor = rem.left_of(rem.first_of(' '));
   1950         _line_progressed(anchor.len);
   1951         anchor = anchor.sub(1); // skip the first character
   1952         _c4dbgpf("val: found an anchor: '{}', indentation={}!!!", anchor, m_state->line_contents.current_col(rem));
   1953         if(m_val_anchor.empty())
   1954         {
   1955             _c4dbgpf("save val anchor: '{}'", anchor);
   1956             m_val_anchor = anchor;
   1957             m_val_anchor_indentation = m_state->line_contents.current_col(rem);
   1958         }
   1959         else
   1960         {
   1961             _c4dbgpf("there is a pending val anchor '{}'", m_val_anchor);
   1962             if(m_tree->is_seq(m_state->node_id))
   1963             {
   1964                 if(m_tree->has_children(m_state->node_id))
   1965                 {
   1966                     _c4dbgpf("current node={} is a seq, has {} children", m_state->node_id, m_tree->num_children(m_state->node_id));
   1967                     _c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
   1968                     m_key_anchor = anchor;
   1969                     m_key_anchor_indentation = m_state->line_contents.current_col(rem);
   1970                 }
   1971                 else
   1972                 {
   1973                     _c4dbgpf("current node={} is a seq, has no children", m_state->node_id);
   1974                     if(m_tree->has_val_anchor(m_state->node_id))
   1975                     {
   1976                         _c4dbgpf("... node={} already has val anchor: '{}'", m_state->node_id, m_tree->val_anchor(m_state->node_id));
   1977                         _c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
   1978                         m_key_anchor = anchor;
   1979                         m_key_anchor_indentation = m_state->line_contents.current_col(rem);
   1980                     }
   1981                     else
   1982                     {
   1983                         _c4dbgpf("... so set pending val anchor: '{}' on current node {}", m_val_anchor, m_state->node_id);
   1984                         m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
   1985                         m_val_anchor = anchor;
   1986                         m_val_anchor_indentation = m_state->line_contents.current_col(rem);
   1987                     }
   1988                 }
   1989             }
   1990         }
   1991         return true;
   1992     }
   1993     else if(C4_UNLIKELY(rem.begins_with('*')))
   1994     {
   1995         _c4err("not implemented - this should have been catched elsewhere");
   1996         C4_NEVER_REACH();
   1997         return false;
   1998     }
   1999     return false;
   2000 }
   2001 
   2002 void Parser::_move_key_anchor_to_val_anchor()
   2003 {
   2004     if(m_key_anchor.empty())
   2005         return;
   2006     _c4dbgpf("move current key anchor to val slot: key='{}' -> val='{}'", m_key_anchor, m_val_anchor);
   2007     if(!m_val_anchor.empty())
   2008         _c4err("triple-pending anchor");
   2009     m_val_anchor = m_key_anchor;
   2010     m_val_anchor_indentation = m_key_anchor_indentation;
   2011     m_key_anchor = {};
   2012     m_key_anchor_indentation = {};
   2013 }
   2014 
   2015 void Parser::_move_val_anchor_to_key_anchor()
   2016 {
   2017     if(m_val_anchor.empty())
   2018         return;
   2019     if(!_token_is_from_this_line(m_val_anchor))
   2020         return;
   2021     _c4dbgpf("move current val anchor to key slot: key='{}' <- val='{}'", m_key_anchor, m_val_anchor);
   2022     if(!m_key_anchor.empty())
   2023         _c4err("triple-pending anchor");
   2024     m_key_anchor = m_val_anchor;
   2025     m_key_anchor_indentation = m_val_anchor_indentation;
   2026     m_val_anchor = {};
   2027     m_val_anchor_indentation = {};
   2028 }
   2029 
   2030 void Parser::_move_key_tag_to_val_tag()
   2031 {
   2032     if(m_key_tag.empty())
   2033         return;
   2034     _c4dbgpf("move key tag to val tag: key='{}' -> val='{}'", m_key_tag, m_val_tag);
   2035     m_val_tag = m_key_tag;
   2036     m_val_tag_indentation = m_key_tag_indentation;
   2037     m_key_tag.clear();
   2038     m_key_tag_indentation = 0;
   2039 }
   2040 
   2041 void Parser::_move_val_tag_to_key_tag()
   2042 {
   2043     if(m_val_tag.empty())
   2044         return;
   2045     if(!_token_is_from_this_line(m_val_tag))
   2046         return;
   2047     _c4dbgpf("move val tag to key tag: key='{}' <- val='{}'", m_key_tag, m_val_tag);
   2048     m_key_tag = m_val_tag;
   2049     m_key_tag_indentation = m_val_tag_indentation;
   2050     m_val_tag.clear();
   2051     m_val_tag_indentation = 0;
   2052 }
   2053 
   2054 void Parser::_move_key_tag2_to_key_tag()
   2055 {
   2056     if(m_key_tag2.empty())
   2057         return;
   2058     _c4dbgpf("move key tag2 to key tag: key='{}' <- key2='{}'", m_key_tag, m_key_tag2);
   2059     m_key_tag = m_key_tag2;
   2060     m_key_tag_indentation = m_key_tag2_indentation;
   2061     m_key_tag2.clear();
   2062     m_key_tag2_indentation = 0;
   2063 }
   2064 
   2065 
   2066 //-----------------------------------------------------------------------------
   2067 
   2068 bool Parser::_handle_types()
   2069 {
   2070     csubstr rem = m_state->line_contents.rem.triml(' ');
   2071     csubstr t;
   2072 
   2073     if(rem.begins_with("!!"))
   2074     {
   2075         _c4dbgp("begins with '!!'");
   2076         t = rem.left_of(rem.first_of(" ,"));
   2077         _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
   2078         //t = t.sub(2);
   2079         if(t == "!!set")
   2080             add_flags(RSET);
   2081     }
   2082     else if(rem.begins_with("!<"))
   2083     {
   2084         _c4dbgp("begins with '!<'");
   2085         t = rem.left_of(rem.first_of('>'), true);
   2086         _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
   2087         //t = t.sub(2, t.len-1);
   2088     }
   2089     else if(rem.begins_with("!h!"))
   2090     {
   2091         _c4dbgp("begins with '!h!'");
   2092         t = rem.left_of(rem.first_of(' '));
   2093         _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 3);
   2094         //t = t.sub(3);
   2095     }
   2096     else if(rem.begins_with('!'))
   2097     {
   2098         _c4dbgp("begins with '!'");
   2099         t = rem.left_of(rem.first_of(' '));
   2100         _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
   2101         //t = t.sub(1);
   2102     }
   2103 
   2104     if(t.empty())
   2105         return false;
   2106 
   2107     if(has_all(QMRK|SSCL))
   2108     {
   2109         _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
   2110         _c4dbgp("there is a stored key, so this tag is for the next element");
   2111         _append_key_val_null(rem.str - 1);
   2112         rem_flags(QMRK);
   2113     }
   2114 
   2115     #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   2116     const char *tag_beginning = rem.str;
   2117     #endif
   2118     size_t tag_indentation = m_state->line_contents.current_col(t);
   2119     _c4dbgpf("there was a tag: '{}', indentation={}", t, tag_indentation);
   2120     _RYML_CB_ASSERT(m_stack.m_callbacks, t.end() > m_state->line_contents.rem.begin());
   2121     _line_progressed(static_cast<size_t>(t.end() - m_state->line_contents.rem.begin()));
   2122     {
   2123         size_t pos = m_state->line_contents.rem.first_not_of(" \t");
   2124         if(pos != csubstr::npos)
   2125             _line_progressed(pos);
   2126     }
   2127 
   2128     if(has_all(RMAP|RKEY))
   2129     {
   2130         _c4dbgpf("saving map key tag '{}'", t);
   2131         _RYML_CB_ASSERT(m_stack.m_callbacks, m_key_tag.empty());
   2132         m_key_tag = t;
   2133         m_key_tag_indentation = tag_indentation;
   2134     }
   2135     else if(has_all(RMAP|RVAL))
   2136     {
   2137         /* foo: !!str
   2138          * !!str : bar  */
   2139         rem = m_state->line_contents.rem;
   2140         rem = rem.left_of(rem.find("#"));
   2141         rem = rem.trimr(" \t");
   2142         _c4dbgpf("rem='{}'", rem);
   2143         #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   2144         if(rem == ':' || rem.begins_with(": "))
   2145         {
   2146             _c4dbgp("the last val was null, and this is a tag from a null key");
   2147             _append_key_val_null(tag_beginning - 1);
   2148             _store_scalar_null(rem.str - 1);
   2149             // do not change the flag to key, it is ~
   2150             _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begin() > m_state->line_contents.rem.begin());
   2151             size_t token_len = rem == ':' ? 1 : 2;
   2152             _line_progressed(static_cast<size_t>(token_len + rem.begin() - m_state->line_contents.rem.begin()));
   2153         }
   2154         #endif
   2155         _c4dbgpf("saving map val tag '{}'", t);
   2156         _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
   2157         m_val_tag = t;
   2158         m_val_tag_indentation = tag_indentation;
   2159     }
   2160     else if(has_all(RSEQ|RVAL) || has_all(RTOP|RUNK|NDOC))
   2161     {
   2162         if(m_val_tag.empty())
   2163         {
   2164             _c4dbgpf("saving seq/doc val tag '{}'", t);
   2165             m_val_tag = t;
   2166             m_val_tag_indentation = tag_indentation;
   2167         }
   2168         else
   2169         {
   2170             _c4dbgpf("saving seq/doc key tag '{}'", t);
   2171             m_key_tag = t;
   2172             m_key_tag_indentation = tag_indentation;
   2173         }
   2174     }
   2175     else if(has_all(RTOP|RUNK) || has_any(RUNK))
   2176     {
   2177         rem = m_state->line_contents.rem;
   2178         rem = rem.left_of(rem.find("#"));
   2179         rem = rem.trimr(" \t");
   2180         if(rem.empty())
   2181         {
   2182             _c4dbgpf("saving val tag '{}'", t);
   2183             _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
   2184             m_val_tag = t;
   2185             m_val_tag_indentation = tag_indentation;
   2186         }
   2187         else
   2188         {
   2189             _c4dbgpf("saving key tag '{}'", t);
   2190             if(m_key_tag.empty())
   2191             {
   2192                 m_key_tag = t;
   2193                 m_key_tag_indentation = tag_indentation;
   2194             }
   2195             else
   2196             {
   2197                 /* handle this case:
   2198                  * !!str foo: !!map
   2199                  *   !!int 1: !!float 20.0
   2200                  *   !!int 3: !!float 40.0
   2201                  *
   2202                  * (m_key_tag would be !!str and m_key_tag2 would be !!int)
   2203                  */
   2204                 m_key_tag2 = t;
   2205                 m_key_tag2_indentation = tag_indentation;
   2206             }
   2207         }
   2208     }
   2209     else
   2210     {
   2211         _c4err("internal error");
   2212     }
   2213 
   2214     if(m_val_tag.not_empty())
   2215     {
   2216         YamlTag_e tag = to_tag(t);
   2217         if(tag == TAG_STR)
   2218         {
   2219             _c4dbgpf("tag '{}' is a str-type tag", t);
   2220             if(has_all(RTOP|RUNK|NDOC))
   2221             {
   2222                 _c4dbgpf("docval. slurping the string. pos={}", m_state->pos.offset);
   2223                 csubstr scalar = _slurp_doc_scalar();
   2224                 _c4dbgpf("docval. after slurp: {}, at node {}: '{}'", m_state->pos.offset, m_state->node_id, scalar);
   2225                 m_tree->to_val(m_state->node_id, scalar, DOC);
   2226                 _c4dbgpf("docval. val tag {} -> {}", m_val_tag, normalize_tag(m_val_tag));
   2227                 m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
   2228                 m_val_tag.clear();
   2229                 if(!m_val_anchor.empty())
   2230                 {
   2231                     _c4dbgpf("setting val anchor[{}]='{}'", m_state->node_id, m_val_anchor);
   2232                     m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
   2233                     m_val_anchor.clear();
   2234                 }
   2235                 _end_stream();
   2236             }
   2237         }
   2238     }
   2239     return true;
   2240 }
   2241 
   2242 //-----------------------------------------------------------------------------
   2243 csubstr Parser::_slurp_doc_scalar()
   2244 {
   2245     csubstr s = m_state->line_contents.rem;
   2246     size_t pos = m_state->pos.offset;
   2247     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.find("---") != csubstr::npos);
   2248     _c4dbgpf("slurp 0 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
   2249     if(s.len == 0)
   2250     {
   2251         _line_ended();
   2252         _scan_line();
   2253         s = m_state->line_contents.rem;
   2254         pos = m_state->pos.offset;
   2255     }
   2256 
   2257     size_t skipws = s.first_not_of(" \t");
   2258     _c4dbgpf("slurp 1 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
   2259     if(skipws != npos)
   2260     {
   2261         _line_progressed(skipws);
   2262         s = m_state->line_contents.rem;
   2263         pos = m_state->pos.offset;
   2264         _c4dbgpf("slurp 2 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
   2265     }
   2266 
   2267     _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_anchor.empty());
   2268     _handle_val_anchors_and_refs();
   2269     if(!m_val_anchor.empty())
   2270     {
   2271         s = m_state->line_contents.rem;
   2272         skipws = s.first_not_of(" \t");
   2273         if(skipws != npos)
   2274         {
   2275             _line_progressed(skipws);
   2276         }
   2277         s = m_state->line_contents.rem;
   2278         pos = m_state->pos.offset;
   2279         _c4dbgpf("slurp 3 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
   2280     }
   2281 
   2282     if(s.begins_with('\''))
   2283     {
   2284         m_state->scalar_col = m_state->line_contents.current_col(s);
   2285         return _scan_squot_scalar();
   2286     }
   2287     else if(s.begins_with('"'))
   2288     {
   2289         m_state->scalar_col = m_state->line_contents.current_col(s);
   2290         return _scan_dquot_scalar();
   2291     }
   2292     else if(s.begins_with('|') || s.begins_with('>'))
   2293     {
   2294         return _scan_block();
   2295     }
   2296 
   2297     _c4dbgpf("slurp 4 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
   2298 
   2299     m_state->scalar_col = m_state->line_contents.current_col(s);
   2300     _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() + pos);
   2301     _line_progressed(static_cast<size_t>(s.end() - (m_buf.begin() + pos)));
   2302 
   2303     _c4dbgpf("slurp 5 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
   2304 
   2305     if(_at_line_end())
   2306     {
   2307         _c4dbgpf("at line end. curr='{}'", s);
   2308         s = _extend_scanned_scalar(s);
   2309     }
   2310 
   2311     _c4dbgpf("scalar was '{}'", s);
   2312 
   2313     return s;
   2314 }
   2315 
   2316 
   2317 //-----------------------------------------------------------------------------
   2318 
   2319 bool Parser::_scan_scalar_seq_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
   2320 {
   2321     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RSEQ));
   2322     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RVAL));
   2323     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(RKEY));
   2324     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(FLOW));
   2325 
   2326     csubstr s = m_state->line_contents.rem;
   2327     if(s.len == 0)
   2328         return false;
   2329     s = s.trim(" \t");
   2330     if(s.len == 0)
   2331         return false;
   2332 
   2333     if(s.begins_with('\''))
   2334     {
   2335         _c4dbgp("got a ': scanning single-quoted scalar");
   2336         m_state->scalar_col = m_state->line_contents.current_col(s);
   2337         *scalar = _scan_squot_scalar();
   2338         *quoted = true;
   2339         return true;
   2340     }
   2341     else if(s.begins_with('"'))
   2342     {
   2343         _c4dbgp("got a \": scanning double-quoted scalar");
   2344         m_state->scalar_col = m_state->line_contents.current_col(s);
   2345         *scalar = _scan_dquot_scalar();
   2346         *quoted = true;
   2347         return true;
   2348     }
   2349     else if(s.begins_with('|') || s.begins_with('>'))
   2350     {
   2351         *scalar = _scan_block();
   2352         *quoted = true;
   2353         return true;
   2354     }
   2355     else if(has_any(RTOP) && _is_doc_sep(s))
   2356     {
   2357         return false;
   2358     }
   2359 
   2360     _c4dbgp("RSEQ|RVAL");
   2361     if( ! _is_scalar_next__rseq_rval(s))
   2362         return false;
   2363     _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
   2364         return false;
   2365     )
   2366 
   2367     if(s.ends_with(':'))
   2368     {
   2369         --s.len;
   2370     }
   2371     else
   2372     {
   2373         auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
   2374         if(first)
   2375             s.len = first.pos;
   2376     }
   2377     s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   2378 
   2379     if(s.empty())
   2380         return false;
   2381 
   2382     m_state->scalar_col = m_state->line_contents.current_col(s);
   2383     _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
   2384     _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
   2385 
   2386     if(_at_line_end() && s != '~')
   2387     {
   2388         _c4dbgpf("at line end. curr='{}'", s);
   2389         s = _extend_scanned_scalar(s);
   2390     }
   2391 
   2392     _c4dbgpf("scalar was '{}'", s);
   2393 
   2394     *scalar = s;
   2395     *quoted = false;
   2396     return true;
   2397 }
   2398 
   2399 bool Parser::_scan_scalar_map_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
   2400 {
   2401     _c4dbgp("_scan_scalar_map_blck");
   2402     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RMAP));
   2403     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(FLOW));
   2404     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RKEY|RVAL));
   2405 
   2406     csubstr s = m_state->line_contents.rem;
   2407     #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
   2408     if(s.len == 0)
   2409         return false;
   2410     #endif
   2411     s = s.trim(" \t");
   2412     if(s.len == 0)
   2413         return false;
   2414 
   2415     if(s.begins_with('\''))
   2416     {
   2417         _c4dbgp("got a ': scanning single-quoted scalar");
   2418         m_state->scalar_col = m_state->line_contents.current_col(s);
   2419         *scalar = _scan_squot_scalar();
   2420         *quoted = true;
   2421         return true;
   2422     }
   2423     else if(s.begins_with('"'))
   2424     {
   2425         _c4dbgp("got a \": scanning double-quoted scalar");
   2426         m_state->scalar_col = m_state->line_contents.current_col(s);
   2427         *scalar = _scan_dquot_scalar();
   2428         *quoted = true;
   2429         return true;
   2430     }
   2431     else if(s.begins_with('|') || s.begins_with('>'))
   2432     {
   2433         *scalar = _scan_block();
   2434         *quoted = true;
   2435         return true;
   2436     }
   2437     else if(has_any(RTOP) && _is_doc_sep(s))
   2438     {
   2439         return false;
   2440     }
   2441 
   2442     if( ! _is_scalar_next__rmap(s))
   2443         return false;
   2444 
   2445     size_t colon_token = s.find(": ");
   2446     if(colon_token == npos)
   2447     {
   2448         _RYML_WITH_OR_WITHOUT_TAB_TOKENS(
   2449             // with tab tokens
   2450             colon_token = s.find(":\t");
   2451             if(colon_token == npos)
   2452             {
   2453                 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
   2454                 colon_token = s.find(':');
   2455                 if(colon_token != s.len-1)
   2456                     colon_token = npos;
   2457             }
   2458             ,
   2459             // without tab tokens
   2460             colon_token = s.find(':');
   2461             _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
   2462             if(colon_token != s.len-1)
   2463                 colon_token = npos;
   2464         )
   2465     }
   2466 
   2467     if(has_all(RKEY))
   2468     {
   2469         _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
   2470         if(has_any(QMRK))
   2471         {
   2472             _c4dbgp("RMAP|RKEY|CPLX");
   2473             _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
   2474             if(s.begins_with("? ") || s == '?')
   2475                 return false;
   2476             s = s.left_of(colon_token);
   2477             s = s.left_of(s.first_of("#"));
   2478             s = s.trimr(" \t");
   2479             if(s.begins_with("---"))
   2480                 return false;
   2481             else if(s.begins_with("..."))
   2482                 return false;
   2483         }
   2484         else
   2485         {
   2486             _c4dbgp("RMAP|RKEY");
   2487             _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
   2488             if(s.begins_with("? ") || s == '?')
   2489                 return false;
   2490             s = s.left_of(colon_token);
   2491             s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   2492             if(s.begins_with("---"))
   2493             {
   2494                 return false;
   2495             }
   2496             else if(s.begins_with("..."))
   2497             {
   2498                 return false;
   2499             }
   2500         }
   2501     }
   2502     else if(has_all(RVAL))
   2503     {
   2504         _c4dbgp("RMAP|RVAL");
   2505         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
   2506         if( ! _is_scalar_next__rmap_val(s))
   2507             return false;
   2508         _RYML_WITH_TAB_TOKENS(
   2509         else if(s.begins_with("-\t"))
   2510             return false;
   2511         )
   2512         _c4dbgp("RMAP|RVAL: scalar");
   2513         s = s.left_of(s.find(" #")); // is there a comment?
   2514         s = s.left_of(s.find("\t#")); // is there a comment?
   2515         s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   2516         if(s.begins_with("---"))
   2517             return false;
   2518         #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
   2519         else if(s.begins_with("..."))
   2520             return false;
   2521         #endif
   2522     }
   2523 
   2524     if(s.empty())
   2525         return false;
   2526 
   2527     m_state->scalar_col = m_state->line_contents.current_col(s);
   2528     _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
   2529     _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
   2530 
   2531     if(_at_line_end() && s != '~')
   2532     {
   2533         _c4dbgpf("at line end. curr='{}'", s);
   2534         s = _extend_scanned_scalar(s);
   2535     }
   2536 
   2537     _c4dbgpf("scalar was '{}'", s);
   2538 
   2539     *scalar = s;
   2540     *quoted = false;
   2541     return true;
   2542 }
   2543 
   2544 bool Parser::_scan_scalar_seq_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
   2545 {
   2546     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RSEQ));
   2547     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(FLOW));
   2548     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RVAL));
   2549     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(RKEY));
   2550 
   2551     csubstr s = m_state->line_contents.rem;
   2552     if(s.len == 0)
   2553         return false;
   2554     s = s.trim(" \t");
   2555     if(s.len == 0)
   2556         return false;
   2557 
   2558     if(s.begins_with('\''))
   2559     {
   2560         _c4dbgp("got a ': scanning single-quoted scalar");
   2561         m_state->scalar_col = m_state->line_contents.current_col(s);
   2562         *scalar = _scan_squot_scalar();
   2563         *quoted = true;
   2564         return true;
   2565     }
   2566     else if(s.begins_with('"'))
   2567     {
   2568         _c4dbgp("got a \": scanning double-quoted scalar");
   2569         m_state->scalar_col = m_state->line_contents.current_col(s);
   2570         *scalar = _scan_dquot_scalar();
   2571         *quoted = true;
   2572         return true;
   2573     }
   2574 
   2575     if(has_all(RVAL))
   2576     {
   2577         _c4dbgp("RSEQ|RVAL");
   2578         if( ! _is_scalar_next__rseq_rval(s))
   2579             return false;
   2580         _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
   2581             return false;
   2582         )
   2583         _c4dbgp("RSEQ|RVAL|FLOW");
   2584         s = s.left_of(s.first_of(",]"));
   2585         if(s.ends_with(':'))
   2586         {
   2587             --s.len;
   2588         }
   2589         else
   2590         {
   2591             auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
   2592             if(first)
   2593                 s.len = first.pos;
   2594         }
   2595         s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   2596     }
   2597 
   2598     if(s.empty())
   2599         return false;
   2600 
   2601     m_state->scalar_col = m_state->line_contents.current_col(s);
   2602     _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
   2603     _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
   2604 
   2605     if(_at_line_end() && s != '~')
   2606     {
   2607         _c4dbgpf("at line end. curr='{}'", s);
   2608         s = _extend_scanned_scalar(s);
   2609     }
   2610 
   2611     _c4dbgpf("scalar was '{}'", s);
   2612 
   2613     *scalar = s;
   2614     *quoted = false;
   2615     return true;
   2616 }
   2617 
   2618 bool Parser::_scan_scalar_map_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
   2619 {
   2620     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RMAP));
   2621     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(FLOW));
   2622     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RKEY|RVAL));
   2623 
   2624     csubstr s = m_state->line_contents.rem;
   2625     if(s.len == 0)
   2626         return false;
   2627     s = s.trim(" \t");
   2628     if(s.len == 0)
   2629         return false;
   2630 
   2631     if(s.begins_with('\''))
   2632     {
   2633         _c4dbgp("got a ': scanning single-quoted scalar");
   2634         m_state->scalar_col = m_state->line_contents.current_col(s);
   2635         *scalar = _scan_squot_scalar();
   2636         *quoted = true;
   2637         return true;
   2638     }
   2639     else if(s.begins_with('"'))
   2640     {
   2641         _c4dbgp("got a \": scanning double-quoted scalar");
   2642         m_state->scalar_col = m_state->line_contents.current_col(s);
   2643         *scalar = _scan_dquot_scalar();
   2644         *quoted = true;
   2645         return true;
   2646     }
   2647 
   2648     if( ! _is_scalar_next__rmap(s))
   2649         return false;
   2650 
   2651     if(has_all(RKEY))
   2652     {
   2653         _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
   2654         size_t colon_token = s.find(": ");
   2655         if(colon_token == npos)
   2656         {
   2657             _RYML_WITH_OR_WITHOUT_TAB_TOKENS(
   2658                 // with tab tokens
   2659                 colon_token = s.find(":\t");
   2660                 if(colon_token == npos)
   2661                 {
   2662                     _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
   2663                     colon_token = s.find(':');
   2664                     if(colon_token != s.len-1)
   2665                         colon_token = npos;
   2666                 }
   2667                 ,
   2668                 // without tab tokens
   2669                 colon_token = s.find(':');
   2670                 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
   2671                 if(colon_token != s.len-1)
   2672                     colon_token = npos;
   2673             )
   2674         }
   2675         if(s.begins_with("? ") || s == '?')
   2676             return false;
   2677         if(has_any(QMRK))
   2678         {
   2679             _c4dbgp("RMAP|RKEY|CPLX");
   2680             _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
   2681             s = s.left_of(colon_token);
   2682             s = s.left_of(s.first_of("#"));
   2683             s = s.left_of(s.first_of(':'));
   2684             s = s.trimr(" \t");
   2685             if(s.begins_with("---"))
   2686                 return false;
   2687             else if(s.begins_with("..."))
   2688                 return false;
   2689         }
   2690         else
   2691         {
   2692             _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
   2693             _c4dbgp("RMAP|RKEY");
   2694             s = s.left_of(colon_token);
   2695             s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   2696             _c4dbgpf("RMAP|RKEY|FLOW: '{}'", s);
   2697             s = s.left_of(s.first_of(",}"));
   2698             if(s.ends_with(':'))
   2699                 --s.len;
   2700         }
   2701     }
   2702     else if(has_all(RVAL))
   2703     {
   2704         _c4dbgp("RMAP|RVAL");
   2705         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
   2706         if( ! _is_scalar_next__rmap_val(s))
   2707             return false;
   2708         _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
   2709             return false;
   2710         )
   2711         _c4dbgp("RMAP|RVAL|FLOW");
   2712         if(has_none(RSEQIMAP))
   2713             s = s.left_of(s.first_of(",}"));
   2714         else
   2715             s = s.left_of(s.first_of(",]"));
   2716         s = s.left_of(s.find(" #")); // is there a comment?
   2717         s = s.left_of(s.find("\t#")); // is there a comment?
   2718         s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
   2719     }
   2720 
   2721     if(s.empty())
   2722         return false;
   2723 
   2724     m_state->scalar_col = m_state->line_contents.current_col(s);
   2725     _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
   2726     _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
   2727 
   2728     if(_at_line_end() && s != '~')
   2729     {
   2730         _c4dbgpf("at line end. curr='{}'", s);
   2731         s = _extend_scanned_scalar(s);
   2732     }
   2733 
   2734     _c4dbgpf("scalar was '{}'", s);
   2735 
   2736     *scalar = s;
   2737     *quoted = false;
   2738     return true;
   2739 }
   2740 
   2741 bool Parser::_scan_scalar_unk(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
   2742 {
   2743     _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RUNK));
   2744 
   2745     csubstr s = m_state->line_contents.rem;
   2746     if(s.len == 0)
   2747         return false;
   2748     s = s.trim(" \t");
   2749     if(s.len == 0)
   2750         return false;
   2751 
   2752     if(s.begins_with('\''))
   2753     {
   2754         _c4dbgp("got a ': scanning single-quoted scalar");
   2755         m_state->scalar_col = m_state->line_contents.current_col(s);
   2756         *scalar = _scan_squot_scalar();
   2757         *quoted = true;
   2758         return true;
   2759     }
   2760     else if(s.begins_with('"'))
   2761     {
   2762         _c4dbgp("got a \": scanning double-quoted scalar");
   2763         m_state->scalar_col = m_state->line_contents.current_col(s);
   2764         *scalar = _scan_dquot_scalar();
   2765         *quoted = true;
   2766         return true;
   2767     }
   2768     else if(s.begins_with('|') || s.begins_with('>'))
   2769     {
   2770         *scalar = _scan_block();
   2771         *quoted = true;
   2772         return true;
   2773     }
   2774     else if(has_any(RTOP) && _is_doc_sep(s))
   2775     {
   2776         return false;
   2777     }
   2778 
   2779     _c4dbgpf("RUNK '[{}]~~~{}~~~", s.len, s);
   2780     if( ! _is_scalar_next__runk(s))
   2781     {
   2782         _c4dbgp("RUNK: no scalar next");
   2783         return false;
   2784     }
   2785     size_t pos = s.find(" #");
   2786     if(pos != npos)
   2787     {
   2788         _c4dbgpf("RUNK: found ' #' at {}", pos);
   2789         s = s.left_of(pos);
   2790     }
   2791     pos = s.find(": ");
   2792     if(pos != npos)
   2793     {
   2794         _c4dbgpf("RUNK: found ': ' at {}", pos);
   2795         s = s.left_of(pos);
   2796     }
   2797     else if(s.ends_with(':'))
   2798     {
   2799         _c4dbgp("RUNK: ends with ':'");
   2800         s = s.left_of(s.len-1);
   2801     }
   2802     _RYML_WITH_TAB_TOKENS(
   2803     else if((pos = s.find(":\t")) != npos) // TABS
   2804     {
   2805         _c4dbgp("RUNK: ends with ':\\t'");
   2806         s = s.left_of(pos);
   2807     })
   2808     else
   2809     {
   2810         _c4dbgp("RUNK: trimming left of ,");
   2811         s = s.left_of(s.first_of(','));
   2812     }
   2813     s = s.trim(" \t");
   2814     _c4dbgpf("RUNK: scalar=[{}]~~~{}~~~", s.len, s);
   2815 
   2816     if(s.empty())
   2817         return false;
   2818 
   2819     m_state->scalar_col = m_state->line_contents.current_col(s);
   2820     _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
   2821     _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
   2822 
   2823     if(_at_line_end() && s != '~')
   2824     {
   2825         _c4dbgpf("at line end. curr=[{}]~~~{}~~", s.len, s);
   2826         s = _extend_scanned_scalar(s);
   2827     }
   2828 
   2829     _c4dbgpf("scalar was [{}]~~~{}~~~", s.len, s);
   2830 
   2831     *scalar = s;
   2832     *quoted = false;
   2833     return true;
   2834 }
   2835 
   2836 
   2837 //-----------------------------------------------------------------------------
   2838 
   2839 csubstr Parser::_extend_scanned_scalar(csubstr s)
   2840 {
   2841     if(has_all(RMAP|RKEY|QMRK))
   2842     {
   2843         size_t scalar_indentation = has_any(FLOW) ? 0 : m_state->scalar_col;
   2844         _c4dbgpf("extend_scalar: explicit key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col);
   2845         csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
   2846         if(!n.empty())
   2847         {
   2848             substr full = _scan_complex_key(s, n).trimr(" \t\r\n");
   2849             if(full != s)
   2850                 s = _filter_plain_scalar(full, scalar_indentation);
   2851         }
   2852     }
   2853     // deal with plain (unquoted) scalars that continue to the next line
   2854     else if(!s.begins_with_any("*")) // cannot be a plain scalar if it starts with * (that's an anchor reference)
   2855     {
   2856         _c4dbgpf("extend_scalar: line ended, scalar='{}'", s);
   2857         if(has_none(FLOW))
   2858         {
   2859             size_t scalar_indentation = m_state->indref + 1;
   2860             if(has_all(RUNK) && scalar_indentation == 1)
   2861                 scalar_indentation = 0;
   2862             csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
   2863             if(!n.empty())
   2864             {
   2865                 _c4dbgpf("rscalar[IMPL]: state_indref={} state_indentation={} scalar_indentation={}", m_state->indref, m_state->line_contents.indentation, scalar_indentation);
   2866                 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n));
   2867                 substr full = _scan_plain_scalar_blck(s, n, scalar_indentation);
   2868                 if(full.len >= s.len)
   2869                     s = _filter_plain_scalar(full, scalar_indentation);
   2870             }
   2871         }
   2872         else
   2873         {
   2874             _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
   2875             csubstr n = _scan_to_next_nonempty_line(/*indentation*/0);
   2876             if(!n.empty())
   2877             {
   2878                 _c4dbgp("rscalar[FLOW]");
   2879                 substr full = _scan_plain_scalar_flow(s, n);
   2880                 s = _filter_plain_scalar(full, /*indentation*/0);
   2881             }
   2882         }
   2883     }
   2884 
   2885     return s;
   2886 }
   2887 
   2888 
   2889 //-----------------------------------------------------------------------------
   2890 
   2891 substr Parser::_scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line)
   2892 {
   2893     static constexpr const csubstr chars = "[]{}?#,";
   2894     size_t pos = peeked_line.first_of(chars);
   2895     bool first = true;
   2896     while(pos != 0)
   2897     {
   2898         if(has_all(RMAP|RKEY) || has_any(RUNK))
   2899         {
   2900             csubstr tpkl = peeked_line.triml(' ').trimr("\r\n");
   2901             if(tpkl.begins_with(": ") || tpkl == ':')
   2902             {
   2903                 _c4dbgpf("rscalar[FLOW]: map value starts on the peeked line: '{}'", peeked_line);
   2904                 peeked_line = peeked_line.first(0);
   2905                 break;
   2906             }
   2907             else
   2908             {
   2909                 auto colon_pos = peeked_line.first_of_any(": ", ":");
   2910                 if(colon_pos && colon_pos.pos < pos)
   2911                 {
   2912                     peeked_line = peeked_line.first(colon_pos.pos);
   2913                     _c4dbgpf("rscalar[FLOW]: found colon at {}. peeked='{}'", colon_pos.pos, peeked_line);
   2914                     _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
   2915                     _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
   2916                     break;
   2917                 }
   2918             }
   2919         }
   2920         if(pos != npos)
   2921         {
   2922             _c4dbgpf("rscalar[FLOW]: found special character '{}' at {}, stopping: '{}'", peeked_line[pos], pos, peeked_line.left_of(pos).trimr("\r\n"));
   2923             peeked_line = peeked_line.left_of(pos);
   2924             _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
   2925             _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
   2926             break;
   2927         }
   2928         _c4dbgpf("rscalar[FLOW]: append another line, full: '{}'", peeked_line.trimr("\r\n"));
   2929         if(!first)
   2930         {
   2931             RYML_CHECK(_advance_to_peeked());
   2932         }
   2933         peeked_line = _scan_to_next_nonempty_line(/*indentation*/0);
   2934         if(peeked_line.empty())
   2935         {
   2936             _c4err("expected token or continuation");
   2937         }
   2938         pos = peeked_line.first_of(chars);
   2939         first = false;
   2940     }
   2941     substr full(m_buf.str + (currscalar.str - m_buf.str), m_buf.begin() + m_state->pos.offset);
   2942     full = full.trimr("\n\r ");
   2943     return full;
   2944 }
   2945 
   2946 
   2947 //-----------------------------------------------------------------------------
   2948 
   2949 substr Parser::_scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation)
   2950 {
   2951     _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
   2952     // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
   2953     // size_t offs = m_state->pos.offset;   // so we workaround by directly counting from the end of the given scalar
   2954     _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
   2955     size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
   2956     _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.begins_with(' ', indentation));
   2957     while(true)
   2958     {
   2959         _c4dbgpf("rscalar[IMPL]: continuing... ref_indentation={}", indentation);
   2960         if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
   2961         {
   2962             _c4dbgpf("rscalar[IMPL]: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
   2963             break;
   2964         }
   2965         else if(( ! peeked_line.begins_with(' ', indentation))) // is the line deindented?
   2966         {
   2967             if(!peeked_line.trim(" \r\n\t").empty()) // is the line not blank?
   2968             {
   2969                 _c4dbgpf("rscalar[IMPL]: deindented line, not blank -- bail now '{}'", peeked_line.trimr("\r\n"));
   2970                 break;
   2971             }
   2972             _c4dbgpf("rscalar[IMPL]: line is blank and has less indentation: ref={} line={}: '{}'", indentation, peeked_line.first_not_of(' ') == csubstr::npos ? 0 : peeked_line.first_not_of(' '), peeked_line.trimr("\r\n"));
   2973             _c4dbgpf("rscalar[IMPL]: ... searching for a line starting at indentation {}", indentation);
   2974             csubstr next_peeked = _scan_to_next_nonempty_line(indentation);
   2975             if(next_peeked.empty())
   2976             {
   2977                 _c4dbgp("rscalar[IMPL]: ... finished.");
   2978                 break;
   2979             }
   2980             _c4dbgp("rscalar[IMPL]: ... continuing.");
   2981             peeked_line = next_peeked;
   2982         }
   2983 
   2984         _c4dbgpf("rscalar[IMPL]: line contents: '{}'", peeked_line.right_of(indentation, true).trimr("\r\n"));
   2985         size_t token_pos;
   2986         if(peeked_line.find(": ") != npos)
   2987         {
   2988             _line_progressed(peeked_line.find(": "));
   2989             _c4err("': ' is not a valid token in plain flow (unquoted) scalars");
   2990         }
   2991         else if(peeked_line.ends_with(':'))
   2992         {
   2993             _line_progressed(peeked_line.find(':'));
   2994             _c4err("lines cannot end with ':' in plain flow (unquoted) scalars");
   2995         }
   2996         else if((token_pos = peeked_line.find(" #")) != npos)
   2997         {
   2998             _line_progressed(token_pos);
   2999             break;
   3000             //_c4err("' #' is not a valid token in plain flow (unquoted) scalars");
   3001         }
   3002 
   3003         _c4dbgpf("rscalar[IMPL]: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
   3004         if(!_advance_to_peeked())
   3005         {
   3006             _c4dbgp("rscalar[IMPL]: file finishes after the scalar");
   3007             break;
   3008         }
   3009         peeked_line = m_state->line_contents.rem;
   3010     }
   3011     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
   3012     substr full(m_buf.str + (currscalar.str - m_buf.str),
   3013                 currscalar.len + (m_state->pos.offset - offs));
   3014     full = full.trimr("\r\n ");
   3015     return full;
   3016 }
   3017 
   3018 substr Parser::_scan_complex_key(csubstr currscalar, csubstr peeked_line)
   3019 {
   3020     _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
   3021     // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
   3022     // size_t offs = m_state->pos.offset;   // so we workaround by directly counting from the end of the given scalar
   3023     _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
   3024     size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
   3025     while(true)
   3026     {
   3027         _c4dbgp("rcplxkey: continuing...");
   3028         if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
   3029         {
   3030             _c4dbgpf("rcplxkey: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
   3031             break;
   3032         }
   3033         else
   3034         {
   3035             size_t pos = peeked_line.first_of("?:[]{}");
   3036             if(pos == csubstr::npos)
   3037             {
   3038                 pos = peeked_line.find("- ");
   3039             }
   3040             if(pos != csubstr::npos)
   3041             {
   3042                 _c4dbgpf("rcplxkey: found special characters at pos={}: '{}'", pos, peeked_line.trimr("\r\n"));
   3043                 _line_progressed(pos);
   3044                 break;
   3045             }
   3046         }
   3047 
   3048         _c4dbgpf("rcplxkey: no special chars found '{}'", peeked_line.trimr("\r\n"));
   3049         csubstr next_peeked = _scan_to_next_nonempty_line(0);
   3050         if(next_peeked.empty())
   3051         {
   3052             _c4dbgp("rcplxkey: empty ... finished.");
   3053             break;
   3054         }
   3055         _c4dbgp("rcplxkey: ... continuing.");
   3056         peeked_line = next_peeked;
   3057 
   3058         _c4dbgpf("rcplxkey: line contents: '{}'", peeked_line.trimr("\r\n"));
   3059         size_t colpos;
   3060         if((colpos = peeked_line.find(": ")) != npos)
   3061         {
   3062             _c4dbgp("rcplxkey: found ': ', stopping.");
   3063             _line_progressed(colpos);
   3064             break;
   3065         }
   3066         #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   3067         else if((colpos = peeked_line.ends_with(':')))
   3068         {
   3069             _c4dbgp("rcplxkey: ends with ':', stopping.");
   3070             _line_progressed(colpos);
   3071             break;
   3072         }
   3073         #endif
   3074         _c4dbgpf("rcplxkey: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
   3075         if(!_advance_to_peeked())
   3076         {
   3077             _c4dbgp("rcplxkey: file finishes after the scalar");
   3078             break;
   3079         }
   3080         peeked_line = m_state->line_contents.rem;
   3081     }
   3082     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
   3083     substr full(m_buf.str + (currscalar.str - m_buf.str),
   3084                 currscalar.len + (m_state->pos.offset - offs));
   3085     return full;
   3086 }
   3087 
   3088 //! scans to the next non-blank line starting with the given indentation
   3089 csubstr Parser::_scan_to_next_nonempty_line(size_t indentation)
   3090 {
   3091     csubstr next_peeked;
   3092     while(true)
   3093     {
   3094         _c4dbgpf("rscalar: ... curr offset: {} indentation={}", m_state->pos.offset, indentation);
   3095         next_peeked = _peek_next_line(m_state->pos.offset);
   3096         csubstr next_peeked_triml = next_peeked.triml(' ');
   3097         _c4dbgpf("rscalar: ... next peeked line='{}'", next_peeked.trimr("\r\n"));
   3098         if(next_peeked_triml.begins_with('#'))
   3099         {
   3100             _c4dbgp("rscalar: ... first non-space character is #");
   3101             return {};
   3102         }
   3103         else if(next_peeked.begins_with(' ', indentation))
   3104         {
   3105             _c4dbgpf("rscalar: ... begins at same indentation {}, assuming continuation", indentation);
   3106             _advance_to_peeked();
   3107             return next_peeked;
   3108         }
   3109         else   // check for de-indentation
   3110         {
   3111             csubstr trimmed = next_peeked_triml.trimr("\t\r\n");
   3112             _c4dbgpf("rscalar: ... deindented! trimmed='{}'", trimmed);
   3113             if(!trimmed.empty())
   3114             {
   3115                 _c4dbgp("rscalar: ... and not empty. bailing out.");
   3116                 return {};
   3117             }
   3118         }
   3119         if(!_advance_to_peeked())
   3120         {
   3121             _c4dbgp("rscalar: file finished");
   3122             return {};
   3123         }
   3124     }
   3125     return {};
   3126 }
   3127 
   3128 // returns false when the file finished
   3129 bool Parser::_advance_to_peeked()
   3130 {
   3131     _line_progressed(m_state->line_contents.rem.len);
   3132     _line_ended(); // advances to the peeked-at line, consuming all remaining (probably newline) characters on the current line
   3133     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.first_of("\r\n") == csubstr::npos);
   3134     _c4dbgpf("advance to peeked: scan more... pos={} len={}", m_state->pos.offset, m_buf.len);
   3135     _scan_line();  // puts the peeked-at line in the buffer
   3136     if(_finished_file())
   3137     {
   3138         _c4dbgp("rscalar: finished file!");
   3139         return false;
   3140     }
   3141     return true;
   3142 }
   3143 
   3144 //-----------------------------------------------------------------------------
   3145 
   3146 C4_ALWAYS_INLINE size_t _extend_from_combined_newline(char nl, char following)
   3147 {
   3148     return (nl == '\n' && following == '\r') || (nl == '\r' && following == '\n');
   3149 }
   3150 
   3151 //! look for the next newline chars, and jump to the right of those
   3152 csubstr from_next_line(csubstr rem)
   3153 {
   3154     size_t nlpos = rem.first_of("\r\n");
   3155     if(nlpos == csubstr::npos)
   3156         return {};
   3157     const char nl = rem[nlpos];
   3158     rem = rem.right_of(nlpos);
   3159     if(rem.empty())
   3160         return {};
   3161     if(_extend_from_combined_newline(nl, rem.front()))
   3162         rem = rem.sub(1);
   3163     return rem;
   3164 }
   3165 
   3166 csubstr Parser::_peek_next_line(size_t pos) const
   3167 {
   3168     csubstr rem{}; // declare here because of the goto
   3169     size_t nlpos{}; // declare here because of the goto
   3170     pos = pos == npos ? m_state->pos.offset : pos;
   3171     if(pos >= m_buf.len)
   3172         goto next_is_empty;
   3173 
   3174     // look for the next newline chars, and jump to the right of those
   3175     rem = from_next_line(m_buf.sub(pos));
   3176     if(rem.empty())
   3177         goto next_is_empty;
   3178 
   3179     // now get everything up to and including the following newline chars
   3180     nlpos = rem.first_of("\r\n");
   3181     if((nlpos != csubstr::npos) && (nlpos + 1 < rem.len))
   3182         nlpos += _extend_from_combined_newline(rem[nlpos], rem[nlpos+1]);
   3183     rem = rem.left_of(nlpos, /*include_pos*/true);
   3184 
   3185     _c4dbgpf("peek next line @ {}: (len={})'{}'", pos, rem.len, rem.trimr("\r\n"));
   3186     return rem;
   3187 
   3188 next_is_empty:
   3189     _c4dbgpf("peek next line @ {}: (len=0)''", pos);
   3190     return {};
   3191 }
   3192 
   3193 
   3194 //-----------------------------------------------------------------------------
   3195 void Parser::LineContents::reset_with_next_line(csubstr buf, size_t offset)
   3196 {
   3197     RYML_ASSERT(offset <= buf.len);
   3198     char const* C4_RESTRICT b = &buf[offset];
   3199     char const* C4_RESTRICT e = b;
   3200     // get the current line stripped of newline chars
   3201     while(e < buf.end() && (*e != '\n' && *e != '\r'))
   3202         ++e;
   3203     RYML_ASSERT(e >= b);
   3204     const csubstr stripped_ = buf.sub(offset, static_cast<size_t>(e - b));
   3205     // advance pos to include the first line ending
   3206     if(e != buf.end() && *e == '\r')
   3207         ++e;
   3208     if(e != buf.end() && *e == '\n')
   3209         ++e;
   3210     RYML_ASSERT(e >= b);
   3211     const csubstr full_ = buf.sub(offset, static_cast<size_t>(e - b));
   3212     reset(full_, stripped_);
   3213 }
   3214 
   3215 void Parser::_scan_line()
   3216 {
   3217     if(m_state->pos.offset >= m_buf.len)
   3218     {
   3219         m_state->line_contents.reset(m_buf.last(0), m_buf.last(0));
   3220         return;
   3221     }
   3222     m_state->line_contents.reset_with_next_line(m_buf, m_state->pos.offset);
   3223 }
   3224 
   3225 
   3226 //-----------------------------------------------------------------------------
   3227 void Parser::_line_progressed(size_t ahead)
   3228 {
   3229     _c4dbgpf("line[{}] ({} cols) progressed by {}:  col {}-->{}   offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, ahead, m_state->pos.col, m_state->pos.col+ahead, m_state->pos.offset, m_state->pos.offset+ahead);
   3230     m_state->pos.offset += ahead;
   3231     m_state->pos.col += ahead;
   3232     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col <= m_state->line_contents.stripped.len+1);
   3233     m_state->line_contents.rem = m_state->line_contents.rem.sub(ahead);
   3234 }
   3235 
   3236 void Parser::_line_ended()
   3237 {
   3238     _c4dbgpf("line[{}] ({} cols) ended! offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, m_state->pos.offset, m_state->pos.offset+m_state->line_contents.full.len - m_state->line_contents.stripped.len);
   3239     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == m_state->line_contents.stripped.len+1);
   3240     m_state->pos.offset += m_state->line_contents.full.len - m_state->line_contents.stripped.len;
   3241     ++m_state->pos.line;
   3242     m_state->pos.col = 1;
   3243 }
   3244 
   3245 void Parser::_line_ended_undo()
   3246 {
   3247     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == 1u);
   3248     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line > 0u);
   3249     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_state->line_contents.full.len - m_state->line_contents.stripped.len);
   3250     size_t delta = m_state->line_contents.full.len - m_state->line_contents.stripped.len;
   3251     _c4dbgpf("line[{}] undo ended! line {}-->{}, offset {}-->{}", m_state->pos.line, m_state->pos.line, m_state->pos.line - 1, m_state->pos.offset, m_state->pos.offset - delta);
   3252     m_state->pos.offset -= delta;
   3253     --m_state->pos.line;
   3254     m_state->pos.col = m_state->line_contents.stripped.len + 1u;
   3255     // don't forget to undo also the changes to the remainder of the line
   3256     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_buf.len || m_buf[m_state->pos.offset] == '\n' || m_buf[m_state->pos.offset] == '\r');
   3257     m_state->line_contents.rem = m_buf.sub(m_state->pos.offset, 0);
   3258 }
   3259 
   3260 
   3261 //-----------------------------------------------------------------------------
   3262 void Parser::_set_indentation(size_t indentation)
   3263 {
   3264     m_state->indref = indentation;
   3265     _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
   3266 }
   3267 
   3268 void Parser::_save_indentation(size_t behind)
   3269 {
   3270     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begin() >= m_state->line_contents.full.begin());
   3271     m_state->indref = static_cast<size_t>(m_state->line_contents.rem.begin() - m_state->line_contents.full.begin());
   3272     _RYML_CB_ASSERT(m_stack.m_callbacks, behind <= m_state->indref);
   3273     m_state->indref -= behind;
   3274     _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
   3275 }
   3276 
   3277 bool Parser::_maybe_set_indentation_from_anchor_or_tag()
   3278 {
   3279     if(m_key_anchor.not_empty())
   3280     {
   3281         _c4dbgpf("set indentation from key anchor: {}", m_key_anchor_indentation);
   3282         _set_indentation(m_key_anchor_indentation); // this is the column where the anchor starts
   3283         return true;
   3284     }
   3285     else if(m_key_tag.not_empty())
   3286     {
   3287         _c4dbgpf("set indentation from key tag: {}", m_key_tag_indentation);
   3288         _set_indentation(m_key_tag_indentation); // this is the column where the tag starts
   3289         return true;
   3290     }
   3291     return false;
   3292 }
   3293 
   3294 
   3295 //-----------------------------------------------------------------------------
   3296 void Parser::_write_key_anchor(size_t node_id)
   3297 {
   3298     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_key(node_id));
   3299     if( ! m_key_anchor.empty())
   3300     {
   3301         _c4dbgpf("node={}: set key anchor to '{}'", node_id, m_key_anchor);
   3302         m_tree->set_key_anchor(node_id, m_key_anchor);
   3303         m_key_anchor.clear();
   3304         m_key_anchor_was_before = false;
   3305         m_key_anchor_indentation = 0;
   3306     }
   3307     else if( ! m_tree->is_key_quoted(node_id))
   3308     {
   3309         csubstr r = m_tree->key(node_id);
   3310         if(r.begins_with('*'))
   3311         {
   3312             _c4dbgpf("node={}: set key reference: '{}'", node_id, r);
   3313             m_tree->set_key_ref(node_id, r.sub(1));
   3314         }
   3315         else if(r == "<<")
   3316         {
   3317             m_tree->set_key_ref(node_id, r);
   3318             _c4dbgpf("node={}: it's an inheriting reference", node_id);
   3319             if(m_tree->is_seq(node_id))
   3320             {
   3321                 _c4dbgpf("node={}: inheriting from seq of {}", node_id, m_tree->num_children(node_id));
   3322                 for(size_t i = m_tree->first_child(node_id); i != NONE; i = m_tree->next_sibling(i))
   3323                 {
   3324                     if( ! (m_tree->val(i).begins_with('*')))
   3325                         _c4err("malformed reference: '{}'", m_tree->val(i));
   3326                 }
   3327             }
   3328             else if( ! m_tree->val(node_id).begins_with('*'))
   3329             {
   3330                  _c4err("malformed reference: '{}'", m_tree->val(node_id));
   3331             }
   3332             //m_tree->set_key_ref(node_id, r);
   3333         }
   3334     }
   3335 }
   3336 
   3337 //-----------------------------------------------------------------------------
   3338 void Parser::_write_val_anchor(size_t node_id)
   3339 {
   3340     if( ! m_val_anchor.empty())
   3341     {
   3342         _c4dbgpf("node={}: set val anchor to '{}'", node_id, m_val_anchor);
   3343         m_tree->set_val_anchor(node_id, m_val_anchor);
   3344         m_val_anchor.clear();
   3345     }
   3346     csubstr r = m_tree->has_val(node_id) ? m_tree->val(node_id) : "";
   3347     if(!m_tree->is_val_quoted(node_id) && r.begins_with('*'))
   3348     {
   3349         _c4dbgpf("node={}: set val reference: '{}'", node_id, r);
   3350         RYML_CHECK(!m_tree->has_val_anchor(node_id));
   3351         m_tree->set_val_ref(node_id, r.sub(1));
   3352     }
   3353 }
   3354 
   3355 //-----------------------------------------------------------------------------
   3356 void Parser::_push_level(bool explicit_flow_chars)
   3357 {
   3358     _c4dbgpf("pushing level! currnode={}  currlevel={} stacksize={} stackcap={}", m_state->node_id, m_state->level, m_stack.size(), m_stack.capacity());
   3359     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
   3360     if(node(m_state) == nullptr)
   3361     {
   3362         _c4dbgp("pushing level! actually no, current node is null");
   3363         //_RYML_CB_ASSERT(m_stack.m_callbacks,  ! explicit_flow_chars);
   3364         return;
   3365     }
   3366     flag_t st = RUNK;
   3367     if(explicit_flow_chars || has_all(FLOW))
   3368     {
   3369         st |= FLOW;
   3370     }
   3371     m_stack.push_top();
   3372     m_state = &m_stack.top();
   3373     set_flags(st);
   3374     m_state->node_id = (size_t)NONE;
   3375     m_state->indref = (size_t)NONE;
   3376     ++m_state->level;
   3377     _c4dbgpf("pushing level: now, currlevel={}", m_state->level);
   3378 }
   3379 
   3380 void Parser::_pop_level()
   3381 {
   3382     _c4dbgpf("popping level! currnode={} currlevel={}", m_state->node_id, m_state->level);
   3383     if(has_any(RMAP) || m_tree->is_map(m_state->node_id))
   3384     {
   3385         _stop_map();
   3386     }
   3387     if(has_any(RSEQ) || m_tree->is_seq(m_state->node_id))
   3388     {
   3389         _stop_seq();
   3390     }
   3391     if(m_tree->is_doc(m_state->node_id))
   3392     {
   3393         _stop_doc();
   3394     }
   3395     _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() > 1);
   3396     _prepare_pop();
   3397     m_stack.pop();
   3398     m_state = &m_stack.top();
   3399     /*if(has_any(RMAP))
   3400     {
   3401         _toggle_key_val();
   3402     }*/
   3403     if(m_state->line_contents.indentation == 0)
   3404     {
   3405         //_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RTOP));
   3406         add_flags(RTOP);
   3407     }
   3408     _c4dbgpf("popping level: now, currnode={} currlevel={}", m_state->node_id, m_state->level);
   3409 }
   3410 
   3411 //-----------------------------------------------------------------------------
   3412 void Parser::_start_unk(bool /*as_child*/)
   3413 {
   3414     _c4dbgp("start_unk");
   3415     _push_level();
   3416     _move_scalar_from_top();
   3417 }
   3418 
   3419 //-----------------------------------------------------------------------------
   3420 void Parser::_start_doc(bool as_child)
   3421 {
   3422     _c4dbgpf("start_doc (as child={})", as_child);
   3423     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
   3424     size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
   3425     _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
   3426     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_root(parent_id));
   3427     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
   3428     if(as_child)
   3429     {
   3430         _c4dbgpf("start_doc: parent={}", parent_id);
   3431         if( ! m_tree->is_stream(parent_id))
   3432         {
   3433             _c4dbgp("start_doc: rearranging with root as STREAM");
   3434             m_tree->set_root_as_stream();
   3435         }
   3436         m_state->node_id = m_tree->append_child(parent_id);
   3437         m_tree->to_doc(m_state->node_id);
   3438     }
   3439     #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   3440     else
   3441     {
   3442         _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(parent_id) || m_tree->empty(parent_id));
   3443         m_state->node_id = parent_id;
   3444         if( ! m_tree->is_doc(parent_id))
   3445         {
   3446             m_tree->to_doc(parent_id, DOC);
   3447         }
   3448     }
   3449     #endif
   3450     _c4dbgpf("start_doc: id={}", m_state->node_id);
   3451     add_flags(RUNK|RTOP|NDOC);
   3452     _handle_types();
   3453     rem_flags(NDOC);
   3454 }
   3455 
   3456 void Parser::_stop_doc()
   3457 {
   3458     size_t doc_node = m_state->node_id;
   3459     _c4dbgpf("stop_doc[{}]", doc_node);
   3460     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(doc_node));
   3461     if(!m_tree->is_seq(doc_node) && !m_tree->is_map(doc_node) && !m_tree->is_val(doc_node))
   3462     {
   3463         _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
   3464         _c4dbgpf("stop_doc[{}]: there was nothing; adding null val", doc_node);
   3465         m_tree->to_val(doc_node, {}, DOC);
   3466     }
   3467 }
   3468 
   3469 void Parser::_end_stream()
   3470 {
   3471     _c4dbgpf("end_stream, level={} node_id={}", m_state->level, m_state->node_id);
   3472     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! m_stack.empty());
   3473     NodeData *added = nullptr;
   3474     if(has_any(SSCL))
   3475     {
   3476         if(m_tree->is_seq(m_state->node_id))
   3477         {
   3478             _c4dbgp("append val...");
   3479             added = _append_val(_consume_scalar());
   3480         }
   3481         else if(m_tree->is_map(m_state->node_id))
   3482         {
   3483             _c4dbgp("append null key val...");
   3484             added = _append_key_val_null(m_state->line_contents.rem.str);
   3485             #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   3486             if(has_any(RSEQIMAP))
   3487             {
   3488                 _stop_seqimap();
   3489                 _pop_level();
   3490             }
   3491             #endif
   3492         }
   3493         else if(m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE)
   3494         {
   3495             NodeType_e quoted = has_any(QSCL) ? VALQUO : NOTYPE; // do this before consuming the scalar
   3496             csubstr scalar = _consume_scalar();
   3497             _c4dbgpf("node[{}]: to docval '{}'{}", m_state->node_id, scalar, quoted == VALQUO ? ", quoted" : "");
   3498             m_tree->to_val(m_state->node_id, scalar, DOC|quoted);
   3499             added = m_tree->get(m_state->node_id);
   3500         }
   3501         else
   3502         {
   3503             _c4err("internal error");
   3504         }
   3505     }
   3506     else if(has_all(RSEQ|RVAL) && has_none(FLOW))
   3507     {
   3508         _c4dbgp("add last...");
   3509         added = _append_val_null(m_state->line_contents.rem.str);
   3510     }
   3511     else if(!m_val_tag.empty() && (m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE))
   3512     {
   3513         csubstr scalar = m_state->line_contents.rem.first(0);
   3514         _c4dbgpf("node[{}]: add null scalar as docval", m_state->node_id);
   3515         m_tree->to_val(m_state->node_id, scalar, DOC);
   3516         added = m_tree->get(m_state->node_id);
   3517     }
   3518 
   3519     if(added)
   3520     {
   3521         size_t added_id = m_tree->id(added);
   3522         if(m_tree->is_seq(m_state->node_id) || m_tree->is_doc(m_state->node_id))
   3523         {
   3524             if(!m_key_anchor.empty())
   3525             {
   3526                 _c4dbgpf("node[{}]: move key to val anchor: '{}'", added_id, m_key_anchor);
   3527                 m_val_anchor = m_key_anchor;
   3528                 m_key_anchor = {};
   3529             }
   3530             if(!m_key_tag.empty())
   3531             {
   3532                 _c4dbgpf("node[{}]: move key to val tag: '{}'", added_id, m_key_tag);
   3533                 m_val_tag = m_key_tag;
   3534                 m_key_tag = {};
   3535             }
   3536         }
   3537         #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   3538         if(!m_key_anchor.empty())
   3539         {
   3540             _c4dbgpf("node[{}]: set key anchor='{}'", added_id, m_key_anchor);
   3541             m_tree->set_key_anchor(added_id, m_key_anchor);
   3542             m_key_anchor = {};
   3543         }
   3544         #endif
   3545         if(!m_val_anchor.empty())
   3546         {
   3547             _c4dbgpf("node[{}]: set val anchor='{}'", added_id, m_val_anchor);
   3548             m_tree->set_val_anchor(added_id, m_val_anchor);
   3549             m_val_anchor = {};
   3550         }
   3551         #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   3552         if(!m_key_tag.empty())
   3553         {
   3554             _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", added_id, m_key_tag, normalize_tag(m_key_tag));
   3555             m_tree->set_key_tag(added_id, normalize_tag(m_key_tag));
   3556             m_key_tag = {};
   3557         }
   3558         #endif
   3559         if(!m_val_tag.empty())
   3560         {
   3561             _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", added_id, m_val_tag, normalize_tag(m_val_tag));
   3562             m_tree->set_val_tag(added_id, normalize_tag(m_val_tag));
   3563             m_val_tag = {};
   3564         }
   3565     }
   3566 
   3567     while(m_stack.size() > 1)
   3568     {
   3569         _c4dbgpf("popping level: {} (stack sz={})", m_state->level, m_stack.size());
   3570         _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(SSCL, &m_stack.top()));
   3571         if(has_all(RSEQ|FLOW))
   3572             _err("closing ] not found");
   3573         _pop_level();
   3574     }
   3575     add_flags(NDOC);
   3576 }
   3577 
   3578 void Parser::_start_new_doc(csubstr rem)
   3579 {
   3580     _c4dbgp("_start_new_doc");
   3581     _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begins_with("---"));
   3582     C4_UNUSED(rem);
   3583 
   3584     _end_stream();
   3585 
   3586     size_t indref = m_state->indref;
   3587     _c4dbgpf("start a document, indentation={}", indref);
   3588     _line_progressed(3);
   3589     _push_level();
   3590     _start_doc();
   3591     _set_indentation(indref);
   3592 }
   3593 
   3594 
   3595 //-----------------------------------------------------------------------------
   3596 void Parser::_start_map(bool as_child)
   3597 {
   3598     _c4dbgpf("start_map (as child={})", as_child);
   3599     addrem_flags(RMAP|RVAL, RKEY|RUNK);
   3600     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
   3601     size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
   3602     _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
   3603     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
   3604     if(as_child)
   3605     {
   3606         m_state->node_id = m_tree->append_child(parent_id);
   3607         if(has_all(SSCL))
   3608         {
   3609             type_bits key_quoted = NOTYPE;
   3610             if(m_state->flags & QSCL) // before consuming the scalar
   3611                 key_quoted |= KEYQUO;
   3612             csubstr key = _consume_scalar();
   3613             m_tree->to_map(m_state->node_id, key, key_quoted);
   3614             _c4dbgpf("start_map: id={} key='{}'", m_state->node_id, m_tree->key(m_state->node_id));
   3615             _write_key_anchor(m_state->node_id);
   3616             if( ! m_key_tag.empty())
   3617             {
   3618                 _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
   3619                 m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
   3620                 m_key_tag.clear();
   3621             }
   3622         }
   3623         else
   3624         {
   3625             m_tree->to_map(m_state->node_id);
   3626             _c4dbgpf("start_map: id={}", m_state->node_id);
   3627         }
   3628         m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
   3629         _write_val_anchor(m_state->node_id);
   3630     }
   3631     else
   3632     {
   3633         _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
   3634         m_state->node_id = parent_id;
   3635         _c4dbgpf("start_map: id={}", m_state->node_id);
   3636         type_bits as_doc = 0;
   3637         if(m_tree->is_doc(m_state->node_id))
   3638             as_doc |= DOC;
   3639         if(!m_tree->is_map(parent_id))
   3640         {
   3641             RYML_CHECK(!m_tree->has_children(parent_id));
   3642             m_tree->to_map(parent_id, as_doc);
   3643         }
   3644         else
   3645         {
   3646             m_tree->_add_flags(parent_id, as_doc);
   3647         }
   3648         _move_scalar_from_top();
   3649         if(m_key_anchor.not_empty())
   3650             m_key_anchor_was_before = true;
   3651         _write_val_anchor(parent_id);
   3652         if(m_stack.size() >= 2)
   3653         {
   3654             State const& parent_state = m_stack.top(1);
   3655             if(parent_state.flags & RSET)
   3656                 add_flags(RSET);
   3657         }
   3658         m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
   3659     }
   3660     if( ! m_val_tag.empty())
   3661     {
   3662         _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
   3663         m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
   3664         m_val_tag.clear();
   3665     }
   3666 }
   3667 
   3668 void Parser::_start_map_unk(bool as_child)
   3669 {
   3670     _c4dbgpf("start_map_unk (as child={})", as_child);
   3671     if(!m_key_anchor_was_before)
   3672     {
   3673         _c4dbgpf("stash key anchor before starting map... '{}'", m_key_anchor);
   3674         csubstr ka = m_key_anchor;
   3675         m_key_anchor = {};
   3676         _start_map(as_child);
   3677         m_key_anchor = ka;
   3678     }
   3679     else
   3680     {
   3681         _start_map(as_child);
   3682         m_key_anchor_was_before = false;
   3683     }
   3684     if(m_key_tag2.not_empty())
   3685     {
   3686         m_key_tag = m_key_tag2;
   3687         m_key_tag_indentation = m_key_tag2_indentation;
   3688         m_key_tag2.clear();
   3689         m_key_tag2_indentation = 0;
   3690     }
   3691 }
   3692 
   3693 void Parser::_stop_map()
   3694 {
   3695     _c4dbgpf("stop_map[{}]", m_state->node_id);
   3696     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
   3697     if(has_all(QMRK|RKEY) && !has_all(SSCL))
   3698     {
   3699         _c4dbgpf("stop_map[{}]: RKEY", m_state->node_id);
   3700         _store_scalar_null(m_state->line_contents.rem.str);
   3701         _append_key_val_null(m_state->line_contents.rem.str);
   3702     }
   3703 }
   3704 
   3705 
   3706 //-----------------------------------------------------------------------------
   3707 void Parser::_start_seq(bool as_child)
   3708 {
   3709     _c4dbgpf("start_seq (as child={})", as_child);
   3710     if(has_all(RTOP|RUNK))
   3711     {
   3712         _c4dbgpf("start_seq: moving key tag to val tag: '{}'", m_key_tag);
   3713         m_val_tag = m_key_tag;
   3714         m_key_tag.clear();
   3715     }
   3716     addrem_flags(RSEQ|RVAL, RUNK);
   3717     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
   3718     size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
   3719     _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
   3720     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
   3721     if(as_child)
   3722     {
   3723         m_state->node_id = m_tree->append_child(parent_id);
   3724         if(has_all(SSCL))
   3725         {
   3726             _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(parent_id));
   3727             type_bits key_quoted = 0;
   3728             if(m_state->flags & QSCL) // before consuming the scalar
   3729                 key_quoted |= KEYQUO;
   3730             csubstr key = _consume_scalar();
   3731             m_tree->to_seq(m_state->node_id, key, key_quoted);
   3732             _c4dbgpf("start_seq: id={} name='{}'", m_state->node_id, m_tree->key(m_state->node_id));
   3733             _write_key_anchor(m_state->node_id);
   3734             if( ! m_key_tag.empty())
   3735             {
   3736                 _c4dbgpf("start_seq[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
   3737                 m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
   3738                 m_key_tag.clear();
   3739             }
   3740         }
   3741         else
   3742         {
   3743             type_bits as_doc = 0;
   3744             _RYML_CB_ASSERT(m_stack.m_callbacks, !m_tree->is_doc(m_state->node_id));
   3745             m_tree->to_seq(m_state->node_id, as_doc);
   3746             _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as doc" : "");
   3747         }
   3748         _write_val_anchor(m_state->node_id);
   3749         m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
   3750     }
   3751     else
   3752     {
   3753         m_state->node_id = parent_id;
   3754         type_bits as_doc = 0;
   3755         if(m_tree->is_doc(m_state->node_id))
   3756             as_doc |= DOC;
   3757         if(!m_tree->is_seq(parent_id))
   3758         {
   3759             RYML_CHECK(!m_tree->has_children(parent_id));
   3760             m_tree->to_seq(parent_id, as_doc);
   3761         }
   3762         else
   3763         {
   3764             m_tree->_add_flags(parent_id, as_doc);
   3765         }
   3766         _move_scalar_from_top();
   3767         _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as_doc" : "");
   3768         _write_val_anchor(parent_id);
   3769         m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
   3770     }
   3771     if( ! m_val_tag.empty())
   3772     {
   3773         _c4dbgpf("start_seq[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
   3774         m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
   3775         m_val_tag.clear();
   3776     }
   3777 }
   3778 
   3779 void Parser::_stop_seq()
   3780 {
   3781     _c4dbgp("stop_seq");
   3782     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
   3783 }
   3784 
   3785 
   3786 //-----------------------------------------------------------------------------
   3787 void Parser::_start_seqimap()
   3788 {
   3789     _c4dbgpf("start_seqimap at node={}. has_children={}", m_state->node_id, m_tree->has_children(m_state->node_id));
   3790     _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
   3791     // create a map, and turn the last scalar of this sequence
   3792     // into the key of the map's first child. This scalar was
   3793     // understood to be a value in the sequence, but it is
   3794     // actually a key of a map, implicitly opened here.
   3795     // Eg [val, key: val]
   3796     //
   3797     // Yep, YAML is crazy.
   3798     if(m_tree->has_children(m_state->node_id) && m_tree->has_val(m_tree->last_child(m_state->node_id)))
   3799     {
   3800         size_t prev = m_tree->last_child(m_state->node_id);
   3801         NodeType ty = m_tree->_p(prev)->m_type; // don't use type() because it masks out the quotes
   3802         NodeScalar tmp = m_tree->valsc(prev);
   3803         _c4dbgpf("has children and last child={} has val. saving the scalars, val='{}' quoted={}", prev, tmp.scalar, ty.is_val_quoted());
   3804         m_tree->remove(prev);
   3805         _push_level();
   3806         _start_map();
   3807         _store_scalar(tmp.scalar, ty.is_val_quoted());
   3808         m_key_anchor = tmp.anchor;
   3809         m_key_tag = tmp.tag;
   3810     }
   3811     else
   3812     {
   3813         _c4dbgpf("node {} has no children yet, using empty key", m_state->node_id);
   3814         _push_level();
   3815         _start_map();
   3816         _store_scalar_null(m_state->line_contents.rem.str);
   3817     }
   3818     add_flags(RSEQIMAP|FLOW);
   3819 }
   3820 
   3821 void Parser::_stop_seqimap()
   3822 {
   3823     _c4dbgp("stop_seqimap");
   3824     _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQIMAP));
   3825 }
   3826 
   3827 
   3828 //-----------------------------------------------------------------------------
   3829 NodeData* Parser::_append_val(csubstr val, flag_t quoted)
   3830 {
   3831     _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_all(SSCL));
   3832     _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) != nullptr);
   3833     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
   3834     type_bits additional_flags = quoted ? VALQUO : NOTYPE;
   3835     _c4dbgpf("append val: '{}' to parent id={} (level={}){}", val, m_state->node_id, m_state->level, quoted ? " VALQUO!" : "");
   3836     size_t nid = m_tree->append_child(m_state->node_id);
   3837     m_tree->to_val(nid, val, additional_flags);
   3838     _c4dbgpf("append val: id={} val='{}'", nid, m_tree->get(nid)->m_val.scalar);
   3839     if( ! m_val_tag.empty())
   3840     {
   3841         _c4dbgpf("append val[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
   3842         m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
   3843         m_val_tag.clear();
   3844     }
   3845     _write_val_anchor(nid);
   3846     return m_tree->get(nid);
   3847 }
   3848 
   3849 NodeData* Parser::_append_key_val(csubstr val, flag_t val_quoted)
   3850 {
   3851     _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
   3852     type_bits additional_flags = 0;
   3853     if(m_state->flags & QSCL)
   3854         additional_flags |= KEYQUO;
   3855     if(val_quoted)
   3856         additional_flags |= VALQUO;
   3857     csubstr key = _consume_scalar();
   3858     _c4dbgpf("append keyval: '{}' '{}' to parent id={} (level={}){}{}", key, val, m_state->node_id, m_state->level, (additional_flags & KEYQUO) ? " KEYQUO!" : "", (additional_flags & VALQUO) ? " VALQUO!" : "");
   3859     size_t nid = m_tree->append_child(m_state->node_id);
   3860     m_tree->to_keyval(nid, key, val, additional_flags);
   3861     _c4dbgpf("append keyval: id={} key='{}' val='{}'", nid, m_tree->key(nid), m_tree->val(nid));
   3862     if( ! m_key_tag.empty())
   3863     {
   3864         _c4dbgpf("append keyval[{}]: set key tag='{}' -> '{}'", nid, m_key_tag, normalize_tag(m_key_tag));
   3865         m_tree->set_key_tag(nid, normalize_tag(m_key_tag));
   3866         m_key_tag.clear();
   3867     }
   3868     if( ! m_val_tag.empty())
   3869     {
   3870         _c4dbgpf("append keyval[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
   3871         m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
   3872         m_val_tag.clear();
   3873     }
   3874     _write_key_anchor(nid);
   3875     _write_val_anchor(nid);
   3876     rem_flags(QMRK);
   3877     return m_tree->get(nid);
   3878 }
   3879 
   3880 
   3881 //-----------------------------------------------------------------------------
   3882 void Parser::_store_scalar(csubstr s, flag_t is_quoted)
   3883 {
   3884     _c4dbgpf("state[{}]: storing scalar '{}' (flag: {}) (old scalar='{}')",
   3885              m_state-m_stack.begin(), s, m_state->flags & SSCL, m_state->scalar);
   3886     RYML_CHECK(has_none(SSCL));
   3887     add_flags(SSCL | (is_quoted * QSCL));
   3888     m_state->scalar = s;
   3889 }
   3890 
   3891 csubstr Parser::_consume_scalar()
   3892 {
   3893     _c4dbgpf("state[{}]: consuming scalar '{}' (flag: {}))", m_state-m_stack.begin(), m_state->scalar, m_state->flags & SSCL);
   3894     RYML_CHECK(m_state->flags & SSCL);
   3895     csubstr s = m_state->scalar;
   3896     rem_flags(SSCL | QSCL);
   3897     m_state->scalar.clear();
   3898     return s;
   3899 }
   3900 
   3901 void Parser::_move_scalar_from_top()
   3902 {
   3903     if(m_stack.size() < 2) return;
   3904     State &prev = m_stack.top(1);
   3905     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
   3906     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state != &prev);
   3907     if(prev.flags & SSCL)
   3908     {
   3909         _c4dbgpf("moving scalar '{}' from state[{}] to state[{}] (overwriting '{}')", prev.scalar, &prev-m_stack.begin(), m_state-m_stack.begin(), m_state->scalar);
   3910         add_flags(prev.flags & (SSCL | QSCL));
   3911         m_state->scalar = prev.scalar;
   3912         rem_flags(SSCL | QSCL, &prev);
   3913         prev.scalar.clear();
   3914     }
   3915 }
   3916 
   3917 //-----------------------------------------------------------------------------
   3918 /** @todo this function is a monster and needs love. Likely, it needs
   3919  * to be split like _scan_scalar_*() */
   3920 bool Parser::_handle_indentation()
   3921 {
   3922     _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
   3923     if( ! _at_line_begin())
   3924         return false;
   3925 
   3926     size_t ind = m_state->line_contents.indentation;
   3927     csubstr rem = m_state->line_contents.rem;
   3928     /** @todo instead of trimming, we should use the indentation index from above */
   3929     csubstr remt = rem.triml(' ');
   3930 
   3931     if(remt.empty() || remt.begins_with('#')) // this is a blank or comment line
   3932     {
   3933         _line_progressed(rem.size());
   3934         return true;
   3935     }
   3936 
   3937     _c4dbgpf("indentation? ind={} indref={}", ind, m_state->indref);
   3938     if(ind == m_state->indref)
   3939     {
   3940         _c4dbgpf("same indentation: {}", ind);
   3941         if(!rem.sub(ind).begins_with('-'))
   3942         {
   3943             _c4dbgp("does not begin with -");
   3944             if(has_any(RMAP))
   3945             {
   3946                 if(has_all(SSCL|RVAL))
   3947                 {
   3948                     _c4dbgp("add with null val");
   3949                     _append_key_val_null(rem.str + ind - 1);
   3950                     addrem_flags(RKEY, RVAL);
   3951                 }
   3952             }
   3953             else if(has_any(RSEQ))
   3954             {
   3955                 if(m_stack.size() > 2) // do not pop to root level
   3956                 {
   3957                     if(has_any(RNXT))
   3958                     {
   3959                         _c4dbgp("end the indentless seq");
   3960                         _pop_level();
   3961                         return true;
   3962                     }
   3963                     else if(has_any(RVAL))
   3964                     {
   3965                         _c4dbgp("add with null val");
   3966                         _append_val_null(rem.str);
   3967                         _c4dbgp("end the indentless seq");
   3968                         _pop_level();
   3969                         return true;
   3970                     }
   3971                 }
   3972             }
   3973         }
   3974         _line_progressed(ind);
   3975         return ind > 0;
   3976     }
   3977     else if(ind < m_state->indref)
   3978     {
   3979         _c4dbgpf("smaller indentation ({} < {})!!!", ind, m_state->indref);
   3980         if(has_all(RVAL))
   3981         {
   3982             _c4dbgp("there was an empty val -- appending");
   3983             if(has_all(RMAP))
   3984             {
   3985                 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
   3986                 _append_key_val_null(rem.sub(ind).str - 1);
   3987             }
   3988             else if(has_all(RSEQ))
   3989             {
   3990                 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
   3991                 _append_val_null(rem.sub(ind).str - 1);
   3992             }
   3993         }
   3994         // search the stack frame to jump to based on its indentation
   3995         State const* popto = nullptr;
   3996         _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.is_contiguous()); // this search relies on the stack being contiguous
   3997         for(State const* s = m_state-1; s >= m_stack.begin(); --s)
   3998         {
   3999             _c4dbgpf("searching for state with indentation {}. curr={} (level={},node={})", ind, s->indref, s->level, s->node_id);
   4000             if(s->indref == ind)
   4001             {
   4002                 _c4dbgpf("gotit!!! level={} node={}", s->level, s->node_id);
   4003                 popto = s;
   4004                 // while it may be tempting to think we're done at this
   4005                 // point, we must still determine whether we're jumping to a
   4006                 // parent with the same indentation. Consider this case with
   4007                 // an indentless sequence:
   4008                 //
   4009                 // product:
   4010                 // - sku: BL394D
   4011                 //   quantity: 4
   4012                 //   description: Basketball
   4013                 //   price: 450.00
   4014                 // - sku: BL4438H
   4015                 //   quantity: 1
   4016                 //   description: Super Hoop
   4017                 //   price: 2392.00  # jumping one level here would be wrong.
   4018                 // tax: 1234.5       # we must jump two levels
   4019                 if(popto > m_stack.begin())
   4020                 {
   4021                     auto parent = popto - 1;
   4022                     if(parent->indref == popto->indref)
   4023                     {
   4024                         _c4dbgpf("the parent (level={},node={}) has the same indentation ({}). is this in an indentless sequence?", parent->level, parent->node_id, popto->indref);
   4025                         _c4dbgpf("isseq(popto)={} ismap(parent)={}", m_tree->is_seq(popto->node_id), m_tree->is_map(parent->node_id));
   4026                         if(m_tree->is_seq(popto->node_id) && m_tree->is_map(parent->node_id))
   4027                         {
   4028                             if( ! remt.begins_with('-'))
   4029                             {
   4030                                 _c4dbgp("this is an indentless sequence");
   4031                                 popto = parent;
   4032                             }
   4033                             else
   4034                             {
   4035                                 _c4dbgp("not an indentless sequence");
   4036                             }
   4037                         }
   4038                     }
   4039                 }
   4040                 break;
   4041             }
   4042         }
   4043         if(!popto || popto >= m_state || popto->level >= m_state->level)
   4044         {
   4045             _c4err("parse error: incorrect indentation?");
   4046         }
   4047         _c4dbgpf("popping {} levels: from level {} to level {}", m_state->level-popto->level, m_state->level, popto->level);
   4048         while(m_state != popto)
   4049         {
   4050             _c4dbgpf("popping level {} (indentation={})", m_state->level, m_state->indref);
   4051             _pop_level();
   4052         }
   4053         _RYML_CB_ASSERT(m_stack.m_callbacks, ind == m_state->indref);
   4054         _line_progressed(ind);
   4055         return true;
   4056     }
   4057     else
   4058     {
   4059         _c4dbgpf("larger indentation ({} > {})!!!", ind, m_state->indref);
   4060         _RYML_CB_ASSERT(m_stack.m_callbacks, ind > m_state->indref);
   4061         if(has_all(RMAP|RVAL))
   4062         {
   4063             if(_is_scalar_next__rmap_val(remt) && (!remt.first_of_any(": ", "? ")) && (!remt.ends_with(":")))
   4064             {
   4065                 _c4dbgpf("actually it seems a value: '{}'", remt);
   4066             }
   4067             else
   4068             {
   4069                 addrem_flags(RKEY, RVAL);
   4070                 _start_unk();
   4071                 //_move_scalar_from_top();
   4072                 _line_progressed(ind);
   4073                 _save_indentation();
   4074                 return true;
   4075             }
   4076         }
   4077         else if(has_all(RSEQ|RVAL))
   4078         {
   4079             // nothing to do here
   4080         }
   4081         else
   4082         {
   4083             _c4err("parse error - indentation should not increase at this point");
   4084         }
   4085     }
   4086 
   4087     return false;
   4088 }
   4089 
   4090 //-----------------------------------------------------------------------------
   4091 csubstr Parser::_scan_comment()
   4092 {
   4093     csubstr s = m_state->line_contents.rem;
   4094     _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('#'));
   4095     _line_progressed(s.len);
   4096     // skip the # character
   4097     s = s.sub(1);
   4098     // skip leading whitespace
   4099     s = s.right_of(s.first_not_of(' '), /*include_pos*/true);
   4100     _c4dbgpf("comment was '{}'", s);
   4101     return s;
   4102 }
   4103 
   4104 //-----------------------------------------------------------------------------
   4105 csubstr Parser::_scan_squot_scalar()
   4106 {
   4107     // quoted scalars can spread over multiple lines!
   4108     // nice explanation here: http://yaml-multiline.info/
   4109 
   4110     // a span to the end of the file
   4111     size_t b = m_state->pos.offset;
   4112     substr s = m_buf.sub(b);
   4113     if(s.begins_with(' '))
   4114     {
   4115         s = s.triml(' ');
   4116         _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
   4117         _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
   4118         _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
   4119     }
   4120     b = m_state->pos.offset; // take this into account
   4121     _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('\''));
   4122 
   4123     // skip the opening quote
   4124     _line_progressed(1);
   4125     s = s.sub(1);
   4126 
   4127     bool needs_filter = false;
   4128 
   4129     size_t numlines = 1; // we already have one line
   4130     size_t pos = npos; // find the pos of the matching quote
   4131     while( ! _finished_file())
   4132     {
   4133         const csubstr line = m_state->line_contents.rem;
   4134         bool line_is_blank = true;
   4135         _c4dbgpf("scanning single quoted scalar @ line[{}]: ~~~{}~~~", m_state->pos.line, line);
   4136         for(size_t i = 0; i < line.len; ++i)
   4137         {
   4138             const char curr = line.str[i];
   4139             if(curr == '\'') // single quotes are escaped with two single quotes
   4140             {
   4141                 const char next = i+1 < line.len ? line.str[i+1] : '~';
   4142                 if(next != '\'') // so just look for the first quote
   4143                 {                // without another after it
   4144                     pos = i;
   4145                     break;
   4146                 }
   4147                 else
   4148                 {
   4149                     needs_filter = true; // needs filter to remove escaped quotes
   4150                     ++i; // skip the escaped quote
   4151                 }
   4152             }
   4153             else if(curr != ' ')
   4154             {
   4155                 line_is_blank = false;
   4156             }
   4157         }
   4158 
   4159         // leading whitespace also needs filtering
   4160         needs_filter = needs_filter
   4161             || (numlines > 1)
   4162             || line_is_blank
   4163             || (_at_line_begin() && line.begins_with(' '));
   4164 
   4165         if(pos == npos)
   4166         {
   4167             _line_progressed(line.len);
   4168             ++numlines;
   4169         }
   4170         else
   4171         {
   4172             _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
   4173             _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '\'');
   4174             _line_progressed(pos + 1); // progress beyond the quote
   4175             pos = m_state->pos.offset - b - 1; // but we stop before it
   4176             break;
   4177         }
   4178 
   4179         _line_ended();
   4180         _scan_line();
   4181     }
   4182 
   4183     if(pos == npos)
   4184     {
   4185         _c4err("reached end of file while looking for closing quote");
   4186     }
   4187     else
   4188     {
   4189         _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
   4190         _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
   4191         _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '\'');
   4192         s = s.sub(0, pos-1);
   4193     }
   4194 
   4195     if(needs_filter)
   4196     {
   4197         csubstr ret = _filter_squot_scalar(s);
   4198         _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
   4199         _c4dbgpf("final scalar: \"{}\"", ret);
   4200         return ret;
   4201     }
   4202 
   4203     _c4dbgpf("final scalar: \"{}\"", s);
   4204 
   4205     return s;
   4206 }
   4207 
   4208 //-----------------------------------------------------------------------------
   4209 csubstr Parser::_scan_dquot_scalar()
   4210 {
   4211     // quoted scalars can spread over multiple lines!
   4212     // nice explanation here: http://yaml-multiline.info/
   4213 
   4214     // a span to the end of the file
   4215     size_t b = m_state->pos.offset;
   4216     substr s = m_buf.sub(b);
   4217     if(s.begins_with(' '))
   4218     {
   4219         s = s.triml(' ');
   4220         _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
   4221         _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
   4222         _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
   4223     }
   4224     b = m_state->pos.offset; // take this into account
   4225     _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('"'));
   4226 
   4227     // skip the opening quote
   4228     _line_progressed(1);
   4229     s = s.sub(1);
   4230 
   4231     bool needs_filter = false;
   4232 
   4233     size_t numlines = 1; // we already have one line
   4234     size_t pos = npos; // find the pos of the matching quote
   4235     while( ! _finished_file())
   4236     {
   4237         const csubstr line = m_state->line_contents.rem;
   4238         bool line_is_blank = true;
   4239         _c4dbgpf("scanning double quoted scalar @ line[{}]:  line='{}'", m_state->pos.line, line);
   4240         for(size_t i = 0; i < line.len; ++i)
   4241         {
   4242             const char curr = line.str[i];
   4243             if(curr != ' ')
   4244                 line_is_blank = false;
   4245             // every \ is an escape
   4246             if(curr == '\\')
   4247             {
   4248                 const char next = i+1 < line.len ? line.str[i+1] : '~';
   4249                 needs_filter = true;
   4250                 if(next == '"' || next == '\\')
   4251                     ++i;
   4252             }
   4253             else if(curr == '"')
   4254             {
   4255                 pos = i;
   4256                 break;
   4257             }
   4258         }
   4259 
   4260         // leading whitespace also needs filtering
   4261         needs_filter = needs_filter
   4262             || (numlines > 1)
   4263             || line_is_blank
   4264             || (_at_line_begin() && line.begins_with(' '));
   4265 
   4266         if(pos == npos)
   4267         {
   4268             _line_progressed(line.len);
   4269             ++numlines;
   4270         }
   4271         else
   4272         {
   4273             _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
   4274             _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '"');
   4275             _line_progressed(pos + 1); // progress beyond the quote
   4276             pos = m_state->pos.offset - b - 1; // but we stop before it
   4277             break;
   4278         }
   4279 
   4280         _line_ended();
   4281         _scan_line();
   4282     }
   4283 
   4284     if(pos == npos)
   4285     {
   4286         _c4err("reached end of file looking for closing quote");
   4287     }
   4288     else
   4289     {
   4290         _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
   4291         _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '"');
   4292         _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
   4293         s = s.sub(0, pos-1);
   4294     }
   4295 
   4296     if(needs_filter)
   4297     {
   4298         csubstr ret = _filter_dquot_scalar(s);
   4299         _c4dbgpf("final scalar: [{}]\"{}\"", ret.len, ret);
   4300         _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
   4301         return ret;
   4302     }
   4303 
   4304     _c4dbgpf("final scalar: \"{}\"", s);
   4305 
   4306     return s;
   4307 }
   4308 
   4309 //-----------------------------------------------------------------------------
   4310 csubstr Parser::_scan_block()
   4311 {
   4312     // nice explanation here: http://yaml-multiline.info/
   4313     csubstr s = m_state->line_contents.rem;
   4314     csubstr trimmed = s.triml(' ');
   4315     if(trimmed.str > s.str)
   4316     {
   4317         _c4dbgp("skipping whitespace");
   4318         _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= s.str);
   4319         _line_progressed(static_cast<size_t>(trimmed.str - s.str));
   4320         s = trimmed;
   4321     }
   4322     _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('|') || s.begins_with('>'));
   4323 
   4324     _c4dbgpf("scanning block: specs=\"{}\"", s);
   4325 
   4326     // parse the spec
   4327     BlockStyle_e newline = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL;
   4328     BlockChomp_e chomp = CHOMP_CLIP; // default to clip unless + or - are used
   4329     size_t indentation = npos; // have to find out if no spec is given
   4330     csubstr digits;
   4331     if(s.len > 1)
   4332     {
   4333         _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with_any("|>"));
   4334         csubstr t = s.sub(1);
   4335         _c4dbgpf("scanning block: spec is multichar: '{}'", t);
   4336         _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
   4337         size_t pos = t.first_of("-+");
   4338         _c4dbgpf("scanning block: spec chomp char at {}", pos);
   4339         if(pos != npos)
   4340         {
   4341             if(t[pos] == '-')
   4342                 chomp = CHOMP_STRIP;
   4343             else if(t[pos] == '+')
   4344                 chomp = CHOMP_KEEP;
   4345             if(pos == 0)
   4346                 t = t.sub(1);
   4347             else
   4348                 t = t.first(pos);
   4349         }
   4350         // from here to the end, only digits are considered
   4351         digits = t.left_of(t.first_not_of("0123456789"));
   4352         if( ! digits.empty())
   4353         {
   4354             if( ! c4::atou(digits, &indentation))
   4355                 _c4err("parse error: could not read decimal");
   4356             _c4dbgpf("scanning block: indentation specified: {}. add {} from curr state -> {}", indentation, m_state->indref, indentation+m_state->indref);
   4357             indentation += m_state->indref;
   4358         }
   4359     }
   4360 
   4361     // finish the current line
   4362     _line_progressed(s.len);
   4363     _line_ended();
   4364     _scan_line();
   4365 
   4366     _c4dbgpf("scanning block: style={}  chomp={}  indentation={}", newline==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation);
   4367 
   4368     // start with a zero-length block, already pointing at the right place
   4369     substr raw_block(m_buf.data() + m_state->pos.offset, size_t(0));// m_state->line_contents.full.sub(0, 0);
   4370     _RYML_CB_ASSERT(m_stack.m_callbacks, raw_block.begin() == m_state->line_contents.full.begin());
   4371 
   4372     // read every full line into a raw block,
   4373     // from which newlines are to be stripped as needed.
   4374     //
   4375     // If no explicit indentation was given, pick it from the first
   4376     // non-empty line. See
   4377     // https://yaml.org/spec/1.2.2/#8111-block-indentation-indicator
   4378     size_t num_lines = 0, first = m_state->pos.line, provisional_indentation = npos;
   4379     LineContents lc;
   4380     while(( ! _finished_file()))
   4381     {
   4382         // peek next line, but do not advance immediately
   4383         lc.reset_with_next_line(m_buf, m_state->pos.offset);
   4384         _c4dbgpf("scanning block: peeking at '{}'", lc.stripped);
   4385         // evaluate termination conditions
   4386         if(indentation != npos)
   4387         {
   4388             // stop when the line is deindented and not empty
   4389             if(lc.indentation < indentation && ( ! lc.rem.trim(" \t\r\n").empty()))
   4390             {
   4391                 if(raw_block.len)
   4392                 {
   4393                     _c4dbgpf("scanning block: indentation decreased ref={} thisline={}", indentation, lc.indentation);
   4394                 }
   4395                 else
   4396                 {
   4397                     _c4err("indentation decreased without any scalar");
   4398                 }
   4399                 break;
   4400             }
   4401             else if(indentation == 0)
   4402             {
   4403                 if((lc.rem == "..." || lc.rem.begins_with("... "))
   4404                     ||
   4405                    (lc.rem == "---" || lc.rem.begins_with("--- ")))
   4406                 {
   4407                     _c4dbgp("scanning block: stop. indentation=0 and stream ended");
   4408                     break;
   4409                 }
   4410             }
   4411         }
   4412         else
   4413         {
   4414             _c4dbgpf("scanning block: indentation ref not set. firstnonws={}", lc.stripped.first_not_of(' '));
   4415             if(lc.stripped.first_not_of(' ') != npos) // non-empty line
   4416             {
   4417                 _c4dbgpf("scanning block: line not empty. indref={} indprov={} indentation={}", m_state->indref, provisional_indentation, lc.indentation);
   4418                 if(provisional_indentation == npos)
   4419                 {
   4420                     if(lc.indentation < m_state->indref)
   4421                     {
   4422                         _c4dbgpf("scanning block: block terminated indentation={} < indref={}", lc.indentation, m_state->indref);
   4423                         if(raw_block.len == 0)
   4424                         {
   4425                             _c4dbgp("scanning block: was empty, undo next line");
   4426                             _line_ended_undo();
   4427                         }
   4428                         break;
   4429                     }
   4430                     else if(lc.indentation == m_state->indref)
   4431                     {
   4432                         if(has_any(RSEQ|RMAP))
   4433                         {
   4434                             _c4dbgpf("scanning block: block terminated. reading container and indentation={}==indref={}", lc.indentation, m_state->indref);
   4435                             break;
   4436                         }
   4437                     }
   4438                     _c4dbgpf("scanning block: set indentation ref from this line: ref={}", lc.indentation);
   4439                     indentation = lc.indentation;
   4440                 }
   4441                 else
   4442                 {
   4443                     if(lc.indentation >= provisional_indentation)
   4444                     {
   4445                         _c4dbgpf("scanning block: set indentation ref from provisional indentation: provisional_ref={}, thisline={}", provisional_indentation, lc.indentation);
   4446                         //indentation = provisional_indentation ? provisional_indentation : lc.indentation;
   4447                         indentation = lc.indentation;
   4448                     }
   4449                     else
   4450                     {
   4451                         break;
   4452                         //_c4err("parse error: first non-empty block line should have at least the original indentation");
   4453                     }
   4454                 }
   4455             }
   4456             else // empty line
   4457             {
   4458                 _c4dbgpf("scanning block: line empty or {} spaces. line_indentation={} prov_indentation={}", lc.stripped.len, lc.indentation, provisional_indentation);
   4459                 if(provisional_indentation != npos)
   4460                 {
   4461                     if(lc.stripped.len >= provisional_indentation)
   4462                     {
   4463                         _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.stripped.len);
   4464                         provisional_indentation = lc.stripped.len;
   4465                     }
   4466                     #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
   4467                     else if(lc.indentation >= provisional_indentation && lc.indentation != npos)
   4468                     {
   4469                         _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.indentation);
   4470                         provisional_indentation = lc.indentation;
   4471                     }
   4472                     #endif
   4473                 }
   4474                 else
   4475                 {
   4476                     provisional_indentation = lc.indentation ? lc.indentation : has_any(RSEQ|RVAL);
   4477                     _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
   4478                     if(provisional_indentation == npos)
   4479                     {
   4480                         provisional_indentation = lc.stripped.len ? lc.stripped.len : has_any(RSEQ|RVAL);
   4481                         _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
   4482                     }
   4483                 }
   4484             }
   4485         }
   4486         // advance now that we know the folded scalar continues
   4487         m_state->line_contents = lc;
   4488         _c4dbgpf("scanning block: append '{}'", m_state->line_contents.rem);
   4489         raw_block.len += m_state->line_contents.full.len;
   4490         _line_progressed(m_state->line_contents.rem.len);
   4491         _line_ended();
   4492         ++num_lines;
   4493     }
   4494     _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line == (first + num_lines) || (raw_block.len == 0));
   4495     C4_UNUSED(num_lines);
   4496     C4_UNUSED(first);
   4497 
   4498     if(indentation == npos)
   4499     {
   4500         _c4dbgpf("scanning block: set indentation from provisional: {}", provisional_indentation);
   4501         indentation = provisional_indentation;
   4502     }
   4503 
   4504     if(num_lines)
   4505         _line_ended_undo();
   4506 
   4507     _c4dbgpf("scanning block: raw=~~~{}~~~", raw_block);
   4508 
   4509     // ok! now we strip the newlines and spaces according to the specs
   4510     s = _filter_block_scalar(raw_block, newline, chomp, indentation);
   4511 
   4512     _c4dbgpf("scanning block: final=~~~{}~~~", s);
   4513 
   4514     return s;
   4515 }
   4516 
   4517 
   4518 //-----------------------------------------------------------------------------
   4519 
   4520 template<bool backslash_is_escape, bool keep_trailing_whitespace>
   4521 bool Parser::_filter_nl(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos, size_t indentation)
   4522 {
   4523     // a debugging scaffold:
   4524     #if 0
   4525     #define _c4dbgfnl(fmt, ...) _c4dbgpf("filter_nl[{}]: " fmt, *i, __VA_ARGS__)
   4526     #else
   4527     #define _c4dbgfnl(...)
   4528     #endif
   4529 
   4530     const char curr = r[*i];
   4531     bool replaced = false;
   4532 
   4533     _RYML_CB_ASSERT(m_stack.m_callbacks, indentation != npos);
   4534     _RYML_CB_ASSERT(m_stack.m_callbacks, curr == '\n');
   4535 
   4536     _c4dbgfnl("found newline. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
   4537     size_t ii = *i;
   4538     size_t numnl_following = count_following_newlines(r, &ii, indentation);
   4539     if(numnl_following)
   4540     {
   4541         _c4dbgfnl("{} consecutive (empty) lines {} in the middle. totalws={}", 1+numnl_following, ii < r.len ? "in the middle" : "at the end", ii - *i);
   4542         for(size_t j = 0; j < numnl_following; ++j)
   4543             m_filter_arena.str[(*pos)++] = '\n';
   4544     }
   4545     else
   4546     {
   4547         if(r.first_not_of(" \t", *i+1) != npos)
   4548         {
   4549             m_filter_arena.str[(*pos)++] = ' ';
   4550             _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
   4551             replaced = true;
   4552         }
   4553         else
   4554         {
   4555             if C4_IF_CONSTEXPR (keep_trailing_whitespace)
   4556             {
   4557                 m_filter_arena.str[(*pos)++] = ' ';
   4558                 _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
   4559                 replaced = true;
   4560             }
   4561             else
   4562             {
   4563                 _c4dbgfnl("last newline, everything else is whitespace. ii={}/{}", ii, r.len);
   4564                 *i = r.len;
   4565             }
   4566         }
   4567         if C4_IF_CONSTEXPR (backslash_is_escape)
   4568         {
   4569             if(ii < r.len && r.str[ii] == '\\')
   4570             {
   4571                 const char next = ii+1 < r.len ? r.str[ii+1] : '\0';
   4572                 if(next == ' ' || next == '\t')
   4573                 {
   4574                     _c4dbgfnl("extend skip to backslash{}", "");
   4575                     ++ii;
   4576                 }
   4577             }
   4578         }
   4579     }
   4580     *i = ii - 1; // correct for the loop increment
   4581 
   4582     #undef _c4dbgfnl
   4583 
   4584     return replaced;
   4585 }
   4586 
   4587 
   4588 //-----------------------------------------------------------------------------
   4589 
   4590 template<bool keep_trailing_whitespace>
   4591 void Parser::_filter_ws(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos)
   4592 {
   4593     // a debugging scaffold:
   4594     #if 0
   4595     #define _c4dbgfws(fmt, ...) _c4dbgpf("filt_nl[{}]: " fmt, *i, __VA_ARGS__)
   4596     #else
   4597     #define _c4dbgfws(...)
   4598     #endif
   4599 
   4600     const char curr = r[*i];
   4601     _c4dbgfws("found whitespace '{}'", _c4prc(curr));
   4602     _RYML_CB_ASSERT(m_stack.m_callbacks, curr == ' ' || curr == '\t');
   4603 
   4604     size_t first = *i > 0 ? r.first_not_of(" \t", *i) : r.first_not_of(' ', *i);
   4605     if(first != npos)
   4606     {
   4607         if(r[first] == '\n' || r[first] == '\r') // skip trailing whitespace
   4608         {
   4609             _c4dbgfws("whitespace is trailing on line. firstnonws='{}'@{}", _c4prc(r[first]), first);
   4610             *i = first - 1; // correct for the loop increment
   4611         }
   4612         else // a legit whitespace
   4613         {
   4614             m_filter_arena.str[(*pos)++] = curr;
   4615             _c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
   4616         }
   4617     }
   4618     else
   4619     {
   4620         _c4dbgfws("... everything else is trailing whitespace{}", "");
   4621         if C4_IF_CONSTEXPR (keep_trailing_whitespace)
   4622             for(size_t j = *i; j < r.len; ++j)
   4623                 m_filter_arena.str[(*pos)++] = r[j];
   4624         *i = r.len;
   4625     }
   4626 
   4627     #undef _c4dbgfws
   4628 }
   4629 
   4630 
   4631 //-----------------------------------------------------------------------------
   4632 csubstr Parser::_filter_plain_scalar(substr s, size_t indentation)
   4633 {
   4634     // a debugging scaffold:
   4635     #if 0
   4636     #define _c4dbgfps(...) _c4dbgpf("filt_plain_scalar" __VA_ARGS__)
   4637     #else
   4638     #define _c4dbgfps(...)
   4639     #endif
   4640 
   4641     _c4dbgfps("before=~~~{}~~~", s);
   4642 
   4643     substr r = s.triml(" \t");
   4644     _grow_filter_arena(r.len);
   4645     size_t pos = 0; // the filtered size
   4646     bool filtered_chars = false;
   4647     for(size_t i = 0; i < r.len; ++i)
   4648     {
   4649         const char curr = r.str[i];
   4650         _c4dbgfps("[{}]: '{}'", i, _c4prc(curr));
   4651         if(curr == ' ' || curr == '\t')
   4652         {
   4653             _filter_ws</*keep_trailing_ws*/false>(r, &i, &pos);
   4654         }
   4655         else if(curr == '\n')
   4656         {
   4657             filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/false>(r, &i, &pos, indentation);
   4658         }
   4659         else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
   4660         {
   4661             ;
   4662         }
   4663         else
   4664         {
   4665             m_filter_arena.str[pos++] = r[i];
   4666         }
   4667     }
   4668 
   4669     _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
   4670     if(pos < r.len || filtered_chars)
   4671     {
   4672         r = _finish_filter_arena(r, pos);
   4673     }
   4674 
   4675     _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
   4676     _c4dbgfps("#filteredchars={} after=~~~{}~~~", s.len - r.len, r);
   4677 
   4678     #undef _c4dbgfps
   4679     return r;
   4680 }
   4681 
   4682 
   4683 //-----------------------------------------------------------------------------
   4684 csubstr Parser::_filter_squot_scalar(substr s)
   4685 {
   4686     // a debugging scaffold:
   4687     #if 0
   4688     #define _c4dbgfsq(...) _c4dbgpf("filt_squo_scalar")
   4689     #else
   4690     #define _c4dbgfsq(...)
   4691     #endif
   4692 
   4693     // from the YAML spec for double-quoted scalars:
   4694     // https://yaml.org/spec/1.2-old/spec.html#style/flow/single-quoted
   4695 
   4696     _c4dbgfsq(": before=~~~{}~~~", s);
   4697 
   4698     _grow_filter_arena(s.len);
   4699     substr r = s;
   4700     size_t pos = 0; // the filtered size
   4701     bool filtered_chars = false;
   4702     for(size_t i = 0; i < r.len; ++i)
   4703     {
   4704         const char curr = r[i];
   4705         _c4dbgfsq("[{}]: '{}'", i, _c4prc(curr));
   4706         if(curr == ' ' || curr == '\t')
   4707         {
   4708             _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
   4709         }
   4710         else if(curr == '\n')
   4711         {
   4712             filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
   4713         }
   4714         else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
   4715         {
   4716             ;
   4717         }
   4718         else if(curr == '\'')
   4719         {
   4720             char next = i+1 < r.len ? r[i+1] : '\0';
   4721             if(next == '\'')
   4722             {
   4723                 _c4dbgfsq("[{}]: two consecutive quotes", i);
   4724                 filtered_chars = true;
   4725                 m_filter_arena.str[pos++] = '\'';
   4726                 ++i;
   4727             }
   4728         }
   4729         else
   4730         {
   4731             m_filter_arena.str[pos++] = curr;
   4732         }
   4733     }
   4734 
   4735     _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
   4736     if(pos < r.len || filtered_chars)
   4737     {
   4738         r = _finish_filter_arena(r, pos);
   4739     }
   4740 
   4741     _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
   4742     _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
   4743 
   4744     #undef _c4dbgfsq
   4745     return r;
   4746 }
   4747 
   4748 
   4749 //-----------------------------------------------------------------------------
   4750 csubstr Parser::_filter_dquot_scalar(substr s)
   4751 {
   4752     // a debugging scaffold:
   4753     #if 0
   4754     #define _c4dbgfdq(...) _c4dbgpf("filt_dquo_scalar" __VA_ARGS__)
   4755     #else
   4756     #define _c4dbgfdq(...)
   4757     #endif
   4758 
   4759     _c4dbgfdq(": before=~~~{}~~~", s);
   4760 
   4761     // from the YAML spec for double-quoted scalars:
   4762     // https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted
   4763     //
   4764     // All leading and trailing white space characters are excluded
   4765     // from the content. Each continuation line must therefore contain
   4766     // at least one non-space character. Empty lines, if any, are
   4767     // consumed as part of the line folding.
   4768 
   4769     _grow_filter_arena(s.len + 2u * s.count('\\'));
   4770     substr r = s;
   4771     size_t pos = 0; // the filtered size
   4772     bool filtered_chars = false;
   4773     for(size_t i = 0; i < r.len; ++i)
   4774     {
   4775         const char curr = r[i];
   4776         _c4dbgfdq("[{}]: '{}'", i, _c4prc(curr));
   4777         if(curr == ' ' || curr == '\t')
   4778         {
   4779             _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
   4780         }
   4781         else if(curr == '\n')
   4782         {
   4783             filtered_chars = _filter_nl</*backslash_is_escape*/true, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
   4784         }
   4785         else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
   4786         {
   4787             ;
   4788         }
   4789         else if(curr == '\\')
   4790         {
   4791             char next = i+1 < r.len ? r[i+1] : '\0';
   4792             _c4dbgfdq("[{}]: backslash, next='{}'", i, _c4prc(next));
   4793             filtered_chars = true;
   4794             if(next == '\r')
   4795             {
   4796                 if(i+2 < r.len && r[i+2] == '\n')
   4797                 {
   4798                     ++i; // newline escaped with \ -- skip both (add only one as i is loop-incremented)
   4799                     next = '\n';
   4800                     _c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", i);
   4801                 }
   4802             }
   4803             // remember the loop will also increment i
   4804             if(next == '\n')
   4805             {
   4806                 size_t ii = i + 2;
   4807                 for( ; ii < r.len; ++ii)
   4808                 {
   4809                     if(r.str[ii] == ' ' || r.str[ii] == '\t')  // skip leading whitespace
   4810                         ;
   4811                     else
   4812                         break;
   4813                 }
   4814                 i += ii - i - 1;
   4815             }
   4816             else if(next == '"' || next == '/'  || next == ' ' || next == '\t') // escapes for json compatibility
   4817             {
   4818                 m_filter_arena.str[pos++] = next;
   4819                 ++i;
   4820             }
   4821             else if(next == '\r')
   4822             {
   4823                 //++i;
   4824             }
   4825             else if(next == 'n')
   4826             {
   4827                 m_filter_arena.str[pos++] = '\n';
   4828                 ++i;
   4829             }
   4830             else if(next == 'r')
   4831             {
   4832                 m_filter_arena.str[pos++] = '\r';
   4833                 ++i; // skip
   4834             }
   4835             else if(next == 't')
   4836             {
   4837                 m_filter_arena.str[pos++] = '\t';
   4838                 ++i;
   4839             }
   4840             else if(next == '\\')
   4841             {
   4842                 m_filter_arena.str[pos++] = '\\';
   4843                 ++i;
   4844             }
   4845             else if(next == 'x') // UTF8
   4846             {
   4847                 if(i + 1u + 2u >= r.len)
   4848                     _c4err("\\x requires 2 hex digits");
   4849                 uint8_t byteval = {};
   4850                 if(!read_hex(r.sub(i + 2u, 2u), &byteval))
   4851                     _c4err("failed to read \\x codepoint");
   4852                 m_filter_arena.str[pos++] = *(char*)&byteval;
   4853                 i += 1u + 2u;
   4854             }
   4855             else if(next == 'u') // UTF16
   4856             {
   4857                 if(i + 1u + 4u >= r.len)
   4858                     _c4err("\\u requires 4 hex digits");
   4859                 char readbuf[8];
   4860                 csubstr codepoint = r.sub(i + 2u, 4u);
   4861                 uint32_t codepoint_val = {};
   4862                 if(!read_hex(codepoint, &codepoint_val))
   4863                     _c4err("failed to parse \\u codepoint");
   4864                 size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
   4865                 C4_ASSERT(numbytes <= 4);
   4866                 memcpy(m_filter_arena.str + pos, readbuf, numbytes);
   4867                 pos += numbytes;
   4868                 i += 1u + 4u;
   4869             }
   4870             else if(next == 'U') // UTF32
   4871             {
   4872                 if(i + 1u + 8u >= r.len)
   4873                     _c4err("\\U requires 8 hex digits");
   4874                 char readbuf[8];
   4875                 csubstr codepoint = r.sub(i + 2u, 8u);
   4876                 uint32_t codepoint_val = {};
   4877                 if(!read_hex(codepoint, &codepoint_val))
   4878                     _c4err("failed to parse \\U codepoint");
   4879                 size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
   4880                 C4_ASSERT(numbytes <= 4);
   4881                 memcpy(m_filter_arena.str + pos, readbuf, numbytes);
   4882                 pos += numbytes;
   4883                 i += 1u + 8u;
   4884             }
   4885             // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char
   4886             else if(next == '0')
   4887             {
   4888                 m_filter_arena.str[pos++] = '\0';
   4889                 ++i;
   4890             }
   4891             else if(next == 'b') // backspace
   4892             {
   4893                 m_filter_arena.str[pos++] = '\b';
   4894                 ++i;
   4895             }
   4896             else if(next == 'f') // form feed
   4897             {
   4898                 m_filter_arena.str[pos++] = '\f';
   4899                 ++i;
   4900             }
   4901             else if(next == 'a') // bell character
   4902             {
   4903                 m_filter_arena.str[pos++] = '\a';
   4904                 ++i;
   4905             }
   4906             else if(next == 'v') // vertical tab
   4907             {
   4908                 m_filter_arena.str[pos++] = '\v';
   4909                 ++i;
   4910             }
   4911             else if(next == 'e') // escape character
   4912             {
   4913                 m_filter_arena.str[pos++] = '\x1b';
   4914                 ++i;
   4915             }
   4916             else if(next == '_') // unicode non breaking space \u00a0
   4917             {
   4918                 // https://www.compart.com/en/unicode/U+00a0
   4919                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
   4920                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x60, 0xa0);
   4921                 ++i;
   4922             }
   4923             else if(next == 'N') // unicode next line \u0085
   4924             {
   4925                 // https://www.compart.com/en/unicode/U+0085
   4926                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
   4927                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x7b, 0x85);
   4928                 ++i;
   4929             }
   4930             else if(next == 'L') // unicode line separator \u2028
   4931             {
   4932                 // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
   4933                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
   4934                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
   4935                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x58, 0xa8);
   4936                 ++i;
   4937             }
   4938             else if(next == 'P') // unicode paragraph separator \u2029
   4939             {
   4940                 // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
   4941                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
   4942                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
   4943                 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x57, 0xa9);
   4944                 ++i;
   4945             }
   4946             _c4dbgfdq("[{}]: backslash...sofar=[{}]~~~{}~~~", i, pos, m_filter_arena.first(pos));
   4947         }
   4948         else
   4949         {
   4950             m_filter_arena.str[pos++] = curr;
   4951         }
   4952     }
   4953 
   4954     _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
   4955     if(pos < r.len || filtered_chars)
   4956     {
   4957         r = _finish_filter_arena(r, pos);
   4958     }
   4959 
   4960     _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
   4961     _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
   4962 
   4963     #undef _c4dbgfdq
   4964 
   4965     return r;
   4966 }
   4967 
   4968 
   4969 //-----------------------------------------------------------------------------
   4970 bool Parser::_apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp)
   4971 {
   4972     substr trimmed = buf.first(*pos).trimr('\n');
   4973     bool added_newline = false;
   4974     switch(chomp)
   4975     {
   4976     case CHOMP_KEEP:
   4977         if(trimmed.len == *pos)
   4978         {
   4979             _c4dbgpf("chomp=KEEP: add missing newline @{}", *pos);
   4980             //m_filter_arena.str[(*pos)++] = '\n';
   4981             added_newline = true;
   4982         }
   4983         break;
   4984     case CHOMP_CLIP:
   4985         if(trimmed.len == *pos)
   4986         {
   4987             _c4dbgpf("chomp=CLIP: add missing newline @{}", *pos);
   4988             m_filter_arena.str[(*pos)++] = '\n';
   4989             added_newline = true;
   4990         }
   4991         else
   4992         {
   4993             _c4dbgpf("chomp=CLIP: include single trailing newline @{}", trimmed.len+1);
   4994             *pos = trimmed.len + 1;
   4995         }
   4996         break;
   4997     case CHOMP_STRIP:
   4998         _c4dbgpf("chomp=STRIP: strip {}-{}-{} newlines", *pos, trimmed.len, *pos-trimmed.len);
   4999         *pos = trimmed.len;
   5000         break;
   5001     default:
   5002         _c4err("unknown chomp style");
   5003     }
   5004     return added_newline;
   5005 }
   5006 
   5007 
   5008 //-----------------------------------------------------------------------------
   5009 csubstr Parser::_filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation)
   5010 {
   5011     // a debugging scaffold:
   5012     #if 0
   5013     #define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block" fmt, __VA_ARGS__)
   5014     #else
   5015     #define _c4dbgfbl(...)
   5016     #endif
   5017 
   5018     _c4dbgfbl(": indentation={} before=[{}]~~~{}~~~", indentation, s.len, s);
   5019 
   5020     if(chomp != CHOMP_KEEP && s.trim(" \n\r").len == 0u)
   5021     {
   5022         _c4dbgp("filt_block: empty scalar");
   5023         return s.first(0);
   5024     }
   5025 
   5026     substr r = s;
   5027 
   5028     switch(style)
   5029     {
   5030     case BLOCK_LITERAL:
   5031         {
   5032             _c4dbgp("filt_block: style=literal");
   5033             // trim leading whitespace up to indentation
   5034             {
   5035                 size_t numws = r.first_not_of(' ');
   5036                 if(numws != npos)
   5037                 {
   5038                     if(numws > indentation)
   5039                         r = r.sub(indentation);
   5040                     else
   5041                         r = r.sub(numws);
   5042                     _c4dbgfbl(": after triml=[{}]~~~{}~~~", r.len, r);
   5043                 }
   5044                 else
   5045                 {
   5046                     if(chomp != CHOMP_KEEP || r.len == 0)
   5047                     {
   5048                         _c4dbgfbl(": all spaces {}, return empty", r.len);
   5049                         return r.first(0);
   5050                     }
   5051                     else
   5052                     {
   5053                         r[0] = '\n';
   5054                         return r.first(1);
   5055                     }
   5056                 }
   5057             }
   5058             _grow_filter_arena(s.len + 2u);  // use s.len! because we may need to add a newline at the end, so the leading indentation will allow space for that newline
   5059             size_t pos = 0; // the filtered size
   5060             for(size_t i = 0; i < r.len; ++i)
   5061             {
   5062                 const char curr = r.str[i];
   5063                 _c4dbgfbl("[{}]='{}'  pos={}", i, _c4prc(curr), pos);
   5064                 if(curr == '\r')
   5065                     continue;
   5066                 m_filter_arena.str[pos++] = curr;
   5067                 if(curr == '\n')
   5068                 {
   5069                     _c4dbgfbl("[{}]: found newline", i);
   5070                     // skip indentation on the next line
   5071                     csubstr rem = r.sub(i+1);
   5072                     size_t first = rem.first_not_of(' ');
   5073                     if(first != npos)
   5074                     {
   5075                         _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
   5076                         _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
   5077                         _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, rem.str[first]);
   5078                         if(first < indentation)
   5079                         {
   5080                             _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
   5081                             i += first;
   5082                         }
   5083                         else
   5084                         {
   5085                             _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
   5086                             i += indentation;
   5087                         }
   5088                     }
   5089                     else
   5090                     {
   5091                         _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
   5092                         first = rem.len;
   5093                         _c4dbgfbl("[{}]: {} spaces to the end", i, first);
   5094                         if(first)
   5095                         {
   5096                             if(first < indentation)
   5097                             {
   5098                                 _c4dbgfbl("[{}]: skip everything", i);
   5099                                 --pos;
   5100                                 break;
   5101                             }
   5102                             else
   5103                             {
   5104                                 _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
   5105                                 i += indentation;
   5106                             }
   5107                         }
   5108                         else if(i+1 == r.len)
   5109                         {
   5110                             if(chomp == CHOMP_STRIP)
   5111                                 --pos;
   5112                             break;
   5113                         }
   5114                     }
   5115                 }
   5116             }
   5117             _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= pos);
   5118             _c4dbgfbl(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
   5119             bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
   5120             _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
   5121             _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= s.len);
   5122             if(pos < r.len || changed)
   5123             {
   5124                 r = _finish_filter_arena(s, pos); // write into s
   5125             }
   5126             break;
   5127         }
   5128     case BLOCK_FOLD:
   5129         {
   5130             _c4dbgp("filt_block: style=fold");
   5131             _grow_filter_arena(r.len + 2);
   5132             size_t pos = 0; // the filtered size
   5133             bool filtered_chars = false;
   5134             bool started = false;
   5135             bool is_indented = false;
   5136             size_t i = r.first_not_of(' ');
   5137             _c4dbgfbl(": first non space at {}", i);
   5138             if(i > indentation)
   5139             {
   5140                 is_indented = true;
   5141                 i = indentation;
   5142             }
   5143             _c4dbgfbl(": start folding at {}, is_indented={}", i, (int)is_indented);
   5144             auto on_change_indentation = [&](size_t numnl_following, size_t last_newl, size_t first_non_whitespace){
   5145                 _c4dbgfbl("[{}]: add 1+{} newlines", i, numnl_following);
   5146                 for(size_t j = 0; j < 1 + numnl_following; ++j)
   5147                     m_filter_arena.str[pos++] = '\n';
   5148                 for(i = last_newl + 1 + indentation; i < first_non_whitespace; ++i)
   5149                 {
   5150                     if(r.str[i] == '\r')
   5151                         continue;
   5152                     _c4dbgfbl("[{}]: add '{}'", i, _c4prc(r.str[i]));
   5153                     m_filter_arena.str[pos++] = r.str[i];
   5154                 }
   5155                 --i;
   5156             };
   5157             for( ; i < r.len; ++i)
   5158             {
   5159                 const char curr = r.str[i];
   5160                 _c4dbgfbl("[{}]='{}'", i, _c4prc(curr));
   5161                 if(curr == '\n')
   5162                 {
   5163                     filtered_chars = true;
   5164                     // skip indentation on the next line, and advance over the next non-indented blank lines as well
   5165                     size_t first_non_whitespace;
   5166                     size_t numnl_following = (size_t)-1;
   5167                     while(r[i] == '\n')
   5168                     {
   5169                         ++numnl_following;
   5170                         csubstr rem = r.sub(i+1);
   5171                         size_t first = rem.first_not_of(' ');
   5172                         _c4dbgfbl("[{}]: found newline. first={} rem.len={}", i, first, rem.len);
   5173                         if(first != npos)
   5174                         {
   5175                             first_non_whitespace = first + i+1;
   5176                             while(first_non_whitespace < r.len && r[first_non_whitespace] == '\r')
   5177                                 ++first_non_whitespace;
   5178                             _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
   5179                             _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
   5180                             _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, _c4prc(rem.str[first]));
   5181                             if(first < indentation)
   5182                             {
   5183                                 _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
   5184                                 i += first;
   5185                             }
   5186                             else
   5187                             {
   5188                                 _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
   5189                                 i += indentation;
   5190                                 if(first > indentation)
   5191                                 {
   5192                                     _c4dbgfbl("[{}]: {} further indented than {}, stop newlining", i, first, indentation);
   5193                                     goto finished_counting_newlines;
   5194                                 }
   5195                             }
   5196                             // prepare the next while loop iteration
   5197                             // by setting i at the next newline after
   5198                             // an empty line
   5199                             if(r[first_non_whitespace] == '\n')
   5200                                 i = first_non_whitespace;
   5201                             else
   5202                                 goto finished_counting_newlines;
   5203                         }
   5204                         else
   5205                         {
   5206                             _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
   5207                             first = rem.len;
   5208                             first_non_whitespace = first + i+1;
   5209                             if(first)
   5210                             {
   5211                                 _c4dbgfbl("[{}]: {} spaces to the end", i, first);
   5212                                 if(first < indentation)
   5213                                 {
   5214                                     _c4dbgfbl("[{}]: skip everything", i);
   5215                                     i += first;
   5216                                 }
   5217                                 else
   5218                                 {
   5219                                     _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
   5220                                     i += indentation;
   5221                                     if(first > indentation)
   5222                                     {
   5223                                         _c4dbgfbl("[{}]: {} spaces missing. not done yet", i, indentation - first);
   5224                                         goto finished_counting_newlines;
   5225                                     }
   5226                                 }
   5227                             }
   5228                             else // if(i+1 == r.len)
   5229                             {
   5230                                 _c4dbgfbl("[{}]: it's the final newline", i);
   5231                                 _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 == r.len);
   5232                                 _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len == 0);
   5233                             }
   5234                             goto end_of_scalar;
   5235                         }
   5236                     }
   5237                 end_of_scalar:
   5238                     // Write all the trailing newlines. Since we're
   5239                     // at the end no folding is needed, so write every
   5240                     // newline (add 1).
   5241                     _c4dbgfbl("[{}]: add {} trailing newlines", i, 1+numnl_following);
   5242                     for(size_t j = 0; j < 1 + numnl_following; ++j)
   5243                         m_filter_arena.str[pos++] = '\n';
   5244                     break;
   5245                 finished_counting_newlines:
   5246                     _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
   5247                     while(first_non_whitespace < r.len && r[first_non_whitespace] == '\t')
   5248                         ++first_non_whitespace;
   5249                     _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
   5250                     _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace <= r.len);
   5251                     size_t last_newl = r.last_of('\n', first_non_whitespace);
   5252                     size_t this_indentation = first_non_whitespace - last_newl - 1;
   5253                     _c4dbgfbl("[{}]: #newlines={} firstnonws={} lastnewl={} this_indentation={} vs indentation={}", i, numnl_following, first_non_whitespace, last_newl, this_indentation, indentation);
   5254                     _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace >= last_newl + 1);
   5255                     _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation >= indentation);
   5256                     if(!started)
   5257                     {
   5258                         _c4dbgfbl("[{}]: #newlines={}. write all leading newlines", i, numnl_following);
   5259                         for(size_t j = 0; j < 1 + numnl_following; ++j)
   5260                             m_filter_arena.str[pos++] = '\n';
   5261                         if(this_indentation > indentation)
   5262                         {
   5263                             is_indented = true;
   5264                             _c4dbgfbl("[{}]: advance ->{}", i, last_newl + indentation);
   5265                             i = last_newl + indentation;
   5266                         }
   5267                         else
   5268                         {
   5269                             i = first_non_whitespace - 1;
   5270                             _c4dbgfbl("[{}]: advance ->{}", i, first_non_whitespace);
   5271                         }
   5272                     }
   5273                     else if(this_indentation == indentation)
   5274                     {
   5275                         _c4dbgfbl("[{}]: same indentation", i);
   5276                         if(!is_indented)
   5277                         {
   5278                             if(numnl_following == 0)
   5279                             {
   5280                                 _c4dbgfbl("[{}]: fold!", i);
   5281                                 m_filter_arena.str[pos++] = ' ';
   5282                             }
   5283                             else
   5284                             {
   5285                                 _c4dbgfbl("[{}]: add {} newlines", i, 1 + numnl_following);
   5286                                 for(size_t j = 0; j < numnl_following; ++j)
   5287                                     m_filter_arena.str[pos++] = '\n';
   5288                             }
   5289                             i = first_non_whitespace - 1;
   5290                             _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
   5291                         }
   5292                         else
   5293                         {
   5294                             _c4dbgfbl("[{}]: back to ref indentation", i);
   5295                             is_indented = false;
   5296                             on_change_indentation(numnl_following, last_newl, first_non_whitespace);
   5297                             _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
   5298                         }
   5299                     }
   5300                     else
   5301                     {
   5302                         _c4dbgfbl("[{}]: increased indentation.", i);
   5303                         is_indented = true;
   5304                         _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation > indentation);
   5305                         on_change_indentation(numnl_following, last_newl, first_non_whitespace);
   5306                         _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
   5307                     }
   5308                 }
   5309                 else if(curr != '\r')
   5310                 {
   5311                     if(curr != '\t')
   5312                         started = true;
   5313                     m_filter_arena.str[pos++] = curr;
   5314                 }
   5315             }
   5316             _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
   5317             _c4dbgfbl(": #filteredchars={} after=[{}]~~~{}~~~", (int)s.len - (int)pos, pos, m_filter_arena.first(pos));
   5318             bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
   5319             if(pos < r.len || filtered_chars || changed)
   5320             {
   5321                 r = _finish_filter_arena(s, pos); // write into s
   5322             }
   5323         }
   5324         break;
   5325     default:
   5326         _c4err("unknown block style");
   5327     }
   5328 
   5329     _c4dbgfbl(": final=[{}]~~~{}~~~", r.len, r);
   5330 
   5331     #undef _c4dbgfbl
   5332 
   5333     return r;
   5334 }
   5335 
   5336 //-----------------------------------------------------------------------------
   5337 size_t Parser::_count_nlines(csubstr src)
   5338 {
   5339     return 1 + src.count('\n');
   5340 }
   5341 
   5342 //-----------------------------------------------------------------------------
   5343 void Parser::_handle_directive(csubstr directive_)
   5344 {
   5345     csubstr directive = directive_;
   5346     if(directive.begins_with("%TAG"))
   5347     {
   5348         TagDirective td;
   5349         _c4dbgpf("%TAG directive: {}", directive_);
   5350         directive = directive.sub(4);
   5351         if(!directive.begins_with(' '))
   5352             _c4err("malformed tag directive: {}", directive_);
   5353         directive = directive.triml(' ');
   5354         size_t pos = directive.find(' ');
   5355         if(pos == npos)
   5356             _c4err("malformed tag directive: {}", directive_);
   5357         td.handle = directive.first(pos);
   5358         directive = directive.sub(td.handle.len).triml(' ');
   5359         pos = directive.find(' ');
   5360         if(pos != npos)
   5361             directive = directive.first(pos);
   5362         td.prefix = directive;
   5363         td.next_node_id = m_tree->size();
   5364         if(m_tree->size() > 0)
   5365         {
   5366             size_t prev = m_tree->size() - 1;
   5367             if(m_tree->is_root(prev) && m_tree->type(prev) != NOTYPE && !m_tree->is_stream(prev))
   5368                 ++td.next_node_id;
   5369         }
   5370         _c4dbgpf("%TAG: handle={} prefix={} next_node={}", td.handle, td.prefix, td.next_node_id);
   5371         m_tree->add_tag_directive(td);
   5372     }
   5373     else if(directive.begins_with("%YAML"))
   5374     {
   5375         _c4dbgpf("%YAML directive! ignoring...: {}", directive);
   5376     }
   5377 }
   5378 
   5379 //-----------------------------------------------------------------------------
   5380 void Parser::set_flags(flag_t f, State * s)
   5381 {
   5382 #ifdef RYML_DBG
   5383     char buf1_[64], buf2_[64];
   5384     csubstr buf1 = _prfl(buf1_, f);
   5385     csubstr buf2 = _prfl(buf2_, s->flags);
   5386     _c4dbgpf("state[{}]: setting flags to {}: before={}", s-m_stack.begin(), buf1, buf2);
   5387 #endif
   5388     s->flags = f;
   5389 }
   5390 
   5391 void Parser::add_flags(flag_t on, State * s)
   5392 {
   5393 #ifdef RYML_DBG
   5394     char buf1_[64], buf2_[64], buf3_[64];
   5395     csubstr buf1 = _prfl(buf1_, on);
   5396     csubstr buf2 = _prfl(buf2_, s->flags);
   5397     csubstr buf3 = _prfl(buf3_, s->flags|on);
   5398     _c4dbgpf("state[{}]: adding flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
   5399 #endif
   5400     s->flags |= on;
   5401 }
   5402 
   5403 void Parser::addrem_flags(flag_t on, flag_t off, State * s)
   5404 {
   5405 #ifdef RYML_DBG
   5406     char buf1_[64], buf2_[64], buf3_[64], buf4_[64];
   5407     csubstr buf1 = _prfl(buf1_, on);
   5408     csubstr buf2 = _prfl(buf2_, off);
   5409     csubstr buf3 = _prfl(buf3_, s->flags);
   5410     csubstr buf4 = _prfl(buf4_, ((s->flags|on)&(~off)));
   5411     _c4dbgpf("state[{}]: adding flags {} / removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3, buf4);
   5412 #endif
   5413     s->flags |= on;
   5414     s->flags &= ~off;
   5415 }
   5416 
   5417 void Parser::rem_flags(flag_t off, State * s)
   5418 {
   5419 #ifdef RYML_DBG
   5420     char buf1_[64], buf2_[64], buf3_[64];
   5421     csubstr buf1 = _prfl(buf1_, off);
   5422     csubstr buf2 = _prfl(buf2_, s->flags);
   5423     csubstr buf3 = _prfl(buf3_, s->flags&(~off));
   5424     _c4dbgpf("state[{}]: removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
   5425 #endif
   5426     s->flags &= ~off;
   5427 }
   5428 
   5429 //-----------------------------------------------------------------------------
   5430 
   5431 csubstr Parser::_prfl(substr buf, flag_t flags)
   5432 {
   5433     size_t pos = 0;
   5434     bool gotone = false;
   5435 
   5436     #define _prflag(fl)                                     \
   5437     if((flags & fl) == (fl))                                \
   5438     {                                                       \
   5439         if(gotone)                                          \
   5440         {                                                   \
   5441             if(pos + 1 < buf.len)                           \
   5442                 buf[pos] = '|';                             \
   5443             ++pos;                                          \
   5444         }                                                   \
   5445         csubstr fltxt = #fl;                                \
   5446         if(pos + fltxt.len <= buf.len)                      \
   5447             memcpy(buf.str + pos, fltxt.str, fltxt.len);    \
   5448         pos += fltxt.len;                                   \
   5449         gotone = true;                                      \
   5450     }
   5451 
   5452     _prflag(RTOP);
   5453     _prflag(RUNK);
   5454     _prflag(RMAP);
   5455     _prflag(RSEQ);
   5456     _prflag(FLOW);
   5457     _prflag(QMRK);
   5458     _prflag(RKEY);
   5459     _prflag(RVAL);
   5460     _prflag(RNXT);
   5461     _prflag(SSCL);
   5462     _prflag(QSCL);
   5463     _prflag(RSET);
   5464     _prflag(NDOC);
   5465     _prflag(RSEQIMAP);
   5466 
   5467     #undef _prflag
   5468 
   5469     RYML_ASSERT(pos <= buf.len);
   5470 
   5471     return buf.first(pos);
   5472 }
   5473 
   5474 
   5475 //-----------------------------------------------------------------------------
   5476 //-----------------------------------------------------------------------------
   5477 //-----------------------------------------------------------------------------
   5478 
   5479 void Parser::_grow_filter_arena(size_t num_characters_needed)
   5480 {
   5481     _c4dbgpf("grow: arena={} numchars={}", m_filter_arena.len, num_characters_needed);
   5482     if(num_characters_needed <= m_filter_arena.len)
   5483         return;
   5484     size_t sz = m_filter_arena.len << 1;
   5485     _c4dbgpf("grow: sz={}", sz);
   5486     sz = num_characters_needed > sz ? num_characters_needed : sz;
   5487     _c4dbgpf("grow: sz={}", sz);
   5488     sz = sz < 128u ? 128u : sz;
   5489     _c4dbgpf("grow: sz={}", sz);
   5490     _RYML_CB_ASSERT(m_stack.m_callbacks, sz >= num_characters_needed);
   5491     _resize_filter_arena(sz);
   5492 }
   5493 
   5494 void Parser::_resize_filter_arena(size_t num_characters)
   5495 {
   5496     if(num_characters > m_filter_arena.len)
   5497     {
   5498         _c4dbgpf("resize: sz={}", num_characters);
   5499         char *prev = m_filter_arena.str;
   5500         if(m_filter_arena.str)
   5501         {
   5502             _RYML_CB_ASSERT(m_stack.m_callbacks, m_filter_arena.len > 0);
   5503             _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
   5504         }
   5505         m_filter_arena.str = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, char, num_characters, prev);
   5506         m_filter_arena.len = num_characters;
   5507     }
   5508 }
   5509 
   5510 substr Parser::_finish_filter_arena(substr dst, size_t pos)
   5511 {
   5512     _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
   5513     _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= dst.len);
   5514     memcpy(dst.str, m_filter_arena.str, pos);
   5515     return dst.first(pos);
   5516 }
   5517 
   5518 
   5519 //-----------------------------------------------------------------------------
   5520 //-----------------------------------------------------------------------------
   5521 //-----------------------------------------------------------------------------
   5522 
   5523 csubstr Parser::location_contents(Location const& loc) const
   5524 {
   5525     _RYML_CB_ASSERT(m_stack.m_callbacks, loc.offset < m_buf.len);
   5526     return m_buf.sub(loc.offset);
   5527 }
   5528 
   5529 Location Parser::location(ConstNodeRef node) const
   5530 {
   5531     _RYML_CB_ASSERT(m_stack.m_callbacks, node.valid());
   5532     return location(*node.tree(), node.id());
   5533 }
   5534 
   5535 Location Parser::location(Tree const& tree, size_t node) const
   5536 {
   5537     // try hard to avoid getting the location from a null string.
   5538     Location loc;
   5539     if(_location_from_node(tree, node, &loc, 0))
   5540         return loc;
   5541     return val_location(m_buf.str);
   5542 }
   5543 
   5544 bool Parser::_location_from_node(Tree const& tree, size_t node, Location *C4_RESTRICT loc, size_t level) const
   5545 {
   5546     if(tree.has_key(node))
   5547     {
   5548         csubstr k = tree.key(node);
   5549         if(C4_LIKELY(k.str != nullptr))
   5550         {
   5551             _RYML_CB_ASSERT(m_stack.m_callbacks, k.is_sub(m_buf));
   5552             _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(k));
   5553             *loc = val_location(k.str);
   5554             return true;
   5555         }
   5556     }
   5557 
   5558     if(tree.has_val(node))
   5559     {
   5560         csubstr v = tree.val(node);
   5561         if(C4_LIKELY(v.str != nullptr))
   5562         {
   5563             _RYML_CB_ASSERT(m_stack.m_callbacks, v.is_sub(m_buf));
   5564             _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(v));
   5565             *loc = val_location(v.str);
   5566             return true;
   5567         }
   5568     }
   5569 
   5570     if(tree.is_container(node))
   5571     {
   5572         if(_location_from_cont(tree, node, loc))
   5573             return true;
   5574     }
   5575 
   5576     if(tree.type(node) != NOTYPE && level == 0)
   5577     {
   5578         // try the prev sibling
   5579         {
   5580             const size_t prev = tree.prev_sibling(node);
   5581             if(prev != NONE)
   5582             {
   5583                 if(_location_from_node(tree, prev, loc, level+1))
   5584                     return true;
   5585             }
   5586         }
   5587         // try the next sibling
   5588         {
   5589             const size_t next = tree.next_sibling(node);
   5590             if(next != NONE)
   5591             {
   5592                 if(_location_from_node(tree, next, loc, level+1))
   5593                     return true;
   5594             }
   5595         }
   5596         // try the parent
   5597         {
   5598             const size_t parent = tree.parent(node);
   5599             if(parent != NONE)
   5600             {
   5601                 if(_location_from_node(tree, parent, loc, level+1))
   5602                     return true;
   5603             }
   5604         }
   5605     }
   5606 
   5607     return false;
   5608 }
   5609 
   5610 bool Parser::_location_from_cont(Tree const& tree, size_t node, Location *C4_RESTRICT loc) const
   5611 {
   5612     _RYML_CB_ASSERT(m_stack.m_callbacks, tree.is_container(node));
   5613     if(!tree.is_stream(node))
   5614     {
   5615         const char *node_start = tree._p(node)->m_val.scalar.str;  // this was stored in the container
   5616         if(tree.has_children(node))
   5617         {
   5618             size_t child = tree.first_child(node);
   5619             if(tree.has_key(child))
   5620             {
   5621                 // when a map starts, the container was set after the key
   5622                 csubstr k = tree.key(child);
   5623                 if(k.str && node_start > k.str)
   5624                     node_start = k.str;
   5625             }
   5626         }
   5627         *loc = val_location(node_start);
   5628         return true;
   5629     }
   5630     else // it's a stream
   5631     {
   5632         *loc = val_location(m_buf.str); // just return the front of the buffer
   5633     }
   5634     return true;
   5635 }
   5636 
   5637 
   5638 Location Parser::val_location(const char *val) const
   5639 {
   5640     if(C4_UNLIKELY(val == nullptr))
   5641         return {m_file, 0, 0, 0};
   5642 
   5643     _RYML_CB_CHECK(m_stack.m_callbacks, m_options.locations());
   5644     // NOTE: if any of these checks fails, the parser needs to be
   5645     // instantiated with locations enabled.
   5646     _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.str == m_newline_offsets_buf.str);
   5647     _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.len == m_newline_offsets_buf.len);
   5648     _RYML_CB_ASSERT(m_stack.m_callbacks, m_options.locations());
   5649     _RYML_CB_ASSERT(m_stack.m_callbacks, !_locations_dirty());
   5650     _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets != nullptr);
   5651     _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size > 0);
   5652     // NOTE: the pointer needs to belong to the buffer that was used to parse.
   5653     csubstr src = m_buf;
   5654     _RYML_CB_CHECK(m_stack.m_callbacks, val != nullptr || src.str == nullptr);
   5655     _RYML_CB_CHECK(m_stack.m_callbacks, (val >= src.begin() && val <= src.end()) || (src.str == nullptr && val == nullptr));
   5656     // ok. search the first stored newline after the given ptr
   5657     using lineptr_type = size_t const* C4_RESTRICT;
   5658     lineptr_type lineptr = nullptr;
   5659     size_t offset = (size_t)(val - src.begin());
   5660     if(m_newline_offsets_size < 30) // TODO magic number
   5661     {
   5662         // just do a linear search if the size is small.
   5663         for(lineptr_type curr = m_newline_offsets, last = m_newline_offsets + m_newline_offsets_size; curr < last; ++curr)
   5664         {
   5665             if(*curr > offset)
   5666             {
   5667                 lineptr = curr;
   5668                 break;
   5669             }
   5670         }
   5671     }
   5672     else
   5673     {
   5674         // do a bisection search if the size is not small.
   5675         //
   5676         // We could use std::lower_bound but this is simple enough and
   5677         // spares the include of <algorithm>.
   5678         size_t count = m_newline_offsets_size;
   5679         size_t step;
   5680         lineptr_type it;
   5681         lineptr = m_newline_offsets;
   5682         while(count)
   5683         {
   5684             step = count >> 1;
   5685             it = lineptr + step;
   5686             if(*it < offset)
   5687             {
   5688                 lineptr = ++it;
   5689                 count -= step + 1;
   5690             }
   5691             else
   5692             {
   5693                 count = step;
   5694             }
   5695         }
   5696     }
   5697     _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr >= m_newline_offsets);
   5698     _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr <= m_newline_offsets + m_newline_offsets_size);
   5699     _RYML_CB_ASSERT(m_stack.m_callbacks, *lineptr > offset);
   5700     Location loc;
   5701     loc.name = m_file;
   5702     loc.offset = offset;
   5703     loc.line = (size_t)(lineptr - m_newline_offsets);
   5704     if(lineptr > m_newline_offsets)
   5705         loc.col = (offset - *(lineptr-1) - 1u);
   5706     else
   5707         loc.col = offset;
   5708     return loc;
   5709 }
   5710 
   5711 void Parser::_prepare_locations()
   5712 {
   5713     m_newline_offsets_buf = m_buf;
   5714     size_t numnewlines = 1u + m_buf.count('\n');
   5715     _resize_locations(numnewlines);
   5716     m_newline_offsets_size = 0;
   5717     for(size_t i = 0; i < m_buf.len; i++)
   5718         if(m_buf[i] == '\n')
   5719             m_newline_offsets[m_newline_offsets_size++] = i;
   5720     m_newline_offsets[m_newline_offsets_size++] = m_buf.len;
   5721     _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size == numnewlines);
   5722 }
   5723 
   5724 void Parser::_resize_locations(size_t numnewlines)
   5725 {
   5726     if(numnewlines > m_newline_offsets_capacity)
   5727     {
   5728         if(m_newline_offsets)
   5729             _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
   5730         m_newline_offsets = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, size_t, numnewlines, m_newline_offsets);
   5731         m_newline_offsets_capacity = numnewlines;
   5732     }
   5733 }
   5734 
   5735 bool Parser::_locations_dirty() const
   5736 {
   5737     return !m_newline_offsets_size;
   5738 }
   5739 
   5740 } // namespace yml
   5741 } // namespace c4
   5742 
   5743 
   5744 #if defined(_MSC_VER)
   5745 #   pragma warning(pop)
   5746 #elif defined(__clang__)
   5747 #   pragma clang diagnostic pop
   5748 #elif defined(__GNUC__)
   5749 #   pragma GCC diagnostic pop
   5750 #endif