parse.cpp (198070B)
1 #include "c4/yml/parse.hpp" 2 #include "c4/error.hpp" 3 #include "c4/utf.hpp" 4 #include <c4/dump.hpp> 5 6 #include <ctype.h> 7 #include <stdarg.h> 8 #include <stdio.h> 9 10 #include "c4/yml/detail/parser_dbg.hpp" 11 #ifdef RYML_DBG 12 #include "c4/yml/detail/print.hpp" 13 #endif 14 15 #ifndef RYML_ERRMSG_SIZE 16 #define RYML_ERRMSG_SIZE 1024 17 #endif 18 19 //#define RYML_WITH_TAB_TOKENS 20 #ifdef RYML_WITH_TAB_TOKENS 21 #define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__ 22 #define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) with 23 #else 24 #define _RYML_WITH_TAB_TOKENS(...) 25 #define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) without 26 #endif 27 28 29 #if defined(_MSC_VER) 30 # pragma warning(push) 31 # pragma warning(disable: 4296/*expression is always 'boolean_value'*/) 32 #elif defined(__clang__) 33 # pragma clang diagnostic push 34 # pragma clang diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0. 35 # pragma clang diagnostic ignored "-Wformat-nonliteral" 36 # pragma clang diagnostic ignored "-Wold-style-cast" 37 #elif defined(__GNUC__) 38 # pragma GCC diagnostic push 39 # pragma GCC diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0. 40 # pragma GCC diagnostic ignored "-Wformat-nonliteral" 41 # pragma GCC diagnostic ignored "-Wold-style-cast" 42 # if __GNUC__ >= 7 43 # pragma GCC diagnostic ignored "-Wduplicated-branches" 44 # endif 45 #endif 46 47 namespace c4 { 48 namespace yml { 49 50 namespace { 51 52 template<class DumpFn, class ...Args> 53 void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args) 54 { 55 char writebuf[256]; 56 auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward<Args>(args)...); 57 // resume writing if the results failed to fit the buffer 58 if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte. 59 { 60 results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...); 61 if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) 62 { 63 results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...); 64 } 65 } 66 } 67 68 bool _is_scalar_next__runk(csubstr s) 69 { 70 return !(s.begins_with(": ") || s.begins_with_any("#,{}[]%&") || s.begins_with("? ") || s == "-" || s.begins_with("- ") || s.begins_with(":\"") || s.begins_with(":'")); 71 } 72 73 bool _is_scalar_next__rseq_rval(csubstr s) 74 { 75 return !(s.begins_with_any("[{!&") || s.begins_with("? ") || s.begins_with("- ") || s == "-"); 76 } 77 78 bool _is_scalar_next__rmap(csubstr s) 79 { 80 return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ") _RYML_WITH_TAB_TOKENS(|| s.begins_with(":\t"))); 81 } 82 83 bool _is_scalar_next__rmap_val(csubstr s) 84 { 85 return !(s.begins_with("- ") || s.begins_with_any("{[") || s == "-"); 86 } 87 88 bool _is_doc_sep(csubstr s) 89 { 90 constexpr const csubstr dashes = "---"; 91 constexpr const csubstr ellipsis = "..."; 92 constexpr const csubstr whitesp = " \t"; 93 if(s.begins_with(dashes)) 94 return s == dashes || s.sub(3).begins_with_any(whitesp); 95 else if(s.begins_with(ellipsis)) 96 return s == ellipsis || s.sub(3).begins_with_any(whitesp); 97 return false; 98 } 99 100 /** @p i is set to the first non whitespace character after the line 101 * @return the number of empty lines after the initial position */ 102 size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation) 103 { 104 RYML_ASSERT(r[*i] == '\n'); 105 size_t numnl_following = 0; 106 ++(*i); 107 for( ; *i < r.len; ++(*i)) 108 { 109 if(r.str[*i] == '\n') 110 { 111 ++numnl_following; 112 if(indentation) // skip the indentation after the newline 113 { 114 size_t stop = *i + indentation; 115 for( ; *i < r.len; ++(*i)) 116 { 117 if(r.str[*i] != ' ' && r.str[*i] != '\r') 118 break; 119 RYML_ASSERT(*i < stop); 120 } 121 C4_UNUSED(stop); 122 } 123 } 124 else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r') // skip leading whitespace 125 ; 126 else 127 break; 128 } 129 return numnl_following; 130 } 131 132 } // anon namespace 133 134 135 //----------------------------------------------------------------------------- 136 137 Parser::~Parser() 138 { 139 _free(); 140 _clr(); 141 } 142 143 Parser::Parser(Callbacks const& cb, ParserOptions opts) 144 : m_options(opts) 145 , m_file() 146 , m_buf() 147 , m_root_id(NONE) 148 , m_tree() 149 , m_stack(cb) 150 , m_state() 151 , m_key_tag_indentation(0) 152 , m_key_tag2_indentation(0) 153 , m_key_tag() 154 , m_key_tag2() 155 , m_val_tag_indentation(0) 156 , m_val_tag() 157 , m_key_anchor_was_before(false) 158 , m_key_anchor_indentation(0) 159 , m_key_anchor() 160 , m_val_anchor_indentation(0) 161 , m_val_anchor() 162 , m_filter_arena() 163 , m_newline_offsets() 164 , m_newline_offsets_size(0) 165 , m_newline_offsets_capacity(0) 166 , m_newline_offsets_buf() 167 { 168 m_stack.push(State{}); 169 m_state = &m_stack.top(); 170 } 171 172 Parser::Parser(Parser &&that) 173 : m_options(that.m_options) 174 , m_file(that.m_file) 175 , m_buf(that.m_buf) 176 , m_root_id(that.m_root_id) 177 , m_tree(that.m_tree) 178 , m_stack(std::move(that.m_stack)) 179 , m_state(&m_stack.top()) 180 , m_key_tag_indentation(that.m_key_tag_indentation) 181 , m_key_tag2_indentation(that.m_key_tag2_indentation) 182 , m_key_tag(that.m_key_tag) 183 , m_key_tag2(that.m_key_tag2) 184 , m_val_tag_indentation(that.m_val_tag_indentation) 185 , m_val_tag(that.m_val_tag) 186 , m_key_anchor_was_before(that.m_key_anchor_was_before) 187 , m_key_anchor_indentation(that.m_key_anchor_indentation) 188 , m_key_anchor(that.m_key_anchor) 189 , m_val_anchor_indentation(that.m_val_anchor_indentation) 190 , m_val_anchor(that.m_val_anchor) 191 , m_filter_arena(that.m_filter_arena) 192 , m_newline_offsets(that.m_newline_offsets) 193 , m_newline_offsets_size(that.m_newline_offsets_size) 194 , m_newline_offsets_capacity(that.m_newline_offsets_capacity) 195 , m_newline_offsets_buf(that.m_newline_offsets_buf) 196 { 197 that._clr(); 198 } 199 200 Parser::Parser(Parser const& that) 201 : m_options(that.m_options) 202 , m_file(that.m_file) 203 , m_buf(that.m_buf) 204 , m_root_id(that.m_root_id) 205 , m_tree(that.m_tree) 206 , m_stack(that.m_stack) 207 , m_state(&m_stack.top()) 208 , m_key_tag_indentation(that.m_key_tag_indentation) 209 , m_key_tag2_indentation(that.m_key_tag2_indentation) 210 , m_key_tag(that.m_key_tag) 211 , m_key_tag2(that.m_key_tag2) 212 , m_val_tag_indentation(that.m_val_tag_indentation) 213 , m_val_tag(that.m_val_tag) 214 , m_key_anchor_was_before(that.m_key_anchor_was_before) 215 , m_key_anchor_indentation(that.m_key_anchor_indentation) 216 , m_key_anchor(that.m_key_anchor) 217 , m_val_anchor_indentation(that.m_val_anchor_indentation) 218 , m_val_anchor(that.m_val_anchor) 219 , m_filter_arena() 220 , m_newline_offsets() 221 , m_newline_offsets_size() 222 , m_newline_offsets_capacity() 223 , m_newline_offsets_buf() 224 { 225 if(that.m_newline_offsets_capacity) 226 { 227 _resize_locations(that.m_newline_offsets_capacity); 228 _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity == that.m_newline_offsets_capacity); 229 memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t)); 230 m_newline_offsets_size = that.m_newline_offsets_size; 231 } 232 if(that.m_filter_arena.len) 233 { 234 _resize_filter_arena(that.m_filter_arena.len); 235 } 236 } 237 238 Parser& Parser::operator=(Parser &&that) 239 { 240 _free(); 241 m_options = (that.m_options); 242 m_file = (that.m_file); 243 m_buf = (that.m_buf); 244 m_root_id = (that.m_root_id); 245 m_tree = (that.m_tree); 246 m_stack = std::move(that.m_stack); 247 m_state = (&m_stack.top()); 248 m_key_tag_indentation = (that.m_key_tag_indentation); 249 m_key_tag2_indentation = (that.m_key_tag2_indentation); 250 m_key_tag = (that.m_key_tag); 251 m_key_tag2 = (that.m_key_tag2); 252 m_val_tag_indentation = (that.m_val_tag_indentation); 253 m_val_tag = (that.m_val_tag); 254 m_key_anchor_was_before = (that.m_key_anchor_was_before); 255 m_key_anchor_indentation = (that.m_key_anchor_indentation); 256 m_key_anchor = (that.m_key_anchor); 257 m_val_anchor_indentation = (that.m_val_anchor_indentation); 258 m_val_anchor = (that.m_val_anchor); 259 m_filter_arena = that.m_filter_arena; 260 m_newline_offsets = (that.m_newline_offsets); 261 m_newline_offsets_size = (that.m_newline_offsets_size); 262 m_newline_offsets_capacity = (that.m_newline_offsets_capacity); 263 m_newline_offsets_buf = (that.m_newline_offsets_buf); 264 that._clr(); 265 return *this; 266 } 267 268 Parser& Parser::operator=(Parser const& that) 269 { 270 _free(); 271 m_options = (that.m_options); 272 m_file = (that.m_file); 273 m_buf = (that.m_buf); 274 m_root_id = (that.m_root_id); 275 m_tree = (that.m_tree); 276 m_stack = that.m_stack; 277 m_state = &m_stack.top(); 278 m_key_tag_indentation = (that.m_key_tag_indentation); 279 m_key_tag2_indentation = (that.m_key_tag2_indentation); 280 m_key_tag = (that.m_key_tag); 281 m_key_tag2 = (that.m_key_tag2); 282 m_val_tag_indentation = (that.m_val_tag_indentation); 283 m_val_tag = (that.m_val_tag); 284 m_key_anchor_was_before = (that.m_key_anchor_was_before); 285 m_key_anchor_indentation = (that.m_key_anchor_indentation); 286 m_key_anchor = (that.m_key_anchor); 287 m_val_anchor_indentation = (that.m_val_anchor_indentation); 288 m_val_anchor = (that.m_val_anchor); 289 if(that.m_filter_arena.len > 0) 290 _resize_filter_arena(that.m_filter_arena.len); 291 if(that.m_newline_offsets_capacity > m_newline_offsets_capacity) 292 _resize_locations(that.m_newline_offsets_capacity); 293 _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity); 294 _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_size); 295 memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t)); 296 m_newline_offsets_size = that.m_newline_offsets_size; 297 m_newline_offsets_buf = that.m_newline_offsets_buf; 298 return *this; 299 } 300 301 void Parser::_clr() 302 { 303 m_options = {}; 304 m_file = {}; 305 m_buf = {}; 306 m_root_id = {}; 307 m_tree = {}; 308 m_stack.clear(); 309 m_state = {}; 310 m_key_tag_indentation = {}; 311 m_key_tag2_indentation = {}; 312 m_key_tag = {}; 313 m_key_tag2 = {}; 314 m_val_tag_indentation = {}; 315 m_val_tag = {}; 316 m_key_anchor_was_before = {}; 317 m_key_anchor_indentation = {}; 318 m_key_anchor = {}; 319 m_val_anchor_indentation = {}; 320 m_val_anchor = {}; 321 m_filter_arena = {}; 322 m_newline_offsets = {}; 323 m_newline_offsets_size = {}; 324 m_newline_offsets_capacity = {}; 325 m_newline_offsets_buf = {}; 326 } 327 328 void Parser::_free() 329 { 330 if(m_newline_offsets) 331 { 332 _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity); 333 m_newline_offsets = nullptr; 334 m_newline_offsets_size = 0u; 335 m_newline_offsets_capacity = 0u; 336 m_newline_offsets_buf = 0u; 337 } 338 if(m_filter_arena.len) 339 { 340 _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len); 341 m_filter_arena = {}; 342 } 343 m_stack._free(); 344 } 345 346 347 //----------------------------------------------------------------------------- 348 void Parser::_reset() 349 { 350 _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() == 1); 351 m_stack.clear(); 352 m_stack.push({}); 353 m_state = &m_stack.top(); 354 m_state->reset(m_file.str, m_root_id); 355 356 m_key_tag_indentation = 0; 357 m_key_tag2_indentation = 0; 358 m_key_tag.clear(); 359 m_key_tag2.clear(); 360 m_val_tag_indentation = 0; 361 m_val_tag.clear(); 362 m_key_anchor_was_before = false; 363 m_key_anchor_indentation = 0; 364 m_key_anchor.clear(); 365 m_val_anchor_indentation = 0; 366 m_val_anchor.clear(); 367 368 if(m_options.locations()) 369 { 370 _prepare_locations(); 371 } 372 } 373 374 //----------------------------------------------------------------------------- 375 template<class DumpFn> 376 void Parser::_fmt_msg(DumpFn &&dumpfn) const 377 { 378 auto const& lc = m_state->line_contents; 379 csubstr contents = lc.stripped; 380 if(contents.len) 381 { 382 // print the yaml src line 383 size_t offs = 3u + to_chars(substr{}, m_state->pos.line) + to_chars(substr{}, m_state->pos.col); 384 if(m_file.len) 385 { 386 _parse_dump(dumpfn, "{}:", m_file); 387 offs += m_file.len + 1; 388 } 389 _parse_dump(dumpfn, "{}:{}: ", m_state->pos.line, m_state->pos.col); 390 csubstr maybe_full_content = (contents.len < 80u ? contents : contents.first(80u)); 391 csubstr maybe_ellipsis = (contents.len < 80u ? csubstr{} : csubstr("...")); 392 _parse_dump(dumpfn, "{}{} (size={})\n", maybe_full_content, maybe_ellipsis, contents.len); 393 // highlight the remaining portion of the previous line 394 size_t firstcol = (size_t)(lc.rem.begin() - lc.full.begin()); 395 size_t lastcol = firstcol + lc.rem.len; 396 for(size_t i = 0; i < offs + firstcol; ++i) 397 dumpfn(" "); 398 dumpfn("^"); 399 for(size_t i = 1, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i) 400 dumpfn("~"); 401 _parse_dump(dumpfn, "{} (cols {}-{})\n", maybe_ellipsis, firstcol+1, lastcol+1); 402 } 403 else 404 { 405 dumpfn("\n"); 406 } 407 408 #ifdef RYML_DBG 409 // next line: print the state flags 410 { 411 char flagbuf_[64]; 412 _parse_dump(dumpfn, "top state: {}\n", _prfl(flagbuf_, m_state->flags)); 413 } 414 #endif 415 } 416 417 418 //----------------------------------------------------------------------------- 419 template<class ...Args> 420 void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const 421 { 422 char errmsg[RYML_ERRMSG_SIZE]; 423 detail::_SubstrWriter writer(errmsg); 424 auto dumpfn = [&writer](csubstr s){ writer.append(s); }; 425 _parse_dump(dumpfn, fmt, args...); 426 writer.append('\n'); 427 _fmt_msg(dumpfn); 428 size_t len = writer.pos < RYML_ERRMSG_SIZE ? writer.pos : RYML_ERRMSG_SIZE; 429 m_tree->m_callbacks.m_error(errmsg, len, m_state->pos, m_tree->m_callbacks.m_user_data); 430 } 431 432 //----------------------------------------------------------------------------- 433 #ifdef RYML_DBG 434 template<class ...Args> 435 void Parser::_dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const 436 { 437 auto dumpfn = [](csubstr s){ fwrite(s.str, 1, s.len, stdout); }; 438 _parse_dump(dumpfn, fmt, args...); 439 dumpfn("\n"); 440 _fmt_msg(dumpfn); 441 } 442 #endif 443 444 //----------------------------------------------------------------------------- 445 bool Parser::_finished_file() const 446 { 447 bool ret = m_state->pos.offset >= m_buf.len; 448 if(ret) 449 { 450 _c4dbgp("finished file!!!"); 451 } 452 return ret; 453 } 454 455 //----------------------------------------------------------------------------- 456 bool Parser::_finished_line() const 457 { 458 return m_state->line_contents.rem.empty(); 459 } 460 461 //----------------------------------------------------------------------------- 462 void Parser::parse_in_place(csubstr file, substr buf, Tree *t, size_t node_id) 463 { 464 m_file = file; 465 m_buf = buf; 466 m_root_id = node_id; 467 m_tree = t; 468 _reset(); 469 while( ! _finished_file()) 470 { 471 _scan_line(); 472 while( ! _finished_line()) 473 _handle_line(); 474 if(_finished_file()) 475 break; // it may have finished because of multiline blocks 476 _line_ended(); 477 } 478 _handle_finished_file(); 479 } 480 481 //----------------------------------------------------------------------------- 482 void Parser::_handle_finished_file() 483 { 484 _end_stream(); 485 } 486 487 //----------------------------------------------------------------------------- 488 void Parser::_handle_line() 489 { 490 _c4dbgq("\n-----------"); 491 _c4dbgt("handling line={}, offset={}B", m_state->pos.line, m_state->pos.offset); 492 _RYML_CB_ASSERT(m_stack.m_callbacks, ! m_state->line_contents.rem.empty()); 493 if(has_any(RSEQ)) 494 { 495 if(has_any(FLOW)) 496 { 497 if(_handle_seq_flow()) 498 return; 499 } 500 else 501 { 502 if(_handle_seq_blck()) 503 return; 504 } 505 } 506 else if(has_any(RMAP)) 507 { 508 if(has_any(FLOW)) 509 { 510 if(_handle_map_flow()) 511 return; 512 } 513 else 514 { 515 if(_handle_map_blck()) 516 return; 517 } 518 } 519 else if(has_any(RUNK)) 520 { 521 if(_handle_unk()) 522 return; 523 } 524 525 if(_handle_top()) 526 return; 527 } 528 529 530 //----------------------------------------------------------------------------- 531 bool Parser::_handle_unk() 532 { 533 _c4dbgp("handle_unk"); 534 535 csubstr rem = m_state->line_contents.rem; 536 const bool start_as_child = (node(m_state) == nullptr); 537 538 if(C4_UNLIKELY(has_any(NDOC))) 539 { 540 if(rem == "---" || rem.begins_with("--- ")) 541 { 542 _start_new_doc(rem); 543 return true; 544 } 545 auto trimmed = rem.triml(' '); 546 if(trimmed == "---" || trimmed.begins_with("--- ")) 547 { 548 _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len >= trimmed.len); 549 _line_progressed(rem.len - trimmed.len); 550 _start_new_doc(trimmed); 551 _save_indentation(); 552 return true; 553 } 554 else if(trimmed.begins_with("...")) 555 { 556 _end_stream(); 557 } 558 else if(trimmed.first_of("#%") == csubstr::npos) // neither a doc nor a tag 559 { 560 _c4dbgpf("starting implicit doc to accomodate unexpected tokens: '{}'", rem); 561 size_t indref = m_state->indref; 562 _push_level(); 563 _start_doc(); 564 _set_indentation(indref); 565 } 566 _RYML_CB_ASSERT(m_stack.m_callbacks, !trimmed.empty()); 567 } 568 569 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT|RSEQ|RMAP)); 570 if(m_state->indref > 0) 571 { 572 csubstr ws = rem.left_of(rem.first_not_of(' ')); 573 if(m_state->indref <= ws.len) 574 { 575 _c4dbgpf("skipping base indentation of {}", m_state->indref); 576 _line_progressed(m_state->indref); 577 rem = rem.sub(m_state->indref); 578 } 579 } 580 581 if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) 582 { 583 _c4dbgpf("it's a seq (as_child={})", start_as_child); 584 _move_key_anchor_to_val_anchor(); 585 _move_key_tag_to_val_tag(); 586 _push_level(); 587 _start_seq(start_as_child); 588 _save_indentation(); 589 _line_progressed(2); 590 return true; 591 } 592 else if(rem == '-') 593 { 594 _c4dbgpf("it's a seq (as_child={})", start_as_child); 595 _move_key_anchor_to_val_anchor(); 596 _move_key_tag_to_val_tag(); 597 _push_level(); 598 _start_seq(start_as_child); 599 _save_indentation(); 600 _line_progressed(1); 601 return true; 602 } 603 else if(rem.begins_with('[')) 604 { 605 _c4dbgpf("it's a seq, flow (as_child={})", start_as_child); 606 _move_key_anchor_to_val_anchor(); 607 _move_key_tag_to_val_tag(); 608 _push_level(/*explicit flow*/true); 609 _start_seq(start_as_child); 610 add_flags(FLOW); 611 _line_progressed(1); 612 return true; 613 } 614 else if(rem.begins_with('{')) 615 { 616 _c4dbgpf("it's a map, flow (as_child={})", start_as_child); 617 _move_key_anchor_to_val_anchor(); 618 _move_key_tag_to_val_tag(); 619 _push_level(/*explicit flow*/true); 620 _start_map(start_as_child); 621 addrem_flags(FLOW|RKEY, RVAL); 622 _line_progressed(1); 623 return true; 624 } 625 else if(rem.begins_with("? ")) 626 { 627 _c4dbgpf("it's a map (as_child={}) + this key is complex", start_as_child); 628 _move_key_anchor_to_val_anchor(); 629 _move_key_tag_to_val_tag(); 630 _push_level(); 631 _start_map(start_as_child); 632 addrem_flags(RKEY|QMRK, RVAL); 633 _save_indentation(); 634 _line_progressed(2); 635 return true; 636 } 637 else if(rem.begins_with(": ") && !has_any(SSCL)) 638 { 639 _c4dbgp("it's a map with an empty key"); 640 _move_key_anchor_to_val_anchor(); 641 _move_key_tag_to_val_tag(); 642 _push_level(); 643 _start_map(start_as_child); 644 _store_scalar_null(rem.str); 645 addrem_flags(RVAL, RKEY); 646 _save_indentation(); 647 _line_progressed(2); 648 return true; 649 } 650 else if(rem == ':' && !has_any(SSCL)) 651 { 652 _c4dbgp("it's a map with an empty key"); 653 _move_key_anchor_to_val_anchor(); 654 _move_key_tag_to_val_tag(); 655 _push_level(); 656 _start_map(start_as_child); 657 _store_scalar_null(rem.str); 658 addrem_flags(RVAL, RKEY); 659 _save_indentation(); 660 _line_progressed(1); 661 return true; 662 } 663 else if(_handle_types()) 664 { 665 return true; 666 } 667 else if(!rem.begins_with('*') && _handle_key_anchors_and_refs()) 668 { 669 return true; 670 } 671 else if(has_any(SSCL)) 672 { 673 _c4dbgpf("there's a stored scalar: '{}'", m_state->scalar); 674 675 csubstr saved_scalar; 676 bool is_quoted = false; 677 if(_scan_scalar_unk(&saved_scalar, &is_quoted)) 678 { 679 rem = m_state->line_contents.rem; 680 _c4dbgpf("... and there's also a scalar next! '{}'", saved_scalar); 681 if(rem.begins_with_any(" \t")) 682 { 683 size_t n = rem.first_not_of(" \t"); 684 _c4dbgpf("skipping {} spaces/tabs", n); 685 rem = rem.sub(n); 686 _line_progressed(n); 687 } 688 } 689 690 _c4dbgpf("rem='{}'", rem); 691 692 if(rem.begins_with(", ")) 693 { 694 _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child); 695 _start_seq(start_as_child); 696 add_flags(FLOW); 697 _append_val(_consume_scalar()); 698 _line_progressed(2); 699 } 700 else if(rem.begins_with(',')) 701 { 702 _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child); 703 _start_seq(start_as_child); 704 add_flags(FLOW); 705 _append_val(_consume_scalar()); 706 _line_progressed(1); 707 } 708 else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) 709 { 710 _c4dbgpf("got a ': ' -- it's a map (as_child={})", start_as_child); 711 _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair 712 _line_progressed(2); 713 } 714 else if(rem == ":" || rem.begins_with(":\"") || rem.begins_with(":'")) 715 { 716 if(rem == ":") { _c4dbgpf("got a ':' -- it's a map (as_child={})", start_as_child); } 717 else { _c4dbgpf("got a '{}' -- it's a map (as_child={})", rem.first(2), start_as_child); } 718 _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair 719 _line_progressed(1); // advance only 1 720 } 721 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 722 else if(rem.begins_with('}')) 723 { 724 if(!has_all(RMAP|FLOW)) 725 { 726 _c4err("invalid token: not reading a map"); 727 } 728 if(!has_all(SSCL)) 729 { 730 _c4err("no scalar stored"); 731 } 732 _append_key_val(saved_scalar, is_quoted); 733 _stop_map(); 734 _line_progressed(1); 735 saved_scalar.clear(); 736 is_quoted = false; 737 } 738 #endif 739 else if(rem.begins_with("...")) 740 { 741 _c4dbgp("got stream end '...'"); 742 _end_stream(); 743 _line_progressed(3); 744 } 745 else if(rem.begins_with('#')) 746 { 747 _c4dbgpf("it's a comment: '{}'", rem); 748 _scan_comment(); 749 return true; 750 } 751 else if(_handle_key_anchors_and_refs()) 752 { 753 return true; 754 } 755 else if(rem.begins_with(" ") || rem.begins_with("\t")) 756 { 757 size_t n = rem.first_not_of(" \t"); 758 if(n == npos) 759 n = rem.len; 760 _c4dbgpf("has {} spaces/tabs, skip...", n); 761 _line_progressed(n); 762 return true; 763 } 764 else if(rem.empty()) 765 { 766 // nothing to do 767 } 768 else if(rem == "---" || rem.begins_with("--- ")) 769 { 770 _c4dbgp("caught ---: starting doc"); 771 _start_new_doc(rem); 772 return true; 773 } 774 else if(rem.begins_with('%')) 775 { 776 _c4dbgp("caught a directive: ignoring..."); 777 _line_progressed(rem.len); 778 return true; 779 } 780 else 781 { 782 _c4err("parse error"); 783 } 784 785 if(is_quoted || (! saved_scalar.empty())) 786 { 787 _store_scalar(saved_scalar, is_quoted); 788 } 789 790 return true; 791 } 792 else 793 { 794 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(SSCL)); 795 csubstr scalar; 796 size_t indentation = m_state->line_contents.indentation; // save 797 bool is_quoted; 798 if(_scan_scalar_unk(&scalar, &is_quoted)) 799 { 800 _c4dbgpf("got a {} scalar", is_quoted ? "quoted" : ""); 801 rem = m_state->line_contents.rem; 802 { 803 size_t first = rem.first_not_of(" \t"); 804 if(first && first != npos) 805 { 806 _c4dbgpf("skip {} whitespace characters", first); 807 _line_progressed(first); 808 rem = rem.sub(first); 809 } 810 } 811 _store_scalar(scalar, is_quoted); 812 if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) 813 { 814 _c4dbgpf("got a ': ' next -- it's a map (as_child={})", start_as_child); 815 _push_level(); 816 _start_map(start_as_child); // wait for the val scalar to append the key-val pair 817 _set_indentation(indentation); 818 _line_progressed(2); // call this AFTER saving the indentation 819 } 820 else if(rem.begins_with(':')) 821 { 822 _c4dbgpf("got a ':' next -- it's a map (as_child={})", start_as_child); 823 _push_level(); 824 _start_map(start_as_child); // wait for the val scalar to append the key-val pair 825 _set_indentation(indentation); 826 _line_progressed(1); // call this AFTER saving the indentation 827 } 828 else 829 { 830 // we still don't know whether it's a seq or a map 831 // so just store the scalar 832 } 833 return true; 834 } 835 else if(rem.begins_with_any(" \t")) 836 { 837 csubstr ws = rem.left_of(rem.first_not_of(" \t")); 838 rem = rem.right_of(ws); 839 if(has_all(RTOP) && rem.begins_with("---")) 840 { 841 _c4dbgp("there's a doc starting, and it's indented"); 842 _set_indentation(ws.len); 843 } 844 _c4dbgpf("skipping {} spaces/tabs", ws.len); 845 _line_progressed(ws.len); 846 return true; 847 } 848 } 849 850 return false; 851 } 852 853 854 //----------------------------------------------------------------------------- 855 C4_ALWAYS_INLINE void Parser::_skipchars(char c) 856 { 857 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with(c)); 858 size_t pos = m_state->line_contents.rem.first_not_of(c); 859 if(pos == npos) 860 pos = m_state->line_contents.rem.len; // maybe the line is just whitespace 861 _c4dbgpf("skip {} '{}'", pos, c); 862 _line_progressed(pos); 863 } 864 865 template<size_t N> 866 C4_ALWAYS_INLINE void Parser::_skipchars(const char (&chars)[N]) 867 { 868 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with_any(chars)); 869 size_t pos = m_state->line_contents.rem.first_not_of(chars); 870 if(pos == npos) 871 pos = m_state->line_contents.rem.len; // maybe the line is just whitespace 872 _c4dbgpf("skip {} characters", pos); 873 _line_progressed(pos); 874 } 875 876 877 //----------------------------------------------------------------------------- 878 bool Parser::_handle_seq_flow() 879 { 880 _c4dbgpf("handle_seq_flow: node_id={} level={}", m_state->node_id, m_state->level); 881 csubstr rem = m_state->line_contents.rem; 882 883 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); 884 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW)); 885 886 if(rem.begins_with(' ')) 887 { 888 // with explicit flow, indentation does not matter 889 _c4dbgp("starts with spaces"); 890 _skipchars(' '); 891 return true; 892 } 893 _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t')) 894 { 895 _c4dbgp("starts with tabs"); 896 _skipchars('\t'); 897 return true; 898 }) 899 else if(rem.begins_with('#')) 900 { 901 _c4dbgp("it's a comment"); 902 rem = _scan_comment(); // also progresses the line 903 return true; 904 } 905 else if(rem.begins_with(']')) 906 { 907 _c4dbgp("end the sequence"); 908 _pop_level(); 909 _line_progressed(1); 910 if(has_all(RSEQIMAP)) 911 { 912 _stop_seqimap(); 913 _pop_level(); 914 } 915 return true; 916 } 917 918 if(has_any(RVAL)) 919 { 920 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT)); 921 bool is_quoted; 922 if(_scan_scalar_seq_flow(&rem, &is_quoted)) 923 { 924 _c4dbgp("it's a scalar"); 925 addrem_flags(RNXT, RVAL); 926 _append_val(rem, is_quoted); 927 return true; 928 } 929 else if(rem.begins_with('[')) 930 { 931 _c4dbgp("val is a child seq"); 932 addrem_flags(RNXT, RVAL); // before _push_level! 933 _push_level(/*explicit flow*/true); 934 _start_seq(); 935 add_flags(FLOW); 936 _line_progressed(1); 937 return true; 938 } 939 else if(rem.begins_with('{')) 940 { 941 _c4dbgp("val is a child map"); 942 addrem_flags(RNXT, RVAL); // before _push_level! 943 _push_level(/*explicit flow*/true); 944 _start_map(); 945 addrem_flags(FLOW|RKEY, RVAL); 946 _line_progressed(1); 947 return true; 948 } 949 else if(rem == ':') 950 { 951 _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id); 952 _start_seqimap(); 953 _line_progressed(1); 954 return true; 955 } 956 else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) 957 { 958 _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id); 959 _start_seqimap(); 960 _line_progressed(2); 961 return true; 962 } 963 else if(rem.begins_with("? ")) 964 { 965 _c4dbgpf("found '? ' -- there's an implicit map in the seq node[{}]", m_state->node_id); 966 _start_seqimap(); 967 _line_progressed(2); 968 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(SSCL) && m_state->scalar == ""); 969 addrem_flags(QMRK|RKEY, RVAL|SSCL); 970 return true; 971 } 972 else if(_handle_types()) 973 { 974 return true; 975 } 976 else if(_handle_val_anchors_and_refs()) 977 { 978 return true; 979 } 980 else if(rem.begins_with(", ")) 981 { 982 _c4dbgp("found ',' -- the value was null"); 983 _append_val_null(rem.str - 1); 984 _line_progressed(2); 985 return true; 986 } 987 else if(rem.begins_with(',')) 988 { 989 _c4dbgp("found ',' -- the value was null"); 990 _append_val_null(rem.str - 1); 991 _line_progressed(1); 992 return true; 993 } 994 else if(rem.begins_with('\t')) 995 { 996 _skipchars('\t'); 997 return true; 998 } 999 else 1000 { 1001 _c4err("parse error"); 1002 } 1003 } 1004 else if(has_any(RNXT)) 1005 { 1006 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); 1007 if(rem.begins_with(", ")) 1008 { 1009 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW)); 1010 _c4dbgp("seq: expect next val"); 1011 addrem_flags(RVAL, RNXT); 1012 _line_progressed(2); 1013 return true; 1014 } 1015 else if(rem.begins_with(',')) 1016 { 1017 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW)); 1018 _c4dbgp("seq: expect next val"); 1019 addrem_flags(RVAL, RNXT); 1020 _line_progressed(1); 1021 return true; 1022 } 1023 else if(rem == ':') 1024 { 1025 _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id); 1026 _start_seqimap(); 1027 _line_progressed(1); 1028 return true; 1029 } 1030 else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) 1031 { 1032 _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id); 1033 _start_seqimap(); 1034 _line_progressed(2); 1035 return true; 1036 } 1037 else 1038 { 1039 _c4err("was expecting a comma"); 1040 } 1041 } 1042 else 1043 { 1044 _c4err("internal error"); 1045 } 1046 1047 return true; 1048 } 1049 1050 //----------------------------------------------------------------------------- 1051 bool Parser::_handle_seq_blck() 1052 { 1053 _c4dbgpf("handle_seq_impl: node_id={} level={}", m_state->node_id, m_state->level); 1054 csubstr rem = m_state->line_contents.rem; 1055 1056 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ)); 1057 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); 1058 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW)); 1059 1060 if(rem.begins_with('#')) 1061 { 1062 _c4dbgp("it's a comment"); 1063 rem = _scan_comment(); 1064 return true; 1065 } 1066 if(has_any(RNXT)) 1067 { 1068 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); 1069 1070 if(_handle_indentation()) 1071 return true; 1072 1073 if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) 1074 { 1075 _c4dbgp("expect another val"); 1076 addrem_flags(RVAL, RNXT); 1077 _line_progressed(2); 1078 return true; 1079 } 1080 else if(rem == '-') 1081 { 1082 _c4dbgp("expect another val"); 1083 addrem_flags(RVAL, RNXT); 1084 _line_progressed(1); 1085 return true; 1086 } 1087 else if(rem.begins_with_any(" \t")) 1088 { 1089 _RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin()); 1090 _skipchars(" \t"); 1091 return true; 1092 } 1093 else if(rem.begins_with("...")) 1094 { 1095 _c4dbgp("got stream end '...'"); 1096 _end_stream(); 1097 _line_progressed(3); 1098 return true; 1099 } 1100 else if(rem.begins_with("---")) 1101 { 1102 _c4dbgp("got document start '---'"); 1103 _start_new_doc(rem); 1104 return true; 1105 } 1106 else 1107 { 1108 _c4err("parse error"); 1109 } 1110 } 1111 else if(has_any(RVAL)) 1112 { 1113 // there can be empty values 1114 if(_handle_indentation()) 1115 return true; 1116 1117 csubstr s; 1118 bool is_quoted; 1119 if(_scan_scalar_seq_blck(&s, &is_quoted)) // this also progresses the line 1120 { 1121 _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : ""); 1122 1123 rem = m_state->line_contents.rem; 1124 if(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(rem.begins_with_any(" \t"), rem.begins_with(' '))) 1125 { 1126 _c4dbgp("skipping whitespace..."); 1127 size_t skip = rem.first_not_of(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 1128 if(skip == csubstr::npos) 1129 skip = rem.len; // maybe the line is just whitespace 1130 _line_progressed(skip); 1131 rem = rem.sub(skip); 1132 } 1133 1134 _c4dbgpf("rem=[{}]~~~{}~~~", rem.len, rem); 1135 if(!rem.begins_with('#') && (rem.ends_with(':') || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))) 1136 { 1137 _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope"); 1138 if(m_key_anchor.empty()) 1139 _move_val_anchor_to_key_anchor(); 1140 if(m_key_tag.empty()) 1141 _move_val_tag_to_key_tag(); 1142 addrem_flags(RNXT, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT 1143 _push_level(); 1144 _start_map(); 1145 _store_scalar(s, is_quoted); 1146 if( ! _maybe_set_indentation_from_anchor_or_tag()) 1147 { 1148 _c4dbgpf("set indentation from scalar: {}", m_state->scalar_col); 1149 _set_indentation(m_state->scalar_col); // this is the column where the scalar starts 1150 } 1151 _move_key_tag2_to_key_tag(); 1152 addrem_flags(RVAL, RKEY); 1153 _line_progressed(1); 1154 } 1155 else 1156 { 1157 _c4dbgp("appending val to current seq"); 1158 _append_val(s, is_quoted); 1159 addrem_flags(RNXT, RVAL); 1160 } 1161 return true; 1162 } 1163 else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) 1164 { 1165 if(_rval_dash_start_or_continue_seq()) 1166 _line_progressed(2); 1167 return true; 1168 } 1169 else if(rem == '-') 1170 { 1171 if(_rval_dash_start_or_continue_seq()) 1172 _line_progressed(1); 1173 return true; 1174 } 1175 else if(rem.begins_with('[')) 1176 { 1177 _c4dbgp("val is a child seq, flow"); 1178 addrem_flags(RNXT, RVAL); // before _push_level! 1179 _push_level(/*explicit flow*/true); 1180 _start_seq(); 1181 add_flags(FLOW); 1182 _line_progressed(1); 1183 return true; 1184 } 1185 else if(rem.begins_with('{')) 1186 { 1187 _c4dbgp("val is a child map, flow"); 1188 addrem_flags(RNXT, RVAL); // before _push_level! 1189 _push_level(/*explicit flow*/true); 1190 _start_map(); 1191 addrem_flags(FLOW|RKEY, RVAL); 1192 _line_progressed(1); 1193 return true; 1194 } 1195 else if(rem.begins_with("? ")) 1196 { 1197 _c4dbgp("val is a child map + this key is complex"); 1198 addrem_flags(RNXT, RVAL); // before _push_level! 1199 _push_level(); 1200 _start_map(); 1201 addrem_flags(QMRK|RKEY, RVAL); 1202 _save_indentation(); 1203 _line_progressed(2); 1204 return true; 1205 } 1206 else if(rem.begins_with(' ')) 1207 { 1208 csubstr spc = rem.left_of(rem.first_not_of(' ')); 1209 if(_at_line_begin()) 1210 { 1211 _c4dbgpf("skipping value indentation: {} spaces", spc.len); 1212 _line_progressed(spc.len); 1213 return true; 1214 } 1215 else 1216 { 1217 _c4dbgpf("skipping {} spaces", spc.len); 1218 _line_progressed(spc.len); 1219 return true; 1220 } 1221 } 1222 else if(_handle_types()) 1223 { 1224 return true; 1225 } 1226 else if(_handle_val_anchors_and_refs()) 1227 { 1228 return true; 1229 } 1230 /* pathological case: 1231 * - &key : val 1232 * - &key : 1233 * - : val 1234 */ 1235 else if((!has_all(SSCL)) && 1236 (rem.begins_with(": ") || rem.left_of(rem.find("#")).trimr("\t") == ":")) 1237 { 1238 if(!m_val_anchor.empty() || !m_val_tag.empty()) 1239 { 1240 _c4dbgp("val is a child map + this key is empty, with anchors or tags"); 1241 addrem_flags(RNXT, RVAL); // before _push_level! 1242 _move_val_tag_to_key_tag(); 1243 _move_val_anchor_to_key_anchor(); 1244 _push_level(); 1245 _start_map(); 1246 _store_scalar_null(rem.str); 1247 addrem_flags(RVAL, RKEY); 1248 RYML_CHECK(_maybe_set_indentation_from_anchor_or_tag()); // one of them must exist 1249 _line_progressed(rem.begins_with(": ") ? 2u : 1u); 1250 return true; 1251 } 1252 else 1253 { 1254 _c4dbgp("val is a child map + this key is empty, no anchors or tags"); 1255 addrem_flags(RNXT, RVAL); // before _push_level! 1256 size_t ind = m_state->indref; 1257 _push_level(); 1258 _start_map(); 1259 _store_scalar_null(rem.str); 1260 addrem_flags(RVAL, RKEY); 1261 _c4dbgpf("set indentation from map anchor: {}", ind + 2); 1262 _set_indentation(ind + 2); // this is the column where the map starts 1263 _line_progressed(rem.begins_with(": ") ? 2u : 1u); 1264 return true; 1265 } 1266 } 1267 else 1268 { 1269 _c4err("parse error"); 1270 } 1271 } 1272 1273 return false; 1274 } 1275 1276 //----------------------------------------------------------------------------- 1277 1278 bool Parser::_rval_dash_start_or_continue_seq() 1279 { 1280 size_t ind = m_state->line_contents.current_col(); 1281 _RYML_CB_ASSERT(m_stack.m_callbacks, ind >= m_state->indref); 1282 size_t delta_ind = ind - m_state->indref; 1283 if( ! delta_ind) 1284 { 1285 _c4dbgp("prev val was empty"); 1286 addrem_flags(RNXT, RVAL); 1287 _append_val_null(&m_state->line_contents.full[ind]); 1288 return false; 1289 } 1290 _c4dbgp("val is a nested seq, indented"); 1291 addrem_flags(RNXT, RVAL); // before _push_level! 1292 _push_level(); 1293 _start_seq(); 1294 _save_indentation(); 1295 return true; 1296 } 1297 1298 //----------------------------------------------------------------------------- 1299 bool Parser::_handle_map_flow() 1300 { 1301 // explicit flow, ie, inside {}, separated by commas 1302 _c4dbgpf("handle_map_flow: node_id={} level={}", m_state->node_id, m_state->level); 1303 csubstr rem = m_state->line_contents.rem; 1304 1305 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|FLOW)); 1306 1307 if(rem.begins_with(' ')) 1308 { 1309 // with explicit flow, indentation does not matter 1310 _c4dbgp("starts with spaces"); 1311 _skipchars(' '); 1312 return true; 1313 } 1314 _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t')) 1315 { 1316 // with explicit flow, indentation does not matter 1317 _c4dbgp("starts with tabs"); 1318 _skipchars('\t'); 1319 return true; 1320 }) 1321 else if(rem.begins_with('#')) 1322 { 1323 _c4dbgp("it's a comment"); 1324 rem = _scan_comment(); // also progresses the line 1325 return true; 1326 } 1327 else if(rem.begins_with('}')) 1328 { 1329 _c4dbgp("end the map"); 1330 if(has_all(SSCL)) 1331 { 1332 _c4dbgp("the last val was null"); 1333 _append_key_val_null(rem.str - 1); 1334 rem_flags(RVAL); 1335 } 1336 _pop_level(); 1337 _line_progressed(1); 1338 if(has_all(RSEQIMAP)) 1339 { 1340 _c4dbgp("stopping implicitly nested 1x map"); 1341 _stop_seqimap(); 1342 _pop_level(); 1343 } 1344 return true; 1345 } 1346 1347 if(has_any(RNXT)) 1348 { 1349 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); 1350 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); 1351 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RSEQIMAP)); 1352 1353 if(rem.begins_with(", ")) 1354 { 1355 _c4dbgp("seq: expect next keyval"); 1356 addrem_flags(RKEY, RNXT); 1357 _line_progressed(2); 1358 return true; 1359 } 1360 else if(rem.begins_with(',')) 1361 { 1362 _c4dbgp("seq: expect next keyval"); 1363 addrem_flags(RKEY, RNXT); 1364 _line_progressed(1); 1365 return true; 1366 } 1367 else 1368 { 1369 _c4err("parse error"); 1370 } 1371 } 1372 else if(has_any(RKEY)) 1373 { 1374 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT)); 1375 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); 1376 1377 bool is_quoted; 1378 if(has_none(SSCL) && _scan_scalar_map_flow(&rem, &is_quoted)) 1379 { 1380 _c4dbgp("it's a scalar"); 1381 _store_scalar(rem, is_quoted); 1382 rem = m_state->line_contents.rem; 1383 csubstr trimmed = rem.triml(" \t"); 1384 if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))) 1385 { 1386 _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= rem.str); 1387 size_t num = static_cast<size_t>(trimmed.str - rem.str); 1388 _c4dbgpf("trimming {} whitespace after the scalar: '{}' --> '{}'", num, rem, rem.sub(num)); 1389 rem = rem.sub(num); 1390 _line_progressed(num); 1391 } 1392 } 1393 1394 if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) 1395 { 1396 _c4dbgp("wait for val"); 1397 addrem_flags(RVAL, RKEY|QMRK); 1398 _line_progressed(2); 1399 if(!has_all(SSCL)) 1400 { 1401 _c4dbgp("no key was found, defaulting to empty key ''"); 1402 _store_scalar_null(rem.str); 1403 } 1404 return true; 1405 } 1406 else if(rem == ':') 1407 { 1408 _c4dbgp("wait for val"); 1409 addrem_flags(RVAL, RKEY|QMRK); 1410 _line_progressed(1); 1411 if(!has_all(SSCL)) 1412 { 1413 _c4dbgp("no key was found, defaulting to empty key ''"); 1414 _store_scalar_null(rem.str); 1415 } 1416 return true; 1417 } 1418 else if(rem.begins_with('?')) 1419 { 1420 _c4dbgp("complex key"); 1421 add_flags(QMRK); 1422 _line_progressed(1); 1423 return true; 1424 } 1425 else if(rem.begins_with(',')) 1426 { 1427 _c4dbgp("prev scalar was a key with null value"); 1428 _append_key_val_null(rem.str - 1); 1429 _line_progressed(1); 1430 return true; 1431 } 1432 else if(rem.begins_with('}')) 1433 { 1434 _c4dbgp("map terminates after a key..."); 1435 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL)); 1436 _c4dbgp("the last val was null"); 1437 _append_key_val_null(rem.str - 1); 1438 rem_flags(RVAL); 1439 if(has_all(RSEQIMAP)) 1440 { 1441 _c4dbgp("stopping implicitly nested 1x map"); 1442 _stop_seqimap(); 1443 _pop_level(); 1444 } 1445 _pop_level(); 1446 _line_progressed(1); 1447 return true; 1448 } 1449 else if(_handle_types()) 1450 { 1451 return true; 1452 } 1453 else if(_handle_key_anchors_and_refs()) 1454 { 1455 return true; 1456 } 1457 else if(rem == "") 1458 { 1459 return true; 1460 } 1461 else 1462 { 1463 size_t pos = rem.first_not_of(" \t"); 1464 if(pos == csubstr::npos) 1465 pos = 0; 1466 rem = rem.sub(pos); 1467 if(rem.begins_with(':')) 1468 { 1469 _c4dbgp("wait for val"); 1470 addrem_flags(RVAL, RKEY|QMRK); 1471 _line_progressed(pos + 1); 1472 if(!has_all(SSCL)) 1473 { 1474 _c4dbgp("no key was found, defaulting to empty key ''"); 1475 _store_scalar_null(rem.str); 1476 } 1477 return true; 1478 } 1479 else if(rem.begins_with('#')) 1480 { 1481 _c4dbgp("it's a comment"); 1482 _line_progressed(pos); 1483 rem = _scan_comment(); // also progresses the line 1484 return true; 1485 } 1486 else 1487 { 1488 _c4err("parse error"); 1489 } 1490 } 1491 } 1492 else if(has_any(RVAL)) 1493 { 1494 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT)); 1495 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); 1496 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL)); 1497 bool is_quoted; 1498 if(_scan_scalar_map_flow(&rem, &is_quoted)) 1499 { 1500 _c4dbgp("it's a scalar"); 1501 addrem_flags(RNXT, RVAL|RKEY); 1502 _append_key_val(rem, is_quoted); 1503 if(has_all(RSEQIMAP)) 1504 { 1505 _c4dbgp("stopping implicitly nested 1x map"); 1506 _stop_seqimap(); 1507 _pop_level(); 1508 } 1509 return true; 1510 } 1511 else if(rem.begins_with('[')) 1512 { 1513 _c4dbgp("val is a child seq"); 1514 addrem_flags(RNXT, RVAL|RKEY); // before _push_level! 1515 _push_level(/*explicit flow*/true); 1516 _move_scalar_from_top(); 1517 _start_seq(); 1518 add_flags(FLOW); 1519 _line_progressed(1); 1520 return true; 1521 } 1522 else if(rem.begins_with('{')) 1523 { 1524 _c4dbgp("val is a child map"); 1525 addrem_flags(RNXT, RVAL|RKEY); // before _push_level! 1526 _push_level(/*explicit flow*/true); 1527 _move_scalar_from_top(); 1528 _start_map(); 1529 addrem_flags(FLOW|RKEY, RNXT|RVAL); 1530 _line_progressed(1); 1531 return true; 1532 } 1533 else if(_handle_types()) 1534 { 1535 return true; 1536 } 1537 else if(_handle_val_anchors_and_refs()) 1538 { 1539 return true; 1540 } 1541 else if(rem.begins_with(',')) 1542 { 1543 _c4dbgp("appending empty val"); 1544 _append_key_val_null(rem.str - 1); 1545 addrem_flags(RKEY, RVAL); 1546 _line_progressed(1); 1547 if(has_any(RSEQIMAP)) 1548 { 1549 _c4dbgp("stopping implicitly nested 1x map"); 1550 _stop_seqimap(); 1551 _pop_level(); 1552 } 1553 return true; 1554 } 1555 else if(has_any(RSEQIMAP) && rem.begins_with(']')) 1556 { 1557 _c4dbgp("stopping implicitly nested 1x map"); 1558 if(has_any(SSCL)) 1559 { 1560 _append_key_val_null(rem.str - 1); 1561 } 1562 _stop_seqimap(); 1563 _pop_level(); 1564 return true; 1565 } 1566 else 1567 { 1568 _c4err("parse error"); 1569 } 1570 } 1571 else 1572 { 1573 _c4err("internal error"); 1574 } 1575 1576 return false; 1577 } 1578 1579 //----------------------------------------------------------------------------- 1580 bool Parser::_handle_map_blck() 1581 { 1582 _c4dbgpf("handle_map_blck: node_id={} level={}", m_state->node_id, m_state->level); 1583 csubstr rem = m_state->line_contents.rem; 1584 1585 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP)); 1586 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW)); 1587 1588 if(rem.begins_with('#')) 1589 { 1590 _c4dbgp("it's a comment"); 1591 rem = _scan_comment(); 1592 return true; 1593 } 1594 1595 if(has_any(RNXT)) 1596 { 1597 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); 1598 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); 1599 // actually, we don't need RNXT in indent-based maps. 1600 addrem_flags(RKEY, RNXT); 1601 } 1602 1603 if(_handle_indentation()) 1604 { 1605 _c4dbgp("indentation token"); 1606 return true; 1607 } 1608 1609 if(has_any(RKEY)) 1610 { 1611 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT)); 1612 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL)); 1613 1614 _c4dbgp("RMAP|RKEY read scalar?"); 1615 bool is_quoted; 1616 if(_scan_scalar_map_blck(&rem, &is_quoted)) // this also progresses the line 1617 { 1618 _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : ""); 1619 if(has_all(QMRK|SSCL)) 1620 { 1621 _c4dbgpf("current key is QMRK; SSCL is set. so take store scalar='{}' as key and add an empty val", m_state->scalar); 1622 _append_key_val_null(rem.str - 1); 1623 } 1624 _store_scalar(rem, is_quoted); 1625 if(has_all(QMRK|RSET)) 1626 { 1627 _c4dbgp("it's a complex key, so use null value '~'"); 1628 _append_key_val_null(rem.str); 1629 } 1630 rem = m_state->line_contents.rem; 1631 1632 if(rem.begins_with(':')) 1633 { 1634 _c4dbgp("wait for val"); 1635 addrem_flags(RVAL, RKEY|QMRK); 1636 _line_progressed(1); 1637 rem = m_state->line_contents.rem; 1638 if(rem.begins_with_any(" \t")) 1639 { 1640 _RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin()); 1641 rem = rem.left_of(rem.first_not_of(" \t")); 1642 _c4dbgpf("skip {} spaces/tabs", rem.len); 1643 _line_progressed(rem.len); 1644 } 1645 } 1646 return true; 1647 } 1648 else if(rem.begins_with_any(" \t")) 1649 { 1650 size_t pos = rem.first_not_of(" \t"); 1651 if(pos == npos) 1652 pos = rem.len; 1653 _c4dbgpf("skip {} spaces/tabs", pos); 1654 _line_progressed(pos); 1655 return true; 1656 } 1657 else if(rem == '?' || rem.begins_with("? ")) 1658 { 1659 _c4dbgp("it's a complex key"); 1660 _line_progressed(rem.begins_with("? ") ? 2u : 1u); 1661 if(has_any(SSCL)) 1662 _append_key_val_null(rem.str - 1); 1663 add_flags(QMRK); 1664 return true; 1665 } 1666 else if(has_all(QMRK) && rem.begins_with(':')) 1667 { 1668 _c4dbgp("complex key finished"); 1669 if(!has_any(SSCL)) 1670 _store_scalar_null(rem.str); 1671 addrem_flags(RVAL, RKEY|QMRK); 1672 _line_progressed(1); 1673 rem = m_state->line_contents.rem; 1674 if(rem.begins_with(' ')) 1675 { 1676 _RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin()); 1677 _skipchars(' '); 1678 } 1679 return true; 1680 } 1681 else if(rem == ':' || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))) 1682 { 1683 _c4dbgp("key finished"); 1684 if(!has_all(SSCL)) 1685 { 1686 _c4dbgp("key was empty..."); 1687 _store_scalar_null(rem.str); 1688 rem_flags(QMRK); 1689 } 1690 addrem_flags(RVAL, RKEY); 1691 _line_progressed(rem == ':' ? 1 : 2); 1692 return true; 1693 } 1694 else if(rem.begins_with("...")) 1695 { 1696 _c4dbgp("end current document"); 1697 _end_stream(); 1698 _line_progressed(3); 1699 return true; 1700 } 1701 else if(rem.begins_with("---")) 1702 { 1703 _c4dbgp("start new document '---'"); 1704 _start_new_doc(rem); 1705 return true; 1706 } 1707 else if(_handle_types()) 1708 { 1709 return true; 1710 } 1711 else if(_handle_key_anchors_and_refs()) 1712 { 1713 return true; 1714 } 1715 else 1716 { 1717 _c4err("parse error"); 1718 } 1719 } 1720 else if(has_any(RVAL)) 1721 { 1722 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT)); 1723 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY)); 1724 1725 _c4dbgp("RMAP|RVAL read scalar?"); 1726 csubstr s; 1727 bool is_quoted; 1728 if(_scan_scalar_map_blck(&s, &is_quoted)) // this also progresses the line 1729 { 1730 _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : ""); 1731 1732 rem = m_state->line_contents.rem; 1733 1734 if(rem.begins_with(": ")) 1735 { 1736 _c4dbgp("actually, the scalar is the first key of a map"); 1737 addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT 1738 _push_level(); 1739 _move_scalar_from_top(); 1740 _move_val_anchor_to_key_anchor(); 1741 _start_map(); 1742 _save_indentation(m_state->scalar_col); 1743 addrem_flags(RVAL, RKEY); 1744 _line_progressed(2); 1745 } 1746 else if(rem.begins_with(':')) 1747 { 1748 _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope"); 1749 addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT 1750 _push_level(); 1751 _move_scalar_from_top(); 1752 _move_val_anchor_to_key_anchor(); 1753 _start_map(); 1754 _save_indentation(/*behind*/s.len); 1755 addrem_flags(RVAL, RKEY); 1756 _line_progressed(1); 1757 } 1758 else 1759 { 1760 _c4dbgp("appending keyval to current map"); 1761 _append_key_val(s, is_quoted); 1762 addrem_flags(RKEY, RVAL); 1763 } 1764 return true; 1765 } 1766 else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t"))) 1767 { 1768 _c4dbgp("val is a nested seq, indented"); 1769 addrem_flags(RKEY, RVAL); // before _push_level! 1770 _push_level(); 1771 _move_scalar_from_top(); 1772 _start_seq(); 1773 _save_indentation(); 1774 _line_progressed(2); 1775 return true; 1776 } 1777 else if(rem == '-') 1778 { 1779 _c4dbgp("maybe a seq. start unknown, indented"); 1780 _start_unk(); 1781 _save_indentation(); 1782 _line_progressed(1); 1783 return true; 1784 } 1785 else if(rem.begins_with('[')) 1786 { 1787 _c4dbgp("val is a child seq, flow"); 1788 addrem_flags(RKEY, RVAL); // before _push_level! 1789 _push_level(/*explicit flow*/true); 1790 _move_scalar_from_top(); 1791 _start_seq(); 1792 add_flags(FLOW); 1793 _line_progressed(1); 1794 return true; 1795 } 1796 else if(rem.begins_with('{')) 1797 { 1798 _c4dbgp("val is a child map, flow"); 1799 addrem_flags(RKEY, RVAL); // before _push_level! 1800 _push_level(/*explicit flow*/true); 1801 _move_scalar_from_top(); 1802 _start_map(); 1803 addrem_flags(FLOW|RKEY, RVAL); 1804 _line_progressed(1); 1805 return true; 1806 } 1807 else if(rem.begins_with(' ')) 1808 { 1809 csubstr spc = rem.left_of(rem.first_not_of(' ')); 1810 if(_at_line_begin()) 1811 { 1812 _c4dbgpf("skipping value indentation: {} spaces", spc.len); 1813 _line_progressed(spc.len); 1814 return true; 1815 } 1816 else 1817 { 1818 _c4dbgpf("skipping {} spaces", spc.len); 1819 _line_progressed(spc.len); 1820 return true; 1821 } 1822 } 1823 else if(_handle_types()) 1824 { 1825 return true; 1826 } 1827 else if(_handle_val_anchors_and_refs()) 1828 { 1829 return true; 1830 } 1831 else if(rem.begins_with("--- ") || rem == "---" || rem.begins_with("---\t")) 1832 { 1833 _start_new_doc(rem); 1834 return true; 1835 } 1836 else if(rem.begins_with("...")) 1837 { 1838 _c4dbgp("end current document"); 1839 _end_stream(); 1840 _line_progressed(3); 1841 return true; 1842 } 1843 else 1844 { 1845 _c4err("parse error"); 1846 } 1847 } 1848 else 1849 { 1850 _c4err("internal error"); 1851 } 1852 1853 return false; 1854 } 1855 1856 1857 //----------------------------------------------------------------------------- 1858 bool Parser::_handle_top() 1859 { 1860 _c4dbgp("handle_top"); 1861 csubstr rem = m_state->line_contents.rem; 1862 1863 if(rem.begins_with('#')) 1864 { 1865 _c4dbgp("a comment line"); 1866 _scan_comment(); 1867 return true; 1868 } 1869 1870 csubstr trimmed = rem.triml(' '); 1871 1872 if(trimmed.begins_with('%')) 1873 { 1874 _handle_directive(trimmed); 1875 _line_progressed(rem.len); 1876 return true; 1877 } 1878 else if(trimmed.begins_with("--- ") || trimmed == "---" || trimmed.begins_with("---\t")) 1879 { 1880 _start_new_doc(rem); 1881 if(trimmed.len < rem.len) 1882 { 1883 _line_progressed(rem.len - trimmed.len); 1884 _save_indentation(); 1885 } 1886 return true; 1887 } 1888 else if(trimmed.begins_with("...")) 1889 { 1890 _c4dbgp("end current document"); 1891 _end_stream(); 1892 if(trimmed.len < rem.len) 1893 { 1894 _line_progressed(rem.len - trimmed.len); 1895 } 1896 _line_progressed(3); 1897 return true; 1898 } 1899 else 1900 { 1901 _c4err("parse error"); 1902 } 1903 1904 return false; 1905 } 1906 1907 1908 //----------------------------------------------------------------------------- 1909 1910 bool Parser::_handle_key_anchors_and_refs() 1911 { 1912 _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RVAL)); 1913 const csubstr rem = m_state->line_contents.rem; 1914 if(rem.begins_with('&')) 1915 { 1916 _c4dbgp("found a key anchor!!!"); 1917 if(has_all(QMRK|SSCL)) 1918 { 1919 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY)); 1920 _c4dbgp("there is a stored key, so this anchor is for the next element"); 1921 _append_key_val_null(rem.str - 1); 1922 rem_flags(QMRK); 1923 return true; 1924 } 1925 csubstr anchor = rem.left_of(rem.first_of(' ')); 1926 _line_progressed(anchor.len); 1927 anchor = anchor.sub(1); // skip the first character 1928 _move_key_anchor_to_val_anchor(); 1929 _c4dbgpf("key anchor value: '{}'", anchor); 1930 m_key_anchor = anchor; 1931 m_key_anchor_indentation = m_state->line_contents.current_col(rem); 1932 return true; 1933 } 1934 else if(C4_UNLIKELY(rem.begins_with('*'))) 1935 { 1936 _c4err("not implemented - this should have been catched elsewhere"); 1937 C4_NEVER_REACH(); 1938 return false; 1939 } 1940 return false; 1941 } 1942 1943 bool Parser::_handle_val_anchors_and_refs() 1944 { 1945 _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RKEY)); 1946 const csubstr rem = m_state->line_contents.rem; 1947 if(rem.begins_with('&')) 1948 { 1949 csubstr anchor = rem.left_of(rem.first_of(' ')); 1950 _line_progressed(anchor.len); 1951 anchor = anchor.sub(1); // skip the first character 1952 _c4dbgpf("val: found an anchor: '{}', indentation={}!!!", anchor, m_state->line_contents.current_col(rem)); 1953 if(m_val_anchor.empty()) 1954 { 1955 _c4dbgpf("save val anchor: '{}'", anchor); 1956 m_val_anchor = anchor; 1957 m_val_anchor_indentation = m_state->line_contents.current_col(rem); 1958 } 1959 else 1960 { 1961 _c4dbgpf("there is a pending val anchor '{}'", m_val_anchor); 1962 if(m_tree->is_seq(m_state->node_id)) 1963 { 1964 if(m_tree->has_children(m_state->node_id)) 1965 { 1966 _c4dbgpf("current node={} is a seq, has {} children", m_state->node_id, m_tree->num_children(m_state->node_id)); 1967 _c4dbgpf("... so take the new one as a key anchor '{}'", anchor); 1968 m_key_anchor = anchor; 1969 m_key_anchor_indentation = m_state->line_contents.current_col(rem); 1970 } 1971 else 1972 { 1973 _c4dbgpf("current node={} is a seq, has no children", m_state->node_id); 1974 if(m_tree->has_val_anchor(m_state->node_id)) 1975 { 1976 _c4dbgpf("... node={} already has val anchor: '{}'", m_state->node_id, m_tree->val_anchor(m_state->node_id)); 1977 _c4dbgpf("... so take the new one as a key anchor '{}'", anchor); 1978 m_key_anchor = anchor; 1979 m_key_anchor_indentation = m_state->line_contents.current_col(rem); 1980 } 1981 else 1982 { 1983 _c4dbgpf("... so set pending val anchor: '{}' on current node {}", m_val_anchor, m_state->node_id); 1984 m_tree->set_val_anchor(m_state->node_id, m_val_anchor); 1985 m_val_anchor = anchor; 1986 m_val_anchor_indentation = m_state->line_contents.current_col(rem); 1987 } 1988 } 1989 } 1990 } 1991 return true; 1992 } 1993 else if(C4_UNLIKELY(rem.begins_with('*'))) 1994 { 1995 _c4err("not implemented - this should have been catched elsewhere"); 1996 C4_NEVER_REACH(); 1997 return false; 1998 } 1999 return false; 2000 } 2001 2002 void Parser::_move_key_anchor_to_val_anchor() 2003 { 2004 if(m_key_anchor.empty()) 2005 return; 2006 _c4dbgpf("move current key anchor to val slot: key='{}' -> val='{}'", m_key_anchor, m_val_anchor); 2007 if(!m_val_anchor.empty()) 2008 _c4err("triple-pending anchor"); 2009 m_val_anchor = m_key_anchor; 2010 m_val_anchor_indentation = m_key_anchor_indentation; 2011 m_key_anchor = {}; 2012 m_key_anchor_indentation = {}; 2013 } 2014 2015 void Parser::_move_val_anchor_to_key_anchor() 2016 { 2017 if(m_val_anchor.empty()) 2018 return; 2019 if(!_token_is_from_this_line(m_val_anchor)) 2020 return; 2021 _c4dbgpf("move current val anchor to key slot: key='{}' <- val='{}'", m_key_anchor, m_val_anchor); 2022 if(!m_key_anchor.empty()) 2023 _c4err("triple-pending anchor"); 2024 m_key_anchor = m_val_anchor; 2025 m_key_anchor_indentation = m_val_anchor_indentation; 2026 m_val_anchor = {}; 2027 m_val_anchor_indentation = {}; 2028 } 2029 2030 void Parser::_move_key_tag_to_val_tag() 2031 { 2032 if(m_key_tag.empty()) 2033 return; 2034 _c4dbgpf("move key tag to val tag: key='{}' -> val='{}'", m_key_tag, m_val_tag); 2035 m_val_tag = m_key_tag; 2036 m_val_tag_indentation = m_key_tag_indentation; 2037 m_key_tag.clear(); 2038 m_key_tag_indentation = 0; 2039 } 2040 2041 void Parser::_move_val_tag_to_key_tag() 2042 { 2043 if(m_val_tag.empty()) 2044 return; 2045 if(!_token_is_from_this_line(m_val_tag)) 2046 return; 2047 _c4dbgpf("move val tag to key tag: key='{}' <- val='{}'", m_key_tag, m_val_tag); 2048 m_key_tag = m_val_tag; 2049 m_key_tag_indentation = m_val_tag_indentation; 2050 m_val_tag.clear(); 2051 m_val_tag_indentation = 0; 2052 } 2053 2054 void Parser::_move_key_tag2_to_key_tag() 2055 { 2056 if(m_key_tag2.empty()) 2057 return; 2058 _c4dbgpf("move key tag2 to key tag: key='{}' <- key2='{}'", m_key_tag, m_key_tag2); 2059 m_key_tag = m_key_tag2; 2060 m_key_tag_indentation = m_key_tag2_indentation; 2061 m_key_tag2.clear(); 2062 m_key_tag2_indentation = 0; 2063 } 2064 2065 2066 //----------------------------------------------------------------------------- 2067 2068 bool Parser::_handle_types() 2069 { 2070 csubstr rem = m_state->line_contents.rem.triml(' '); 2071 csubstr t; 2072 2073 if(rem.begins_with("!!")) 2074 { 2075 _c4dbgp("begins with '!!'"); 2076 t = rem.left_of(rem.first_of(" ,")); 2077 _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2); 2078 //t = t.sub(2); 2079 if(t == "!!set") 2080 add_flags(RSET); 2081 } 2082 else if(rem.begins_with("!<")) 2083 { 2084 _c4dbgp("begins with '!<'"); 2085 t = rem.left_of(rem.first_of('>'), true); 2086 _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2); 2087 //t = t.sub(2, t.len-1); 2088 } 2089 else if(rem.begins_with("!h!")) 2090 { 2091 _c4dbgp("begins with '!h!'"); 2092 t = rem.left_of(rem.first_of(' ')); 2093 _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 3); 2094 //t = t.sub(3); 2095 } 2096 else if(rem.begins_with('!')) 2097 { 2098 _c4dbgp("begins with '!'"); 2099 t = rem.left_of(rem.first_of(' ')); 2100 _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1); 2101 //t = t.sub(1); 2102 } 2103 2104 if(t.empty()) 2105 return false; 2106 2107 if(has_all(QMRK|SSCL)) 2108 { 2109 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY)); 2110 _c4dbgp("there is a stored key, so this tag is for the next element"); 2111 _append_key_val_null(rem.str - 1); 2112 rem_flags(QMRK); 2113 } 2114 2115 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 2116 const char *tag_beginning = rem.str; 2117 #endif 2118 size_t tag_indentation = m_state->line_contents.current_col(t); 2119 _c4dbgpf("there was a tag: '{}', indentation={}", t, tag_indentation); 2120 _RYML_CB_ASSERT(m_stack.m_callbacks, t.end() > m_state->line_contents.rem.begin()); 2121 _line_progressed(static_cast<size_t>(t.end() - m_state->line_contents.rem.begin())); 2122 { 2123 size_t pos = m_state->line_contents.rem.first_not_of(" \t"); 2124 if(pos != csubstr::npos) 2125 _line_progressed(pos); 2126 } 2127 2128 if(has_all(RMAP|RKEY)) 2129 { 2130 _c4dbgpf("saving map key tag '{}'", t); 2131 _RYML_CB_ASSERT(m_stack.m_callbacks, m_key_tag.empty()); 2132 m_key_tag = t; 2133 m_key_tag_indentation = tag_indentation; 2134 } 2135 else if(has_all(RMAP|RVAL)) 2136 { 2137 /* foo: !!str 2138 * !!str : bar */ 2139 rem = m_state->line_contents.rem; 2140 rem = rem.left_of(rem.find("#")); 2141 rem = rem.trimr(" \t"); 2142 _c4dbgpf("rem='{}'", rem); 2143 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 2144 if(rem == ':' || rem.begins_with(": ")) 2145 { 2146 _c4dbgp("the last val was null, and this is a tag from a null key"); 2147 _append_key_val_null(tag_beginning - 1); 2148 _store_scalar_null(rem.str - 1); 2149 // do not change the flag to key, it is ~ 2150 _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begin() > m_state->line_contents.rem.begin()); 2151 size_t token_len = rem == ':' ? 1 : 2; 2152 _line_progressed(static_cast<size_t>(token_len + rem.begin() - m_state->line_contents.rem.begin())); 2153 } 2154 #endif 2155 _c4dbgpf("saving map val tag '{}'", t); 2156 _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty()); 2157 m_val_tag = t; 2158 m_val_tag_indentation = tag_indentation; 2159 } 2160 else if(has_all(RSEQ|RVAL) || has_all(RTOP|RUNK|NDOC)) 2161 { 2162 if(m_val_tag.empty()) 2163 { 2164 _c4dbgpf("saving seq/doc val tag '{}'", t); 2165 m_val_tag = t; 2166 m_val_tag_indentation = tag_indentation; 2167 } 2168 else 2169 { 2170 _c4dbgpf("saving seq/doc key tag '{}'", t); 2171 m_key_tag = t; 2172 m_key_tag_indentation = tag_indentation; 2173 } 2174 } 2175 else if(has_all(RTOP|RUNK) || has_any(RUNK)) 2176 { 2177 rem = m_state->line_contents.rem; 2178 rem = rem.left_of(rem.find("#")); 2179 rem = rem.trimr(" \t"); 2180 if(rem.empty()) 2181 { 2182 _c4dbgpf("saving val tag '{}'", t); 2183 _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty()); 2184 m_val_tag = t; 2185 m_val_tag_indentation = tag_indentation; 2186 } 2187 else 2188 { 2189 _c4dbgpf("saving key tag '{}'", t); 2190 if(m_key_tag.empty()) 2191 { 2192 m_key_tag = t; 2193 m_key_tag_indentation = tag_indentation; 2194 } 2195 else 2196 { 2197 /* handle this case: 2198 * !!str foo: !!map 2199 * !!int 1: !!float 20.0 2200 * !!int 3: !!float 40.0 2201 * 2202 * (m_key_tag would be !!str and m_key_tag2 would be !!int) 2203 */ 2204 m_key_tag2 = t; 2205 m_key_tag2_indentation = tag_indentation; 2206 } 2207 } 2208 } 2209 else 2210 { 2211 _c4err("internal error"); 2212 } 2213 2214 if(m_val_tag.not_empty()) 2215 { 2216 YamlTag_e tag = to_tag(t); 2217 if(tag == TAG_STR) 2218 { 2219 _c4dbgpf("tag '{}' is a str-type tag", t); 2220 if(has_all(RTOP|RUNK|NDOC)) 2221 { 2222 _c4dbgpf("docval. slurping the string. pos={}", m_state->pos.offset); 2223 csubstr scalar = _slurp_doc_scalar(); 2224 _c4dbgpf("docval. after slurp: {}, at node {}: '{}'", m_state->pos.offset, m_state->node_id, scalar); 2225 m_tree->to_val(m_state->node_id, scalar, DOC); 2226 _c4dbgpf("docval. val tag {} -> {}", m_val_tag, normalize_tag(m_val_tag)); 2227 m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag)); 2228 m_val_tag.clear(); 2229 if(!m_val_anchor.empty()) 2230 { 2231 _c4dbgpf("setting val anchor[{}]='{}'", m_state->node_id, m_val_anchor); 2232 m_tree->set_val_anchor(m_state->node_id, m_val_anchor); 2233 m_val_anchor.clear(); 2234 } 2235 _end_stream(); 2236 } 2237 } 2238 } 2239 return true; 2240 } 2241 2242 //----------------------------------------------------------------------------- 2243 csubstr Parser::_slurp_doc_scalar() 2244 { 2245 csubstr s = m_state->line_contents.rem; 2246 size_t pos = m_state->pos.offset; 2247 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.find("---") != csubstr::npos); 2248 _c4dbgpf("slurp 0 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset)); 2249 if(s.len == 0) 2250 { 2251 _line_ended(); 2252 _scan_line(); 2253 s = m_state->line_contents.rem; 2254 pos = m_state->pos.offset; 2255 } 2256 2257 size_t skipws = s.first_not_of(" \t"); 2258 _c4dbgpf("slurp 1 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset)); 2259 if(skipws != npos) 2260 { 2261 _line_progressed(skipws); 2262 s = m_state->line_contents.rem; 2263 pos = m_state->pos.offset; 2264 _c4dbgpf("slurp 2 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset)); 2265 } 2266 2267 _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_anchor.empty()); 2268 _handle_val_anchors_and_refs(); 2269 if(!m_val_anchor.empty()) 2270 { 2271 s = m_state->line_contents.rem; 2272 skipws = s.first_not_of(" \t"); 2273 if(skipws != npos) 2274 { 2275 _line_progressed(skipws); 2276 } 2277 s = m_state->line_contents.rem; 2278 pos = m_state->pos.offset; 2279 _c4dbgpf("slurp 3 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset)); 2280 } 2281 2282 if(s.begins_with('\'')) 2283 { 2284 m_state->scalar_col = m_state->line_contents.current_col(s); 2285 return _scan_squot_scalar(); 2286 } 2287 else if(s.begins_with('"')) 2288 { 2289 m_state->scalar_col = m_state->line_contents.current_col(s); 2290 return _scan_dquot_scalar(); 2291 } 2292 else if(s.begins_with('|') || s.begins_with('>')) 2293 { 2294 return _scan_block(); 2295 } 2296 2297 _c4dbgpf("slurp 4 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset)); 2298 2299 m_state->scalar_col = m_state->line_contents.current_col(s); 2300 _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() + pos); 2301 _line_progressed(static_cast<size_t>(s.end() - (m_buf.begin() + pos))); 2302 2303 _c4dbgpf("slurp 5 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset)); 2304 2305 if(_at_line_end()) 2306 { 2307 _c4dbgpf("at line end. curr='{}'", s); 2308 s = _extend_scanned_scalar(s); 2309 } 2310 2311 _c4dbgpf("scalar was '{}'", s); 2312 2313 return s; 2314 } 2315 2316 2317 //----------------------------------------------------------------------------- 2318 2319 bool Parser::_scan_scalar_seq_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) 2320 { 2321 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RSEQ)); 2322 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RVAL)); 2323 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(RKEY)); 2324 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(FLOW)); 2325 2326 csubstr s = m_state->line_contents.rem; 2327 if(s.len == 0) 2328 return false; 2329 s = s.trim(" \t"); 2330 if(s.len == 0) 2331 return false; 2332 2333 if(s.begins_with('\'')) 2334 { 2335 _c4dbgp("got a ': scanning single-quoted scalar"); 2336 m_state->scalar_col = m_state->line_contents.current_col(s); 2337 *scalar = _scan_squot_scalar(); 2338 *quoted = true; 2339 return true; 2340 } 2341 else if(s.begins_with('"')) 2342 { 2343 _c4dbgp("got a \": scanning double-quoted scalar"); 2344 m_state->scalar_col = m_state->line_contents.current_col(s); 2345 *scalar = _scan_dquot_scalar(); 2346 *quoted = true; 2347 return true; 2348 } 2349 else if(s.begins_with('|') || s.begins_with('>')) 2350 { 2351 *scalar = _scan_block(); 2352 *quoted = true; 2353 return true; 2354 } 2355 else if(has_any(RTOP) && _is_doc_sep(s)) 2356 { 2357 return false; 2358 } 2359 2360 _c4dbgp("RSEQ|RVAL"); 2361 if( ! _is_scalar_next__rseq_rval(s)) 2362 return false; 2363 _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t")) 2364 return false; 2365 ) 2366 2367 if(s.ends_with(':')) 2368 { 2369 --s.len; 2370 } 2371 else 2372 { 2373 auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #"); 2374 if(first) 2375 s.len = first.pos; 2376 } 2377 s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 2378 2379 if(s.empty()) 2380 return false; 2381 2382 m_state->scalar_col = m_state->line_contents.current_col(s); 2383 _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str); 2384 _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len); 2385 2386 if(_at_line_end() && s != '~') 2387 { 2388 _c4dbgpf("at line end. curr='{}'", s); 2389 s = _extend_scanned_scalar(s); 2390 } 2391 2392 _c4dbgpf("scalar was '{}'", s); 2393 2394 *scalar = s; 2395 *quoted = false; 2396 return true; 2397 } 2398 2399 bool Parser::_scan_scalar_map_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) 2400 { 2401 _c4dbgp("_scan_scalar_map_blck"); 2402 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP)); 2403 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(FLOW)); 2404 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY|RVAL)); 2405 2406 csubstr s = m_state->line_contents.rem; 2407 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED 2408 if(s.len == 0) 2409 return false; 2410 #endif 2411 s = s.trim(" \t"); 2412 if(s.len == 0) 2413 return false; 2414 2415 if(s.begins_with('\'')) 2416 { 2417 _c4dbgp("got a ': scanning single-quoted scalar"); 2418 m_state->scalar_col = m_state->line_contents.current_col(s); 2419 *scalar = _scan_squot_scalar(); 2420 *quoted = true; 2421 return true; 2422 } 2423 else if(s.begins_with('"')) 2424 { 2425 _c4dbgp("got a \": scanning double-quoted scalar"); 2426 m_state->scalar_col = m_state->line_contents.current_col(s); 2427 *scalar = _scan_dquot_scalar(); 2428 *quoted = true; 2429 return true; 2430 } 2431 else if(s.begins_with('|') || s.begins_with('>')) 2432 { 2433 *scalar = _scan_block(); 2434 *quoted = true; 2435 return true; 2436 } 2437 else if(has_any(RTOP) && _is_doc_sep(s)) 2438 { 2439 return false; 2440 } 2441 2442 if( ! _is_scalar_next__rmap(s)) 2443 return false; 2444 2445 size_t colon_token = s.find(": "); 2446 if(colon_token == npos) 2447 { 2448 _RYML_WITH_OR_WITHOUT_TAB_TOKENS( 2449 // with tab tokens 2450 colon_token = s.find(":\t"); 2451 if(colon_token == npos) 2452 { 2453 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); 2454 colon_token = s.find(':'); 2455 if(colon_token != s.len-1) 2456 colon_token = npos; 2457 } 2458 , 2459 // without tab tokens 2460 colon_token = s.find(':'); 2461 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); 2462 if(colon_token != s.len-1) 2463 colon_token = npos; 2464 ) 2465 } 2466 2467 if(has_all(RKEY)) 2468 { 2469 _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' ')); 2470 if(has_any(QMRK)) 2471 { 2472 _c4dbgp("RMAP|RKEY|CPLX"); 2473 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP)); 2474 if(s.begins_with("? ") || s == '?') 2475 return false; 2476 s = s.left_of(colon_token); 2477 s = s.left_of(s.first_of("#")); 2478 s = s.trimr(" \t"); 2479 if(s.begins_with("---")) 2480 return false; 2481 else if(s.begins_with("...")) 2482 return false; 2483 } 2484 else 2485 { 2486 _c4dbgp("RMAP|RKEY"); 2487 _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{')); 2488 if(s.begins_with("? ") || s == '?') 2489 return false; 2490 s = s.left_of(colon_token); 2491 s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 2492 if(s.begins_with("---")) 2493 { 2494 return false; 2495 } 2496 else if(s.begins_with("...")) 2497 { 2498 return false; 2499 } 2500 } 2501 } 2502 else if(has_all(RVAL)) 2503 { 2504 _c4dbgp("RMAP|RVAL"); 2505 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK)); 2506 if( ! _is_scalar_next__rmap_val(s)) 2507 return false; 2508 _RYML_WITH_TAB_TOKENS( 2509 else if(s.begins_with("-\t")) 2510 return false; 2511 ) 2512 _c4dbgp("RMAP|RVAL: scalar"); 2513 s = s.left_of(s.find(" #")); // is there a comment? 2514 s = s.left_of(s.find("\t#")); // is there a comment? 2515 s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 2516 if(s.begins_with("---")) 2517 return false; 2518 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED 2519 else if(s.begins_with("...")) 2520 return false; 2521 #endif 2522 } 2523 2524 if(s.empty()) 2525 return false; 2526 2527 m_state->scalar_col = m_state->line_contents.current_col(s); 2528 _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str); 2529 _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len); 2530 2531 if(_at_line_end() && s != '~') 2532 { 2533 _c4dbgpf("at line end. curr='{}'", s); 2534 s = _extend_scanned_scalar(s); 2535 } 2536 2537 _c4dbgpf("scalar was '{}'", s); 2538 2539 *scalar = s; 2540 *quoted = false; 2541 return true; 2542 } 2543 2544 bool Parser::_scan_scalar_seq_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) 2545 { 2546 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RSEQ)); 2547 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(FLOW)); 2548 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RVAL)); 2549 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(RKEY)); 2550 2551 csubstr s = m_state->line_contents.rem; 2552 if(s.len == 0) 2553 return false; 2554 s = s.trim(" \t"); 2555 if(s.len == 0) 2556 return false; 2557 2558 if(s.begins_with('\'')) 2559 { 2560 _c4dbgp("got a ': scanning single-quoted scalar"); 2561 m_state->scalar_col = m_state->line_contents.current_col(s); 2562 *scalar = _scan_squot_scalar(); 2563 *quoted = true; 2564 return true; 2565 } 2566 else if(s.begins_with('"')) 2567 { 2568 _c4dbgp("got a \": scanning double-quoted scalar"); 2569 m_state->scalar_col = m_state->line_contents.current_col(s); 2570 *scalar = _scan_dquot_scalar(); 2571 *quoted = true; 2572 return true; 2573 } 2574 2575 if(has_all(RVAL)) 2576 { 2577 _c4dbgp("RSEQ|RVAL"); 2578 if( ! _is_scalar_next__rseq_rval(s)) 2579 return false; 2580 _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t")) 2581 return false; 2582 ) 2583 _c4dbgp("RSEQ|RVAL|FLOW"); 2584 s = s.left_of(s.first_of(",]")); 2585 if(s.ends_with(':')) 2586 { 2587 --s.len; 2588 } 2589 else 2590 { 2591 auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #"); 2592 if(first) 2593 s.len = first.pos; 2594 } 2595 s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 2596 } 2597 2598 if(s.empty()) 2599 return false; 2600 2601 m_state->scalar_col = m_state->line_contents.current_col(s); 2602 _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str); 2603 _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len); 2604 2605 if(_at_line_end() && s != '~') 2606 { 2607 _c4dbgpf("at line end. curr='{}'", s); 2608 s = _extend_scanned_scalar(s); 2609 } 2610 2611 _c4dbgpf("scalar was '{}'", s); 2612 2613 *scalar = s; 2614 *quoted = false; 2615 return true; 2616 } 2617 2618 bool Parser::_scan_scalar_map_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) 2619 { 2620 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP)); 2621 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(FLOW)); 2622 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY|RVAL)); 2623 2624 csubstr s = m_state->line_contents.rem; 2625 if(s.len == 0) 2626 return false; 2627 s = s.trim(" \t"); 2628 if(s.len == 0) 2629 return false; 2630 2631 if(s.begins_with('\'')) 2632 { 2633 _c4dbgp("got a ': scanning single-quoted scalar"); 2634 m_state->scalar_col = m_state->line_contents.current_col(s); 2635 *scalar = _scan_squot_scalar(); 2636 *quoted = true; 2637 return true; 2638 } 2639 else if(s.begins_with('"')) 2640 { 2641 _c4dbgp("got a \": scanning double-quoted scalar"); 2642 m_state->scalar_col = m_state->line_contents.current_col(s); 2643 *scalar = _scan_dquot_scalar(); 2644 *quoted = true; 2645 return true; 2646 } 2647 2648 if( ! _is_scalar_next__rmap(s)) 2649 return false; 2650 2651 if(has_all(RKEY)) 2652 { 2653 _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' ')); 2654 size_t colon_token = s.find(": "); 2655 if(colon_token == npos) 2656 { 2657 _RYML_WITH_OR_WITHOUT_TAB_TOKENS( 2658 // with tab tokens 2659 colon_token = s.find(":\t"); 2660 if(colon_token == npos) 2661 { 2662 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); 2663 colon_token = s.find(':'); 2664 if(colon_token != s.len-1) 2665 colon_token = npos; 2666 } 2667 , 2668 // without tab tokens 2669 colon_token = s.find(':'); 2670 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0); 2671 if(colon_token != s.len-1) 2672 colon_token = npos; 2673 ) 2674 } 2675 if(s.begins_with("? ") || s == '?') 2676 return false; 2677 if(has_any(QMRK)) 2678 { 2679 _c4dbgp("RMAP|RKEY|CPLX"); 2680 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP)); 2681 s = s.left_of(colon_token); 2682 s = s.left_of(s.first_of("#")); 2683 s = s.left_of(s.first_of(':')); 2684 s = s.trimr(" \t"); 2685 if(s.begins_with("---")) 2686 return false; 2687 else if(s.begins_with("...")) 2688 return false; 2689 } 2690 else 2691 { 2692 _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{')); 2693 _c4dbgp("RMAP|RKEY"); 2694 s = s.left_of(colon_token); 2695 s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 2696 _c4dbgpf("RMAP|RKEY|FLOW: '{}'", s); 2697 s = s.left_of(s.first_of(",}")); 2698 if(s.ends_with(':')) 2699 --s.len; 2700 } 2701 } 2702 else if(has_all(RVAL)) 2703 { 2704 _c4dbgp("RMAP|RVAL"); 2705 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK)); 2706 if( ! _is_scalar_next__rmap_val(s)) 2707 return false; 2708 _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t")) 2709 return false; 2710 ) 2711 _c4dbgp("RMAP|RVAL|FLOW"); 2712 if(has_none(RSEQIMAP)) 2713 s = s.left_of(s.first_of(",}")); 2714 else 2715 s = s.left_of(s.first_of(",]")); 2716 s = s.left_of(s.find(" #")); // is there a comment? 2717 s = s.left_of(s.find("\t#")); // is there a comment? 2718 s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' ')); 2719 } 2720 2721 if(s.empty()) 2722 return false; 2723 2724 m_state->scalar_col = m_state->line_contents.current_col(s); 2725 _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str); 2726 _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len); 2727 2728 if(_at_line_end() && s != '~') 2729 { 2730 _c4dbgpf("at line end. curr='{}'", s); 2731 s = _extend_scanned_scalar(s); 2732 } 2733 2734 _c4dbgpf("scalar was '{}'", s); 2735 2736 *scalar = s; 2737 *quoted = false; 2738 return true; 2739 } 2740 2741 bool Parser::_scan_scalar_unk(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted) 2742 { 2743 _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RUNK)); 2744 2745 csubstr s = m_state->line_contents.rem; 2746 if(s.len == 0) 2747 return false; 2748 s = s.trim(" \t"); 2749 if(s.len == 0) 2750 return false; 2751 2752 if(s.begins_with('\'')) 2753 { 2754 _c4dbgp("got a ': scanning single-quoted scalar"); 2755 m_state->scalar_col = m_state->line_contents.current_col(s); 2756 *scalar = _scan_squot_scalar(); 2757 *quoted = true; 2758 return true; 2759 } 2760 else if(s.begins_with('"')) 2761 { 2762 _c4dbgp("got a \": scanning double-quoted scalar"); 2763 m_state->scalar_col = m_state->line_contents.current_col(s); 2764 *scalar = _scan_dquot_scalar(); 2765 *quoted = true; 2766 return true; 2767 } 2768 else if(s.begins_with('|') || s.begins_with('>')) 2769 { 2770 *scalar = _scan_block(); 2771 *quoted = true; 2772 return true; 2773 } 2774 else if(has_any(RTOP) && _is_doc_sep(s)) 2775 { 2776 return false; 2777 } 2778 2779 _c4dbgpf("RUNK '[{}]~~~{}~~~", s.len, s); 2780 if( ! _is_scalar_next__runk(s)) 2781 { 2782 _c4dbgp("RUNK: no scalar next"); 2783 return false; 2784 } 2785 size_t pos = s.find(" #"); 2786 if(pos != npos) 2787 { 2788 _c4dbgpf("RUNK: found ' #' at {}", pos); 2789 s = s.left_of(pos); 2790 } 2791 pos = s.find(": "); 2792 if(pos != npos) 2793 { 2794 _c4dbgpf("RUNK: found ': ' at {}", pos); 2795 s = s.left_of(pos); 2796 } 2797 else if(s.ends_with(':')) 2798 { 2799 _c4dbgp("RUNK: ends with ':'"); 2800 s = s.left_of(s.len-1); 2801 } 2802 _RYML_WITH_TAB_TOKENS( 2803 else if((pos = s.find(":\t")) != npos) // TABS 2804 { 2805 _c4dbgp("RUNK: ends with ':\\t'"); 2806 s = s.left_of(pos); 2807 }) 2808 else 2809 { 2810 _c4dbgp("RUNK: trimming left of ,"); 2811 s = s.left_of(s.first_of(',')); 2812 } 2813 s = s.trim(" \t"); 2814 _c4dbgpf("RUNK: scalar=[{}]~~~{}~~~", s.len, s); 2815 2816 if(s.empty()) 2817 return false; 2818 2819 m_state->scalar_col = m_state->line_contents.current_col(s); 2820 _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str); 2821 _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len); 2822 2823 if(_at_line_end() && s != '~') 2824 { 2825 _c4dbgpf("at line end. curr=[{}]~~~{}~~", s.len, s); 2826 s = _extend_scanned_scalar(s); 2827 } 2828 2829 _c4dbgpf("scalar was [{}]~~~{}~~~", s.len, s); 2830 2831 *scalar = s; 2832 *quoted = false; 2833 return true; 2834 } 2835 2836 2837 //----------------------------------------------------------------------------- 2838 2839 csubstr Parser::_extend_scanned_scalar(csubstr s) 2840 { 2841 if(has_all(RMAP|RKEY|QMRK)) 2842 { 2843 size_t scalar_indentation = has_any(FLOW) ? 0 : m_state->scalar_col; 2844 _c4dbgpf("extend_scalar: explicit key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col); 2845 csubstr n = _scan_to_next_nonempty_line(scalar_indentation); 2846 if(!n.empty()) 2847 { 2848 substr full = _scan_complex_key(s, n).trimr(" \t\r\n"); 2849 if(full != s) 2850 s = _filter_plain_scalar(full, scalar_indentation); 2851 } 2852 } 2853 // deal with plain (unquoted) scalars that continue to the next line 2854 else if(!s.begins_with_any("*")) // cannot be a plain scalar if it starts with * (that's an anchor reference) 2855 { 2856 _c4dbgpf("extend_scalar: line ended, scalar='{}'", s); 2857 if(has_none(FLOW)) 2858 { 2859 size_t scalar_indentation = m_state->indref + 1; 2860 if(has_all(RUNK) && scalar_indentation == 1) 2861 scalar_indentation = 0; 2862 csubstr n = _scan_to_next_nonempty_line(scalar_indentation); 2863 if(!n.empty()) 2864 { 2865 _c4dbgpf("rscalar[IMPL]: state_indref={} state_indentation={} scalar_indentation={}", m_state->indref, m_state->line_contents.indentation, scalar_indentation); 2866 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n)); 2867 substr full = _scan_plain_scalar_blck(s, n, scalar_indentation); 2868 if(full.len >= s.len) 2869 s = _filter_plain_scalar(full, scalar_indentation); 2870 } 2871 } 2872 else 2873 { 2874 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW)); 2875 csubstr n = _scan_to_next_nonempty_line(/*indentation*/0); 2876 if(!n.empty()) 2877 { 2878 _c4dbgp("rscalar[FLOW]"); 2879 substr full = _scan_plain_scalar_flow(s, n); 2880 s = _filter_plain_scalar(full, /*indentation*/0); 2881 } 2882 } 2883 } 2884 2885 return s; 2886 } 2887 2888 2889 //----------------------------------------------------------------------------- 2890 2891 substr Parser::_scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line) 2892 { 2893 static constexpr const csubstr chars = "[]{}?#,"; 2894 size_t pos = peeked_line.first_of(chars); 2895 bool first = true; 2896 while(pos != 0) 2897 { 2898 if(has_all(RMAP|RKEY) || has_any(RUNK)) 2899 { 2900 csubstr tpkl = peeked_line.triml(' ').trimr("\r\n"); 2901 if(tpkl.begins_with(": ") || tpkl == ':') 2902 { 2903 _c4dbgpf("rscalar[FLOW]: map value starts on the peeked line: '{}'", peeked_line); 2904 peeked_line = peeked_line.first(0); 2905 break; 2906 } 2907 else 2908 { 2909 auto colon_pos = peeked_line.first_of_any(": ", ":"); 2910 if(colon_pos && colon_pos.pos < pos) 2911 { 2912 peeked_line = peeked_line.first(colon_pos.pos); 2913 _c4dbgpf("rscalar[FLOW]: found colon at {}. peeked='{}'", colon_pos.pos, peeked_line); 2914 _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin()); 2915 _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin())); 2916 break; 2917 } 2918 } 2919 } 2920 if(pos != npos) 2921 { 2922 _c4dbgpf("rscalar[FLOW]: found special character '{}' at {}, stopping: '{}'", peeked_line[pos], pos, peeked_line.left_of(pos).trimr("\r\n")); 2923 peeked_line = peeked_line.left_of(pos); 2924 _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin()); 2925 _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin())); 2926 break; 2927 } 2928 _c4dbgpf("rscalar[FLOW]: append another line, full: '{}'", peeked_line.trimr("\r\n")); 2929 if(!first) 2930 { 2931 RYML_CHECK(_advance_to_peeked()); 2932 } 2933 peeked_line = _scan_to_next_nonempty_line(/*indentation*/0); 2934 if(peeked_line.empty()) 2935 { 2936 _c4err("expected token or continuation"); 2937 } 2938 pos = peeked_line.first_of(chars); 2939 first = false; 2940 } 2941 substr full(m_buf.str + (currscalar.str - m_buf.str), m_buf.begin() + m_state->pos.offset); 2942 full = full.trimr("\n\r "); 2943 return full; 2944 } 2945 2946 2947 //----------------------------------------------------------------------------- 2948 2949 substr Parser::_scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation) 2950 { 2951 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar)); 2952 // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice 2953 // size_t offs = m_state->pos.offset; // so we workaround by directly counting from the end of the given scalar 2954 _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin()); 2955 size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin()); 2956 _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.begins_with(' ', indentation)); 2957 while(true) 2958 { 2959 _c4dbgpf("rscalar[IMPL]: continuing... ref_indentation={}", indentation); 2960 if(peeked_line.begins_with("...") || peeked_line.begins_with("---")) 2961 { 2962 _c4dbgpf("rscalar[IMPL]: document termination next -- bail now '{}'", peeked_line.trimr("\r\n")); 2963 break; 2964 } 2965 else if(( ! peeked_line.begins_with(' ', indentation))) // is the line deindented? 2966 { 2967 if(!peeked_line.trim(" \r\n\t").empty()) // is the line not blank? 2968 { 2969 _c4dbgpf("rscalar[IMPL]: deindented line, not blank -- bail now '{}'", peeked_line.trimr("\r\n")); 2970 break; 2971 } 2972 _c4dbgpf("rscalar[IMPL]: line is blank and has less indentation: ref={} line={}: '{}'", indentation, peeked_line.first_not_of(' ') == csubstr::npos ? 0 : peeked_line.first_not_of(' '), peeked_line.trimr("\r\n")); 2973 _c4dbgpf("rscalar[IMPL]: ... searching for a line starting at indentation {}", indentation); 2974 csubstr next_peeked = _scan_to_next_nonempty_line(indentation); 2975 if(next_peeked.empty()) 2976 { 2977 _c4dbgp("rscalar[IMPL]: ... finished."); 2978 break; 2979 } 2980 _c4dbgp("rscalar[IMPL]: ... continuing."); 2981 peeked_line = next_peeked; 2982 } 2983 2984 _c4dbgpf("rscalar[IMPL]: line contents: '{}'", peeked_line.right_of(indentation, true).trimr("\r\n")); 2985 size_t token_pos; 2986 if(peeked_line.find(": ") != npos) 2987 { 2988 _line_progressed(peeked_line.find(": ")); 2989 _c4err("': ' is not a valid token in plain flow (unquoted) scalars"); 2990 } 2991 else if(peeked_line.ends_with(':')) 2992 { 2993 _line_progressed(peeked_line.find(':')); 2994 _c4err("lines cannot end with ':' in plain flow (unquoted) scalars"); 2995 } 2996 else if((token_pos = peeked_line.find(" #")) != npos) 2997 { 2998 _line_progressed(token_pos); 2999 break; 3000 //_c4err("' #' is not a valid token in plain flow (unquoted) scalars"); 3001 } 3002 3003 _c4dbgpf("rscalar[IMPL]: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n")); 3004 if(!_advance_to_peeked()) 3005 { 3006 _c4dbgp("rscalar[IMPL]: file finishes after the scalar"); 3007 break; 3008 } 3009 peeked_line = m_state->line_contents.rem; 3010 } 3011 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs); 3012 substr full(m_buf.str + (currscalar.str - m_buf.str), 3013 currscalar.len + (m_state->pos.offset - offs)); 3014 full = full.trimr("\r\n "); 3015 return full; 3016 } 3017 3018 substr Parser::_scan_complex_key(csubstr currscalar, csubstr peeked_line) 3019 { 3020 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar)); 3021 // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice 3022 // size_t offs = m_state->pos.offset; // so we workaround by directly counting from the end of the given scalar 3023 _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin()); 3024 size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin()); 3025 while(true) 3026 { 3027 _c4dbgp("rcplxkey: continuing..."); 3028 if(peeked_line.begins_with("...") || peeked_line.begins_with("---")) 3029 { 3030 _c4dbgpf("rcplxkey: document termination next -- bail now '{}'", peeked_line.trimr("\r\n")); 3031 break; 3032 } 3033 else 3034 { 3035 size_t pos = peeked_line.first_of("?:[]{}"); 3036 if(pos == csubstr::npos) 3037 { 3038 pos = peeked_line.find("- "); 3039 } 3040 if(pos != csubstr::npos) 3041 { 3042 _c4dbgpf("rcplxkey: found special characters at pos={}: '{}'", pos, peeked_line.trimr("\r\n")); 3043 _line_progressed(pos); 3044 break; 3045 } 3046 } 3047 3048 _c4dbgpf("rcplxkey: no special chars found '{}'", peeked_line.trimr("\r\n")); 3049 csubstr next_peeked = _scan_to_next_nonempty_line(0); 3050 if(next_peeked.empty()) 3051 { 3052 _c4dbgp("rcplxkey: empty ... finished."); 3053 break; 3054 } 3055 _c4dbgp("rcplxkey: ... continuing."); 3056 peeked_line = next_peeked; 3057 3058 _c4dbgpf("rcplxkey: line contents: '{}'", peeked_line.trimr("\r\n")); 3059 size_t colpos; 3060 if((colpos = peeked_line.find(": ")) != npos) 3061 { 3062 _c4dbgp("rcplxkey: found ': ', stopping."); 3063 _line_progressed(colpos); 3064 break; 3065 } 3066 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 3067 else if((colpos = peeked_line.ends_with(':'))) 3068 { 3069 _c4dbgp("rcplxkey: ends with ':', stopping."); 3070 _line_progressed(colpos); 3071 break; 3072 } 3073 #endif 3074 _c4dbgpf("rcplxkey: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n")); 3075 if(!_advance_to_peeked()) 3076 { 3077 _c4dbgp("rcplxkey: file finishes after the scalar"); 3078 break; 3079 } 3080 peeked_line = m_state->line_contents.rem; 3081 } 3082 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs); 3083 substr full(m_buf.str + (currscalar.str - m_buf.str), 3084 currscalar.len + (m_state->pos.offset - offs)); 3085 return full; 3086 } 3087 3088 //! scans to the next non-blank line starting with the given indentation 3089 csubstr Parser::_scan_to_next_nonempty_line(size_t indentation) 3090 { 3091 csubstr next_peeked; 3092 while(true) 3093 { 3094 _c4dbgpf("rscalar: ... curr offset: {} indentation={}", m_state->pos.offset, indentation); 3095 next_peeked = _peek_next_line(m_state->pos.offset); 3096 csubstr next_peeked_triml = next_peeked.triml(' '); 3097 _c4dbgpf("rscalar: ... next peeked line='{}'", next_peeked.trimr("\r\n")); 3098 if(next_peeked_triml.begins_with('#')) 3099 { 3100 _c4dbgp("rscalar: ... first non-space character is #"); 3101 return {}; 3102 } 3103 else if(next_peeked.begins_with(' ', indentation)) 3104 { 3105 _c4dbgpf("rscalar: ... begins at same indentation {}, assuming continuation", indentation); 3106 _advance_to_peeked(); 3107 return next_peeked; 3108 } 3109 else // check for de-indentation 3110 { 3111 csubstr trimmed = next_peeked_triml.trimr("\t\r\n"); 3112 _c4dbgpf("rscalar: ... deindented! trimmed='{}'", trimmed); 3113 if(!trimmed.empty()) 3114 { 3115 _c4dbgp("rscalar: ... and not empty. bailing out."); 3116 return {}; 3117 } 3118 } 3119 if(!_advance_to_peeked()) 3120 { 3121 _c4dbgp("rscalar: file finished"); 3122 return {}; 3123 } 3124 } 3125 return {}; 3126 } 3127 3128 // returns false when the file finished 3129 bool Parser::_advance_to_peeked() 3130 { 3131 _line_progressed(m_state->line_contents.rem.len); 3132 _line_ended(); // advances to the peeked-at line, consuming all remaining (probably newline) characters on the current line 3133 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.first_of("\r\n") == csubstr::npos); 3134 _c4dbgpf("advance to peeked: scan more... pos={} len={}", m_state->pos.offset, m_buf.len); 3135 _scan_line(); // puts the peeked-at line in the buffer 3136 if(_finished_file()) 3137 { 3138 _c4dbgp("rscalar: finished file!"); 3139 return false; 3140 } 3141 return true; 3142 } 3143 3144 //----------------------------------------------------------------------------- 3145 3146 C4_ALWAYS_INLINE size_t _extend_from_combined_newline(char nl, char following) 3147 { 3148 return (nl == '\n' && following == '\r') || (nl == '\r' && following == '\n'); 3149 } 3150 3151 //! look for the next newline chars, and jump to the right of those 3152 csubstr from_next_line(csubstr rem) 3153 { 3154 size_t nlpos = rem.first_of("\r\n"); 3155 if(nlpos == csubstr::npos) 3156 return {}; 3157 const char nl = rem[nlpos]; 3158 rem = rem.right_of(nlpos); 3159 if(rem.empty()) 3160 return {}; 3161 if(_extend_from_combined_newline(nl, rem.front())) 3162 rem = rem.sub(1); 3163 return rem; 3164 } 3165 3166 csubstr Parser::_peek_next_line(size_t pos) const 3167 { 3168 csubstr rem{}; // declare here because of the goto 3169 size_t nlpos{}; // declare here because of the goto 3170 pos = pos == npos ? m_state->pos.offset : pos; 3171 if(pos >= m_buf.len) 3172 goto next_is_empty; 3173 3174 // look for the next newline chars, and jump to the right of those 3175 rem = from_next_line(m_buf.sub(pos)); 3176 if(rem.empty()) 3177 goto next_is_empty; 3178 3179 // now get everything up to and including the following newline chars 3180 nlpos = rem.first_of("\r\n"); 3181 if((nlpos != csubstr::npos) && (nlpos + 1 < rem.len)) 3182 nlpos += _extend_from_combined_newline(rem[nlpos], rem[nlpos+1]); 3183 rem = rem.left_of(nlpos, /*include_pos*/true); 3184 3185 _c4dbgpf("peek next line @ {}: (len={})'{}'", pos, rem.len, rem.trimr("\r\n")); 3186 return rem; 3187 3188 next_is_empty: 3189 _c4dbgpf("peek next line @ {}: (len=0)''", pos); 3190 return {}; 3191 } 3192 3193 3194 //----------------------------------------------------------------------------- 3195 void Parser::LineContents::reset_with_next_line(csubstr buf, size_t offset) 3196 { 3197 RYML_ASSERT(offset <= buf.len); 3198 char const* C4_RESTRICT b = &buf[offset]; 3199 char const* C4_RESTRICT e = b; 3200 // get the current line stripped of newline chars 3201 while(e < buf.end() && (*e != '\n' && *e != '\r')) 3202 ++e; 3203 RYML_ASSERT(e >= b); 3204 const csubstr stripped_ = buf.sub(offset, static_cast<size_t>(e - b)); 3205 // advance pos to include the first line ending 3206 if(e != buf.end() && *e == '\r') 3207 ++e; 3208 if(e != buf.end() && *e == '\n') 3209 ++e; 3210 RYML_ASSERT(e >= b); 3211 const csubstr full_ = buf.sub(offset, static_cast<size_t>(e - b)); 3212 reset(full_, stripped_); 3213 } 3214 3215 void Parser::_scan_line() 3216 { 3217 if(m_state->pos.offset >= m_buf.len) 3218 { 3219 m_state->line_contents.reset(m_buf.last(0), m_buf.last(0)); 3220 return; 3221 } 3222 m_state->line_contents.reset_with_next_line(m_buf, m_state->pos.offset); 3223 } 3224 3225 3226 //----------------------------------------------------------------------------- 3227 void Parser::_line_progressed(size_t ahead) 3228 { 3229 _c4dbgpf("line[{}] ({} cols) progressed by {}: col {}-->{} offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, ahead, m_state->pos.col, m_state->pos.col+ahead, m_state->pos.offset, m_state->pos.offset+ahead); 3230 m_state->pos.offset += ahead; 3231 m_state->pos.col += ahead; 3232 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col <= m_state->line_contents.stripped.len+1); 3233 m_state->line_contents.rem = m_state->line_contents.rem.sub(ahead); 3234 } 3235 3236 void Parser::_line_ended() 3237 { 3238 _c4dbgpf("line[{}] ({} cols) ended! offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, m_state->pos.offset, m_state->pos.offset+m_state->line_contents.full.len - m_state->line_contents.stripped.len); 3239 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == m_state->line_contents.stripped.len+1); 3240 m_state->pos.offset += m_state->line_contents.full.len - m_state->line_contents.stripped.len; 3241 ++m_state->pos.line; 3242 m_state->pos.col = 1; 3243 } 3244 3245 void Parser::_line_ended_undo() 3246 { 3247 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == 1u); 3248 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line > 0u); 3249 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_state->line_contents.full.len - m_state->line_contents.stripped.len); 3250 size_t delta = m_state->line_contents.full.len - m_state->line_contents.stripped.len; 3251 _c4dbgpf("line[{}] undo ended! line {}-->{}, offset {}-->{}", m_state->pos.line, m_state->pos.line, m_state->pos.line - 1, m_state->pos.offset, m_state->pos.offset - delta); 3252 m_state->pos.offset -= delta; 3253 --m_state->pos.line; 3254 m_state->pos.col = m_state->line_contents.stripped.len + 1u; 3255 // don't forget to undo also the changes to the remainder of the line 3256 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_buf.len || m_buf[m_state->pos.offset] == '\n' || m_buf[m_state->pos.offset] == '\r'); 3257 m_state->line_contents.rem = m_buf.sub(m_state->pos.offset, 0); 3258 } 3259 3260 3261 //----------------------------------------------------------------------------- 3262 void Parser::_set_indentation(size_t indentation) 3263 { 3264 m_state->indref = indentation; 3265 _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref); 3266 } 3267 3268 void Parser::_save_indentation(size_t behind) 3269 { 3270 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begin() >= m_state->line_contents.full.begin()); 3271 m_state->indref = static_cast<size_t>(m_state->line_contents.rem.begin() - m_state->line_contents.full.begin()); 3272 _RYML_CB_ASSERT(m_stack.m_callbacks, behind <= m_state->indref); 3273 m_state->indref -= behind; 3274 _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref); 3275 } 3276 3277 bool Parser::_maybe_set_indentation_from_anchor_or_tag() 3278 { 3279 if(m_key_anchor.not_empty()) 3280 { 3281 _c4dbgpf("set indentation from key anchor: {}", m_key_anchor_indentation); 3282 _set_indentation(m_key_anchor_indentation); // this is the column where the anchor starts 3283 return true; 3284 } 3285 else if(m_key_tag.not_empty()) 3286 { 3287 _c4dbgpf("set indentation from key tag: {}", m_key_tag_indentation); 3288 _set_indentation(m_key_tag_indentation); // this is the column where the tag starts 3289 return true; 3290 } 3291 return false; 3292 } 3293 3294 3295 //----------------------------------------------------------------------------- 3296 void Parser::_write_key_anchor(size_t node_id) 3297 { 3298 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_key(node_id)); 3299 if( ! m_key_anchor.empty()) 3300 { 3301 _c4dbgpf("node={}: set key anchor to '{}'", node_id, m_key_anchor); 3302 m_tree->set_key_anchor(node_id, m_key_anchor); 3303 m_key_anchor.clear(); 3304 m_key_anchor_was_before = false; 3305 m_key_anchor_indentation = 0; 3306 } 3307 else if( ! m_tree->is_key_quoted(node_id)) 3308 { 3309 csubstr r = m_tree->key(node_id); 3310 if(r.begins_with('*')) 3311 { 3312 _c4dbgpf("node={}: set key reference: '{}'", node_id, r); 3313 m_tree->set_key_ref(node_id, r.sub(1)); 3314 } 3315 else if(r == "<<") 3316 { 3317 m_tree->set_key_ref(node_id, r); 3318 _c4dbgpf("node={}: it's an inheriting reference", node_id); 3319 if(m_tree->is_seq(node_id)) 3320 { 3321 _c4dbgpf("node={}: inheriting from seq of {}", node_id, m_tree->num_children(node_id)); 3322 for(size_t i = m_tree->first_child(node_id); i != NONE; i = m_tree->next_sibling(i)) 3323 { 3324 if( ! (m_tree->val(i).begins_with('*'))) 3325 _c4err("malformed reference: '{}'", m_tree->val(i)); 3326 } 3327 } 3328 else if( ! m_tree->val(node_id).begins_with('*')) 3329 { 3330 _c4err("malformed reference: '{}'", m_tree->val(node_id)); 3331 } 3332 //m_tree->set_key_ref(node_id, r); 3333 } 3334 } 3335 } 3336 3337 //----------------------------------------------------------------------------- 3338 void Parser::_write_val_anchor(size_t node_id) 3339 { 3340 if( ! m_val_anchor.empty()) 3341 { 3342 _c4dbgpf("node={}: set val anchor to '{}'", node_id, m_val_anchor); 3343 m_tree->set_val_anchor(node_id, m_val_anchor); 3344 m_val_anchor.clear(); 3345 } 3346 csubstr r = m_tree->has_val(node_id) ? m_tree->val(node_id) : ""; 3347 if(!m_tree->is_val_quoted(node_id) && r.begins_with('*')) 3348 { 3349 _c4dbgpf("node={}: set val reference: '{}'", node_id, r); 3350 RYML_CHECK(!m_tree->has_val_anchor(node_id)); 3351 m_tree->set_val_ref(node_id, r.sub(1)); 3352 } 3353 } 3354 3355 //----------------------------------------------------------------------------- 3356 void Parser::_push_level(bool explicit_flow_chars) 3357 { 3358 _c4dbgpf("pushing level! currnode={} currlevel={} stacksize={} stackcap={}", m_state->node_id, m_state->level, m_stack.size(), m_stack.capacity()); 3359 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top()); 3360 if(node(m_state) == nullptr) 3361 { 3362 _c4dbgp("pushing level! actually no, current node is null"); 3363 //_RYML_CB_ASSERT(m_stack.m_callbacks, ! explicit_flow_chars); 3364 return; 3365 } 3366 flag_t st = RUNK; 3367 if(explicit_flow_chars || has_all(FLOW)) 3368 { 3369 st |= FLOW; 3370 } 3371 m_stack.push_top(); 3372 m_state = &m_stack.top(); 3373 set_flags(st); 3374 m_state->node_id = (size_t)NONE; 3375 m_state->indref = (size_t)NONE; 3376 ++m_state->level; 3377 _c4dbgpf("pushing level: now, currlevel={}", m_state->level); 3378 } 3379 3380 void Parser::_pop_level() 3381 { 3382 _c4dbgpf("popping level! currnode={} currlevel={}", m_state->node_id, m_state->level); 3383 if(has_any(RMAP) || m_tree->is_map(m_state->node_id)) 3384 { 3385 _stop_map(); 3386 } 3387 if(has_any(RSEQ) || m_tree->is_seq(m_state->node_id)) 3388 { 3389 _stop_seq(); 3390 } 3391 if(m_tree->is_doc(m_state->node_id)) 3392 { 3393 _stop_doc(); 3394 } 3395 _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() > 1); 3396 _prepare_pop(); 3397 m_stack.pop(); 3398 m_state = &m_stack.top(); 3399 /*if(has_any(RMAP)) 3400 { 3401 _toggle_key_val(); 3402 }*/ 3403 if(m_state->line_contents.indentation == 0) 3404 { 3405 //_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RTOP)); 3406 add_flags(RTOP); 3407 } 3408 _c4dbgpf("popping level: now, currnode={} currlevel={}", m_state->node_id, m_state->level); 3409 } 3410 3411 //----------------------------------------------------------------------------- 3412 void Parser::_start_unk(bool /*as_child*/) 3413 { 3414 _c4dbgp("start_unk"); 3415 _push_level(); 3416 _move_scalar_from_top(); 3417 } 3418 3419 //----------------------------------------------------------------------------- 3420 void Parser::_start_doc(bool as_child) 3421 { 3422 _c4dbgpf("start_doc (as child={})", as_child); 3423 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id)); 3424 size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id; 3425 _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE); 3426 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_root(parent_id)); 3427 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id)); 3428 if(as_child) 3429 { 3430 _c4dbgpf("start_doc: parent={}", parent_id); 3431 if( ! m_tree->is_stream(parent_id)) 3432 { 3433 _c4dbgp("start_doc: rearranging with root as STREAM"); 3434 m_tree->set_root_as_stream(); 3435 } 3436 m_state->node_id = m_tree->append_child(parent_id); 3437 m_tree->to_doc(m_state->node_id); 3438 } 3439 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 3440 else 3441 { 3442 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(parent_id) || m_tree->empty(parent_id)); 3443 m_state->node_id = parent_id; 3444 if( ! m_tree->is_doc(parent_id)) 3445 { 3446 m_tree->to_doc(parent_id, DOC); 3447 } 3448 } 3449 #endif 3450 _c4dbgpf("start_doc: id={}", m_state->node_id); 3451 add_flags(RUNK|RTOP|NDOC); 3452 _handle_types(); 3453 rem_flags(NDOC); 3454 } 3455 3456 void Parser::_stop_doc() 3457 { 3458 size_t doc_node = m_state->node_id; 3459 _c4dbgpf("stop_doc[{}]", doc_node); 3460 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(doc_node)); 3461 if(!m_tree->is_seq(doc_node) && !m_tree->is_map(doc_node) && !m_tree->is_val(doc_node)) 3462 { 3463 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL)); 3464 _c4dbgpf("stop_doc[{}]: there was nothing; adding null val", doc_node); 3465 m_tree->to_val(doc_node, {}, DOC); 3466 } 3467 } 3468 3469 void Parser::_end_stream() 3470 { 3471 _c4dbgpf("end_stream, level={} node_id={}", m_state->level, m_state->node_id); 3472 _RYML_CB_ASSERT(m_stack.m_callbacks, ! m_stack.empty()); 3473 NodeData *added = nullptr; 3474 if(has_any(SSCL)) 3475 { 3476 if(m_tree->is_seq(m_state->node_id)) 3477 { 3478 _c4dbgp("append val..."); 3479 added = _append_val(_consume_scalar()); 3480 } 3481 else if(m_tree->is_map(m_state->node_id)) 3482 { 3483 _c4dbgp("append null key val..."); 3484 added = _append_key_val_null(m_state->line_contents.rem.str); 3485 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 3486 if(has_any(RSEQIMAP)) 3487 { 3488 _stop_seqimap(); 3489 _pop_level(); 3490 } 3491 #endif 3492 } 3493 else if(m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE) 3494 { 3495 NodeType_e quoted = has_any(QSCL) ? VALQUO : NOTYPE; // do this before consuming the scalar 3496 csubstr scalar = _consume_scalar(); 3497 _c4dbgpf("node[{}]: to docval '{}'{}", m_state->node_id, scalar, quoted == VALQUO ? ", quoted" : ""); 3498 m_tree->to_val(m_state->node_id, scalar, DOC|quoted); 3499 added = m_tree->get(m_state->node_id); 3500 } 3501 else 3502 { 3503 _c4err("internal error"); 3504 } 3505 } 3506 else if(has_all(RSEQ|RVAL) && has_none(FLOW)) 3507 { 3508 _c4dbgp("add last..."); 3509 added = _append_val_null(m_state->line_contents.rem.str); 3510 } 3511 else if(!m_val_tag.empty() && (m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE)) 3512 { 3513 csubstr scalar = m_state->line_contents.rem.first(0); 3514 _c4dbgpf("node[{}]: add null scalar as docval", m_state->node_id); 3515 m_tree->to_val(m_state->node_id, scalar, DOC); 3516 added = m_tree->get(m_state->node_id); 3517 } 3518 3519 if(added) 3520 { 3521 size_t added_id = m_tree->id(added); 3522 if(m_tree->is_seq(m_state->node_id) || m_tree->is_doc(m_state->node_id)) 3523 { 3524 if(!m_key_anchor.empty()) 3525 { 3526 _c4dbgpf("node[{}]: move key to val anchor: '{}'", added_id, m_key_anchor); 3527 m_val_anchor = m_key_anchor; 3528 m_key_anchor = {}; 3529 } 3530 if(!m_key_tag.empty()) 3531 { 3532 _c4dbgpf("node[{}]: move key to val tag: '{}'", added_id, m_key_tag); 3533 m_val_tag = m_key_tag; 3534 m_key_tag = {}; 3535 } 3536 } 3537 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 3538 if(!m_key_anchor.empty()) 3539 { 3540 _c4dbgpf("node[{}]: set key anchor='{}'", added_id, m_key_anchor); 3541 m_tree->set_key_anchor(added_id, m_key_anchor); 3542 m_key_anchor = {}; 3543 } 3544 #endif 3545 if(!m_val_anchor.empty()) 3546 { 3547 _c4dbgpf("node[{}]: set val anchor='{}'", added_id, m_val_anchor); 3548 m_tree->set_val_anchor(added_id, m_val_anchor); 3549 m_val_anchor = {}; 3550 } 3551 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 3552 if(!m_key_tag.empty()) 3553 { 3554 _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", added_id, m_key_tag, normalize_tag(m_key_tag)); 3555 m_tree->set_key_tag(added_id, normalize_tag(m_key_tag)); 3556 m_key_tag = {}; 3557 } 3558 #endif 3559 if(!m_val_tag.empty()) 3560 { 3561 _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", added_id, m_val_tag, normalize_tag(m_val_tag)); 3562 m_tree->set_val_tag(added_id, normalize_tag(m_val_tag)); 3563 m_val_tag = {}; 3564 } 3565 } 3566 3567 while(m_stack.size() > 1) 3568 { 3569 _c4dbgpf("popping level: {} (stack sz={})", m_state->level, m_stack.size()); 3570 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(SSCL, &m_stack.top())); 3571 if(has_all(RSEQ|FLOW)) 3572 _err("closing ] not found"); 3573 _pop_level(); 3574 } 3575 add_flags(NDOC); 3576 } 3577 3578 void Parser::_start_new_doc(csubstr rem) 3579 { 3580 _c4dbgp("_start_new_doc"); 3581 _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begins_with("---")); 3582 C4_UNUSED(rem); 3583 3584 _end_stream(); 3585 3586 size_t indref = m_state->indref; 3587 _c4dbgpf("start a document, indentation={}", indref); 3588 _line_progressed(3); 3589 _push_level(); 3590 _start_doc(); 3591 _set_indentation(indref); 3592 } 3593 3594 3595 //----------------------------------------------------------------------------- 3596 void Parser::_start_map(bool as_child) 3597 { 3598 _c4dbgpf("start_map (as child={})", as_child); 3599 addrem_flags(RMAP|RVAL, RKEY|RUNK); 3600 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id)); 3601 size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id; 3602 _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE); 3603 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id)); 3604 if(as_child) 3605 { 3606 m_state->node_id = m_tree->append_child(parent_id); 3607 if(has_all(SSCL)) 3608 { 3609 type_bits key_quoted = NOTYPE; 3610 if(m_state->flags & QSCL) // before consuming the scalar 3611 key_quoted |= KEYQUO; 3612 csubstr key = _consume_scalar(); 3613 m_tree->to_map(m_state->node_id, key, key_quoted); 3614 _c4dbgpf("start_map: id={} key='{}'", m_state->node_id, m_tree->key(m_state->node_id)); 3615 _write_key_anchor(m_state->node_id); 3616 if( ! m_key_tag.empty()) 3617 { 3618 _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag)); 3619 m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag)); 3620 m_key_tag.clear(); 3621 } 3622 } 3623 else 3624 { 3625 m_tree->to_map(m_state->node_id); 3626 _c4dbgpf("start_map: id={}", m_state->node_id); 3627 } 3628 m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str; 3629 _write_val_anchor(m_state->node_id); 3630 } 3631 else 3632 { 3633 _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE); 3634 m_state->node_id = parent_id; 3635 _c4dbgpf("start_map: id={}", m_state->node_id); 3636 type_bits as_doc = 0; 3637 if(m_tree->is_doc(m_state->node_id)) 3638 as_doc |= DOC; 3639 if(!m_tree->is_map(parent_id)) 3640 { 3641 RYML_CHECK(!m_tree->has_children(parent_id)); 3642 m_tree->to_map(parent_id, as_doc); 3643 } 3644 else 3645 { 3646 m_tree->_add_flags(parent_id, as_doc); 3647 } 3648 _move_scalar_from_top(); 3649 if(m_key_anchor.not_empty()) 3650 m_key_anchor_was_before = true; 3651 _write_val_anchor(parent_id); 3652 if(m_stack.size() >= 2) 3653 { 3654 State const& parent_state = m_stack.top(1); 3655 if(parent_state.flags & RSET) 3656 add_flags(RSET); 3657 } 3658 m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str; 3659 } 3660 if( ! m_val_tag.empty()) 3661 { 3662 _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag)); 3663 m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag)); 3664 m_val_tag.clear(); 3665 } 3666 } 3667 3668 void Parser::_start_map_unk(bool as_child) 3669 { 3670 _c4dbgpf("start_map_unk (as child={})", as_child); 3671 if(!m_key_anchor_was_before) 3672 { 3673 _c4dbgpf("stash key anchor before starting map... '{}'", m_key_anchor); 3674 csubstr ka = m_key_anchor; 3675 m_key_anchor = {}; 3676 _start_map(as_child); 3677 m_key_anchor = ka; 3678 } 3679 else 3680 { 3681 _start_map(as_child); 3682 m_key_anchor_was_before = false; 3683 } 3684 if(m_key_tag2.not_empty()) 3685 { 3686 m_key_tag = m_key_tag2; 3687 m_key_tag_indentation = m_key_tag2_indentation; 3688 m_key_tag2.clear(); 3689 m_key_tag2_indentation = 0; 3690 } 3691 } 3692 3693 void Parser::_stop_map() 3694 { 3695 _c4dbgpf("stop_map[{}]", m_state->node_id); 3696 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id)); 3697 if(has_all(QMRK|RKEY) && !has_all(SSCL)) 3698 { 3699 _c4dbgpf("stop_map[{}]: RKEY", m_state->node_id); 3700 _store_scalar_null(m_state->line_contents.rem.str); 3701 _append_key_val_null(m_state->line_contents.rem.str); 3702 } 3703 } 3704 3705 3706 //----------------------------------------------------------------------------- 3707 void Parser::_start_seq(bool as_child) 3708 { 3709 _c4dbgpf("start_seq (as child={})", as_child); 3710 if(has_all(RTOP|RUNK)) 3711 { 3712 _c4dbgpf("start_seq: moving key tag to val tag: '{}'", m_key_tag); 3713 m_val_tag = m_key_tag; 3714 m_key_tag.clear(); 3715 } 3716 addrem_flags(RSEQ|RVAL, RUNK); 3717 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id)); 3718 size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id; 3719 _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE); 3720 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id)); 3721 if(as_child) 3722 { 3723 m_state->node_id = m_tree->append_child(parent_id); 3724 if(has_all(SSCL)) 3725 { 3726 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(parent_id)); 3727 type_bits key_quoted = 0; 3728 if(m_state->flags & QSCL) // before consuming the scalar 3729 key_quoted |= KEYQUO; 3730 csubstr key = _consume_scalar(); 3731 m_tree->to_seq(m_state->node_id, key, key_quoted); 3732 _c4dbgpf("start_seq: id={} name='{}'", m_state->node_id, m_tree->key(m_state->node_id)); 3733 _write_key_anchor(m_state->node_id); 3734 if( ! m_key_tag.empty()) 3735 { 3736 _c4dbgpf("start_seq[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag)); 3737 m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag)); 3738 m_key_tag.clear(); 3739 } 3740 } 3741 else 3742 { 3743 type_bits as_doc = 0; 3744 _RYML_CB_ASSERT(m_stack.m_callbacks, !m_tree->is_doc(m_state->node_id)); 3745 m_tree->to_seq(m_state->node_id, as_doc); 3746 _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as doc" : ""); 3747 } 3748 _write_val_anchor(m_state->node_id); 3749 m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str; 3750 } 3751 else 3752 { 3753 m_state->node_id = parent_id; 3754 type_bits as_doc = 0; 3755 if(m_tree->is_doc(m_state->node_id)) 3756 as_doc |= DOC; 3757 if(!m_tree->is_seq(parent_id)) 3758 { 3759 RYML_CHECK(!m_tree->has_children(parent_id)); 3760 m_tree->to_seq(parent_id, as_doc); 3761 } 3762 else 3763 { 3764 m_tree->_add_flags(parent_id, as_doc); 3765 } 3766 _move_scalar_from_top(); 3767 _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as_doc" : ""); 3768 _write_val_anchor(parent_id); 3769 m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str; 3770 } 3771 if( ! m_val_tag.empty()) 3772 { 3773 _c4dbgpf("start_seq[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag)); 3774 m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag)); 3775 m_val_tag.clear(); 3776 } 3777 } 3778 3779 void Parser::_stop_seq() 3780 { 3781 _c4dbgp("stop_seq"); 3782 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id)); 3783 } 3784 3785 3786 //----------------------------------------------------------------------------- 3787 void Parser::_start_seqimap() 3788 { 3789 _c4dbgpf("start_seqimap at node={}. has_children={}", m_state->node_id, m_tree->has_children(m_state->node_id)); 3790 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW)); 3791 // create a map, and turn the last scalar of this sequence 3792 // into the key of the map's first child. This scalar was 3793 // understood to be a value in the sequence, but it is 3794 // actually a key of a map, implicitly opened here. 3795 // Eg [val, key: val] 3796 // 3797 // Yep, YAML is crazy. 3798 if(m_tree->has_children(m_state->node_id) && m_tree->has_val(m_tree->last_child(m_state->node_id))) 3799 { 3800 size_t prev = m_tree->last_child(m_state->node_id); 3801 NodeType ty = m_tree->_p(prev)->m_type; // don't use type() because it masks out the quotes 3802 NodeScalar tmp = m_tree->valsc(prev); 3803 _c4dbgpf("has children and last child={} has val. saving the scalars, val='{}' quoted={}", prev, tmp.scalar, ty.is_val_quoted()); 3804 m_tree->remove(prev); 3805 _push_level(); 3806 _start_map(); 3807 _store_scalar(tmp.scalar, ty.is_val_quoted()); 3808 m_key_anchor = tmp.anchor; 3809 m_key_tag = tmp.tag; 3810 } 3811 else 3812 { 3813 _c4dbgpf("node {} has no children yet, using empty key", m_state->node_id); 3814 _push_level(); 3815 _start_map(); 3816 _store_scalar_null(m_state->line_contents.rem.str); 3817 } 3818 add_flags(RSEQIMAP|FLOW); 3819 } 3820 3821 void Parser::_stop_seqimap() 3822 { 3823 _c4dbgp("stop_seqimap"); 3824 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQIMAP)); 3825 } 3826 3827 3828 //----------------------------------------------------------------------------- 3829 NodeData* Parser::_append_val(csubstr val, flag_t quoted) 3830 { 3831 _RYML_CB_ASSERT(m_stack.m_callbacks, ! has_all(SSCL)); 3832 _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) != nullptr); 3833 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id)); 3834 type_bits additional_flags = quoted ? VALQUO : NOTYPE; 3835 _c4dbgpf("append val: '{}' to parent id={} (level={}){}", val, m_state->node_id, m_state->level, quoted ? " VALQUO!" : ""); 3836 size_t nid = m_tree->append_child(m_state->node_id); 3837 m_tree->to_val(nid, val, additional_flags); 3838 _c4dbgpf("append val: id={} val='{}'", nid, m_tree->get(nid)->m_val.scalar); 3839 if( ! m_val_tag.empty()) 3840 { 3841 _c4dbgpf("append val[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag)); 3842 m_tree->set_val_tag(nid, normalize_tag(m_val_tag)); 3843 m_val_tag.clear(); 3844 } 3845 _write_val_anchor(nid); 3846 return m_tree->get(nid); 3847 } 3848 3849 NodeData* Parser::_append_key_val(csubstr val, flag_t val_quoted) 3850 { 3851 _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id)); 3852 type_bits additional_flags = 0; 3853 if(m_state->flags & QSCL) 3854 additional_flags |= KEYQUO; 3855 if(val_quoted) 3856 additional_flags |= VALQUO; 3857 csubstr key = _consume_scalar(); 3858 _c4dbgpf("append keyval: '{}' '{}' to parent id={} (level={}){}{}", key, val, m_state->node_id, m_state->level, (additional_flags & KEYQUO) ? " KEYQUO!" : "", (additional_flags & VALQUO) ? " VALQUO!" : ""); 3859 size_t nid = m_tree->append_child(m_state->node_id); 3860 m_tree->to_keyval(nid, key, val, additional_flags); 3861 _c4dbgpf("append keyval: id={} key='{}' val='{}'", nid, m_tree->key(nid), m_tree->val(nid)); 3862 if( ! m_key_tag.empty()) 3863 { 3864 _c4dbgpf("append keyval[{}]: set key tag='{}' -> '{}'", nid, m_key_tag, normalize_tag(m_key_tag)); 3865 m_tree->set_key_tag(nid, normalize_tag(m_key_tag)); 3866 m_key_tag.clear(); 3867 } 3868 if( ! m_val_tag.empty()) 3869 { 3870 _c4dbgpf("append keyval[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag)); 3871 m_tree->set_val_tag(nid, normalize_tag(m_val_tag)); 3872 m_val_tag.clear(); 3873 } 3874 _write_key_anchor(nid); 3875 _write_val_anchor(nid); 3876 rem_flags(QMRK); 3877 return m_tree->get(nid); 3878 } 3879 3880 3881 //----------------------------------------------------------------------------- 3882 void Parser::_store_scalar(csubstr s, flag_t is_quoted) 3883 { 3884 _c4dbgpf("state[{}]: storing scalar '{}' (flag: {}) (old scalar='{}')", 3885 m_state-m_stack.begin(), s, m_state->flags & SSCL, m_state->scalar); 3886 RYML_CHECK(has_none(SSCL)); 3887 add_flags(SSCL | (is_quoted * QSCL)); 3888 m_state->scalar = s; 3889 } 3890 3891 csubstr Parser::_consume_scalar() 3892 { 3893 _c4dbgpf("state[{}]: consuming scalar '{}' (flag: {}))", m_state-m_stack.begin(), m_state->scalar, m_state->flags & SSCL); 3894 RYML_CHECK(m_state->flags & SSCL); 3895 csubstr s = m_state->scalar; 3896 rem_flags(SSCL | QSCL); 3897 m_state->scalar.clear(); 3898 return s; 3899 } 3900 3901 void Parser::_move_scalar_from_top() 3902 { 3903 if(m_stack.size() < 2) return; 3904 State &prev = m_stack.top(1); 3905 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top()); 3906 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state != &prev); 3907 if(prev.flags & SSCL) 3908 { 3909 _c4dbgpf("moving scalar '{}' from state[{}] to state[{}] (overwriting '{}')", prev.scalar, &prev-m_stack.begin(), m_state-m_stack.begin(), m_state->scalar); 3910 add_flags(prev.flags & (SSCL | QSCL)); 3911 m_state->scalar = prev.scalar; 3912 rem_flags(SSCL | QSCL, &prev); 3913 prev.scalar.clear(); 3914 } 3915 } 3916 3917 //----------------------------------------------------------------------------- 3918 /** @todo this function is a monster and needs love. Likely, it needs 3919 * to be split like _scan_scalar_*() */ 3920 bool Parser::_handle_indentation() 3921 { 3922 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW)); 3923 if( ! _at_line_begin()) 3924 return false; 3925 3926 size_t ind = m_state->line_contents.indentation; 3927 csubstr rem = m_state->line_contents.rem; 3928 /** @todo instead of trimming, we should use the indentation index from above */ 3929 csubstr remt = rem.triml(' '); 3930 3931 if(remt.empty() || remt.begins_with('#')) // this is a blank or comment line 3932 { 3933 _line_progressed(rem.size()); 3934 return true; 3935 } 3936 3937 _c4dbgpf("indentation? ind={} indref={}", ind, m_state->indref); 3938 if(ind == m_state->indref) 3939 { 3940 _c4dbgpf("same indentation: {}", ind); 3941 if(!rem.sub(ind).begins_with('-')) 3942 { 3943 _c4dbgp("does not begin with -"); 3944 if(has_any(RMAP)) 3945 { 3946 if(has_all(SSCL|RVAL)) 3947 { 3948 _c4dbgp("add with null val"); 3949 _append_key_val_null(rem.str + ind - 1); 3950 addrem_flags(RKEY, RVAL); 3951 } 3952 } 3953 else if(has_any(RSEQ)) 3954 { 3955 if(m_stack.size() > 2) // do not pop to root level 3956 { 3957 if(has_any(RNXT)) 3958 { 3959 _c4dbgp("end the indentless seq"); 3960 _pop_level(); 3961 return true; 3962 } 3963 else if(has_any(RVAL)) 3964 { 3965 _c4dbgp("add with null val"); 3966 _append_val_null(rem.str); 3967 _c4dbgp("end the indentless seq"); 3968 _pop_level(); 3969 return true; 3970 } 3971 } 3972 } 3973 } 3974 _line_progressed(ind); 3975 return ind > 0; 3976 } 3977 else if(ind < m_state->indref) 3978 { 3979 _c4dbgpf("smaller indentation ({} < {})!!!", ind, m_state->indref); 3980 if(has_all(RVAL)) 3981 { 3982 _c4dbgp("there was an empty val -- appending"); 3983 if(has_all(RMAP)) 3984 { 3985 _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL)); 3986 _append_key_val_null(rem.sub(ind).str - 1); 3987 } 3988 else if(has_all(RSEQ)) 3989 { 3990 _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL)); 3991 _append_val_null(rem.sub(ind).str - 1); 3992 } 3993 } 3994 // search the stack frame to jump to based on its indentation 3995 State const* popto = nullptr; 3996 _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.is_contiguous()); // this search relies on the stack being contiguous 3997 for(State const* s = m_state-1; s >= m_stack.begin(); --s) 3998 { 3999 _c4dbgpf("searching for state with indentation {}. curr={} (level={},node={})", ind, s->indref, s->level, s->node_id); 4000 if(s->indref == ind) 4001 { 4002 _c4dbgpf("gotit!!! level={} node={}", s->level, s->node_id); 4003 popto = s; 4004 // while it may be tempting to think we're done at this 4005 // point, we must still determine whether we're jumping to a 4006 // parent with the same indentation. Consider this case with 4007 // an indentless sequence: 4008 // 4009 // product: 4010 // - sku: BL394D 4011 // quantity: 4 4012 // description: Basketball 4013 // price: 450.00 4014 // - sku: BL4438H 4015 // quantity: 1 4016 // description: Super Hoop 4017 // price: 2392.00 # jumping one level here would be wrong. 4018 // tax: 1234.5 # we must jump two levels 4019 if(popto > m_stack.begin()) 4020 { 4021 auto parent = popto - 1; 4022 if(parent->indref == popto->indref) 4023 { 4024 _c4dbgpf("the parent (level={},node={}) has the same indentation ({}). is this in an indentless sequence?", parent->level, parent->node_id, popto->indref); 4025 _c4dbgpf("isseq(popto)={} ismap(parent)={}", m_tree->is_seq(popto->node_id), m_tree->is_map(parent->node_id)); 4026 if(m_tree->is_seq(popto->node_id) && m_tree->is_map(parent->node_id)) 4027 { 4028 if( ! remt.begins_with('-')) 4029 { 4030 _c4dbgp("this is an indentless sequence"); 4031 popto = parent; 4032 } 4033 else 4034 { 4035 _c4dbgp("not an indentless sequence"); 4036 } 4037 } 4038 } 4039 } 4040 break; 4041 } 4042 } 4043 if(!popto || popto >= m_state || popto->level >= m_state->level) 4044 { 4045 _c4err("parse error: incorrect indentation?"); 4046 } 4047 _c4dbgpf("popping {} levels: from level {} to level {}", m_state->level-popto->level, m_state->level, popto->level); 4048 while(m_state != popto) 4049 { 4050 _c4dbgpf("popping level {} (indentation={})", m_state->level, m_state->indref); 4051 _pop_level(); 4052 } 4053 _RYML_CB_ASSERT(m_stack.m_callbacks, ind == m_state->indref); 4054 _line_progressed(ind); 4055 return true; 4056 } 4057 else 4058 { 4059 _c4dbgpf("larger indentation ({} > {})!!!", ind, m_state->indref); 4060 _RYML_CB_ASSERT(m_stack.m_callbacks, ind > m_state->indref); 4061 if(has_all(RMAP|RVAL)) 4062 { 4063 if(_is_scalar_next__rmap_val(remt) && (!remt.first_of_any(": ", "? ")) && (!remt.ends_with(":"))) 4064 { 4065 _c4dbgpf("actually it seems a value: '{}'", remt); 4066 } 4067 else 4068 { 4069 addrem_flags(RKEY, RVAL); 4070 _start_unk(); 4071 //_move_scalar_from_top(); 4072 _line_progressed(ind); 4073 _save_indentation(); 4074 return true; 4075 } 4076 } 4077 else if(has_all(RSEQ|RVAL)) 4078 { 4079 // nothing to do here 4080 } 4081 else 4082 { 4083 _c4err("parse error - indentation should not increase at this point"); 4084 } 4085 } 4086 4087 return false; 4088 } 4089 4090 //----------------------------------------------------------------------------- 4091 csubstr Parser::_scan_comment() 4092 { 4093 csubstr s = m_state->line_contents.rem; 4094 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('#')); 4095 _line_progressed(s.len); 4096 // skip the # character 4097 s = s.sub(1); 4098 // skip leading whitespace 4099 s = s.right_of(s.first_not_of(' '), /*include_pos*/true); 4100 _c4dbgpf("comment was '{}'", s); 4101 return s; 4102 } 4103 4104 //----------------------------------------------------------------------------- 4105 csubstr Parser::_scan_squot_scalar() 4106 { 4107 // quoted scalars can spread over multiple lines! 4108 // nice explanation here: http://yaml-multiline.info/ 4109 4110 // a span to the end of the file 4111 size_t b = m_state->pos.offset; 4112 substr s = m_buf.sub(b); 4113 if(s.begins_with(' ')) 4114 { 4115 s = s.triml(' '); 4116 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s)); 4117 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin()); 4118 _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin())); 4119 } 4120 b = m_state->pos.offset; // take this into account 4121 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('\'')); 4122 4123 // skip the opening quote 4124 _line_progressed(1); 4125 s = s.sub(1); 4126 4127 bool needs_filter = false; 4128 4129 size_t numlines = 1; // we already have one line 4130 size_t pos = npos; // find the pos of the matching quote 4131 while( ! _finished_file()) 4132 { 4133 const csubstr line = m_state->line_contents.rem; 4134 bool line_is_blank = true; 4135 _c4dbgpf("scanning single quoted scalar @ line[{}]: ~~~{}~~~", m_state->pos.line, line); 4136 for(size_t i = 0; i < line.len; ++i) 4137 { 4138 const char curr = line.str[i]; 4139 if(curr == '\'') // single quotes are escaped with two single quotes 4140 { 4141 const char next = i+1 < line.len ? line.str[i+1] : '~'; 4142 if(next != '\'') // so just look for the first quote 4143 { // without another after it 4144 pos = i; 4145 break; 4146 } 4147 else 4148 { 4149 needs_filter = true; // needs filter to remove escaped quotes 4150 ++i; // skip the escaped quote 4151 } 4152 } 4153 else if(curr != ' ') 4154 { 4155 line_is_blank = false; 4156 } 4157 } 4158 4159 // leading whitespace also needs filtering 4160 needs_filter = needs_filter 4161 || (numlines > 1) 4162 || line_is_blank 4163 || (_at_line_begin() && line.begins_with(' ')); 4164 4165 if(pos == npos) 4166 { 4167 _line_progressed(line.len); 4168 ++numlines; 4169 } 4170 else 4171 { 4172 _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len); 4173 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '\''); 4174 _line_progressed(pos + 1); // progress beyond the quote 4175 pos = m_state->pos.offset - b - 1; // but we stop before it 4176 break; 4177 } 4178 4179 _line_ended(); 4180 _scan_line(); 4181 } 4182 4183 if(pos == npos) 4184 { 4185 _c4err("reached end of file while looking for closing quote"); 4186 } 4187 else 4188 { 4189 _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0); 4190 _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end()); 4191 _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '\''); 4192 s = s.sub(0, pos-1); 4193 } 4194 4195 if(needs_filter) 4196 { 4197 csubstr ret = _filter_squot_scalar(s); 4198 _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty()); 4199 _c4dbgpf("final scalar: \"{}\"", ret); 4200 return ret; 4201 } 4202 4203 _c4dbgpf("final scalar: \"{}\"", s); 4204 4205 return s; 4206 } 4207 4208 //----------------------------------------------------------------------------- 4209 csubstr Parser::_scan_dquot_scalar() 4210 { 4211 // quoted scalars can spread over multiple lines! 4212 // nice explanation here: http://yaml-multiline.info/ 4213 4214 // a span to the end of the file 4215 size_t b = m_state->pos.offset; 4216 substr s = m_buf.sub(b); 4217 if(s.begins_with(' ')) 4218 { 4219 s = s.triml(' '); 4220 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s)); 4221 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin()); 4222 _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin())); 4223 } 4224 b = m_state->pos.offset; // take this into account 4225 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('"')); 4226 4227 // skip the opening quote 4228 _line_progressed(1); 4229 s = s.sub(1); 4230 4231 bool needs_filter = false; 4232 4233 size_t numlines = 1; // we already have one line 4234 size_t pos = npos; // find the pos of the matching quote 4235 while( ! _finished_file()) 4236 { 4237 const csubstr line = m_state->line_contents.rem; 4238 bool line_is_blank = true; 4239 _c4dbgpf("scanning double quoted scalar @ line[{}]: line='{}'", m_state->pos.line, line); 4240 for(size_t i = 0; i < line.len; ++i) 4241 { 4242 const char curr = line.str[i]; 4243 if(curr != ' ') 4244 line_is_blank = false; 4245 // every \ is an escape 4246 if(curr == '\\') 4247 { 4248 const char next = i+1 < line.len ? line.str[i+1] : '~'; 4249 needs_filter = true; 4250 if(next == '"' || next == '\\') 4251 ++i; 4252 } 4253 else if(curr == '"') 4254 { 4255 pos = i; 4256 break; 4257 } 4258 } 4259 4260 // leading whitespace also needs filtering 4261 needs_filter = needs_filter 4262 || (numlines > 1) 4263 || line_is_blank 4264 || (_at_line_begin() && line.begins_with(' ')); 4265 4266 if(pos == npos) 4267 { 4268 _line_progressed(line.len); 4269 ++numlines; 4270 } 4271 else 4272 { 4273 _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len); 4274 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '"'); 4275 _line_progressed(pos + 1); // progress beyond the quote 4276 pos = m_state->pos.offset - b - 1; // but we stop before it 4277 break; 4278 } 4279 4280 _line_ended(); 4281 _scan_line(); 4282 } 4283 4284 if(pos == npos) 4285 { 4286 _c4err("reached end of file looking for closing quote"); 4287 } 4288 else 4289 { 4290 _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0); 4291 _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '"'); 4292 _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end()); 4293 s = s.sub(0, pos-1); 4294 } 4295 4296 if(needs_filter) 4297 { 4298 csubstr ret = _filter_dquot_scalar(s); 4299 _c4dbgpf("final scalar: [{}]\"{}\"", ret.len, ret); 4300 _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty()); 4301 return ret; 4302 } 4303 4304 _c4dbgpf("final scalar: \"{}\"", s); 4305 4306 return s; 4307 } 4308 4309 //----------------------------------------------------------------------------- 4310 csubstr Parser::_scan_block() 4311 { 4312 // nice explanation here: http://yaml-multiline.info/ 4313 csubstr s = m_state->line_contents.rem; 4314 csubstr trimmed = s.triml(' '); 4315 if(trimmed.str > s.str) 4316 { 4317 _c4dbgp("skipping whitespace"); 4318 _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= s.str); 4319 _line_progressed(static_cast<size_t>(trimmed.str - s.str)); 4320 s = trimmed; 4321 } 4322 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('|') || s.begins_with('>')); 4323 4324 _c4dbgpf("scanning block: specs=\"{}\"", s); 4325 4326 // parse the spec 4327 BlockStyle_e newline = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL; 4328 BlockChomp_e chomp = CHOMP_CLIP; // default to clip unless + or - are used 4329 size_t indentation = npos; // have to find out if no spec is given 4330 csubstr digits; 4331 if(s.len > 1) 4332 { 4333 _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with_any("|>")); 4334 csubstr t = s.sub(1); 4335 _c4dbgpf("scanning block: spec is multichar: '{}'", t); 4336 _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1); 4337 size_t pos = t.first_of("-+"); 4338 _c4dbgpf("scanning block: spec chomp char at {}", pos); 4339 if(pos != npos) 4340 { 4341 if(t[pos] == '-') 4342 chomp = CHOMP_STRIP; 4343 else if(t[pos] == '+') 4344 chomp = CHOMP_KEEP; 4345 if(pos == 0) 4346 t = t.sub(1); 4347 else 4348 t = t.first(pos); 4349 } 4350 // from here to the end, only digits are considered 4351 digits = t.left_of(t.first_not_of("0123456789")); 4352 if( ! digits.empty()) 4353 { 4354 if( ! c4::atou(digits, &indentation)) 4355 _c4err("parse error: could not read decimal"); 4356 _c4dbgpf("scanning block: indentation specified: {}. add {} from curr state -> {}", indentation, m_state->indref, indentation+m_state->indref); 4357 indentation += m_state->indref; 4358 } 4359 } 4360 4361 // finish the current line 4362 _line_progressed(s.len); 4363 _line_ended(); 4364 _scan_line(); 4365 4366 _c4dbgpf("scanning block: style={} chomp={} indentation={}", newline==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation); 4367 4368 // start with a zero-length block, already pointing at the right place 4369 substr raw_block(m_buf.data() + m_state->pos.offset, size_t(0));// m_state->line_contents.full.sub(0, 0); 4370 _RYML_CB_ASSERT(m_stack.m_callbacks, raw_block.begin() == m_state->line_contents.full.begin()); 4371 4372 // read every full line into a raw block, 4373 // from which newlines are to be stripped as needed. 4374 // 4375 // If no explicit indentation was given, pick it from the first 4376 // non-empty line. See 4377 // https://yaml.org/spec/1.2.2/#8111-block-indentation-indicator 4378 size_t num_lines = 0, first = m_state->pos.line, provisional_indentation = npos; 4379 LineContents lc; 4380 while(( ! _finished_file())) 4381 { 4382 // peek next line, but do not advance immediately 4383 lc.reset_with_next_line(m_buf, m_state->pos.offset); 4384 _c4dbgpf("scanning block: peeking at '{}'", lc.stripped); 4385 // evaluate termination conditions 4386 if(indentation != npos) 4387 { 4388 // stop when the line is deindented and not empty 4389 if(lc.indentation < indentation && ( ! lc.rem.trim(" \t\r\n").empty())) 4390 { 4391 if(raw_block.len) 4392 { 4393 _c4dbgpf("scanning block: indentation decreased ref={} thisline={}", indentation, lc.indentation); 4394 } 4395 else 4396 { 4397 _c4err("indentation decreased without any scalar"); 4398 } 4399 break; 4400 } 4401 else if(indentation == 0) 4402 { 4403 if((lc.rem == "..." || lc.rem.begins_with("... ")) 4404 || 4405 (lc.rem == "---" || lc.rem.begins_with("--- "))) 4406 { 4407 _c4dbgp("scanning block: stop. indentation=0 and stream ended"); 4408 break; 4409 } 4410 } 4411 } 4412 else 4413 { 4414 _c4dbgpf("scanning block: indentation ref not set. firstnonws={}", lc.stripped.first_not_of(' ')); 4415 if(lc.stripped.first_not_of(' ') != npos) // non-empty line 4416 { 4417 _c4dbgpf("scanning block: line not empty. indref={} indprov={} indentation={}", m_state->indref, provisional_indentation, lc.indentation); 4418 if(provisional_indentation == npos) 4419 { 4420 if(lc.indentation < m_state->indref) 4421 { 4422 _c4dbgpf("scanning block: block terminated indentation={} < indref={}", lc.indentation, m_state->indref); 4423 if(raw_block.len == 0) 4424 { 4425 _c4dbgp("scanning block: was empty, undo next line"); 4426 _line_ended_undo(); 4427 } 4428 break; 4429 } 4430 else if(lc.indentation == m_state->indref) 4431 { 4432 if(has_any(RSEQ|RMAP)) 4433 { 4434 _c4dbgpf("scanning block: block terminated. reading container and indentation={}==indref={}", lc.indentation, m_state->indref); 4435 break; 4436 } 4437 } 4438 _c4dbgpf("scanning block: set indentation ref from this line: ref={}", lc.indentation); 4439 indentation = lc.indentation; 4440 } 4441 else 4442 { 4443 if(lc.indentation >= provisional_indentation) 4444 { 4445 _c4dbgpf("scanning block: set indentation ref from provisional indentation: provisional_ref={}, thisline={}", provisional_indentation, lc.indentation); 4446 //indentation = provisional_indentation ? provisional_indentation : lc.indentation; 4447 indentation = lc.indentation; 4448 } 4449 else 4450 { 4451 break; 4452 //_c4err("parse error: first non-empty block line should have at least the original indentation"); 4453 } 4454 } 4455 } 4456 else // empty line 4457 { 4458 _c4dbgpf("scanning block: line empty or {} spaces. line_indentation={} prov_indentation={}", lc.stripped.len, lc.indentation, provisional_indentation); 4459 if(provisional_indentation != npos) 4460 { 4461 if(lc.stripped.len >= provisional_indentation) 4462 { 4463 _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.stripped.len); 4464 provisional_indentation = lc.stripped.len; 4465 } 4466 #ifdef RYML_NO_COVERAGE__TO_BE_DELETED 4467 else if(lc.indentation >= provisional_indentation && lc.indentation != npos) 4468 { 4469 _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.indentation); 4470 provisional_indentation = lc.indentation; 4471 } 4472 #endif 4473 } 4474 else 4475 { 4476 provisional_indentation = lc.indentation ? lc.indentation : has_any(RSEQ|RVAL); 4477 _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation); 4478 if(provisional_indentation == npos) 4479 { 4480 provisional_indentation = lc.stripped.len ? lc.stripped.len : has_any(RSEQ|RVAL); 4481 _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation); 4482 } 4483 } 4484 } 4485 } 4486 // advance now that we know the folded scalar continues 4487 m_state->line_contents = lc; 4488 _c4dbgpf("scanning block: append '{}'", m_state->line_contents.rem); 4489 raw_block.len += m_state->line_contents.full.len; 4490 _line_progressed(m_state->line_contents.rem.len); 4491 _line_ended(); 4492 ++num_lines; 4493 } 4494 _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line == (first + num_lines) || (raw_block.len == 0)); 4495 C4_UNUSED(num_lines); 4496 C4_UNUSED(first); 4497 4498 if(indentation == npos) 4499 { 4500 _c4dbgpf("scanning block: set indentation from provisional: {}", provisional_indentation); 4501 indentation = provisional_indentation; 4502 } 4503 4504 if(num_lines) 4505 _line_ended_undo(); 4506 4507 _c4dbgpf("scanning block: raw=~~~{}~~~", raw_block); 4508 4509 // ok! now we strip the newlines and spaces according to the specs 4510 s = _filter_block_scalar(raw_block, newline, chomp, indentation); 4511 4512 _c4dbgpf("scanning block: final=~~~{}~~~", s); 4513 4514 return s; 4515 } 4516 4517 4518 //----------------------------------------------------------------------------- 4519 4520 template<bool backslash_is_escape, bool keep_trailing_whitespace> 4521 bool Parser::_filter_nl(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos, size_t indentation) 4522 { 4523 // a debugging scaffold: 4524 #if 0 4525 #define _c4dbgfnl(fmt, ...) _c4dbgpf("filter_nl[{}]: " fmt, *i, __VA_ARGS__) 4526 #else 4527 #define _c4dbgfnl(...) 4528 #endif 4529 4530 const char curr = r[*i]; 4531 bool replaced = false; 4532 4533 _RYML_CB_ASSERT(m_stack.m_callbacks, indentation != npos); 4534 _RYML_CB_ASSERT(m_stack.m_callbacks, curr == '\n'); 4535 4536 _c4dbgfnl("found newline. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos)); 4537 size_t ii = *i; 4538 size_t numnl_following = count_following_newlines(r, &ii, indentation); 4539 if(numnl_following) 4540 { 4541 _c4dbgfnl("{} consecutive (empty) lines {} in the middle. totalws={}", 1+numnl_following, ii < r.len ? "in the middle" : "at the end", ii - *i); 4542 for(size_t j = 0; j < numnl_following; ++j) 4543 m_filter_arena.str[(*pos)++] = '\n'; 4544 } 4545 else 4546 { 4547 if(r.first_not_of(" \t", *i+1) != npos) 4548 { 4549 m_filter_arena.str[(*pos)++] = ' '; 4550 _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos)); 4551 replaced = true; 4552 } 4553 else 4554 { 4555 if C4_IF_CONSTEXPR (keep_trailing_whitespace) 4556 { 4557 m_filter_arena.str[(*pos)++] = ' '; 4558 _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos)); 4559 replaced = true; 4560 } 4561 else 4562 { 4563 _c4dbgfnl("last newline, everything else is whitespace. ii={}/{}", ii, r.len); 4564 *i = r.len; 4565 } 4566 } 4567 if C4_IF_CONSTEXPR (backslash_is_escape) 4568 { 4569 if(ii < r.len && r.str[ii] == '\\') 4570 { 4571 const char next = ii+1 < r.len ? r.str[ii+1] : '\0'; 4572 if(next == ' ' || next == '\t') 4573 { 4574 _c4dbgfnl("extend skip to backslash{}", ""); 4575 ++ii; 4576 } 4577 } 4578 } 4579 } 4580 *i = ii - 1; // correct for the loop increment 4581 4582 #undef _c4dbgfnl 4583 4584 return replaced; 4585 } 4586 4587 4588 //----------------------------------------------------------------------------- 4589 4590 template<bool keep_trailing_whitespace> 4591 void Parser::_filter_ws(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos) 4592 { 4593 // a debugging scaffold: 4594 #if 0 4595 #define _c4dbgfws(fmt, ...) _c4dbgpf("filt_nl[{}]: " fmt, *i, __VA_ARGS__) 4596 #else 4597 #define _c4dbgfws(...) 4598 #endif 4599 4600 const char curr = r[*i]; 4601 _c4dbgfws("found whitespace '{}'", _c4prc(curr)); 4602 _RYML_CB_ASSERT(m_stack.m_callbacks, curr == ' ' || curr == '\t'); 4603 4604 size_t first = *i > 0 ? r.first_not_of(" \t", *i) : r.first_not_of(' ', *i); 4605 if(first != npos) 4606 { 4607 if(r[first] == '\n' || r[first] == '\r') // skip trailing whitespace 4608 { 4609 _c4dbgfws("whitespace is trailing on line. firstnonws='{}'@{}", _c4prc(r[first]), first); 4610 *i = first - 1; // correct for the loop increment 4611 } 4612 else // a legit whitespace 4613 { 4614 m_filter_arena.str[(*pos)++] = curr; 4615 _c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos)); 4616 } 4617 } 4618 else 4619 { 4620 _c4dbgfws("... everything else is trailing whitespace{}", ""); 4621 if C4_IF_CONSTEXPR (keep_trailing_whitespace) 4622 for(size_t j = *i; j < r.len; ++j) 4623 m_filter_arena.str[(*pos)++] = r[j]; 4624 *i = r.len; 4625 } 4626 4627 #undef _c4dbgfws 4628 } 4629 4630 4631 //----------------------------------------------------------------------------- 4632 csubstr Parser::_filter_plain_scalar(substr s, size_t indentation) 4633 { 4634 // a debugging scaffold: 4635 #if 0 4636 #define _c4dbgfps(...) _c4dbgpf("filt_plain_scalar" __VA_ARGS__) 4637 #else 4638 #define _c4dbgfps(...) 4639 #endif 4640 4641 _c4dbgfps("before=~~~{}~~~", s); 4642 4643 substr r = s.triml(" \t"); 4644 _grow_filter_arena(r.len); 4645 size_t pos = 0; // the filtered size 4646 bool filtered_chars = false; 4647 for(size_t i = 0; i < r.len; ++i) 4648 { 4649 const char curr = r.str[i]; 4650 _c4dbgfps("[{}]: '{}'", i, _c4prc(curr)); 4651 if(curr == ' ' || curr == '\t') 4652 { 4653 _filter_ws</*keep_trailing_ws*/false>(r, &i, &pos); 4654 } 4655 else if(curr == '\n') 4656 { 4657 filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/false>(r, &i, &pos, indentation); 4658 } 4659 else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900 4660 { 4661 ; 4662 } 4663 else 4664 { 4665 m_filter_arena.str[pos++] = r[i]; 4666 } 4667 } 4668 4669 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); 4670 if(pos < r.len || filtered_chars) 4671 { 4672 r = _finish_filter_arena(r, pos); 4673 } 4674 4675 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len); 4676 _c4dbgfps("#filteredchars={} after=~~~{}~~~", s.len - r.len, r); 4677 4678 #undef _c4dbgfps 4679 return r; 4680 } 4681 4682 4683 //----------------------------------------------------------------------------- 4684 csubstr Parser::_filter_squot_scalar(substr s) 4685 { 4686 // a debugging scaffold: 4687 #if 0 4688 #define _c4dbgfsq(...) _c4dbgpf("filt_squo_scalar") 4689 #else 4690 #define _c4dbgfsq(...) 4691 #endif 4692 4693 // from the YAML spec for double-quoted scalars: 4694 // https://yaml.org/spec/1.2-old/spec.html#style/flow/single-quoted 4695 4696 _c4dbgfsq(": before=~~~{}~~~", s); 4697 4698 _grow_filter_arena(s.len); 4699 substr r = s; 4700 size_t pos = 0; // the filtered size 4701 bool filtered_chars = false; 4702 for(size_t i = 0; i < r.len; ++i) 4703 { 4704 const char curr = r[i]; 4705 _c4dbgfsq("[{}]: '{}'", i, _c4prc(curr)); 4706 if(curr == ' ' || curr == '\t') 4707 { 4708 _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos); 4709 } 4710 else if(curr == '\n') 4711 { 4712 filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0); 4713 } 4714 else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900 4715 { 4716 ; 4717 } 4718 else if(curr == '\'') 4719 { 4720 char next = i+1 < r.len ? r[i+1] : '\0'; 4721 if(next == '\'') 4722 { 4723 _c4dbgfsq("[{}]: two consecutive quotes", i); 4724 filtered_chars = true; 4725 m_filter_arena.str[pos++] = '\''; 4726 ++i; 4727 } 4728 } 4729 else 4730 { 4731 m_filter_arena.str[pos++] = curr; 4732 } 4733 } 4734 4735 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); 4736 if(pos < r.len || filtered_chars) 4737 { 4738 r = _finish_filter_arena(r, pos); 4739 } 4740 4741 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len); 4742 _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r); 4743 4744 #undef _c4dbgfsq 4745 return r; 4746 } 4747 4748 4749 //----------------------------------------------------------------------------- 4750 csubstr Parser::_filter_dquot_scalar(substr s) 4751 { 4752 // a debugging scaffold: 4753 #if 0 4754 #define _c4dbgfdq(...) _c4dbgpf("filt_dquo_scalar" __VA_ARGS__) 4755 #else 4756 #define _c4dbgfdq(...) 4757 #endif 4758 4759 _c4dbgfdq(": before=~~~{}~~~", s); 4760 4761 // from the YAML spec for double-quoted scalars: 4762 // https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted 4763 // 4764 // All leading and trailing white space characters are excluded 4765 // from the content. Each continuation line must therefore contain 4766 // at least one non-space character. Empty lines, if any, are 4767 // consumed as part of the line folding. 4768 4769 _grow_filter_arena(s.len + 2u * s.count('\\')); 4770 substr r = s; 4771 size_t pos = 0; // the filtered size 4772 bool filtered_chars = false; 4773 for(size_t i = 0; i < r.len; ++i) 4774 { 4775 const char curr = r[i]; 4776 _c4dbgfdq("[{}]: '{}'", i, _c4prc(curr)); 4777 if(curr == ' ' || curr == '\t') 4778 { 4779 _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos); 4780 } 4781 else if(curr == '\n') 4782 { 4783 filtered_chars = _filter_nl</*backslash_is_escape*/true, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0); 4784 } 4785 else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900 4786 { 4787 ; 4788 } 4789 else if(curr == '\\') 4790 { 4791 char next = i+1 < r.len ? r[i+1] : '\0'; 4792 _c4dbgfdq("[{}]: backslash, next='{}'", i, _c4prc(next)); 4793 filtered_chars = true; 4794 if(next == '\r') 4795 { 4796 if(i+2 < r.len && r[i+2] == '\n') 4797 { 4798 ++i; // newline escaped with \ -- skip both (add only one as i is loop-incremented) 4799 next = '\n'; 4800 _c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", i); 4801 } 4802 } 4803 // remember the loop will also increment i 4804 if(next == '\n') 4805 { 4806 size_t ii = i + 2; 4807 for( ; ii < r.len; ++ii) 4808 { 4809 if(r.str[ii] == ' ' || r.str[ii] == '\t') // skip leading whitespace 4810 ; 4811 else 4812 break; 4813 } 4814 i += ii - i - 1; 4815 } 4816 else if(next == '"' || next == '/' || next == ' ' || next == '\t') // escapes for json compatibility 4817 { 4818 m_filter_arena.str[pos++] = next; 4819 ++i; 4820 } 4821 else if(next == '\r') 4822 { 4823 //++i; 4824 } 4825 else if(next == 'n') 4826 { 4827 m_filter_arena.str[pos++] = '\n'; 4828 ++i; 4829 } 4830 else if(next == 'r') 4831 { 4832 m_filter_arena.str[pos++] = '\r'; 4833 ++i; // skip 4834 } 4835 else if(next == 't') 4836 { 4837 m_filter_arena.str[pos++] = '\t'; 4838 ++i; 4839 } 4840 else if(next == '\\') 4841 { 4842 m_filter_arena.str[pos++] = '\\'; 4843 ++i; 4844 } 4845 else if(next == 'x') // UTF8 4846 { 4847 if(i + 1u + 2u >= r.len) 4848 _c4err("\\x requires 2 hex digits"); 4849 uint8_t byteval = {}; 4850 if(!read_hex(r.sub(i + 2u, 2u), &byteval)) 4851 _c4err("failed to read \\x codepoint"); 4852 m_filter_arena.str[pos++] = *(char*)&byteval; 4853 i += 1u + 2u; 4854 } 4855 else if(next == 'u') // UTF16 4856 { 4857 if(i + 1u + 4u >= r.len) 4858 _c4err("\\u requires 4 hex digits"); 4859 char readbuf[8]; 4860 csubstr codepoint = r.sub(i + 2u, 4u); 4861 uint32_t codepoint_val = {}; 4862 if(!read_hex(codepoint, &codepoint_val)) 4863 _c4err("failed to parse \\u codepoint"); 4864 size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); 4865 C4_ASSERT(numbytes <= 4); 4866 memcpy(m_filter_arena.str + pos, readbuf, numbytes); 4867 pos += numbytes; 4868 i += 1u + 4u; 4869 } 4870 else if(next == 'U') // UTF32 4871 { 4872 if(i + 1u + 8u >= r.len) 4873 _c4err("\\U requires 8 hex digits"); 4874 char readbuf[8]; 4875 csubstr codepoint = r.sub(i + 2u, 8u); 4876 uint32_t codepoint_val = {}; 4877 if(!read_hex(codepoint, &codepoint_val)) 4878 _c4err("failed to parse \\U codepoint"); 4879 size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); 4880 C4_ASSERT(numbytes <= 4); 4881 memcpy(m_filter_arena.str + pos, readbuf, numbytes); 4882 pos += numbytes; 4883 i += 1u + 8u; 4884 } 4885 // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char 4886 else if(next == '0') 4887 { 4888 m_filter_arena.str[pos++] = '\0'; 4889 ++i; 4890 } 4891 else if(next == 'b') // backspace 4892 { 4893 m_filter_arena.str[pos++] = '\b'; 4894 ++i; 4895 } 4896 else if(next == 'f') // form feed 4897 { 4898 m_filter_arena.str[pos++] = '\f'; 4899 ++i; 4900 } 4901 else if(next == 'a') // bell character 4902 { 4903 m_filter_arena.str[pos++] = '\a'; 4904 ++i; 4905 } 4906 else if(next == 'v') // vertical tab 4907 { 4908 m_filter_arena.str[pos++] = '\v'; 4909 ++i; 4910 } 4911 else if(next == 'e') // escape character 4912 { 4913 m_filter_arena.str[pos++] = '\x1b'; 4914 ++i; 4915 } 4916 else if(next == '_') // unicode non breaking space \u00a0 4917 { 4918 // https://www.compart.com/en/unicode/U+00a0 4919 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2); 4920 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x60, 0xa0); 4921 ++i; 4922 } 4923 else if(next == 'N') // unicode next line \u0085 4924 { 4925 // https://www.compart.com/en/unicode/U+0085 4926 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2); 4927 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x7b, 0x85); 4928 ++i; 4929 } 4930 else if(next == 'L') // unicode line separator \u2028 4931 { 4932 // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex 4933 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2); 4934 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80); 4935 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x58, 0xa8); 4936 ++i; 4937 } 4938 else if(next == 'P') // unicode paragraph separator \u2029 4939 { 4940 // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex 4941 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2); 4942 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80); 4943 m_filter_arena.str[pos++] = _RYML_CHCONST(-0x57, 0xa9); 4944 ++i; 4945 } 4946 _c4dbgfdq("[{}]: backslash...sofar=[{}]~~~{}~~~", i, pos, m_filter_arena.first(pos)); 4947 } 4948 else 4949 { 4950 m_filter_arena.str[pos++] = curr; 4951 } 4952 } 4953 4954 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); 4955 if(pos < r.len || filtered_chars) 4956 { 4957 r = _finish_filter_arena(r, pos); 4958 } 4959 4960 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len); 4961 _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r); 4962 4963 #undef _c4dbgfdq 4964 4965 return r; 4966 } 4967 4968 4969 //----------------------------------------------------------------------------- 4970 bool Parser::_apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp) 4971 { 4972 substr trimmed = buf.first(*pos).trimr('\n'); 4973 bool added_newline = false; 4974 switch(chomp) 4975 { 4976 case CHOMP_KEEP: 4977 if(trimmed.len == *pos) 4978 { 4979 _c4dbgpf("chomp=KEEP: add missing newline @{}", *pos); 4980 //m_filter_arena.str[(*pos)++] = '\n'; 4981 added_newline = true; 4982 } 4983 break; 4984 case CHOMP_CLIP: 4985 if(trimmed.len == *pos) 4986 { 4987 _c4dbgpf("chomp=CLIP: add missing newline @{}", *pos); 4988 m_filter_arena.str[(*pos)++] = '\n'; 4989 added_newline = true; 4990 } 4991 else 4992 { 4993 _c4dbgpf("chomp=CLIP: include single trailing newline @{}", trimmed.len+1); 4994 *pos = trimmed.len + 1; 4995 } 4996 break; 4997 case CHOMP_STRIP: 4998 _c4dbgpf("chomp=STRIP: strip {}-{}-{} newlines", *pos, trimmed.len, *pos-trimmed.len); 4999 *pos = trimmed.len; 5000 break; 5001 default: 5002 _c4err("unknown chomp style"); 5003 } 5004 return added_newline; 5005 } 5006 5007 5008 //----------------------------------------------------------------------------- 5009 csubstr Parser::_filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation) 5010 { 5011 // a debugging scaffold: 5012 #if 0 5013 #define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block" fmt, __VA_ARGS__) 5014 #else 5015 #define _c4dbgfbl(...) 5016 #endif 5017 5018 _c4dbgfbl(": indentation={} before=[{}]~~~{}~~~", indentation, s.len, s); 5019 5020 if(chomp != CHOMP_KEEP && s.trim(" \n\r").len == 0u) 5021 { 5022 _c4dbgp("filt_block: empty scalar"); 5023 return s.first(0); 5024 } 5025 5026 substr r = s; 5027 5028 switch(style) 5029 { 5030 case BLOCK_LITERAL: 5031 { 5032 _c4dbgp("filt_block: style=literal"); 5033 // trim leading whitespace up to indentation 5034 { 5035 size_t numws = r.first_not_of(' '); 5036 if(numws != npos) 5037 { 5038 if(numws > indentation) 5039 r = r.sub(indentation); 5040 else 5041 r = r.sub(numws); 5042 _c4dbgfbl(": after triml=[{}]~~~{}~~~", r.len, r); 5043 } 5044 else 5045 { 5046 if(chomp != CHOMP_KEEP || r.len == 0) 5047 { 5048 _c4dbgfbl(": all spaces {}, return empty", r.len); 5049 return r.first(0); 5050 } 5051 else 5052 { 5053 r[0] = '\n'; 5054 return r.first(1); 5055 } 5056 } 5057 } 5058 _grow_filter_arena(s.len + 2u); // use s.len! because we may need to add a newline at the end, so the leading indentation will allow space for that newline 5059 size_t pos = 0; // the filtered size 5060 for(size_t i = 0; i < r.len; ++i) 5061 { 5062 const char curr = r.str[i]; 5063 _c4dbgfbl("[{}]='{}' pos={}", i, _c4prc(curr), pos); 5064 if(curr == '\r') 5065 continue; 5066 m_filter_arena.str[pos++] = curr; 5067 if(curr == '\n') 5068 { 5069 _c4dbgfbl("[{}]: found newline", i); 5070 // skip indentation on the next line 5071 csubstr rem = r.sub(i+1); 5072 size_t first = rem.first_not_of(' '); 5073 if(first != npos) 5074 { 5075 _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len); 5076 _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len); 5077 _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, rem.str[first]); 5078 if(first < indentation) 5079 { 5080 _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation); 5081 i += first; 5082 } 5083 else 5084 { 5085 _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); 5086 i += indentation; 5087 } 5088 } 5089 else 5090 { 5091 _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len); 5092 first = rem.len; 5093 _c4dbgfbl("[{}]: {} spaces to the end", i, first); 5094 if(first) 5095 { 5096 if(first < indentation) 5097 { 5098 _c4dbgfbl("[{}]: skip everything", i); 5099 --pos; 5100 break; 5101 } 5102 else 5103 { 5104 _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); 5105 i += indentation; 5106 } 5107 } 5108 else if(i+1 == r.len) 5109 { 5110 if(chomp == CHOMP_STRIP) 5111 --pos; 5112 break; 5113 } 5114 } 5115 } 5116 } 5117 _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= pos); 5118 _c4dbgfbl(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r); 5119 bool changed = _apply_chomp(m_filter_arena, &pos, chomp); 5120 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); 5121 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= s.len); 5122 if(pos < r.len || changed) 5123 { 5124 r = _finish_filter_arena(s, pos); // write into s 5125 } 5126 break; 5127 } 5128 case BLOCK_FOLD: 5129 { 5130 _c4dbgp("filt_block: style=fold"); 5131 _grow_filter_arena(r.len + 2); 5132 size_t pos = 0; // the filtered size 5133 bool filtered_chars = false; 5134 bool started = false; 5135 bool is_indented = false; 5136 size_t i = r.first_not_of(' '); 5137 _c4dbgfbl(": first non space at {}", i); 5138 if(i > indentation) 5139 { 5140 is_indented = true; 5141 i = indentation; 5142 } 5143 _c4dbgfbl(": start folding at {}, is_indented={}", i, (int)is_indented); 5144 auto on_change_indentation = [&](size_t numnl_following, size_t last_newl, size_t first_non_whitespace){ 5145 _c4dbgfbl("[{}]: add 1+{} newlines", i, numnl_following); 5146 for(size_t j = 0; j < 1 + numnl_following; ++j) 5147 m_filter_arena.str[pos++] = '\n'; 5148 for(i = last_newl + 1 + indentation; i < first_non_whitespace; ++i) 5149 { 5150 if(r.str[i] == '\r') 5151 continue; 5152 _c4dbgfbl("[{}]: add '{}'", i, _c4prc(r.str[i])); 5153 m_filter_arena.str[pos++] = r.str[i]; 5154 } 5155 --i; 5156 }; 5157 for( ; i < r.len; ++i) 5158 { 5159 const char curr = r.str[i]; 5160 _c4dbgfbl("[{}]='{}'", i, _c4prc(curr)); 5161 if(curr == '\n') 5162 { 5163 filtered_chars = true; 5164 // skip indentation on the next line, and advance over the next non-indented blank lines as well 5165 size_t first_non_whitespace; 5166 size_t numnl_following = (size_t)-1; 5167 while(r[i] == '\n') 5168 { 5169 ++numnl_following; 5170 csubstr rem = r.sub(i+1); 5171 size_t first = rem.first_not_of(' '); 5172 _c4dbgfbl("[{}]: found newline. first={} rem.len={}", i, first, rem.len); 5173 if(first != npos) 5174 { 5175 first_non_whitespace = first + i+1; 5176 while(first_non_whitespace < r.len && r[first_non_whitespace] == '\r') 5177 ++first_non_whitespace; 5178 _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len); 5179 _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len); 5180 _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, _c4prc(rem.str[first])); 5181 if(first < indentation) 5182 { 5183 _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation); 5184 i += first; 5185 } 5186 else 5187 { 5188 _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); 5189 i += indentation; 5190 if(first > indentation) 5191 { 5192 _c4dbgfbl("[{}]: {} further indented than {}, stop newlining", i, first, indentation); 5193 goto finished_counting_newlines; 5194 } 5195 } 5196 // prepare the next while loop iteration 5197 // by setting i at the next newline after 5198 // an empty line 5199 if(r[first_non_whitespace] == '\n') 5200 i = first_non_whitespace; 5201 else 5202 goto finished_counting_newlines; 5203 } 5204 else 5205 { 5206 _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len); 5207 first = rem.len; 5208 first_non_whitespace = first + i+1; 5209 if(first) 5210 { 5211 _c4dbgfbl("[{}]: {} spaces to the end", i, first); 5212 if(first < indentation) 5213 { 5214 _c4dbgfbl("[{}]: skip everything", i); 5215 i += first; 5216 } 5217 else 5218 { 5219 _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation); 5220 i += indentation; 5221 if(first > indentation) 5222 { 5223 _c4dbgfbl("[{}]: {} spaces missing. not done yet", i, indentation - first); 5224 goto finished_counting_newlines; 5225 } 5226 } 5227 } 5228 else // if(i+1 == r.len) 5229 { 5230 _c4dbgfbl("[{}]: it's the final newline", i); 5231 _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 == r.len); 5232 _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len == 0); 5233 } 5234 goto end_of_scalar; 5235 } 5236 } 5237 end_of_scalar: 5238 // Write all the trailing newlines. Since we're 5239 // at the end no folding is needed, so write every 5240 // newline (add 1). 5241 _c4dbgfbl("[{}]: add {} trailing newlines", i, 1+numnl_following); 5242 for(size_t j = 0; j < 1 + numnl_following; ++j) 5243 m_filter_arena.str[pos++] = '\n'; 5244 break; 5245 finished_counting_newlines: 5246 _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace); 5247 while(first_non_whitespace < r.len && r[first_non_whitespace] == '\t') 5248 ++first_non_whitespace; 5249 _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace); 5250 _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace <= r.len); 5251 size_t last_newl = r.last_of('\n', first_non_whitespace); 5252 size_t this_indentation = first_non_whitespace - last_newl - 1; 5253 _c4dbgfbl("[{}]: #newlines={} firstnonws={} lastnewl={} this_indentation={} vs indentation={}", i, numnl_following, first_non_whitespace, last_newl, this_indentation, indentation); 5254 _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace >= last_newl + 1); 5255 _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation >= indentation); 5256 if(!started) 5257 { 5258 _c4dbgfbl("[{}]: #newlines={}. write all leading newlines", i, numnl_following); 5259 for(size_t j = 0; j < 1 + numnl_following; ++j) 5260 m_filter_arena.str[pos++] = '\n'; 5261 if(this_indentation > indentation) 5262 { 5263 is_indented = true; 5264 _c4dbgfbl("[{}]: advance ->{}", i, last_newl + indentation); 5265 i = last_newl + indentation; 5266 } 5267 else 5268 { 5269 i = first_non_whitespace - 1; 5270 _c4dbgfbl("[{}]: advance ->{}", i, first_non_whitespace); 5271 } 5272 } 5273 else if(this_indentation == indentation) 5274 { 5275 _c4dbgfbl("[{}]: same indentation", i); 5276 if(!is_indented) 5277 { 5278 if(numnl_following == 0) 5279 { 5280 _c4dbgfbl("[{}]: fold!", i); 5281 m_filter_arena.str[pos++] = ' '; 5282 } 5283 else 5284 { 5285 _c4dbgfbl("[{}]: add {} newlines", i, 1 + numnl_following); 5286 for(size_t j = 0; j < numnl_following; ++j) 5287 m_filter_arena.str[pos++] = '\n'; 5288 } 5289 i = first_non_whitespace - 1; 5290 _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace); 5291 } 5292 else 5293 { 5294 _c4dbgfbl("[{}]: back to ref indentation", i); 5295 is_indented = false; 5296 on_change_indentation(numnl_following, last_newl, first_non_whitespace); 5297 _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace); 5298 } 5299 } 5300 else 5301 { 5302 _c4dbgfbl("[{}]: increased indentation.", i); 5303 is_indented = true; 5304 _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation > indentation); 5305 on_change_indentation(numnl_following, last_newl, first_non_whitespace); 5306 _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace); 5307 } 5308 } 5309 else if(curr != '\r') 5310 { 5311 if(curr != '\t') 5312 started = true; 5313 m_filter_arena.str[pos++] = curr; 5314 } 5315 } 5316 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); 5317 _c4dbgfbl(": #filteredchars={} after=[{}]~~~{}~~~", (int)s.len - (int)pos, pos, m_filter_arena.first(pos)); 5318 bool changed = _apply_chomp(m_filter_arena, &pos, chomp); 5319 if(pos < r.len || filtered_chars || changed) 5320 { 5321 r = _finish_filter_arena(s, pos); // write into s 5322 } 5323 } 5324 break; 5325 default: 5326 _c4err("unknown block style"); 5327 } 5328 5329 _c4dbgfbl(": final=[{}]~~~{}~~~", r.len, r); 5330 5331 #undef _c4dbgfbl 5332 5333 return r; 5334 } 5335 5336 //----------------------------------------------------------------------------- 5337 size_t Parser::_count_nlines(csubstr src) 5338 { 5339 return 1 + src.count('\n'); 5340 } 5341 5342 //----------------------------------------------------------------------------- 5343 void Parser::_handle_directive(csubstr directive_) 5344 { 5345 csubstr directive = directive_; 5346 if(directive.begins_with("%TAG")) 5347 { 5348 TagDirective td; 5349 _c4dbgpf("%TAG directive: {}", directive_); 5350 directive = directive.sub(4); 5351 if(!directive.begins_with(' ')) 5352 _c4err("malformed tag directive: {}", directive_); 5353 directive = directive.triml(' '); 5354 size_t pos = directive.find(' '); 5355 if(pos == npos) 5356 _c4err("malformed tag directive: {}", directive_); 5357 td.handle = directive.first(pos); 5358 directive = directive.sub(td.handle.len).triml(' '); 5359 pos = directive.find(' '); 5360 if(pos != npos) 5361 directive = directive.first(pos); 5362 td.prefix = directive; 5363 td.next_node_id = m_tree->size(); 5364 if(m_tree->size() > 0) 5365 { 5366 size_t prev = m_tree->size() - 1; 5367 if(m_tree->is_root(prev) && m_tree->type(prev) != NOTYPE && !m_tree->is_stream(prev)) 5368 ++td.next_node_id; 5369 } 5370 _c4dbgpf("%TAG: handle={} prefix={} next_node={}", td.handle, td.prefix, td.next_node_id); 5371 m_tree->add_tag_directive(td); 5372 } 5373 else if(directive.begins_with("%YAML")) 5374 { 5375 _c4dbgpf("%YAML directive! ignoring...: {}", directive); 5376 } 5377 } 5378 5379 //----------------------------------------------------------------------------- 5380 void Parser::set_flags(flag_t f, State * s) 5381 { 5382 #ifdef RYML_DBG 5383 char buf1_[64], buf2_[64]; 5384 csubstr buf1 = _prfl(buf1_, f); 5385 csubstr buf2 = _prfl(buf2_, s->flags); 5386 _c4dbgpf("state[{}]: setting flags to {}: before={}", s-m_stack.begin(), buf1, buf2); 5387 #endif 5388 s->flags = f; 5389 } 5390 5391 void Parser::add_flags(flag_t on, State * s) 5392 { 5393 #ifdef RYML_DBG 5394 char buf1_[64], buf2_[64], buf3_[64]; 5395 csubstr buf1 = _prfl(buf1_, on); 5396 csubstr buf2 = _prfl(buf2_, s->flags); 5397 csubstr buf3 = _prfl(buf3_, s->flags|on); 5398 _c4dbgpf("state[{}]: adding flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3); 5399 #endif 5400 s->flags |= on; 5401 } 5402 5403 void Parser::addrem_flags(flag_t on, flag_t off, State * s) 5404 { 5405 #ifdef RYML_DBG 5406 char buf1_[64], buf2_[64], buf3_[64], buf4_[64]; 5407 csubstr buf1 = _prfl(buf1_, on); 5408 csubstr buf2 = _prfl(buf2_, off); 5409 csubstr buf3 = _prfl(buf3_, s->flags); 5410 csubstr buf4 = _prfl(buf4_, ((s->flags|on)&(~off))); 5411 _c4dbgpf("state[{}]: adding flags {} / removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3, buf4); 5412 #endif 5413 s->flags |= on; 5414 s->flags &= ~off; 5415 } 5416 5417 void Parser::rem_flags(flag_t off, State * s) 5418 { 5419 #ifdef RYML_DBG 5420 char buf1_[64], buf2_[64], buf3_[64]; 5421 csubstr buf1 = _prfl(buf1_, off); 5422 csubstr buf2 = _prfl(buf2_, s->flags); 5423 csubstr buf3 = _prfl(buf3_, s->flags&(~off)); 5424 _c4dbgpf("state[{}]: removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3); 5425 #endif 5426 s->flags &= ~off; 5427 } 5428 5429 //----------------------------------------------------------------------------- 5430 5431 csubstr Parser::_prfl(substr buf, flag_t flags) 5432 { 5433 size_t pos = 0; 5434 bool gotone = false; 5435 5436 #define _prflag(fl) \ 5437 if((flags & fl) == (fl)) \ 5438 { \ 5439 if(gotone) \ 5440 { \ 5441 if(pos + 1 < buf.len) \ 5442 buf[pos] = '|'; \ 5443 ++pos; \ 5444 } \ 5445 csubstr fltxt = #fl; \ 5446 if(pos + fltxt.len <= buf.len) \ 5447 memcpy(buf.str + pos, fltxt.str, fltxt.len); \ 5448 pos += fltxt.len; \ 5449 gotone = true; \ 5450 } 5451 5452 _prflag(RTOP); 5453 _prflag(RUNK); 5454 _prflag(RMAP); 5455 _prflag(RSEQ); 5456 _prflag(FLOW); 5457 _prflag(QMRK); 5458 _prflag(RKEY); 5459 _prflag(RVAL); 5460 _prflag(RNXT); 5461 _prflag(SSCL); 5462 _prflag(QSCL); 5463 _prflag(RSET); 5464 _prflag(NDOC); 5465 _prflag(RSEQIMAP); 5466 5467 #undef _prflag 5468 5469 RYML_ASSERT(pos <= buf.len); 5470 5471 return buf.first(pos); 5472 } 5473 5474 5475 //----------------------------------------------------------------------------- 5476 //----------------------------------------------------------------------------- 5477 //----------------------------------------------------------------------------- 5478 5479 void Parser::_grow_filter_arena(size_t num_characters_needed) 5480 { 5481 _c4dbgpf("grow: arena={} numchars={}", m_filter_arena.len, num_characters_needed); 5482 if(num_characters_needed <= m_filter_arena.len) 5483 return; 5484 size_t sz = m_filter_arena.len << 1; 5485 _c4dbgpf("grow: sz={}", sz); 5486 sz = num_characters_needed > sz ? num_characters_needed : sz; 5487 _c4dbgpf("grow: sz={}", sz); 5488 sz = sz < 128u ? 128u : sz; 5489 _c4dbgpf("grow: sz={}", sz); 5490 _RYML_CB_ASSERT(m_stack.m_callbacks, sz >= num_characters_needed); 5491 _resize_filter_arena(sz); 5492 } 5493 5494 void Parser::_resize_filter_arena(size_t num_characters) 5495 { 5496 if(num_characters > m_filter_arena.len) 5497 { 5498 _c4dbgpf("resize: sz={}", num_characters); 5499 char *prev = m_filter_arena.str; 5500 if(m_filter_arena.str) 5501 { 5502 _RYML_CB_ASSERT(m_stack.m_callbacks, m_filter_arena.len > 0); 5503 _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len); 5504 } 5505 m_filter_arena.str = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, char, num_characters, prev); 5506 m_filter_arena.len = num_characters; 5507 } 5508 } 5509 5510 substr Parser::_finish_filter_arena(substr dst, size_t pos) 5511 { 5512 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len); 5513 _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= dst.len); 5514 memcpy(dst.str, m_filter_arena.str, pos); 5515 return dst.first(pos); 5516 } 5517 5518 5519 //----------------------------------------------------------------------------- 5520 //----------------------------------------------------------------------------- 5521 //----------------------------------------------------------------------------- 5522 5523 csubstr Parser::location_contents(Location const& loc) const 5524 { 5525 _RYML_CB_ASSERT(m_stack.m_callbacks, loc.offset < m_buf.len); 5526 return m_buf.sub(loc.offset); 5527 } 5528 5529 Location Parser::location(ConstNodeRef node) const 5530 { 5531 _RYML_CB_ASSERT(m_stack.m_callbacks, node.valid()); 5532 return location(*node.tree(), node.id()); 5533 } 5534 5535 Location Parser::location(Tree const& tree, size_t node) const 5536 { 5537 // try hard to avoid getting the location from a null string. 5538 Location loc; 5539 if(_location_from_node(tree, node, &loc, 0)) 5540 return loc; 5541 return val_location(m_buf.str); 5542 } 5543 5544 bool Parser::_location_from_node(Tree const& tree, size_t node, Location *C4_RESTRICT loc, size_t level) const 5545 { 5546 if(tree.has_key(node)) 5547 { 5548 csubstr k = tree.key(node); 5549 if(C4_LIKELY(k.str != nullptr)) 5550 { 5551 _RYML_CB_ASSERT(m_stack.m_callbacks, k.is_sub(m_buf)); 5552 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(k)); 5553 *loc = val_location(k.str); 5554 return true; 5555 } 5556 } 5557 5558 if(tree.has_val(node)) 5559 { 5560 csubstr v = tree.val(node); 5561 if(C4_LIKELY(v.str != nullptr)) 5562 { 5563 _RYML_CB_ASSERT(m_stack.m_callbacks, v.is_sub(m_buf)); 5564 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(v)); 5565 *loc = val_location(v.str); 5566 return true; 5567 } 5568 } 5569 5570 if(tree.is_container(node)) 5571 { 5572 if(_location_from_cont(tree, node, loc)) 5573 return true; 5574 } 5575 5576 if(tree.type(node) != NOTYPE && level == 0) 5577 { 5578 // try the prev sibling 5579 { 5580 const size_t prev = tree.prev_sibling(node); 5581 if(prev != NONE) 5582 { 5583 if(_location_from_node(tree, prev, loc, level+1)) 5584 return true; 5585 } 5586 } 5587 // try the next sibling 5588 { 5589 const size_t next = tree.next_sibling(node); 5590 if(next != NONE) 5591 { 5592 if(_location_from_node(tree, next, loc, level+1)) 5593 return true; 5594 } 5595 } 5596 // try the parent 5597 { 5598 const size_t parent = tree.parent(node); 5599 if(parent != NONE) 5600 { 5601 if(_location_from_node(tree, parent, loc, level+1)) 5602 return true; 5603 } 5604 } 5605 } 5606 5607 return false; 5608 } 5609 5610 bool Parser::_location_from_cont(Tree const& tree, size_t node, Location *C4_RESTRICT loc) const 5611 { 5612 _RYML_CB_ASSERT(m_stack.m_callbacks, tree.is_container(node)); 5613 if(!tree.is_stream(node)) 5614 { 5615 const char *node_start = tree._p(node)->m_val.scalar.str; // this was stored in the container 5616 if(tree.has_children(node)) 5617 { 5618 size_t child = tree.first_child(node); 5619 if(tree.has_key(child)) 5620 { 5621 // when a map starts, the container was set after the key 5622 csubstr k = tree.key(child); 5623 if(k.str && node_start > k.str) 5624 node_start = k.str; 5625 } 5626 } 5627 *loc = val_location(node_start); 5628 return true; 5629 } 5630 else // it's a stream 5631 { 5632 *loc = val_location(m_buf.str); // just return the front of the buffer 5633 } 5634 return true; 5635 } 5636 5637 5638 Location Parser::val_location(const char *val) const 5639 { 5640 if(C4_UNLIKELY(val == nullptr)) 5641 return {m_file, 0, 0, 0}; 5642 5643 _RYML_CB_CHECK(m_stack.m_callbacks, m_options.locations()); 5644 // NOTE: if any of these checks fails, the parser needs to be 5645 // instantiated with locations enabled. 5646 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.str == m_newline_offsets_buf.str); 5647 _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.len == m_newline_offsets_buf.len); 5648 _RYML_CB_ASSERT(m_stack.m_callbacks, m_options.locations()); 5649 _RYML_CB_ASSERT(m_stack.m_callbacks, !_locations_dirty()); 5650 _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets != nullptr); 5651 _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size > 0); 5652 // NOTE: the pointer needs to belong to the buffer that was used to parse. 5653 csubstr src = m_buf; 5654 _RYML_CB_CHECK(m_stack.m_callbacks, val != nullptr || src.str == nullptr); 5655 _RYML_CB_CHECK(m_stack.m_callbacks, (val >= src.begin() && val <= src.end()) || (src.str == nullptr && val == nullptr)); 5656 // ok. search the first stored newline after the given ptr 5657 using lineptr_type = size_t const* C4_RESTRICT; 5658 lineptr_type lineptr = nullptr; 5659 size_t offset = (size_t)(val - src.begin()); 5660 if(m_newline_offsets_size < 30) // TODO magic number 5661 { 5662 // just do a linear search if the size is small. 5663 for(lineptr_type curr = m_newline_offsets, last = m_newline_offsets + m_newline_offsets_size; curr < last; ++curr) 5664 { 5665 if(*curr > offset) 5666 { 5667 lineptr = curr; 5668 break; 5669 } 5670 } 5671 } 5672 else 5673 { 5674 // do a bisection search if the size is not small. 5675 // 5676 // We could use std::lower_bound but this is simple enough and 5677 // spares the include of <algorithm>. 5678 size_t count = m_newline_offsets_size; 5679 size_t step; 5680 lineptr_type it; 5681 lineptr = m_newline_offsets; 5682 while(count) 5683 { 5684 step = count >> 1; 5685 it = lineptr + step; 5686 if(*it < offset) 5687 { 5688 lineptr = ++it; 5689 count -= step + 1; 5690 } 5691 else 5692 { 5693 count = step; 5694 } 5695 } 5696 } 5697 _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr >= m_newline_offsets); 5698 _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr <= m_newline_offsets + m_newline_offsets_size); 5699 _RYML_CB_ASSERT(m_stack.m_callbacks, *lineptr > offset); 5700 Location loc; 5701 loc.name = m_file; 5702 loc.offset = offset; 5703 loc.line = (size_t)(lineptr - m_newline_offsets); 5704 if(lineptr > m_newline_offsets) 5705 loc.col = (offset - *(lineptr-1) - 1u); 5706 else 5707 loc.col = offset; 5708 return loc; 5709 } 5710 5711 void Parser::_prepare_locations() 5712 { 5713 m_newline_offsets_buf = m_buf; 5714 size_t numnewlines = 1u + m_buf.count('\n'); 5715 _resize_locations(numnewlines); 5716 m_newline_offsets_size = 0; 5717 for(size_t i = 0; i < m_buf.len; i++) 5718 if(m_buf[i] == '\n') 5719 m_newline_offsets[m_newline_offsets_size++] = i; 5720 m_newline_offsets[m_newline_offsets_size++] = m_buf.len; 5721 _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size == numnewlines); 5722 } 5723 5724 void Parser::_resize_locations(size_t numnewlines) 5725 { 5726 if(numnewlines > m_newline_offsets_capacity) 5727 { 5728 if(m_newline_offsets) 5729 _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity); 5730 m_newline_offsets = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, size_t, numnewlines, m_newline_offsets); 5731 m_newline_offsets_capacity = numnewlines; 5732 } 5733 } 5734 5735 bool Parser::_locations_dirty() const 5736 { 5737 return !m_newline_offsets_size; 5738 } 5739 5740 } // namespace yml 5741 } // namespace c4 5742 5743 5744 #if defined(_MSC_VER) 5745 # pragma warning(pop) 5746 #elif defined(__clang__) 5747 # pragma clang diagnostic pop 5748 #elif defined(__GNUC__) 5749 # pragma GCC diagnostic pop 5750 #endif