capnproto

FORK: Cap'n Proto serialization/RPC system - core tools and C++ library
git clone https://git.neptards.moe/neptards/capnproto.git
Log | Files | Refs | README | LICENSE

char.h (12244B)


      1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
      2 // Licensed under the MIT License:
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a copy
      5 // of this software and associated documentation files (the "Software"), to deal
      6 // in the Software without restriction, including without limitation the rights
      7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      8 // copies of the Software, and to permit persons to whom the Software is
      9 // furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     20 // THE SOFTWARE.
     21 
     22 // This file contains parsers useful for character stream inputs, including parsers to parse
     23 // common kinds of tokens like identifiers, numbers, and quoted strings.
     24 
     25 #pragma once
     26 
     27 #include "common.h"
     28 #include "../string.h"
     29 #include <inttypes.h>
     30 
     31 KJ_BEGIN_HEADER
     32 
     33 namespace kj {
     34 namespace parse {
     35 
     36 // =======================================================================================
     37 // Exact char/string.
     38 
     39 class ExactString_ {
     40 public:
     41   constexpr inline ExactString_(const char* str): str(str) {}
     42 
     43   template <typename Input>
     44   Maybe<Tuple<>> operator()(Input& input) const {
     45     const char* ptr = str;
     46 
     47     while (*ptr != '\0') {
     48       if (input.atEnd() || input.current() != *ptr) return nullptr;
     49       input.next();
     50       ++ptr;
     51     }
     52 
     53     return Tuple<>();
     54   }
     55 
     56 private:
     57   const char* str;
     58 };
     59 
     60 constexpr inline ExactString_ exactString(const char* str) {
     61   return ExactString_(str);
     62 }
     63 
     64 template <char c>
     65 constexpr ExactlyConst_<char, c> exactChar() {
     66   // Returns a parser that matches exactly the character given by the template argument (returning
     67   // no result).
     68   return ExactlyConst_<char, c>();
     69 }
     70 
     71 // =======================================================================================
     72 // Char ranges / sets
     73 
     74 class CharGroup_ {
     75 public:
     76   constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
     77 
     78   constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
     79     return CharGroup_(bits[0] | (oneBits(last +   1) & ~oneBits(first      )),
     80                       bits[1] | (oneBits(last -  63) & ~oneBits(first -  64)),
     81                       bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
     82                       bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
     83   }
     84 
     85   constexpr inline CharGroup_ orAny(const char* chars) const {
     86     return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
     87   }
     88 
     89   constexpr inline CharGroup_ orChar(unsigned char c) const {
     90     return CharGroup_(bits[0] | bit(c),
     91                       bits[1] | bit(c - 64),
     92                       bits[2] | bit(c - 128),
     93                       bits[3] | bit(c - 256));
     94   }
     95 
     96   constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
     97     return CharGroup_(bits[0] | other.bits[0],
     98                       bits[1] | other.bits[1],
     99                       bits[2] | other.bits[2],
    100                       bits[3] | other.bits[3]);
    101   }
    102 
    103   constexpr inline CharGroup_ invert() const {
    104     return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
    105   }
    106 
    107   constexpr inline bool contains(unsigned char c) const {
    108     return (bits[c / 64] & (1ll << (c % 64))) != 0;
    109   }
    110 
    111   inline bool containsAll(ArrayPtr<const char> text) const {
    112     for (char c: text) {
    113       if (!contains(c)) return false;
    114     }
    115     return true;
    116   }
    117 
    118   template <typename Input>
    119   Maybe<char> operator()(Input& input) const {
    120     if (input.atEnd()) return nullptr;
    121     unsigned char c = input.current();
    122     if (contains(c)) {
    123       input.next();
    124       return c;
    125     } else {
    126       return nullptr;
    127     }
    128   }
    129 
    130 private:
    131   typedef unsigned long long Bits64;
    132 
    133   constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
    134   Bits64 bits[4];
    135 
    136   static constexpr inline Bits64 oneBits(int count) {
    137     return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
    138   }
    139   static constexpr inline Bits64 bit(int index) {
    140     return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
    141   }
    142 };
    143 
    144 constexpr inline CharGroup_ charRange(char first, char last) {
    145   // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
    146   // For example: `charRange('a', 'z')` matches all lower-case letters.  The parser's result is the
    147   // character matched.
    148   //
    149   // The returned object has methods which can be used to match more characters.  The following
    150   // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
    151   //
    152   //     charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
    153   //
    154   // You can also use `.invert()` to match the opposite set of characters.
    155 
    156   return CharGroup_().orRange(first, last);
    157 }
    158 
    159 #if _MSC_VER && !defined(__clang__)
    160 #define anyOfChars(chars) CharGroup_().orAny(chars)
    161 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
    162 //   building the compiler or schema parser. We don't know why this happens, but Harris found that
    163 //   this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
    164 //   Hopefully, MSVC will get fixed soon and we'll be able to remove this.
    165 #else
    166 constexpr inline CharGroup_ anyOfChars(const char* chars) {
    167   // Returns a parser that accepts any of the characters in the given string (which should usually
    168   // be a literal).  The returned parser is of the same type as returned by `charRange()` -- see
    169   // that function for more info.
    170 
    171   return CharGroup_().orAny(chars);
    172 }
    173 #endif
    174 
    175 // =======================================================================================
    176 
    177 namespace _ {  // private
    178 
    179 struct ArrayToString {
    180   inline String operator()(const Array<char>& arr) const {
    181     return heapString(arr);
    182   }
    183 };
    184 
    185 }  // namespace _ (private)
    186 
    187 template <typename SubParser>
    188 constexpr inline auto charsToString(SubParser&& subParser)
    189     -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
    190   // Wraps a parser that returns Array<char> such that it returns String instead.
    191   return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
    192 }
    193 
    194 // =======================================================================================
    195 // Basic character classes.
    196 
    197 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
    198 constexpr auto digit = charRange('0', '9');
    199 constexpr auto alphaNumeric = alpha.orGroup(digit);
    200 constexpr auto nameStart = alpha.orChar('_');
    201 constexpr auto nameChar = alphaNumeric.orChar('_');
    202 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
    203 constexpr auto octDigit = charRange('0', '7');
    204 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
    205 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
    206 
    207 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
    208 
    209 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
    210 // Like discard(whitespace) but avoids some memory allocation.
    211 
    212 // =======================================================================================
    213 // Identifiers
    214 
    215 namespace _ { // private
    216 
    217 struct IdentifierToString {
    218   inline String operator()(char first, const Array<char>& rest) const {
    219     if (rest.size() == 0) return heapString(&first, 1);
    220     String result = heapString(rest.size() + 1);
    221     result[0] = first;
    222     memcpy(result.begin() + 1, rest.begin(), rest.size());
    223     return result;
    224   }
    225 };
    226 
    227 }  // namespace _ (private)
    228 
    229 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
    230 // Parses an identifier (e.g. a C variable name).
    231 
    232 // =======================================================================================
    233 // Integers
    234 
    235 namespace _ {  // private
    236 
    237 inline char parseDigit(char c) {
    238   if (c < 'A') return c - '0';
    239   if (c < 'a') return c - 'A' + 10;
    240   return c - 'a' + 10;
    241 }
    242 
    243 template <uint base>
    244 struct ParseInteger {
    245   inline uint64_t operator()(const Array<char>& digits) const {
    246     return operator()('0', digits);
    247   }
    248   uint64_t operator()(char first, const Array<char>& digits) const {
    249     uint64_t result = parseDigit(first);
    250     for (char digit: digits) {
    251       result = result * base + parseDigit(digit);
    252     }
    253     return result;
    254   }
    255 };
    256 
    257 
    258 }  // namespace _ (private)
    259 
    260 constexpr auto integer = sequence(
    261     oneOf(
    262       transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
    263       transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
    264       transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
    265     notLookingAt(alpha.orAny("_.")));
    266 
    267 // =======================================================================================
    268 // Numbers (i.e. floats)
    269 
    270 namespace _ {  // private
    271 
    272 struct ParseFloat {
    273   double operator()(const Array<char>& digits,
    274                     const Maybe<Array<char>>& fraction,
    275                     const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
    276 };
    277 
    278 }  // namespace _ (private)
    279 
    280 constexpr auto number = transform(
    281     sequence(
    282         oneOrMore(digit),
    283         optional(sequence(exactChar<'.'>(), many(digit))),
    284         optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
    285         notLookingAt(alpha.orAny("_."))),
    286     _::ParseFloat());
    287 
    288 // =======================================================================================
    289 // Quoted strings
    290 
    291 namespace _ {  // private
    292 
    293 struct InterpretEscape {
    294   char operator()(char c) const {
    295     switch (c) {
    296       case 'a': return '\a';
    297       case 'b': return '\b';
    298       case 'f': return '\f';
    299       case 'n': return '\n';
    300       case 'r': return '\r';
    301       case 't': return '\t';
    302       case 'v': return '\v';
    303       default: return c;
    304     }
    305   }
    306 };
    307 
    308 struct ParseHexEscape {
    309   inline char operator()(char first, char second) const {
    310     return (parseDigit(first) << 4) | parseDigit(second);
    311   }
    312 };
    313 
    314 struct ParseHexByte {
    315   inline byte operator()(char first, char second) const {
    316     return (parseDigit(first) << 4) | parseDigit(second);
    317   }
    318 };
    319 
    320 struct ParseOctEscape {
    321   inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
    322     char result = first - '0';
    323     KJ_IF_MAYBE(digit1, second) {
    324       result = (result << 3) | (*digit1 - '0');
    325       KJ_IF_MAYBE(digit2, third) {
    326         result = (result << 3) | (*digit2 - '0');
    327       }
    328     }
    329     return result;
    330   }
    331 };
    332 
    333 }  // namespace _ (private)
    334 
    335 constexpr auto escapeSequence =
    336     sequence(exactChar<'\\'>(), oneOf(
    337         transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
    338         transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
    339         transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
    340                   _::ParseOctEscape())));
    341 // A parser that parses a C-string-style escape sequence (starting with a backslash).  Returns
    342 // a char.
    343 
    344 constexpr auto doubleQuotedString = charsToString(sequence(
    345     exactChar<'\"'>(),
    346     many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
    347     exactChar<'\"'>()));
    348 // Parses a C-style double-quoted string.
    349 
    350 constexpr auto singleQuotedString = charsToString(sequence(
    351     exactChar<'\''>(),
    352     many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
    353     exactChar<'\''>()));
    354 // Parses a C-style single-quoted string.
    355 
    356 constexpr auto doubleQuotedHexBinary = sequence(
    357     exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
    358     oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
    359     discardWhitespace,
    360     exactChar<'\"'>());
    361 // Parses a double-quoted hex binary literal. Returns Array<byte>.
    362 
    363 }  // namespace parse
    364 }  // namespace kj
    365 
    366 KJ_END_HEADER