char.h (12244B)
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 // This file contains parsers useful for character stream inputs, including parsers to parse 23 // common kinds of tokens like identifiers, numbers, and quoted strings. 24 25 #pragma once 26 27 #include "common.h" 28 #include "../string.h" 29 #include <inttypes.h> 30 31 KJ_BEGIN_HEADER 32 33 namespace kj { 34 namespace parse { 35 36 // ======================================================================================= 37 // Exact char/string. 38 39 class ExactString_ { 40 public: 41 constexpr inline ExactString_(const char* str): str(str) {} 42 43 template <typename Input> 44 Maybe<Tuple<>> operator()(Input& input) const { 45 const char* ptr = str; 46 47 while (*ptr != '\0') { 48 if (input.atEnd() || input.current() != *ptr) return nullptr; 49 input.next(); 50 ++ptr; 51 } 52 53 return Tuple<>(); 54 } 55 56 private: 57 const char* str; 58 }; 59 60 constexpr inline ExactString_ exactString(const char* str) { 61 return ExactString_(str); 62 } 63 64 template <char c> 65 constexpr ExactlyConst_<char, c> exactChar() { 66 // Returns a parser that matches exactly the character given by the template argument (returning 67 // no result). 68 return ExactlyConst_<char, c>(); 69 } 70 71 // ======================================================================================= 72 // Char ranges / sets 73 74 class CharGroup_ { 75 public: 76 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} 77 78 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { 79 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), 80 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), 81 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), 82 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); 83 } 84 85 constexpr inline CharGroup_ orAny(const char* chars) const { 86 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); 87 } 88 89 constexpr inline CharGroup_ orChar(unsigned char c) const { 90 return CharGroup_(bits[0] | bit(c), 91 bits[1] | bit(c - 64), 92 bits[2] | bit(c - 128), 93 bits[3] | bit(c - 256)); 94 } 95 96 constexpr inline CharGroup_ orGroup(CharGroup_ other) const { 97 return CharGroup_(bits[0] | other.bits[0], 98 bits[1] | other.bits[1], 99 bits[2] | other.bits[2], 100 bits[3] | other.bits[3]); 101 } 102 103 constexpr inline CharGroup_ invert() const { 104 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); 105 } 106 107 constexpr inline bool contains(unsigned char c) const { 108 return (bits[c / 64] & (1ll << (c % 64))) != 0; 109 } 110 111 inline bool containsAll(ArrayPtr<const char> text) const { 112 for (char c: text) { 113 if (!contains(c)) return false; 114 } 115 return true; 116 } 117 118 template <typename Input> 119 Maybe<char> operator()(Input& input) const { 120 if (input.atEnd()) return nullptr; 121 unsigned char c = input.current(); 122 if (contains(c)) { 123 input.next(); 124 return c; 125 } else { 126 return nullptr; 127 } 128 } 129 130 private: 131 typedef unsigned long long Bits64; 132 133 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} 134 Bits64 bits[4]; 135 136 static constexpr inline Bits64 oneBits(int count) { 137 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); 138 } 139 static constexpr inline Bits64 bit(int index) { 140 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); 141 } 142 }; 143 144 constexpr inline CharGroup_ charRange(char first, char last) { 145 // Create a parser which accepts any character in the range from `first` to `last`, inclusive. 146 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the 147 // character matched. 148 // 149 // The returned object has methods which can be used to match more characters. The following 150 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. 151 // 152 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") 153 // 154 // You can also use `.invert()` to match the opposite set of characters. 155 156 return CharGroup_().orRange(first, last); 157 } 158 159 #if _MSC_VER && !defined(__clang__) 160 #define anyOfChars(chars) CharGroup_().orAny(chars) 161 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from 162 // building the compiler or schema parser. We don't know why this happens, but Harris found that 163 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. 164 // Hopefully, MSVC will get fixed soon and we'll be able to remove this. 165 #else 166 constexpr inline CharGroup_ anyOfChars(const char* chars) { 167 // Returns a parser that accepts any of the characters in the given string (which should usually 168 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see 169 // that function for more info. 170 171 return CharGroup_().orAny(chars); 172 } 173 #endif 174 175 // ======================================================================================= 176 177 namespace _ { // private 178 179 struct ArrayToString { 180 inline String operator()(const Array<char>& arr) const { 181 return heapString(arr); 182 } 183 }; 184 185 } // namespace _ (private) 186 187 template <typename SubParser> 188 constexpr inline auto charsToString(SubParser&& subParser) 189 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { 190 // Wraps a parser that returns Array<char> such that it returns String instead. 191 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); 192 } 193 194 // ======================================================================================= 195 // Basic character classes. 196 197 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); 198 constexpr auto digit = charRange('0', '9'); 199 constexpr auto alphaNumeric = alpha.orGroup(digit); 200 constexpr auto nameStart = alpha.orChar('_'); 201 constexpr auto nameChar = alphaNumeric.orChar('_'); 202 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); 203 constexpr auto octDigit = charRange('0', '7'); 204 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v"); 205 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); 206 207 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v")); 208 209 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v")))); 210 // Like discard(whitespace) but avoids some memory allocation. 211 212 // ======================================================================================= 213 // Identifiers 214 215 namespace _ { // private 216 217 struct IdentifierToString { 218 inline String operator()(char first, const Array<char>& rest) const { 219 if (rest.size() == 0) return heapString(&first, 1); 220 String result = heapString(rest.size() + 1); 221 result[0] = first; 222 memcpy(result.begin() + 1, rest.begin(), rest.size()); 223 return result; 224 } 225 }; 226 227 } // namespace _ (private) 228 229 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); 230 // Parses an identifier (e.g. a C variable name). 231 232 // ======================================================================================= 233 // Integers 234 235 namespace _ { // private 236 237 inline char parseDigit(char c) { 238 if (c < 'A') return c - '0'; 239 if (c < 'a') return c - 'A' + 10; 240 return c - 'a' + 10; 241 } 242 243 template <uint base> 244 struct ParseInteger { 245 inline uint64_t operator()(const Array<char>& digits) const { 246 return operator()('0', digits); 247 } 248 uint64_t operator()(char first, const Array<char>& digits) const { 249 uint64_t result = parseDigit(first); 250 for (char digit: digits) { 251 result = result * base + parseDigit(digit); 252 } 253 return result; 254 } 255 }; 256 257 258 } // namespace _ (private) 259 260 constexpr auto integer = sequence( 261 oneOf( 262 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), 263 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), 264 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), 265 notLookingAt(alpha.orAny("_."))); 266 267 // ======================================================================================= 268 // Numbers (i.e. floats) 269 270 namespace _ { // private 271 272 struct ParseFloat { 273 double operator()(const Array<char>& digits, 274 const Maybe<Array<char>>& fraction, 275 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; 276 }; 277 278 } // namespace _ (private) 279 280 constexpr auto number = transform( 281 sequence( 282 oneOrMore(digit), 283 optional(sequence(exactChar<'.'>(), many(digit))), 284 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))), 285 notLookingAt(alpha.orAny("_."))), 286 _::ParseFloat()); 287 288 // ======================================================================================= 289 // Quoted strings 290 291 namespace _ { // private 292 293 struct InterpretEscape { 294 char operator()(char c) const { 295 switch (c) { 296 case 'a': return '\a'; 297 case 'b': return '\b'; 298 case 'f': return '\f'; 299 case 'n': return '\n'; 300 case 'r': return '\r'; 301 case 't': return '\t'; 302 case 'v': return '\v'; 303 default: return c; 304 } 305 } 306 }; 307 308 struct ParseHexEscape { 309 inline char operator()(char first, char second) const { 310 return (parseDigit(first) << 4) | parseDigit(second); 311 } 312 }; 313 314 struct ParseHexByte { 315 inline byte operator()(char first, char second) const { 316 return (parseDigit(first) << 4) | parseDigit(second); 317 } 318 }; 319 320 struct ParseOctEscape { 321 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { 322 char result = first - '0'; 323 KJ_IF_MAYBE(digit1, second) { 324 result = (result << 3) | (*digit1 - '0'); 325 KJ_IF_MAYBE(digit2, third) { 326 result = (result << 3) | (*digit2 - '0'); 327 } 328 } 329 return result; 330 } 331 }; 332 333 } // namespace _ (private) 334 335 constexpr auto escapeSequence = 336 sequence(exactChar<'\\'>(), oneOf( 337 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()), 338 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), 339 transform(sequence(octDigit, optional(octDigit), optional(octDigit)), 340 _::ParseOctEscape()))); 341 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns 342 // a char. 343 344 constexpr auto doubleQuotedString = charsToString(sequence( 345 exactChar<'\"'>(), 346 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)), 347 exactChar<'\"'>())); 348 // Parses a C-style double-quoted string. 349 350 constexpr auto singleQuotedString = charsToString(sequence( 351 exactChar<'\''>(), 352 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)), 353 exactChar<'\''>())); 354 // Parses a C-style single-quoted string. 355 356 constexpr auto doubleQuotedHexBinary = sequence( 357 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), 358 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), 359 discardWhitespace, 360 exactChar<'\"'>()); 361 // Parses a double-quoted hex binary literal. Returns Array<byte>. 362 363 } // namespace parse 364 } // namespace kj 365 366 KJ_END_HEADER