lexer.c++ (12086B)
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #include "lexer.h" 23 #include <kj/parse/char.h> 24 #include <kj/debug.h> 25 26 namespace capnp { 27 namespace compiler { 28 29 namespace p = kj::parse; 30 31 bool lex(kj::ArrayPtr<const char> input, LexedStatements::Builder result, 32 ErrorReporter& errorReporter) { 33 Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter); 34 35 auto parser = p::sequence(lexer.getParsers().statementSequence, p::endOfInput); 36 37 Lexer::ParserInput parserInput(input.begin(), input.end()); 38 kj::Maybe<kj::Array<Orphan<Statement>>> parseOutput = parser(parserInput); 39 40 KJ_IF_MAYBE(output, parseOutput) { 41 auto l = result.initStatements(output->size()); 42 for (uint i = 0; i < output->size(); i++) { 43 l.adoptWithCaveats(i, kj::mv((*output)[i])); 44 } 45 return true; 46 } else { 47 uint32_t best = parserInput.getBest(); 48 errorReporter.addError(best, best, kj::str("Parse error.")); 49 return false; 50 } 51 } 52 53 bool lex(kj::ArrayPtr<const char> input, LexedTokens::Builder result, 54 ErrorReporter& errorReporter) { 55 Lexer lexer(Orphanage::getForMessageContaining(result), errorReporter); 56 57 auto parser = p::sequence(lexer.getParsers().tokenSequence, p::endOfInput); 58 59 Lexer::ParserInput parserInput(input.begin(), input.end()); 60 kj::Maybe<kj::Array<Orphan<Token>>> parseOutput = parser(parserInput); 61 62 KJ_IF_MAYBE(output, parseOutput) { 63 auto l = result.initTokens(output->size()); 64 for (uint i = 0; i < output->size(); i++) { 65 l.adoptWithCaveats(i, kj::mv((*output)[i])); 66 } 67 return true; 68 } else { 69 uint32_t best = parserInput.getBest(); 70 errorReporter.addError(best, best, kj::str("Parse error.")); 71 return false; 72 } 73 } 74 75 namespace { 76 77 typedef p::Span<uint32_t> Location; 78 79 Token::Builder initTok(Orphan<Token>& t, const Location& loc) { 80 auto builder = t.get(); 81 builder.setStartByte(loc.begin()); 82 builder.setEndByte(loc.end()); 83 return builder; 84 } 85 86 void buildTokenSequenceList(List<List<Token>>::Builder builder, 87 kj::Array<kj::Array<Orphan<Token>>>&& items) { 88 for (uint i = 0; i < items.size(); i++) { 89 auto& item = items[i]; 90 auto itemBuilder = builder.init(i, item.size()); 91 for (uint j = 0; j < item.size(); j++) { 92 itemBuilder.adoptWithCaveats(j, kj::mv(item[j])); 93 } 94 } 95 } 96 97 void attachDocComment(Statement::Builder statement, kj::Array<kj::String>&& comment) { 98 size_t size = 0; 99 for (auto& line: comment) { 100 size += line.size() + 1; // include newline 101 } 102 Text::Builder builder = statement.initDocComment(size); 103 char* pos = builder.begin(); 104 for (auto& line: comment) { 105 memcpy(pos, line.begin(), line.size()); 106 pos += line.size(); 107 *pos++ = '\n'; 108 } 109 KJ_ASSERT(pos == builder.end()); 110 } 111 112 constexpr auto discardComment = 113 sequence(p::exactChar<'#'>(), p::discard(p::many(p::discard(p::anyOfChars("\n").invert()))), 114 p::oneOf(p::exactChar<'\n'>(), p::endOfInput)); 115 constexpr auto saveComment = 116 sequence(p::exactChar<'#'>(), p::discard(p::optional(p::exactChar<' '>())), 117 p::charsToString(p::many(p::anyOfChars("\n").invert())), 118 p::oneOf(p::exactChar<'\n'>(), p::endOfInput)); 119 120 constexpr auto utf8Bom = 121 sequence(p::exactChar<'\xef'>(), p::exactChar<'\xbb'>(), p::exactChar<'\xbf'>()); 122 123 constexpr auto bomsAndWhitespace = 124 sequence(p::discardWhitespace, 125 p::discard(p::many(sequence(utf8Bom, p::discardWhitespace)))); 126 127 constexpr auto commentsAndWhitespace = 128 sequence(bomsAndWhitespace, 129 p::discard(p::many(sequence(discardComment, bomsAndWhitespace)))); 130 131 constexpr auto discardLineWhitespace = 132 p::discard(p::many(p::discard(p::whitespaceChar.invert().orAny("\r\n").invert()))); 133 constexpr auto newline = p::oneOf( 134 p::exactChar<'\n'>(), 135 sequence(p::exactChar<'\r'>(), p::discard(p::optional(p::exactChar<'\n'>())))); 136 137 constexpr auto docComment = p::optional(p::sequence( 138 discardLineWhitespace, 139 p::discard(p::optional(newline)), 140 p::oneOrMore(p::sequence(discardLineWhitespace, saveComment)))); 141 // Parses a set of comment lines preceded by at most one newline and with no intervening blank 142 // lines. 143 144 } // namespace 145 146 Lexer::Lexer(Orphanage orphanageParam, ErrorReporter& errorReporter) 147 : orphanage(orphanageParam) { 148 149 // Note that because passing an lvalue to a parser constructor uses it by-referencee, it's safe 150 // for us to use parsers.tokenSequence even though we haven't yet constructed it. 151 auto& tokenSequence = parsers.tokenSequence; 152 153 auto& commaDelimitedList = arena.copy(p::transform( 154 p::sequence(tokenSequence, p::many(p::sequence(p::exactChar<','>(), tokenSequence))), 155 [](kj::Array<Orphan<Token>>&& first, kj::Array<kj::Array<Orphan<Token>>>&& rest) 156 -> kj::Array<kj::Array<Orphan<Token>>> { 157 if (first == nullptr && rest == nullptr) { 158 // Completely empty list. 159 return nullptr; 160 } else { 161 uint restSize = rest.size(); 162 if (restSize > 0 && rest[restSize - 1] == nullptr) { 163 // Allow for trailing commas by shortening the list by one item if the final token is 164 // nullptr 165 restSize--; 166 } 167 auto result = kj::heapArrayBuilder<kj::Array<Orphan<Token>>>(1 + restSize); // first+rest 168 result.add(kj::mv(first)); 169 for (uint i = 0; i < restSize ; i++) { 170 result.add(kj::mv(rest[i])); 171 } 172 return result.finish(); 173 } 174 })); 175 176 auto& token = arena.copy(p::oneOf( 177 p::transformWithLocation(p::identifier, 178 [this](Location loc, kj::String name) -> Orphan<Token> { 179 auto t = orphanage.newOrphan<Token>(); 180 initTok(t, loc).setIdentifier(name); 181 return t; 182 }), 183 p::transformWithLocation(p::doubleQuotedString, 184 [this](Location loc, kj::String text) -> Orphan<Token> { 185 auto t = orphanage.newOrphan<Token>(); 186 initTok(t, loc).setStringLiteral(text); 187 return t; 188 }), 189 p::transformWithLocation(p::doubleQuotedHexBinary, 190 [this](Location loc, kj::Array<byte> data) -> Orphan<Token> { 191 auto t = orphanage.newOrphan<Token>(); 192 initTok(t, loc).setBinaryLiteral(data); 193 return t; 194 }), 195 p::transformWithLocation(p::integer, 196 [this](Location loc, uint64_t i) -> Orphan<Token> { 197 auto t = orphanage.newOrphan<Token>(); 198 initTok(t, loc).setIntegerLiteral(i); 199 return t; 200 }), 201 p::transformWithLocation(p::number, 202 [this](Location loc, double x) -> Orphan<Token> { 203 auto t = orphanage.newOrphan<Token>(); 204 initTok(t, loc).setFloatLiteral(x); 205 return t; 206 }), 207 p::transformWithLocation( 208 p::charsToString(p::oneOrMore(p::anyOfChars("!$%&*+-./:<=>?@^|~"))), 209 [this](Location loc, kj::String text) -> Orphan<Token> { 210 auto t = orphanage.newOrphan<Token>(); 211 initTok(t, loc).setOperator(text); 212 return t; 213 }), 214 p::transformWithLocation( 215 sequence(p::exactChar<'('>(), commaDelimitedList, p::exactChar<')'>()), 216 [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> { 217 auto t = orphanage.newOrphan<Token>(); 218 buildTokenSequenceList( 219 initTok(t, loc).initParenthesizedList(items.size()), kj::mv(items)); 220 return t; 221 }), 222 p::transformWithLocation( 223 sequence(p::exactChar<'['>(), commaDelimitedList, p::exactChar<']'>()), 224 [this](Location loc, kj::Array<kj::Array<Orphan<Token>>>&& items) -> Orphan<Token> { 225 auto t = orphanage.newOrphan<Token>(); 226 buildTokenSequenceList( 227 initTok(t, loc).initBracketedList(items.size()), kj::mv(items)); 228 return t; 229 }), 230 p::transformOrReject(p::transformWithLocation( 231 p::oneOf(sequence(p::exactChar<'\xff'>(), p::exactChar<'\xfe'>()), 232 sequence(p::exactChar<'\xfe'>(), p::exactChar<'\xff'>()), 233 sequence(p::exactChar<'\x00'>())), 234 [&errorReporter](Location loc) -> kj::Maybe<Orphan<Token>> { 235 errorReporter.addError(loc.begin(), loc.end(), 236 "Non-UTF-8 input detected. Cap'n Proto schema files must be UTF-8 text."); 237 return nullptr; 238 }), [](kj::Maybe<Orphan<Token>> param) { return param; }))); 239 parsers.tokenSequence = arena.copy(p::sequence( 240 commentsAndWhitespace, p::many(p::sequence(token, commentsAndWhitespace)))); 241 242 auto& statementSequence = parsers.statementSequence; 243 244 auto& statementEnd = arena.copy(p::oneOf( 245 transform(p::sequence(p::exactChar<';'>(), docComment), 246 [this](kj::Maybe<kj::Array<kj::String>>&& comment) -> Orphan<Statement> { 247 auto result = orphanage.newOrphan<Statement>(); 248 auto builder = result.get(); 249 KJ_IF_MAYBE(c, comment) { 250 attachDocComment(builder, kj::mv(*c)); 251 } 252 builder.setLine(); 253 return result; 254 }), 255 transform( 256 p::sequence(p::exactChar<'{'>(), docComment, statementSequence, p::exactChar<'}'>(), 257 docComment), 258 [this](kj::Maybe<kj::Array<kj::String>>&& comment, 259 kj::Array<Orphan<Statement>>&& statements, 260 kj::Maybe<kj::Array<kj::String>>&& lateComment) 261 -> Orphan<Statement> { 262 auto result = orphanage.newOrphan<Statement>(); 263 auto builder = result.get(); 264 KJ_IF_MAYBE(c, comment) { 265 attachDocComment(builder, kj::mv(*c)); 266 } else KJ_IF_MAYBE(c, lateComment) { 267 attachDocComment(builder, kj::mv(*c)); 268 } 269 auto list = builder.initBlock(statements.size()); 270 for (uint i = 0; i < statements.size(); i++) { 271 list.adoptWithCaveats(i, kj::mv(statements[i])); 272 } 273 return result; 274 }) 275 )); 276 277 auto& statement = arena.copy(p::transformWithLocation(p::sequence(tokenSequence, statementEnd), 278 [](Location loc, kj::Array<Orphan<Token>>&& tokens, Orphan<Statement>&& statement) { 279 auto builder = statement.get(); 280 auto tokensBuilder = builder.initTokens(tokens.size()); 281 for (uint i = 0; i < tokens.size(); i++) { 282 tokensBuilder.adoptWithCaveats(i, kj::mv(tokens[i])); 283 } 284 builder.setStartByte(loc.begin()); 285 builder.setEndByte(loc.end()); 286 return kj::mv(statement); 287 })); 288 289 parsers.statementSequence = arena.copy(sequence( 290 commentsAndWhitespace, many(sequence(statement, commentsAndWhitespace)))); 291 292 parsers.token = token; 293 parsers.statement = statement; 294 parsers.emptySpace = commentsAndWhitespace; 295 } 296 297 Lexer::~Lexer() noexcept(false) {} 298 299 } // namespace compiler 300 } // namespace capnp