#include <cctype> #include <cstring> #include <string> #include <memory> #include <ostream> #include <vector> #include <arbor/arbexcept.hpp> #include <arbor/util/variant.hpp> #include "s_expr.hpp" #include "strprintf.hpp" namespace pyarb { inline bool is_alphanumeric(char c) { return std::isdigit(c) || std::isalpha(c); } inline bool is_plusminus(char c) { return (c=='-' || c=='+'); } std::ostream& operator<<(std::ostream& o, const tok& t) { switch (t) { case tok::nil: return o << "nil"; case tok::lparen: return o << "lparen"; case tok::rparen: return o << "rparen"; case tok::real: return o << "real"; case tok::integer:return o << "integer"; case tok::name: return o << "name"; case tok::string: return o << "string"; case tok::eof: return o << "eof"; case tok::error: return o << "error"; } return o << "<unknown>"; } std::ostream& operator<<(std::ostream& o, const token& t) { if (t.kind==tok::string) { return o << util::pprintf("\"{}\"", t.spelling); } return o << util::pprintf("{}", t.spelling); } // // lexer // struct parser_error: public arb::arbor_exception { int loc; parser_error(const std::string& msg, int l): arbor_exception(msg), loc(l) {} }; static std::unordered_map<tok, const char*> tok_to_keyword = { {tok::nil, "nil"}, }; static std::unordered_map<std::string, tok> keyword_to_tok = { {"nil", tok::nil}, }; class lexer { const char* data_;; const char* end_;; const char* current_; int loc_; token token_; public: lexer(const char* s): data_(s), end_(data_ + std::strlen(data_)), current_(data_) { // Prime the first token. parse(); } // Return the current token in the stream. const token& current() { return token_; } const token& next() { parse(); return token_; } private: // Consume the and return the next token in the stream. void parse() { using namespace std::string_literals; while (current_!=end_) { loc_ = current_-data_; switch (*current_) { // end of file case 0 : // end of string token_ = {loc_, tok::eof, "eof"s}; return; // white space case ' ' : case '\t' : case '\v' : case '\f' : case '\n' : character(); continue; // skip to next character case ';': eat_comment(); continue; case '(': token_ = {loc_, tok::lparen, {character()}}; return; case ')': token_ = {loc_, tok::rparen, {character()}}; return; case 'a' ... 'z': case 'A' ... 'Z': token_ = name(); return; case '0' ... '9': token_ = number(); return; case '"': token_ = string(); return; case '-': case '+': { char c = current_[1]; if (std::isdigit(c) or c=='.') { token_ = number(); return; } } token_ = {loc_, tok::error, util::pprintf("Unexpected character '{}'.", character())}; return; default: token_ = {loc_, tok::error, util::pprintf("Unexpected character '{}'.", character())}; return; } } if (current_!=end_) { // todo: handle error: should never hit this } token_ = {loc_, tok::eof, "eof"s}; return; } // Consumes characters in the stream until end of stream or a new line. // Assumes that the current_ location is the `;` that starts the comment. void eat_comment() { while (*current_ && *current_!='\n') { character(); } } // Parse alphanumeric sequence that starts with an alphabet character, // and my contain alphabet, numeric or underscore '_' characters. // // Valid names: // sub_dendrite // sub-dendrite // temp_ // branch3 // A // Invalid names: // _cat ; can't start with underscore // -cat ; can't start with hyphen // 2ndvar ; can't start with numeric character // // Returns the appropriate token kind if name is a keyword. token name() { std::string name; char c = *current_; // Assert that current position is at the start of an identifier if( !(std::isalpha(c)) ) { throw parser_error( "Lexer attempting to read identifier when none is available", loc_); } name += c; ++current_; while(1) { c = *current_; if(is_alphanumeric(c) || c=='_' || c=='-') { name += character(); } else { break; } } // test if the name matches a keyword auto it = keyword_to_tok.find(name.c_str()); if (it!=keyword_to_tok.end()) { return {loc_, it->second, std::move(name)}; } return {loc_, tok::name, std::move(name)}; } token string() { using namespace std::string_literals; ++current_; const char* begin = current_; while (current_!=end_ && character()!='"'); if (current_==end_) return {loc_, tok::error, "string missing closing \""}; return {loc_, tok::string, std::string(begin, current_-1)}; } token number() { using namespace std::string_literals; std::string str; char c = *current_; // Start counting the number of points in the number. auto num_point = (c=='.' ? 1 : 0); auto uses_scientific_notation = 0; str += c; current_++; while(1) { c = *current_; if(std::isdigit(c)) { str += c; current_++; } else if(c=='.') { if (++num_point>1) { // Can't have more than one '.' in a number return {int(current_-data_), tok::error, "unexpected '.'"s}; } str += c; current_++; if(uses_scientific_notation) { // Can't have a '.' in the mantissa return {int(current_-data_), tok::error, "unexpected '.'"s}; } } else if(!uses_scientific_notation && (c=='e' || c=='E')) { if(std::isdigit(current_[1]) || (is_plusminus(current_[1]) && std::isdigit(current_[2]))) { uses_scientific_notation++; str += c; current_++; // Consume the next char if +/- if (is_plusminus(*current_)) { str += *current_++; } } else { // the 'e' or 'E' is the beginning of a new token break; } } else { break; } } const bool is_real = uses_scientific_notation || num_point>0; return {loc_, (is_real? tok::real: tok::integer), std::move(str)}; } char character() { return *current_++; } }; bool test_identifier(const char* in) { lexer L(in); auto x = L.current(); return x.kind==tok::name && x.spelling==in; } // // s expression members // bool s_expr::is_atom() const { return state.index()==0; } const token& s_expr::atom() const { return state.get<0>(); } const s_expr& s_expr::head() const { return state.get<1>().head.get(); } const s_expr& s_expr::tail() const { return state.get<1>().tail.get(); } s_expr& s_expr::head() { return state.get<1>().head.get(); } s_expr& s_expr::tail() { return state.get<1>().tail.get(); } s_expr::operator bool() const { return !(is_atom() && atom().kind==tok::nil); } std::ostream& operator<<(std::ostream& o, const s_expr& x) { if (x.is_atom()) return o << x.atom(); #if 0 // print full tree with terminating 'nil' return o << "(" << x.head() << " . " << x.tail() << ")"; #else // print '(a . nil)' as 'a' return x.tail()? o << "(" << x.head() << " . " << x.tail() << ")" : o << x.head(); #endif } std::size_t length(const s_expr& l) { if (l.is_atom() && l) { throw arb::arbor_internal_error( util::pprintf("Internal error: can't take length of an atom in '{}'.", l)); } if (!l) { // nil return 0u; } return 1+length(l.tail()); } int location(const s_expr& l) { if (l.is_atom()) return l.atom().column; return location(l.head()); } // // parsing s expressions // // If there is a parsing error, then an atom with kind==tok::error is returned // with the error string in its spelling. s_expr parse(lexer& L) { using namespace std::string_literals; s_expr node; auto t = L.current(); if (t.kind==tok::lparen) { t = L.next(); s_expr* n = &node; while (true) { if (t.kind == tok::eof) { return token{t.column, tok::error, "Unexpected end of input. Missing a closing parenthesis ')'."};; } if (t.kind == tok::error) { return t; } else if (t.kind == tok::rparen) { *n = token{t.column, tok::nil, "nil"}; break; } else if (t.kind == tok::lparen) { auto e = parse(L); if (e.is_atom() && e.atom().kind==tok::error) return e; *n = {std::move(e), {}}; } else { *n = {s_expr(t), {}}; } n = &n->tail(); t = L.next(); } } else { return token{t.column, tok::error, "Missing opening parenthesis'('."};; } return node; } s_expr parse(const char* in) { lexer l(in); s_expr result = parse(l); const bool err = result.is_atom()? result.atom().kind==tok::error: false; if (!err) { auto t = l.next(); // pop the last rparen token. if (t.kind!=tok::eof) { return token{t.column, tok::error, util::pprintf("Unexpected '{}' at the end of input.", t)}; } } return result; } s_expr parse(const std::string& in) { return parse(in.c_str()); } } // namespace pyarb