Tighten number lexing

* Tokenize strings of the form "7E+X", "2.3E " etc as a number (integer, real) followed by a new token starting with 'E'. Note that this makes the string "1.2E3E4" no longer cause a lexer error status. * Add tests for corresponding numeric disambiguation in `text_lexer.cpp`. * Add tests for stoichiometric expressions that ensure that only otherwise ambiguous phrases such as "7E+2F" fail to parse ("7E+F" should parse correctly as a stoich expression.) * Add missing comma in list of 'good' expressions in the `Parser.parse_line_expressions` test.

Tighten number lexing
* Tokenize strings of the form "7E+X", "2.3E " etc as a number (integer, real) followed by a new token starting with 'E'. Note that this makes the string "1.2E3E4" no longer cause a lexer error status. * Add tests for corresponding numeric disambiguation in `text_lexer.cpp`. * Add tests for stoichiometric expressions that ensure that only otherwise ambiguous phrases such as "7E+2F" fail to parse ("7E+F" should parse correctly as a stoich expression.) * Add missing comma in list of 'good' expressions in the `Parser.parse_line_expressions` test.
584c70fe · Sam Yates · 3850c41a · 584c70fe · 584c70fe · 584c70fe
Commit 584c70fe authored 8 years ago by Sam Yates
--- a/modcc/lexer.cpp
+++ b/modcc/lexer.cpp
@@ -28,6 +28,9 @@ inline bool is_eof(char c) {
 inline bool is_operator(char c) {
    return (c=='+' || c=='-' || c=='*' || c=='/' || c=='^' || c=='\'');
 }
+inline bool is_plusminus(char c) {
+    return (c=='+' || c=='-');
+}
 //*********************
 // Lexer
@@ -258,13 +261,21 @@ Token Lexer::number() {
                incorrectly_formed_mantisa = true;
            }
        }
-        else if(c=='e' || c=='E') {
+        else if(!uses_scientific_notation && (c=='e' || c=='E')) {
-            uses_scientific_notation++;
+            if(is_numeric(current_[1]) ||
-            str += c;
+               is_plusminus(current_[1]) && is_numeric(current_[2]))
-            current_++;
+            {
-            // Consume the next char if +/-
+                uses_scientific_notation++;
-            if (*current_ == '+' || *current_ == '-') {
+                str += c;
-                str += *current_++;
+                current_++;
+                // Consume the next char if +/-
+                if (is_plusminus(*current_)) {
+                    str += *current_++;
+                }
+            }
+            else {
+                // the 'e' or 'E' is the beginning of a new token
+                break;
            }
        }
        else {
@@ -283,11 +294,6 @@ Token Lexer::number() {
        error_string_ = pprintf("too many .'s when reading the number '%'", yellow(str));
        status_ = lexerStatus::error;
    }
-    // check that e or E is not used more than once in the number
-    if(uses_scientific_notation>1) {
-        error_string_ = pprintf("can't parse the number '%'", yellow(str));
-        status_ = lexerStatus::error;
-    }
    tok type;
    if(status_==lexerStatus::error) {

--- a/tests/modcc/test_lexer.cpp
+++ b/tests/modcc/test_lexer.cpp
+#include <cctype>
 #include <cmath>
+#include <cstdio>
 #include <iterator>
 #include <utility>
@@ -31,6 +33,22 @@ public:
        }
        return tok;
    }
+    char character() {
+        char c = Lexer::character();
+        if (g_verbose_flag) {
+            std::cout << "character: ";
+            if (!std::isprint(c)) {
+                char buf[5] = "XXXX";
+                snprintf(buf, sizeof buf, "0x%02x", (unsigned)c);
+                std::cout << buf << '\n';
+            }
+            else {
+                std::cout << c << '\n';
+            }
+        }
+        return c;
+    }
 };
 /**************************************************************
@@ -307,4 +325,29 @@ TEST(Lexer, numbers) {
    EXPECT_EQ(floats.cend(), iter);
    EXPECT_EQ(tok::eof, t.type);
    EXPECT_EQ(check_ints, ints);
+    // check case where 'E' is not followed by +, -, or a digit explicitly
+    lexer = VerboseLexer("7.2E");
+    t = lexer.parse();
+    EXPECT_EQ(lexerStatus::happy, lexer.status());
+    EXPECT_EQ(tok::real, t.type);
+    EXPECT_EQ(t.spelling, "7.2");
+    EXPECT_EQ(lexer.character(), 'E');
+    lexer = VerboseLexer("3E+E2");
+    t = lexer.parse();
+    EXPECT_EQ(lexerStatus::happy, lexer.status());
+    EXPECT_EQ(tok::integer, t.type);
+    EXPECT_EQ(t.spelling, "3");
+    EXPECT_EQ(lexer.character(), 'E');
+    EXPECT_EQ(lexer.character(), '+');
+    // 'bad' numbers should give errors
+    lexer = VerboseLexer("1.2.3");
+    lexer.parse();
+    EXPECT_EQ(lexerStatus::error, lexer.status());
+    lexer = VerboseLexer("1.2E4.3");
+    lexer.parse();
+    EXPECT_EQ(lexerStatus::error, lexer.status());
 }
--- a/tests/modcc/test_parser.cpp
+++ b/tests/modcc/test_parser.cpp
@@ -282,7 +282,7 @@ TEST(Parser, parse_parenthesis_expression) {
 // test parsing of line expressions
 TEST(Parser, parse_line_expression) {
    const char* good_expr[] = {
-        "qt=q10^((celsius-22)/10)"
+        "qt=q10^((celsius-22)/10)",
        "x=2        ",
        "x=2        ",
        "x = -y\n   "
@@ -319,7 +319,7 @@ TEST(Parser, parse_line_expression) {
 TEST(Parser, parse_stoich_term) {
    const char* good_pos_expr[] = {
-        "B", "B3", "3B3", "0A", "12A"
+        "B", "B3", "3B3", "0A", "12A", "4E"
    };
    for (auto& text: good_pos_expr) {
@@ -338,7 +338,7 @@ TEST(Parser, parse_stoich_term) {
        EXPECT_TRUE((s && s->negative()));
    }
    const char* bad_expr[] = {
-        "0.2A", "5"
+        "0.2A", "5", "3e2" // "3e2" should lex as real number 300.0
    };
    for (auto& text: bad_expr) {
@@ -403,6 +403,7 @@ TEST(Parser, parse_reaction_expression) {
        "~ A + B <-> C + D (k1, k2)",
        "~ 2B <-> C + D + E (k1(3,v), k2)",
        "~ <-> C + D + 7 E (k1, f(a,b)-2)",
+        "~ <-> C + D + 7E+F (k1, f(a,b)-2)",
        "~ <-> (f,g)",
        "~ A + 3B + C<-> (f,g)"
    };
@@ -417,6 +418,7 @@ TEST(Parser, parse_reaction_expression) {
        "~ A + B <-> C + (k1, k2)",
        "~ 2.3B <-> C + D + E (k1(3,v), k2)",
        "~ <-> C + D + 7E",
+        "~ <-> C + D + 7E+2F (k1, f(a,b)-2)", // "7E+2" will lex as real number
        "~ <-> (,g)",
        "~ A - 3B + C<-> (f,g)",
        "  A <-> B (k1, k2)",