fix lex_number for leading-zero handling and digit separators in fraction

johnbartholomew · johnbartholomew · commit 82ebe7de83a5 · 2026-01-21T18:16:02.000Z
There are some cases which are a little strange but lexically valid.

- `1.2.3.4` lexically this tokenises as `1.2` DOT `3.4`, because a dot
  in the fractional or exponent part of a number is simply treated the
  same as any other possible terminating character (any character that
  isn't part of the valid number lexical syntax)
- `1e2.34` lexically is `1e2` DOT `34` (same as the first case)
- `1e2e34` lexically is `1e2` (number) `e34` (identifier)

These behaviours are basically preserved/extrapolated in the case of
digit separators, so for example `1_2.3_4.5_6` is lexically parsed
as `12.34` DOT `56`. And `1e2_3e4` is lexically parsed as
`1e23` (number), `e4` (identifier). These both look very confusing,
but it probably doesn't matter because those token sequences are,
I think, not valid syntactically so they'll just be rejected by
the parser.

Note that in JSON (and jsonnet), leading zeros are not allowed in
numeric literals. This behaviour is explicitly kept with digit
separators, so `0_5` is explicitly rejected. The alternatives are:

- Treat underscore after an initial zero the same as any terminator
  character, so `0_5` lexes as tokens `0` followed by identifier `_5`.
- Allow underscore, thereby breaking the no-leading-zeros rule, so
  `0_5` tokenises as `05`.

Either option seems confusing, hence it seems better to explicitly
reject an underscore after an initial zero.
diff --git a/core/lexer.cpp b/core/lexer.cpp
@@ -227,9 +227,10 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
         BEGIN,
         AFTER_ZERO,
         AFTER_ONE_TO_NINE,
+        AFTER_INT_UNDERSCORE,
         AFTER_DOT,
         AFTER_DIGIT,
-        AFTER_UNDERSCORE,
+        AFTER_FRAC_UNDERSCORE,
         AFTER_E,
         AFTER_EXP_SIGN,
         AFTER_EXP_DIGIT,
@@ -266,7 +267,11 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
                     case 'e':
                     case 'E': state = AFTER_E; break;
 
-                    case '_': state = AFTER_UNDERSCORE; goto skip_char;
+                    case '_': {
+                        std::stringstream ss;
+                        ss << "couldn't lex number, _ not allowed after leading 0";
+                        throw StaticError(filename, begin, ss.str());
+                    }
 
                     default: goto end;
                 }
@@ -290,12 +295,34 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
                     case '8':
                     case '9': state = AFTER_ONE_TO_NINE; break;
 
-                    case '_': state = AFTER_UNDERSCORE; goto skip_char;
+                    case '_': state = AFTER_INT_UNDERSCORE; goto skip_char;
 
                     default: goto end;
                 }
                 break;
 
+            case AFTER_INT_UNDERSCORE:
+                switch (*c) {
+                    // The only valid transition from _ is to a digit.
+                    case '0':
+                    case '1':
+                    case '2':
+                    case '3':
+                    case '4':
+                    case '5':
+                    case '6':
+                    case '7':
+                    case '8':
+                    case '9': state = AFTER_ONE_TO_NINE; break;
+
+                    default: {
+                        std::stringstream ss;
+                        ss << "couldn't lex number, junk after _: " << *c;
+                        throw StaticError(filename, begin, ss.str());
+                    }
+                }
+                break;
+
             case AFTER_DOT:
                 switch (*c) {
                     case '0':
@@ -333,13 +360,13 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
                     case '8':
                     case '9': state = AFTER_DIGIT; break;
 
-                    case '_': state = AFTER_UNDERSCORE; goto skip_char;
+                    case '_': state = AFTER_FRAC_UNDERSCORE; goto skip_char;
 
                     default: goto end;
                 }
                 break;
 
-            case AFTER_UNDERSCORE:
+            case AFTER_FRAC_UNDERSCORE:
                 switch (*c) {
                     // The only valid transition from _ is to a digit.
                     case '0':
@@ -351,7 +378,7 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
                     case '6':
                     case '7':
                     case '8':
-                    case '9': state = AFTER_ONE_TO_NINE; break;
+                    case '9': state = AFTER_DIGIT; break;
 
                     default: {
                         std::stringstream ss;
diff --git a/core/lexer_test.cpp b/core/lexer_test.cpp
@@ -115,6 +115,18 @@ TEST(Lexer, TestNumbers)
             "1e+!",
             {},
             "number 1e+!:1:1: couldn't lex number, junk after exponent sign: !");
+    testLex("number 1.2.3.4",
+            "1.2.3.4",
+            {Token(Token::Kind::NUMBER, "1.2"),
+             Token(Token::Kind::DOT, ""),
+             Token(Token::Kind::NUMBER, "3.4")},
+            "");
+    testLex("number 1e2.34",
+            "1e2.34",
+            {Token(Token::Kind::NUMBER, "1e2"),
+             Token(Token::Kind::DOT, ""),
+             Token(Token::Kind::NUMBER, "34")},
+            "");
 }
 
 TEST(Lexer, TestNumbersWithSeparators)
@@ -131,7 +143,23 @@ TEST(Lexer, TestNumbersWithSeparators)
     testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, "");
     testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, "");
     testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, "");
+    // Strange cases of adjacent tokens.
+    testLex("number 1_2.3_4.5_6.7_8",
+            "1_2.3_4.5_6.7_8",
+            {Token(Token::Kind::NUMBER, "12.34"),
+             Token(Token::Kind::DOT, ""),
+             Token(Token::Kind::NUMBER, "56.78")},
+             {});
+    testLex("number 1e2_3e4",
+            "1e2_3e4",
+            {Token(Token::Kind::NUMBER, "1e23"),
+             Token(Token::Kind::IDENTIFIER, "e4")},
+            "");
 
+    testLex("number 0_5",
+            "0_5",
+            {},
+            "number 0_5:1:1: couldn't lex number, _ not allowed after leading 0");
     testLex("number 123456_!",
             "123456_!",
             {},