Skip to content

Commit 82ebe7d

Browse files
fix lex_number for leading-zero handling and digit separators in fraction
There are some cases which are a little strange but lexically valid. - `1.2.3.4` lexically this tokenises as `1.2` DOT `3.4`, because a dot in the fractional or exponent part of a number is simply treated the same as any other possible terminating character (any character that isn't part of the valid number lexical syntax) - `1e2.34` lexically is `1e2` DOT `34` (same as the first case) - `1e2e34` lexically is `1e2` (number) `e34` (identifier) These behaviours are basically preserved/extrapolated in the case of digit separators, so for example `1_2.3_4.5_6` is lexically parsed as `12.34` DOT `56`. And `1e2_3e4` is lexically parsed as `1e23` (number), `e4` (identifier). These both look very confusing, but it probably doesn't matter because those token sequences are, I think, not valid syntactically so they'll just be rejected by the parser. Note that in JSON (and jsonnet), leading zeros are not allowed in numeric literals. This behaviour is explicitly kept with digit separators, so `0_5` is explicitly rejected. The alternatives are: - Treat underscore after an initial zero the same as any terminator character, so `0_5` lexes as tokens `0` followed by identifier `_5`. - Allow underscore, thereby breaking the no-leading-zeros rule, so `0_5` tokenises as `05`. Either option seems confusing, hence it seems better to explicitly reject an underscore after an initial zero.
1 parent 7784da1 commit 82ebe7d

File tree

2 files changed

+61
-6
lines changed

2 files changed

+61
-6
lines changed

core/lexer.cpp

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,10 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
227227
BEGIN,
228228
AFTER_ZERO,
229229
AFTER_ONE_TO_NINE,
230+
AFTER_INT_UNDERSCORE,
230231
AFTER_DOT,
231232
AFTER_DIGIT,
232-
AFTER_UNDERSCORE,
233+
AFTER_FRAC_UNDERSCORE,
233234
AFTER_E,
234235
AFTER_EXP_SIGN,
235236
AFTER_EXP_DIGIT,
@@ -266,7 +267,11 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
266267
case 'e':
267268
case 'E': state = AFTER_E; break;
268269

269-
case '_': state = AFTER_UNDERSCORE; goto skip_char;
270+
case '_': {
271+
std::stringstream ss;
272+
ss << "couldn't lex number, _ not allowed after leading 0";
273+
throw StaticError(filename, begin, ss.str());
274+
}
270275

271276
default: goto end;
272277
}
@@ -290,12 +295,34 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
290295
case '8':
291296
case '9': state = AFTER_ONE_TO_NINE; break;
292297

293-
case '_': state = AFTER_UNDERSCORE; goto skip_char;
298+
case '_': state = AFTER_INT_UNDERSCORE; goto skip_char;
294299

295300
default: goto end;
296301
}
297302
break;
298303

304+
case AFTER_INT_UNDERSCORE:
305+
switch (*c) {
306+
// The only valid transition from _ is to a digit.
307+
case '0':
308+
case '1':
309+
case '2':
310+
case '3':
311+
case '4':
312+
case '5':
313+
case '6':
314+
case '7':
315+
case '8':
316+
case '9': state = AFTER_ONE_TO_NINE; break;
317+
318+
default: {
319+
std::stringstream ss;
320+
ss << "couldn't lex number, junk after _: " << *c;
321+
throw StaticError(filename, begin, ss.str());
322+
}
323+
}
324+
break;
325+
299326
case AFTER_DOT:
300327
switch (*c) {
301328
case '0':
@@ -333,13 +360,13 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
333360
case '8':
334361
case '9': state = AFTER_DIGIT; break;
335362

336-
case '_': state = AFTER_UNDERSCORE; goto skip_char;
363+
case '_': state = AFTER_FRAC_UNDERSCORE; goto skip_char;
337364

338365
default: goto end;
339366
}
340367
break;
341368

342-
case AFTER_UNDERSCORE:
369+
case AFTER_FRAC_UNDERSCORE:
343370
switch (*c) {
344371
// The only valid transition from _ is to a digit.
345372
case '0':
@@ -351,7 +378,7 @@ std::string lex_number(const char *&c, const std::string &filename, const Locati
351378
case '6':
352379
case '7':
353380
case '8':
354-
case '9': state = AFTER_ONE_TO_NINE; break;
381+
case '9': state = AFTER_DIGIT; break;
355382

356383
default: {
357384
std::stringstream ss;

core/lexer_test.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,18 @@ TEST(Lexer, TestNumbers)
115115
"1e+!",
116116
{},
117117
"number 1e+!:1:1: couldn't lex number, junk after exponent sign: !");
118+
testLex("number 1.2.3.4",
119+
"1.2.3.4",
120+
{Token(Token::Kind::NUMBER, "1.2"),
121+
Token(Token::Kind::DOT, ""),
122+
Token(Token::Kind::NUMBER, "3.4")},
123+
"");
124+
testLex("number 1e2.34",
125+
"1e2.34",
126+
{Token(Token::Kind::NUMBER, "1e2"),
127+
Token(Token::Kind::DOT, ""),
128+
Token(Token::Kind::NUMBER, "34")},
129+
"");
118130
}
119131

120132
TEST(Lexer, TestNumbersWithSeparators)
@@ -131,7 +143,23 @@ TEST(Lexer, TestNumbersWithSeparators)
131143
testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, "");
132144
testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, "");
133145
testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, "");
146+
// Strange cases of adjacent tokens.
147+
testLex("number 1_2.3_4.5_6.7_8",
148+
"1_2.3_4.5_6.7_8",
149+
{Token(Token::Kind::NUMBER, "12.34"),
150+
Token(Token::Kind::DOT, ""),
151+
Token(Token::Kind::NUMBER, "56.78")},
152+
{});
153+
testLex("number 1e2_3e4",
154+
"1e2_3e4",
155+
{Token(Token::Kind::NUMBER, "1e23"),
156+
Token(Token::Kind::IDENTIFIER, "e4")},
157+
"");
134158

159+
testLex("number 0_5",
160+
"0_5",
161+
{},
162+
"number 0_5:1:1: couldn't lex number, _ not allowed after leading 0");
135163
testLex("number 123456_!",
136164
"123456_!",
137165
{},

0 commit comments

Comments
 (0)