From ac85b18155b77e737f90212e7b84f41ac574ba6a Mon Sep 17 00:00:00 2001 From: David Grant Date: Thu, 20 Jun 2024 20:44:18 -0700 Subject: [PATCH 01/10] Initial separator lexing. --- internal/parser/lexer.go | 50 ++++++++++++++++++++++++++++++++--- internal/parser/lexer_test.go | 20 ++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 0b3437263..175ecc257 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() { l.tokenStartLoc = l.location() } +// tokenKindPostprocessors defines a transformation of the lexed token string +// before it is stored in the tokens list. It is optional for each token kind. +var tokenKindPostprocessors = map[tokenKind]func(string) string{ + tokenNumber: func(s string) string { + // Get rid of underscore digit separators. + return strings.ReplaceAll(s, "_", "") + }, +} + func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) { + // Run the postprocessor if the token kind has one defined. + if pp, ok := tokenKindPostprocessors[kind]; ok { + data = pp(data) + } + l.tokens = append(l.tokens, token{ kind: kind, fodder: l.fodder, @@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) { // that the next rune to be served by the lexer will be a leading digit. func (l *lexer) lexNumber() error { // This function should be understood with reference to the linked image: - // http://www.json.org/number.gif + // https://www.json.org/img/number.png // Note, we deviate from the json.org documentation as follows: // There is no reason to lex negative numbers as atomic tokens, it is better to parse them @@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error { numAfterOneToNine numAfterDot numAfterDigit + numAfterUnderscore numAfterE numAfterExpSign numAfterExpDigit + numAfterExpUnderscore ) state := numBegin @@ -492,6 +508,9 @@ outerLoop: state = numAfterDot case 'e', 'E': state = numAfterE + case '_': + state = numAfterUnderscore + default: break outerLoop } @@ -503,6 +522,8 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterOneToNine + case r == '_': + state = numAfterUnderscore default: break outerLoop } @@ -521,9 +542,28 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterDigit + case r == '_': + state = numAfterUnderscore default: break outerLoop } + + case numAfterUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r == '_': + return l.makeStaticErrorPoint( + "Couldn't lex number, multiple consecutive _'s", + l.location()) + + case r >= '0' && r <= '9': + state = numAfterExpDigit + + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } case numAfterE: switch { case r == '+' || r == '-': @@ -545,9 +585,12 @@ outerLoop: } case numAfterExpDigit: - if r >= '0' && r <= '9' { + switch { + case r >= '0' && r <= '9': state = numAfterExpDigit - } else { + case r == '_': + state = numAfterUnderscore + default: break outerLoop } } @@ -978,7 +1021,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)), l.location()) } - } } diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 66949d51f..9949caa44 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -268,6 +268,26 @@ func TestNumber(t *testing.T) { } } +func TestNumberSeparators(t *testing.T) { + + SingleTest(t, "123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}) + + /* + testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, ""); + testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, ""); + testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, ""); + testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, ""); + testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, ""); + testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, ""); + testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, ""); + testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, ""); + testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, ""); + testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, ""); + testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, ""); + testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, ""); + */ +} + func TestDoublestring1(t *testing.T) { SingleTest(t, "\"hi\"", "", Tokens{ {kind: tokenStringDouble, data: "hi"}, From c700c000f9816961cde5c91722e0691a651876a8 Mon Sep 17 00:00:00 2001 From: David Grant Date: Thu, 20 Jun 2024 20:57:28 -0700 Subject: [PATCH 02/10] More tests. Some fail. --- internal/parser/lexer_test.go | 90 ++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 9949caa44..d544d3759 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -269,23 +269,81 @@ func TestNumber(t *testing.T) { } func TestNumberSeparators(t *testing.T) { + cases := [...]struct { + input string + err string + tokens Tokens + }{ + { + input: "123_456", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "123456"}}, + }, + { + input: "1_750_000", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "1750000"}}, + }, + { + input: "1_2_3", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "123"}}, + }, + { + input: "3.141_592", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "3.141592"}}, + }, + { + input: "01_100", + err: "", + tokens: Tokens{ + {kind: tokenNumber, data: "0"}, + {kind: tokenNumber, data: "1100"}, + }, + }, + { + input: "1_200.0", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "1200.0"}}, + }, + { + input: "0e1_01", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "0e101"}}, + }, + { + input: "10_10e3", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "1010e3"}}, + }, + { + input: "2_3e1_2", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "23e12"}}, + }, + { + input: "1.1_2e100", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "1.12e100"}}, + }, + { + input: "1.1e-10_1", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "1.1e-101"}}, + }, + { + input: "9.109_383_56e-31", + err: "", + tokens: Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}, + }, + } - SingleTest(t, "123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}) - - /* - testLex("number 123_456", "123_456", {Token(Token::Kind::NUMBER, "123456")}, ""); - testLex("number 1_750_000", "1_750_000", {Token(Token::Kind::NUMBER, "1750000")}, ""); - testLex("number 1_2_3", "1_2_3", {Token(Token::Kind::NUMBER, "123")}, ""); - testLex("number 3.141_592", "3.141_592", {Token(Token::Kind::NUMBER, "3.141592")}, ""); - testLex("number 01_100", "01_100", {Token(Token::Kind::NUMBER, "0"), Token(Token::Kind::NUMBER, "1100")}, ""); - testLex("number 1_200.0", "1_200.0", {Token(Token::Kind::NUMBER, "1200.0")}, ""); - testLex("number 0e1_01", "0e1_01", {Token(Token::Kind::NUMBER, "0e101")}, ""); - testLex("number 10_10e3", "10_10e3", {Token(Token::Kind::NUMBER, "1010e3")}, ""); - testLex("number 2_3e1_2", "2_3e1_2", {Token(Token::Kind::NUMBER, "23e12")}, ""); - testLex("number 1.1_2e100", "1.1_2e100", {Token(Token::Kind::NUMBER, "1.12e100")}, ""); - testLex("number 1.1e-10_1", "1.1e-10_1", {Token(Token::Kind::NUMBER, "1.1e-101")}, ""); - testLex("number 9.109_383_56e-31", "9.109_383_56e-31", {Token(Token::Kind::NUMBER, "9.10938356e-31")}, ""); - */ + for _, c := range cases { + t.Run(fmt.Sprintf("number %s", c.input), func(t *testing.T) { + SingleTest(t, c.input, c.err, c.tokens) + }) + } } func TestDoublestring1(t *testing.T) { From 30ab4ea56c8dfc22e16f346f5a189bab2832d295 Mon Sep 17 00:00:00 2001 From: David Grant Date: Thu, 20 Jun 2024 21:01:31 -0700 Subject: [PATCH 03/10] Fix the test. --- internal/parser/lexer.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 175ecc257..52d50f07f 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -557,7 +557,7 @@ outerLoop: l.location()) case r >= '0' && r <= '9': - state = numAfterExpDigit + state = numAfterOneToNine default: return l.makeStaticErrorPoint( @@ -589,11 +589,29 @@ outerLoop: case r >= '0' && r <= '9': state = numAfterExpDigit case r == '_': - state = numAfterUnderscore + state = numAfterExpUnderscore default: break outerLoop } + + case numAfterExpUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r == '_': + return l.makeStaticErrorPoint( + "Couldn't lex number, multiple consecutive _'s", + l.location()) + + case r >= '0' && r <= '9': + state = numAfterExpDigit + + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } } + l.next() } From 1c972d05a291b15b153ad22373e96552d059a59e Mon Sep 17 00:00:00 2001 From: David Grant Date: Fri, 21 Jun 2024 08:28:11 -0700 Subject: [PATCH 04/10] Add exceptional test cases. Make case table less crazy. --- internal/parser/lexer_test.go | 96 +++++++++++------------------------ 1 file changed, 29 insertions(+), 67 deletions(-) diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index d544d3759..2b91b16ff 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -269,77 +269,39 @@ func TestNumber(t *testing.T) { } func TestNumberSeparators(t *testing.T) { - cases := [...]struct { + type numcase struct { input string err string tokens Tokens - }{ - { - input: "123_456", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "123456"}}, - }, - { - input: "1_750_000", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "1750000"}}, - }, - { - input: "1_2_3", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "123"}}, - }, - { - input: "3.141_592", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "3.141592"}}, - }, - { - input: "01_100", - err: "", - tokens: Tokens{ - {kind: tokenNumber, data: "0"}, - {kind: tokenNumber, data: "1100"}, - }, - }, - { - input: "1_200.0", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "1200.0"}}, - }, - { - input: "0e1_01", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "0e101"}}, - }, - { - input: "10_10e3", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "1010e3"}}, - }, - { - input: "2_3e1_2", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "23e12"}}, - }, - { - input: "1.1_2e100", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "1.12e100"}}, - }, - { - input: "1.1e-10_1", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "1.1e-101"}}, - }, - { - input: "9.109_383_56e-31", - err: "", - tokens: Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}, - }, } - - for _, c := range cases { + mknumcase := func(input string, err string, tokens Tokens) numcase { + return numcase{input, err, tokens} + } + for _, c := range [...]numcase{ + mknumcase("123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}), + mknumcase("1_750_000", "", Tokens{{kind: tokenNumber, data: "1750000"}}), + mknumcase("1_2_3", "", Tokens{{kind: tokenNumber, data: "123"}}), + mknumcase("3.141_592", "", Tokens{{kind: tokenNumber, data: "3.141592"}}), + mknumcase("01_100", "", Tokens{ + {kind: tokenNumber, data: "0"}, + {kind: tokenNumber, data: "1100"}, + }), + mknumcase("1_200.0", "", Tokens{{kind: tokenNumber, data: "1200.0"}}), + mknumcase("0e1_01", "", Tokens{{kind: tokenNumber, data: "0e101"}}), + mknumcase("10_10e3", "", Tokens{{kind: tokenNumber, data: "1010e3"}}), + mknumcase("2_3e1_2", "", Tokens{{kind: tokenNumber, data: "23e12"}}), + mknumcase("1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}), + mknumcase("1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}), + mknumcase("9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}), + mknumcase("123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}), + mknumcase("123__456", "snippet:1:5 Couldn't lex number, multiple consecutive _'s", Tokens{}), + mknumcase("1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}), + mknumcase("1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}), + mknumcase("1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}), + mknumcase("1_200e_2", "snippet:1:7 Couldn't lex number, junk after 'E': '_'", Tokens{}), + mknumcase("200e-_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}), + mknumcase("200e+_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}), + } { t.Run(fmt.Sprintf("number %s", c.input), func(t *testing.T) { SingleTest(t, c.input, c.err, c.tokens) }) From 502cac2427c27256d3c4c0fc23a4c13d07a005b1 Mon Sep 17 00:00:00 2001 From: David Grant Date: Fri, 21 Jun 2024 08:35:02 -0700 Subject: [PATCH 05/10] Just use struct literals. --- internal/parser/lexer_test.go | 51 +++++++++++++++-------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 2b91b16ff..3b30c7ff3 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -269,38 +269,31 @@ func TestNumber(t *testing.T) { } func TestNumberSeparators(t *testing.T) { - type numcase struct { + for _, c := range []struct { input string err string tokens Tokens - } - mknumcase := func(input string, err string, tokens Tokens) numcase { - return numcase{input, err, tokens} - } - for _, c := range [...]numcase{ - mknumcase("123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}), - mknumcase("1_750_000", "", Tokens{{kind: tokenNumber, data: "1750000"}}), - mknumcase("1_2_3", "", Tokens{{kind: tokenNumber, data: "123"}}), - mknumcase("3.141_592", "", Tokens{{kind: tokenNumber, data: "3.141592"}}), - mknumcase("01_100", "", Tokens{ - {kind: tokenNumber, data: "0"}, - {kind: tokenNumber, data: "1100"}, - }), - mknumcase("1_200.0", "", Tokens{{kind: tokenNumber, data: "1200.0"}}), - mknumcase("0e1_01", "", Tokens{{kind: tokenNumber, data: "0e101"}}), - mknumcase("10_10e3", "", Tokens{{kind: tokenNumber, data: "1010e3"}}), - mknumcase("2_3e1_2", "", Tokens{{kind: tokenNumber, data: "23e12"}}), - mknumcase("1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}), - mknumcase("1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}), - mknumcase("9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}), - mknumcase("123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}), - mknumcase("123__456", "snippet:1:5 Couldn't lex number, multiple consecutive _'s", Tokens{}), - mknumcase("1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}), - mknumcase("1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}), - mknumcase("1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}), - mknumcase("1_200e_2", "snippet:1:7 Couldn't lex number, junk after 'E': '_'", Tokens{}), - mknumcase("200e-_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}), - mknumcase("200e+_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}), + }{ + {"123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}}, + {"1_750_000", "", Tokens{{kind: tokenNumber, data: "1750000"}}}, + {"1_2_3", "", Tokens{{kind: tokenNumber, data: "123"}}}, + {"3.141_592", "", Tokens{{kind: tokenNumber, data: "3.141592"}}}, + {"01_100", "", Tokens{{kind: tokenNumber, data: "0"}, {kind: tokenNumber, data: "1100"}}}, + {"1_200.0", "", Tokens{{kind: tokenNumber, data: "1200.0"}}}, + {"0e1_01", "", Tokens{{kind: tokenNumber, data: "0e101"}}}, + {"10_10e3", "", Tokens{{kind: tokenNumber, data: "1010e3"}}}, + {"2_3e1_2", "", Tokens{{kind: tokenNumber, data: "23e12"}}}, + {"1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}}, + {"1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}}, + {"9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}}, + {"123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}}, + {"123__456", "snippet:1:5 Couldn't lex number, multiple consecutive _'s", Tokens{}}, + {"1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}}, + {"1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}}, + {"1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}}, + {"1_200e_2", "snippet:1:7 Couldn't lex number, junk after 'E': '_'", Tokens{}}, + {"200e-_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}}, + {"200e+_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}}, } { t.Run(fmt.Sprintf("number %s", c.input), func(t *testing.T) { SingleTest(t, c.input, c.err, c.tokens) From f664e6d82fb3c5f997289e410f47ea4ad5dd6af5 Mon Sep 17 00:00:00 2001 From: David Grant Date: Fri, 21 Jun 2024 20:33:25 -0700 Subject: [PATCH 06/10] Add a test for _123 lexing as identifier. --- internal/parser/lexer_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 3b30c7ff3..9245357bc 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -478,6 +478,12 @@ func TestIdentifiers(t *testing.T) { }) } +func TestIdentifierUnderscore(t *testing.T) { + SingleTest(t, "_123", "", Tokens{ + {kind: tokenIdentifier, data: "_123"}, + }) +} + func TestCppComment(t *testing.T) { SingleTest(t, "// hi", "", Tokens{ {kind: tokenEndOfFile, fodder: ast.Fodder{{Kind: ast.FodderParagraph, Comment: []string{"// hi"}}}}, From 449c302be91daa58879570b2712500680c0baccf Mon Sep 17 00:00:00 2001 From: David Grant Date: Sun, 23 Jun 2024 14:59:24 -0700 Subject: [PATCH 07/10] Simpler to not special-case consecutive _s. --- internal/parser/lexer.go | 12 ------------ internal/parser/lexer_test.go | 2 +- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 52d50f07f..4327788be 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -551,14 +551,8 @@ outerLoop: case numAfterUnderscore: // The only valid transition out of _ is to a digit. switch { - case r == '_': - return l.makeStaticErrorPoint( - "Couldn't lex number, multiple consecutive _'s", - l.location()) - case r >= '0' && r <= '9': state = numAfterOneToNine - default: return l.makeStaticErrorPoint( fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), @@ -597,14 +591,8 @@ outerLoop: case numAfterExpUnderscore: // The only valid transition out of _ is to a digit. switch { - case r == '_': - return l.makeStaticErrorPoint( - "Couldn't lex number, multiple consecutive _'s", - l.location()) - case r >= '0' && r <= '9': state = numAfterExpDigit - default: return l.makeStaticErrorPoint( fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 9245357bc..1a08da719 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -287,7 +287,7 @@ func TestNumberSeparators(t *testing.T) { {"1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}}, {"9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}}, {"123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}}, - {"123__456", "snippet:1:5 Couldn't lex number, multiple consecutive _'s", Tokens{}}, + {"123__456", "snippet:1:5 Couldn't lex number, junk after '_': '_'", Tokens{}}, {"1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}}, {"1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}}, {"1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}}, From c9f1b48d2d1a2b08d53b6e99e58506a3af7d3e31 Mon Sep 17 00:00:00 2001 From: John Bartholomew Date: Mon, 26 Jan 2026 20:30:20 +0000 Subject: [PATCH 08/10] fix underscore handling in numbers to avoid repeated fractions or exponents See also the corresponding C++ jsonnet commit: https://github.com/google/jsonnet/pull/1160/commits/82ebe7de83a5a2a2726e85b56c506c41446dc7c7 There are some cases which are a little strange but lexically valid. - `1.2.3.4` lexically this tokenises as `1.2` DOT `3.4`, because a dot in the fractional or exponent part of a number is simply treated the same as any other possible terminating character (any character that isn't part of the valid number lexical syntax) - `1e2.34` lexically is `1e2` DOT `34` (same as the first case) - `1e2e34` lexically is `1e2` (number) `e34` (identifier) These behaviours are basically preserved/extrapolated in the case of digit separators, so for example `1_2.3_4.5_6` is lexically parsed as `12.34` DOT `56`. And `1e2_3e4` is lexically parsed as `1e23` (number), `e4` (identifier). These both look very confusing, but it probably doesn't matter because those token sequences are, I think, not valid syntactically so they'll just be rejected by the parser. Note that in JSON (and jsonnet), leading zeros are not allowed in numeric literals. This behaviour is explicitly kept with digit separators, so `0_5` is explicitly rejected. The alternatives are: - Treat underscore after an initial zero the same as any terminator character, so `0_5` lexes as tokens `0` followed by identifier `_5`. - Allow underscore, thereby breaking the no-leading-zeros rule, so `0_5` tokenises as `05`. Either option seems confusing, hence it seems better to explicitly reject an underscore after an initial zero. --- internal/parser/lexer.go | 27 +++++++++++++++++++-------- internal/parser/lexer_test.go | 20 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 4327788be..b45e6e7a5 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -477,9 +477,10 @@ func (l *lexer) lexNumber() error { numBegin numLexState = iota numAfterZero numAfterOneToNine + numAfterIntUnderscore numAfterDot numAfterDigit - numAfterUnderscore + numAfterFracUnderscore numAfterE numAfterExpSign numAfterExpDigit @@ -509,8 +510,9 @@ outerLoop: case 'e', 'E': state = numAfterE case '_': - state = numAfterUnderscore - + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, _ not allowed after leading 0"), + l.location()) default: break outerLoop } @@ -523,10 +525,20 @@ outerLoop: case r >= '0' && r <= '9': state = numAfterOneToNine case r == '_': - state = numAfterUnderscore + state = numAfterIntUnderscore default: break outerLoop } + case numAfterIntUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r >= '0' && r <= '9': + state = numAfterOneToNine + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } case numAfterDot: switch { case r >= '0' && r <= '9': @@ -543,16 +555,15 @@ outerLoop: case r >= '0' && r <= '9': state = numAfterDigit case r == '_': - state = numAfterUnderscore + state = numAfterFracUnderscore default: break outerLoop } - - case numAfterUnderscore: + case numAfterFracUnderscore: // The only valid transition out of _ is to a digit. switch { case r >= '0' && r <= '9': - state = numAfterOneToNine + state = numAfterDigit default: return l.makeStaticErrorPoint( fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 1a08da719..ea06d8b64 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -258,6 +258,16 @@ func TestNumber(t *testing.T) { {kind: tokenOperator, data: "+"}, {kind: tokenNumber, data: "10"}, }}, + {"1.2.3.4", "", Tokens{ + {kind: tokenNumber, data: "1.2"}, + {kind: tokenDot, data: "."}, + {kind: tokenNumber, data: "3.4"}, + }}, + {"1e2.34", "", Tokens{ + {kind: tokenNumber, data: "1e2"}, + {kind: tokenDot, data: "."}, + {kind: tokenNumber, data: "34"}, + }}, {"1.+3", "snippet:1:3 Couldn't lex number, junk after decimal point: '+'", Tokens{}}, {"1e!", "snippet:1:3 Couldn't lex number, junk after 'E': '!'", Tokens{}}, {"1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{}}, @@ -286,6 +296,16 @@ func TestNumberSeparators(t *testing.T) { {"1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}}, {"1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}}, {"9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}}, + {"1_2.3_4.5_6.7_8", "", Tokens{ + {kind: tokenNumber, data: "12.34"}, + {kind: tokenDot, data: "."}, + {kind: tokenNumber, data: "56.78"}, + }}, + {"1e2_3e4", "", Tokens{ + {kind: tokenNumber, data: "1e23"}, + {kind: tokenIdentifier, data: "e4"}, + }}, + {"0_5", "snippet:1:2 Couldn't lex number, _ not allowed after leading 0", Tokens{}}, {"123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}}, {"123__456", "snippet:1:5 Couldn't lex number, junk after '_': '_'", Tokens{}}, {"1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}}, From a44ee9452f03fb77555575e1fc8ccbe68e642920 Mon Sep 17 00:00:00 2001 From: John Bartholomew Date: Mon, 26 Jan 2026 20:44:56 +0000 Subject: [PATCH 09/10] add end to end tests for number literals with underscore digit separators --- testdata/digitsep.golden | 15 +++++++++++++++ testdata/digitsep.jsonnet | 19 +++++++++++++++++++ testdata/digitsep.linter.golden | 0 3 files changed, 34 insertions(+) create mode 100644 testdata/digitsep.golden create mode 100644 testdata/digitsep.jsonnet create mode 100644 testdata/digitsep.linter.golden diff --git a/testdata/digitsep.golden b/testdata/digitsep.golden new file mode 100644 index 000000000..e9795a491 --- /dev/null +++ b/testdata/digitsep.golden @@ -0,0 +1,15 @@ +{ + "test_results": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ] +} diff --git a/testdata/digitsep.jsonnet b/testdata/digitsep.jsonnet new file mode 100644 index 000000000..cf80540e7 --- /dev/null +++ b/testdata/digitsep.jsonnet @@ -0,0 +1,19 @@ +local cases = [ + [123456, '123_456'], + [1750000, '1_750_000'], + [123, '1_2_3'], + [3.141592, '3.141_592'], + [1200.0, '1_200.0'], + [0e101, '0e1_01'], + [1010e3, '10_10e3'], + [23e12, '2_3e1_2'], + [1.12e100, '1.1_2e100'], + [1.1e-101, '1.1e-10_1'], + [9.10938356e-31, '9.109_383_56e-31'], +]; + +local sepParse(s) = std.parseJson(std.strReplace(s, '_', '')); + +{ + test_results: [std.assertEqual(c[0], sepParse(c[1])) for c in cases], +} diff --git a/testdata/digitsep.linter.golden b/testdata/digitsep.linter.golden new file mode 100644 index 000000000..e69de29bb From a52ac8dcebf10bc30aed9ea4afcb2ba90bfe5fc2 Mon Sep 17 00:00:00 2001 From: John Bartholomew Date: Tue, 27 Jan 2026 16:16:20 +0000 Subject: [PATCH 10/10] inline processing of number text into lexNumber --- internal/parser/lexer.go | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index b45e6e7a5..8c8a3f065 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -358,21 +358,7 @@ func (l *lexer) resetTokenStart() { l.tokenStartLoc = l.location() } -// tokenKindPostprocessors defines a transformation of the lexed token string -// before it is stored in the tokens list. It is optional for each token kind. -var tokenKindPostprocessors = map[tokenKind]func(string) string{ - tokenNumber: func(s string) string { - // Get rid of underscore digit separators. - return strings.ReplaceAll(s, "_", "") - }, -} - func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) { - // Run the postprocessor if the token kind has one defined. - if pp, ok := tokenKindPostprocessors[kind]; ok { - data = pp(data) - } - l.tokens = append(l.tokens, token{ kind: kind, fodder: l.fodder, @@ -487,6 +473,7 @@ func (l *lexer) lexNumber() error { numAfterExpUnderscore ) + var cb bytes.Buffer state := numBegin outerLoop: @@ -611,10 +598,14 @@ outerLoop: } } + if r != '_' { + cb.WriteRune(r) + } l.next() } - l.emitToken(tokenNumber) + l.emitFullToken(tokenNumber, cb.String(), "", "") + l.resetTokenStart() return nil }