diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 0b343726..8c8a3f06 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -451,7 +451,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) { // that the next rune to be served by the lexer will be a leading digit. func (l *lexer) lexNumber() error { // This function should be understood with reference to the linked image: - // http://www.json.org/number.gif + // https://www.json.org/img/number.png // Note, we deviate from the json.org documentation as follows: // There is no reason to lex negative numbers as atomic tokens, it is better to parse them @@ -463,13 +463,17 @@ func (l *lexer) lexNumber() error { numBegin numLexState = iota numAfterZero numAfterOneToNine + numAfterIntUnderscore numAfterDot numAfterDigit + numAfterFracUnderscore numAfterE numAfterExpSign numAfterExpDigit + numAfterExpUnderscore ) + var cb bytes.Buffer state := numBegin outerLoop: @@ -492,6 +496,10 @@ outerLoop: state = numAfterDot case 'e', 'E': state = numAfterE + case '_': + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, _ not allowed after leading 0"), + l.location()) default: break outerLoop } @@ -503,9 +511,21 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterOneToNine + case r == '_': + state = numAfterIntUnderscore default: break outerLoop } + case numAfterIntUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r >= '0' && r <= '9': + state = numAfterOneToNine + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } case numAfterDot: switch { case r >= '0' && r <= '9': @@ -521,9 +541,21 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterDigit + case r == '_': + state = numAfterFracUnderscore default: break outerLoop } + case numAfterFracUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r >= '0' && r <= '9': + state = numAfterDigit + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } case numAfterE: switch { case r == '+' || r == '-': @@ -545,16 +577,35 @@ outerLoop: } case numAfterExpDigit: - if r >= '0' && r <= '9' { + switch { + case r >= '0' && r <= '9': state = numAfterExpDigit - } else { + case r == '_': + state = numAfterExpUnderscore + default: break outerLoop } + + case numAfterExpUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r >= '0' && r <= '9': + state = numAfterExpDigit + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } + } + + if r != '_' { + cb.WriteRune(r) } l.next() } - l.emitToken(tokenNumber) + l.emitFullToken(tokenNumber, cb.String(), "", "") + l.resetTokenStart() return nil } @@ -978,7 +1029,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)), l.location()) } - } } diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index 66949d51..ea06d8b6 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -258,6 +258,16 @@ func TestNumber(t *testing.T) { {kind: tokenOperator, data: "+"}, {kind: tokenNumber, data: "10"}, }}, + {"1.2.3.4", "", Tokens{ + {kind: tokenNumber, data: "1.2"}, + {kind: tokenDot, data: "."}, + {kind: tokenNumber, data: "3.4"}, + }}, + {"1e2.34", "", Tokens{ + {kind: tokenNumber, data: "1e2"}, + {kind: tokenDot, data: "."}, + {kind: tokenNumber, data: "34"}, + }}, {"1.+3", "snippet:1:3 Couldn't lex number, junk after decimal point: '+'", Tokens{}}, {"1e!", "snippet:1:3 Couldn't lex number, junk after 'E': '!'", Tokens{}}, {"1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{}}, @@ -268,6 +278,49 @@ func TestNumber(t *testing.T) { } } +func TestNumberSeparators(t *testing.T) { + for _, c := range []struct { + input string + err string + tokens Tokens + }{ + {"123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}}, + {"1_750_000", "", Tokens{{kind: tokenNumber, data: "1750000"}}}, + {"1_2_3", "", Tokens{{kind: tokenNumber, data: "123"}}}, + {"3.141_592", "", Tokens{{kind: tokenNumber, data: "3.141592"}}}, + {"01_100", "", Tokens{{kind: tokenNumber, data: "0"}, {kind: tokenNumber, data: "1100"}}}, + {"1_200.0", "", Tokens{{kind: tokenNumber, data: "1200.0"}}}, + {"0e1_01", "", Tokens{{kind: tokenNumber, data: "0e101"}}}, + {"10_10e3", "", Tokens{{kind: tokenNumber, data: "1010e3"}}}, + {"2_3e1_2", "", Tokens{{kind: tokenNumber, data: "23e12"}}}, + {"1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}}, + {"1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}}, + {"9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}}, + {"1_2.3_4.5_6.7_8", "", Tokens{ + {kind: tokenNumber, data: "12.34"}, + {kind: tokenDot, data: "."}, + {kind: tokenNumber, data: "56.78"}, + }}, + {"1e2_3e4", "", Tokens{ + {kind: tokenNumber, data: "1e23"}, + {kind: tokenIdentifier, data: "e4"}, + }}, + {"0_5", "snippet:1:2 Couldn't lex number, _ not allowed after leading 0", Tokens{}}, + {"123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}}, + {"123__456", "snippet:1:5 Couldn't lex number, junk after '_': '_'", Tokens{}}, + {"1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}}, + {"1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}}, + {"1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}}, + {"1_200e_2", "snippet:1:7 Couldn't lex number, junk after 'E': '_'", Tokens{}}, + {"200e-_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}}, + {"200e+_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}}, + } { + t.Run(fmt.Sprintf("number %s", c.input), func(t *testing.T) { + SingleTest(t, c.input, c.err, c.tokens) + }) + } +} + func TestDoublestring1(t *testing.T) { SingleTest(t, "\"hi\"", "", Tokens{ {kind: tokenStringDouble, data: "hi"}, @@ -445,6 +498,12 @@ func TestIdentifiers(t *testing.T) { }) } +func TestIdentifierUnderscore(t *testing.T) { + SingleTest(t, "_123", "", Tokens{ + {kind: tokenIdentifier, data: "_123"}, + }) +} + func TestCppComment(t *testing.T) { SingleTest(t, "// hi", "", Tokens{ {kind: tokenEndOfFile, fodder: ast.Fodder{{Kind: ast.FodderParagraph, Comment: []string{"// hi"}}}}, diff --git a/testdata/digitsep.golden b/testdata/digitsep.golden new file mode 100644 index 00000000..e9795a49 --- /dev/null +++ b/testdata/digitsep.golden @@ -0,0 +1,15 @@ +{ + "test_results": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ] +} diff --git a/testdata/digitsep.jsonnet b/testdata/digitsep.jsonnet new file mode 100644 index 00000000..cf80540e --- /dev/null +++ b/testdata/digitsep.jsonnet @@ -0,0 +1,19 @@ +local cases = [ + [123456, '123_456'], + [1750000, '1_750_000'], + [123, '1_2_3'], + [3.141592, '3.141_592'], + [1200.0, '1_200.0'], + [0e101, '0e1_01'], + [1010e3, '10_10e3'], + [23e12, '2_3e1_2'], + [1.12e100, '1.1_2e100'], + [1.1e-101, '1.1e-10_1'], + [9.10938356e-31, '9.109_383_56e-31'], +]; + +local sepParse(s) = std.parseJson(std.strReplace(s, '_', '')); + +{ + test_results: [std.assertEqual(c[0], sepParse(c[1])) for c in cases], +} diff --git a/testdata/digitsep.linter.golden b/testdata/digitsep.linter.golden new file mode 100644 index 00000000..e69de29b