From dde11a0d719f4d4ed10c857809e2d9c243d3b8e8 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 10:28:07 +0100 Subject: [PATCH 1/8] Copy pyimportparse into parsing.rs --- rust/Cargo.lock | 66 ++++ rust/Cargo.toml | 4 + rust/src/parsing.rs | 755 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 825 insertions(+) create mode 100644 rust/src/parsing.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 5e4650e5..f60f25f0 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -13,7 +13,11 @@ dependencies = [ "indexmap 2.7.1", "itertools", "lazy_static", + "nom 8.0.0", + "nom-regex", + "nom_locate", "parameterized", + "pyimportparse", "pyo3", "rayon", "regex", @@ -46,6 +50,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "230c5f1ca6a325a32553f8640d31ac9b49f2411e901e427570154868b46da4f7" +[[package]] +name = "bytecount" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" + [[package]] name = "cfg-if" version = "1.0.0" @@ -227,6 +237,52 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom-regex" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e5c7731c4c1370b61604ed52a2475e861aac9e08dec9f23903d4ddfdc91c18" +dependencies = [ + "nom 7.1.3", + "regex", +] + +[[package]] +name = "nom_locate" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" +dependencies = [ + "bytecount", + "memchr", + "nom 8.0.0", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -291,6 +347,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pyimportparse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f9c2424bb74b5fdc6d38e75c904890a8ac4d2ff52ba3da87cec4c8a412148b" +dependencies = [ + "nom 8.0.0", + "nom_locate", +] + [[package]] name = "pyo3" version = "0.24.1" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 3e6a9457..94e81cc1 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -22,6 +22,10 @@ rustc-hash = "2.1.0" indexmap = "2.7.1" regex = "1.11.1" const_format = "0.2.34" +pyimportparse = "0.2.1" +nom = "8.0.0" +nom_locate = "5.0.0" +nom-regex = "0.2.0" [dependencies.pyo3] version = "0.24.1" diff --git a/rust/src/parsing.rs b/rust/src/parsing.rs new file mode 100644 index 00000000..ee10f23c --- /dev/null +++ b/rust/src/parsing.rs @@ -0,0 +1,755 @@ +// This file was originally copied from the pyimportparse crate. +// License copied below, as per terms of the license. +// +// MIT License +// +// Copyright (c) 2025 Peter Byfield +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + + +use nom::branch::alt; +use nom::bytes::complete::{tag, take_until}; +use nom::character::complete::{ + alphanumeric1, line_ending, multispace1, not_line_ending, space0, space1, +}; +use nom::combinator::{all_consuming, opt, recognize, value, verify}; +use nom::multi::{many0, many1, separated_list1}; +use nom::sequence::{delimited, preceded, terminated}; +use nom::{IResult, Input, Parser}; +use nom_locate::{LocatedSpan, position}; + +type Span<'a> = LocatedSpan<&'a str>; + +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct Import { + pub imported_object: String, + pub line_number: u32, + pub line_contents: String, + pub typechecking_only: bool, +} + +impl Import { + pub fn new( + imported_object: String, + line_number: u32, + line_contents: String, + typechecking_only: bool, + ) -> Self { + Self { + imported_object, + line_number, + line_contents, + typechecking_only, + } + } +} + +pub fn parse_imports(s: &str) -> Result, String> { + let s = Span::new(s); + let (_, result) = all_consuming(parse_block(false)) + .parse(s) + .map_err(|e| e.to_string())?; + Ok(result) +} + +fn parse_block(typechecking_only: bool) -> impl Fn(Span) -> IResult> { + move |s| { + let (s, result) = many0(alt(( + parse_if_typechecking, + value(vec![], parse_space1), + value(vec![], line_ending), + value(vec![], parse_multiline_comment), + value(vec![], parse_comment), + parse_import_statement_list(typechecking_only), + value(vec![], verify(not_line_ending, |s: &Span| !s.is_empty())), + ))) + .parse(s)?; + Ok((s, result.into_iter().flatten().collect())) + } +} + +fn parse_import_statement_list( + typechecking_only: bool, +) -> impl Fn(Span) -> IResult> { + move |s| { + let (s, imports) = separated_list1( + delimited(parse_space0, tag(";"), parse_space0), + alt(( + parse_import_statement(typechecking_only), + parse_from_import_statement(typechecking_only), + parse_multiline_from_import_statement(typechecking_only), + parse_wildcard_from_import_statement(typechecking_only), + )), + ) + .parse(s)?; + let (s, _) = (opt(parse_space0), opt(tag(";"))).parse(s)?; + Ok((s, imports.into_iter().flatten().collect())) + } +} + +fn parse_import_statement(typechecking_only: bool) -> impl Fn(Span) -> IResult> { + move |s| { + let input = s; + let (s, position) = position.parse(s)?; + let (s, _) = (tag("import"), parse_space1).parse(s)?; + let (s, imported_modules) = separated_list1( + delimited(parse_space0, tag(","), parse_space0), + terminated( + parse_module, + opt((parse_space1, tag("as"), parse_space1, parse_identifier)), + ), + ) + .parse(s)?; + + let (_, span) = input.take_split(s.location_offset() - input.location_offset()); + Ok(( + s, + imported_modules + .into_iter() + .map(|imported_module| { + Import::new( + imported_module.to_owned(), + position.location_line(), + (*span.fragment()).to_owned(), + typechecking_only, + ) + }) + .collect(), + )) + } +} + +fn parse_from_import_statement( + typechecking_only: bool, +) -> impl Fn(Span) -> IResult> { + move |s| { + let input = s; + let (s, position) = position.parse(s)?; + let (s, _) = (tag("from"), parse_space1).parse(s)?; + let (s, imported_module_base) = parse_relative_module.parse(s)?; + let (s, _) = (parse_space1, tag("import"), parse_space1).parse(s)?; + + let (s, imported_identifiers) = separated_list1( + delimited(parse_space0, tag(","), parse_space0), + terminated( + parse_identifier, + opt((parse_space1, tag("as"), parse_space1, parse_identifier)), + ), + ) + .parse(s)?; + + let (_, span) = input.take_split(s.location_offset() - input.location_offset()); + Ok(( + s, + imported_identifiers + .into_iter() + .map(|imported_identifier| { + let imported_object = if imported_module_base.ends_with(".") { + format!("{}{}", imported_module_base, imported_identifier) + } else { + format!("{}.{}", imported_module_base, imported_identifier) + }; + Import::new( + imported_object, + position.location_line(), + (*span.fragment()).to_owned(), + typechecking_only, + ) + }) + .collect(), + )) + } +} + +fn parse_multiline_from_import_statement( + typechecking_only: bool, +) -> impl Fn(Span) -> IResult> { + move |s| { + let input = s; + let (s, position) = position.parse(s)?; + let (s, _) = (tag("from"), parse_space1).parse(s)?; + let (s, imported_module_base) = parse_relative_module.parse(s)?; + let (s, _) = (parse_space1, tag("import"), parse_space1).parse(s)?; + + let (s, imported_identifiers) = delimited( + (tag("("), parse_multispace0_or_comment), + separated_list1( + delimited( + parse_multispace0_or_comment, + tag(","), + parse_multispace0_or_comment, + ), + terminated( + parse_identifier, + opt((multispace1, tag("as"), multispace1, parse_identifier)), + ), + ), + ( + parse_multispace0_or_comment, + opt(tag(",")), + parse_multispace0_or_comment, + tag(")"), + ), + ) + .parse(s)?; + + let (_, span) = input.take_split(s.location_offset() - input.location_offset()); + Ok(( + s, + imported_identifiers + .into_iter() + .map(|imported_identifier| { + let imported_object = if imported_module_base.ends_with(".") { + format!("{}{}", imported_module_base, imported_identifier) + } else { + format!("{}.{}", imported_module_base, imported_identifier) + }; + Import::new( + imported_object, + position.location_line(), + (*span.fragment()).to_owned(), + typechecking_only, + ) + }) + .collect(), + )) + } +} + +fn parse_wildcard_from_import_statement( + typechecking_only: bool, +) -> impl Fn(Span) -> IResult> { + move |s| { + let input = s; + let (s, position) = position.parse(s)?; + let (s, _) = (tag("from"), parse_space1).parse(s)?; + let (s, imported_module) = parse_relative_module.parse(s)?; + let (s, _) = (parse_space1, tag("import"), parse_space1, tag("*")).parse(s)?; + + let imported_object = if imported_module.ends_with(".") { + format!("{}*", imported_module) + } else { + format!("{}.*", imported_module) + }; + + let (_, span) = input.take_split(s.location_offset() - input.location_offset()); + Ok(( + s, + vec![Import::new( + imported_object, + position.location_line(), + (*span.fragment()).to_owned(), + typechecking_only, + )], + )) + } +} + +fn parse_module(s: Span) -> IResult { + let (s, result) = recognize(separated_list1(tag("."), parse_identifier)).parse(s)?; + Ok((s, result.fragment())) +} + +fn parse_relative_module(s: Span) -> IResult { + let (s, result) = alt(( + recognize((many0(tag(".")), parse_module)), + recognize(many1(tag("."))), + )) + .parse(s)?; + Ok((s, result.fragment())) +} + +fn parse_identifier(s: Span) -> IResult { + let (s, result) = recognize(many1(alt((alphanumeric1, tag("_"))))).parse(s)?; + Ok((s, result.fragment())) +} + +fn parse_comment(s: Span) -> IResult { + let (s, _) = (tag("#"), not_line_ending).parse(s)?; + Ok((s, ())) +} + +fn parse_multispace0_or_comment(s: Span) -> IResult { + let (s, _) = many0(alt((value((), multispace1), parse_comment))).parse(s)?; + Ok((s, ())) +} + +fn parse_multiline_comment(s: Span) -> IResult { + let (s, _) = alt(( + delimited(tag(r#"""""#), take_until(r#"""""#), tag(r#"""""#)), + delimited(tag(r#"'''"#), take_until(r#"'''"#), tag(r#"'''"#)), + )) + .parse(s)?; + Ok((s, ())) +} + +fn parse_space0(s: Span) -> IResult { + let (s, _) = many0(alt((space1, tag("\\\n")))).parse(s)?; + Ok((s, ())) +} + +fn parse_space1(s: Span) -> IResult { + let (s, _) = many1(alt((space1, tag("\\\n")))).parse(s)?; + Ok((s, ())) +} + +fn parse_if_typechecking(s: Span) -> IResult> { + let (s, _) = ( + tag("if"), + parse_space1, + alt((tag("TYPE_CHECKING"), tag("typing.TYPE_CHECKING"))), + parse_space0, + tag(":"), + ) + .parse(s)?; + + if let Ok((s, imports)) = preceded( + parse_space0, + terminated( + parse_import_statement_list(true), + (parse_space0, opt(parse_comment)), + ), + ) + .parse(s) + { + return Ok((s, imports)); + }; + + let (s, _) = (parse_space0, opt(parse_comment), line_ending).parse(s)?; + let (s, indented_block) = parse_indented_block.parse(s)?; + let (_, imports) = all_consuming(parse_block(true)).parse(indented_block)?; + Ok((s, imports)) +} + +fn parse_indented_block(s: Span) -> IResult { + let input = s; + + let (s, _) = many0((space0, line_ending)).parse(s)?; + let (s, (indentation, _, _)) = (space1, not_line_ending, opt(line_ending)).parse(s)?; + + let (s, _) = many0(alt(( + value((), (space0, line_ending)), + value( + (), + ( + tag(*indentation.fragment()), + not_line_ending, + opt(line_ending), + ), + ), + ))) + .parse(s)?; + + Ok(input.take_split(s.location_offset() - input.location_offset())) +} + +#[cfg(test)] +mod tests { + use super::parse_imports; + use parameterized::parameterized; + + #[test] + fn test_parse_empty_string() { + let imports = parse_imports("").unwrap(); + assert!(imports.is_empty()); + } + + fn parse_and_check(case: (&str, &[&str])) { + let (code, expected_imports) = case; + let imports = parse_imports(code).unwrap(); + assert_eq!( + expected_imports, + imports + .into_iter() + .map(|i| i.imported_object) + .collect::>() + ); + } + + fn parse_and_check_with_typechecking_only(case: (&str, &[(&str, bool)])) { + let (code, expected_imports) = case; + let imports = parse_imports(code).unwrap(); + assert_eq!( + expected_imports + .iter() + .map(|i| (i.0.to_owned(), i.1)) + .collect::>(), + imports + .into_iter() + .map(|i| (i.imported_object, i.typechecking_only)) + .collect::>() + ); + } + + #[parameterized(case = { + ("import foo", &["foo"]), + ("import foo_FOO_123", &["foo_FOO_123"]), + ("import foo.bar", &["foo.bar"]), + ("import foo.bar.baz", &["foo.bar.baz"]), + ("import foo, bar, bax", &["foo", "bar", "bax"]), + ("import foo as FOO", &["foo"]), + ("import foo as FOO, bar as BAR", &["foo", "bar"]), + ("import foo as FOO , bar as BAR", &["foo", "bar"]), + ("import foo # Comment", &["foo"]), + })] + fn test_parse_import_statement(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + ("from foo import bar", &["foo.bar"]), + ("from foo import bar_BAR_123", &["foo.bar_BAR_123"]), + ("from .foo import bar", &[".foo.bar"]), + ("from ..foo import bar", &["..foo.bar"]), + ("from . import foo", &[".foo"]), + ("from .. import foo", &["..foo"]), + ("from foo.bar import baz", &["foo.bar.baz"]), + ("from .foo.bar import baz", &[".foo.bar.baz"]), + ("from ..foo.bar import baz", &["..foo.bar.baz"]), + ("from foo import bar, baz, bax", &["foo.bar", "foo.baz", "foo.bax"]), + ("from foo import bar as BAR", &["foo.bar"]), + ("from foo import bar as BAR, baz as BAZ", &["foo.bar", "foo.baz"]), + ("from foo import bar as BAR , baz as BAZ", &["foo.bar", "foo.baz"]), + ("from foo import bar # Comment", &["foo.bar"]), + })] + fn test_parse_from_import_statement(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + ("from foo import (bar)", &["foo.bar"]), + ("from foo import (bar,)", &["foo.bar"]), + ("from foo import (bar, baz)", &["foo.bar", "foo.baz"]), + ("from foo import (bar, baz,)", &["foo.bar", "foo.baz"]), + ("from foo import (bar as BAR, baz as BAZ,)", &["foo.bar", "foo.baz"]), + ("from foo import ( bar as BAR , baz as BAZ , )", &["foo.bar", "foo.baz"]), + ("from foo import (bar, baz,) # Comment", &["foo.bar", "foo.baz"]), + + (r#" +from foo import ( + bar, + baz +) + "#, &["foo.bar", "foo.baz"]), + + (r#" +from foo import ( + bar, + baz, +) + "#, &["foo.bar", "foo.baz"]), + + (r#" +from foo import ( + a, b, + c, d, +) + "#, &["foo.a", "foo.b", "foo.c", "foo.d"]), + + // As name + (r#" +from foo import ( + bar as BAR, + baz as BAZ, +) + "#, &["foo.bar", "foo.baz"]), + + // Whitespace + (r#" +from foo import ( + + bar as BAR , + + baz as BAZ , + +) + "#, &["foo.bar", "foo.baz"]), + + // Comments + (r#" +from foo import ( # C + # C + bar as BAR, # C + # C + baz as BAZ, # C + # C +) # C + "#, &["foo.bar", "foo.baz"]), + })] + fn test_parse_multiline_from_import_statement(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + ("from foo import *", &["foo.*"]), + ("from .foo import *", &[".foo.*"]), + ("from ..foo import *", &["..foo.*"]), + ("from . import *", &[".*"]), + ("from .. import *", &["..*"]), + ("from foo import *", &["foo.*"]), + ("from foo import * # Comment", &["foo.*"]), + })] + fn test_parse_wildcard_from_import_statement(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + ("import a; import b", &["a", "b"]), + ("import a; import b;", &["a", "b"]), + ("import a ; import b ;", &["a", "b"]), + ("import a; import b # Comment", &["a", "b"]), + ("import a; from b import c; from d import (e); from f import *", &["a", "b.c", "d.e", "f.*"]), + })] + fn test_parse_import_statement_list(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + (r#" +import a, b, \ + c, d + "#, &["a", "b", "c", "d"]), + + (r#" +from foo import a, b, \ + c, d + "#, &["foo.a", "foo.b", "foo.c", "foo.d"]), + + (r#" +from foo \ + import * + "#, &["foo.*"]), + })] + fn test_backslash_continuation(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + (r#" +import a +def foo(): + import b +import c + "#, &["a", "b", "c"]), + + (r#" +import a +class Foo: + import b +import c + "#, &["a", "b", "c"]), + })] + fn test_parse_nested_imports(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[parameterized(case = { + (r#" +import foo +if typing.TYPE_CHECKING: import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if TYPE_CHECKING: import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if TYPE_CHECKING : import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if TYPE_CHECKING: import bar as BAR +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo # C +if TYPE_CHECKING: import bar # C +import baz # C +"#, &[("foo", false), ("bar", true), ("baz", false)]), + })] + fn test_singleline_if_typechecking(case: (&str, &[(&str, bool)])) { + parse_and_check_with_typechecking_only(case); + } + + #[parameterized(case = { + (r#" +import foo +if typing.TYPE_CHECKING: + import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if TYPE_CHECKING: + import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo + +if TYPE_CHECKING : + + import bar + +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if TYPE_CHECKING: + import bar as BAR +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo # C +if TYPE_CHECKING: # C + # C + import bar # C + # C +import baz # C +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if TYPE_CHECKING: + """ + Comment + """ + import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + })] + fn test_multiline_if_typechecking(case: (&str, &[(&str, bool)])) { + parse_and_check_with_typechecking_only(case); + } + + #[parameterized(case = { + (r#" +import foo +""" +import bar +""" +import baz +"#, &["foo", "baz"]), + + (r#" +import foo +""" +import bar +""" # foo +import baz +"#, &["foo", "baz"]), + + (r#" +import foo +''' +import bar +''' +import baz +"#, &["foo", "baz"]), + })] + fn test_multiline_strings(case: (&str, &[&str])) { + parse_and_check(case); + } + + #[test] + fn test_parse_line_numbers() { + let imports = parse_imports( + " +import a +from b import c +from d import (e) +from f import *", + ) + .unwrap(); + assert_eq!( + vec![ + ("a".to_owned(), 2_u32), + ("b.c".to_owned(), 3_u32), + ("d.e".to_owned(), 4_u32), + ("f.*".to_owned(), 5_u32), + ], + imports + .into_iter() + .map(|i| (i.imported_object, i.line_number)) + .collect::>() + ); + } + + #[test] + fn test_parse_line_numbers_if_typechecking() { + let imports = parse_imports( + " +import a +if TYPE_CHECKING: + from b import c +from d import (e) +if TYPE_CHECKING: + from f import *", + ) + .unwrap(); + assert_eq!( + vec![ + ("a".to_owned(), 2_u32, false), + ("b.c".to_owned(), 4_u32, true), + ("d.e".to_owned(), 5_u32, false), + ("f.*".to_owned(), 7_u32, true), + ], + imports + .into_iter() + .map(|i| (i.imported_object, i.line_number, i.typechecking_only)) + .collect::>() + ); + } + + #[test] + fn test_parse_line_contents() { + let imports = parse_imports( + " +import a +from b import c +from d import (e) +from f import *", + ) + .unwrap(); + assert_eq!( + vec![ + ("a".to_owned(), "import a".to_owned()), + ("b.c".to_owned(), "from b import c".to_owned()), + ("d.e".to_owned(), "from d import (e)".to_owned()), + ("f.*".to_owned(), "from f import *".to_owned()), + ], + imports + .into_iter() + .map(|i| (i.imported_object, i.line_contents)) + .collect::>() + ); + } +} \ No newline at end of file From 598f14af10b646eb0a62e8bca4174ae95a457770 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 11:25:05 +0100 Subject: [PATCH 2/8] Add Python test that comments are included --- tests/unit/adaptors/test_importscanner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index f01480ee..601b9108 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -41,7 +41,7 @@ importer=Module("foo.one"), imported=Module("externaltwo"), line_number=3, - line_contents="import externaltwo.subpackage", + line_contents="import externaltwo.subpackage # with comment afterwards.", ), }, ), @@ -54,7 +54,7 @@ def test_absolute_imports(include_external_packages, expected_result): "/path/to/foo/one.py": """ import foo.two import externalone - import externaltwo.subpackage + import externaltwo.subpackage # with comment afterwards. arbitrary_expression = 1 """ } @@ -344,7 +344,7 @@ def test_import_of_portion_not_in_graph(include_external_packages): importer=Module("foo.one.blue"), imported=Module("external"), line_number=6, - line_contents="from external.two import blue", + line_contents="from external.two import blue # with comment afterwards.", ), }, ), @@ -382,7 +382,7 @@ def test_absolute_from_imports(include_external_packages, expected_result): if t.TYPE_CHECKING: from foo import three from external import one - from external.two import blue + from external.two import blue # with comment afterwards. arbitrary_expression = 1 """ }, From eab7b15cb50a8936e0dcf7fe4c2b7ec1aca6df43 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 10:28:38 +0100 Subject: [PATCH 3/8] Expose parse_to_imported_objects function --- rust/src/lib.rs | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b4506b0d..994d92e7 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -2,6 +2,7 @@ pub mod errors; pub mod exceptions; pub mod graph; pub mod module_expressions; +mod parsing; use crate::errors::{GrimpError, GrimpResult}; use crate::exceptions::{InvalidModuleExpression, ModuleNotPresent, NoSuchContainer}; @@ -13,13 +14,15 @@ use itertools::Itertools; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyDict, PyFrozenSet, PyList, PySet, PyString, PyTuple}; -use pyo3::IntoPyObjectExt; +use pyo3::{IntoPyObjectExt}; use rayon::prelude::*; use rustc_hash::FxHashSet; use std::collections::HashSet; + #[pymodule] fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_wrapped(wrap_pyfunction!(parse_to_imported_objects))?; m.add_class::()?; m.add("ModuleNotPresent", py.get_type::())?; m.add("NoSuchContainer", py.get_type::())?; @@ -30,6 +33,32 @@ fn _rustgrimp(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { Ok(()) } + + + +#[pyfunction] +fn parse_to_imported_objects<'py>(py: Python<'py>, python_code: &str) -> PyResult>> { + let imports = match parsing::parse_imports(python_code) { + Ok(imports) => imports, + Err(message) => return Err(PyValueError::new_err(message)), + }; + + let imports_as_dicts = imports.into_iter().map( + |import| { + let dict = PyDict::new(py); + dict.set_item("name", import.imported_object).unwrap(); + dict.set_item("line_number", import.line_number).unwrap(); + dict.set_item("line_contents", import.line_contents).unwrap(); + dict.set_item("typechecking_only", import.typechecking_only).unwrap(); + dict + } + ).collect(); + + Ok(imports_as_dicts) +} + + + #[pyclass(name = "Graph")] struct GraphWrapper { _graph: Graph, From c80bfd11b97039d7941b97431847a7d62f635551 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 10:29:13 +0100 Subject: [PATCH 4/8] Call rust parser from Python --- src/grimp/adaptors/importscanner.py | 26 +++++++++++++--------- tests/assets/syntaxerrorpackage/foo/one.py | 4 +++- tests/assets/syntaxerrorpackage/foo/two.py | 0 tests/functional/test_error_handling.py | 23 +++++++++---------- 4 files changed, 29 insertions(+), 24 deletions(-) create mode 100644 tests/assets/syntaxerrorpackage/foo/two.py diff --git a/src/grimp/adaptors/importscanner.py b/src/grimp/adaptors/importscanner.py index 863ebb64..36317efe 100644 --- a/src/grimp/adaptors/importscanner.py +++ b/src/grimp/adaptors/importscanner.py @@ -11,6 +11,7 @@ from grimp.application.ports.importscanner import AbstractImportScanner from grimp.application.ports.modulefinder import FoundPackage from grimp.domain.valueobjects import DirectImport, Module +from grimp import _rustgrimp as rust # type: ignore[attr-defined] logger = logging.getLogger(__name__) @@ -138,16 +139,21 @@ def _module_is_package(self, module_filename: str) -> bool: @staticmethod def _get_raw_imported_objects(module_contents: str) -> Set[_ImportedObject]: - module_lines = module_contents.splitlines() - - ast_tree = ast.parse(module_contents) - - visitor = _TreeVisitor( - module_lines=module_lines, - ) - visitor.visit(ast_tree) - - return visitor.imported_objects + dicts_from_rust = rust.parse_to_imported_objects(module_contents) + objects_from_rust = {_ImportedObject(**object_kwargs) for object_kwargs in dicts_from_rust} + + # TODO - remove these lines once we're confident the rust way + # is consistent with ast. + # + # module_lines = module_contents.splitlines() + # ast_tree = ast.parse(module_contents) + # visitor = _TreeVisitor(module_lines=module_lines) + # visitor.visit(ast_tree) + # objects_from_ast = visitor.imported_objects + # + # assert objects_from_rust == objects_from_ast, "Discrepancy!" + + return objects_from_rust @staticmethod def _get_absolute_imported_object_name( diff --git a/tests/assets/syntaxerrorpackage/foo/one.py b/tests/assets/syntaxerrorpackage/foo/one.py index cbf77873..5587e1b8 100644 --- a/tests/assets/syntaxerrorpackage/foo/one.py +++ b/tests/assets/syntaxerrorpackage/foo/one.py @@ -2,4 +2,6 @@ x = 1 -fromb . import two +fromb . import three + +from . import two \ No newline at end of file diff --git a/tests/assets/syntaxerrorpackage/foo/two.py b/tests/assets/syntaxerrorpackage/foo/two.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/test_error_handling.py b/tests/functional/test_error_handling.py index 2e0cd137..ab816b2c 100644 --- a/tests/functional/test_error_handling.py +++ b/tests/functional/test_error_handling.py @@ -1,4 +1,3 @@ -import os import re import pytest # type: ignore @@ -6,19 +5,17 @@ from grimp import build_graph, exceptions -def test_syntax_error_includes_module(): - dirname = os.path.dirname(__file__) - filename = os.path.abspath( - os.path.join(dirname, "..", "assets", "syntaxerrorpackage", "foo", "one.py") - ) +def test_does_not_raise_exception_if_encounters_syntax_error(): + """ + We don't make any promises about what to do if there's a syntax error, + but the parser isn't a complete implementation of the Python parser, so + it may just ignore it - finding other imports, as in this case. + """ + graph = build_graph("syntaxerrorpackage", cache_dir=None) - with pytest.raises(exceptions.SourceSyntaxError) as excinfo: - build_graph("syntaxerrorpackage", cache_dir=None) - - expected_exception = exceptions.SourceSyntaxError( - filename=filename, lineno=5, text="fromb . import two\n" - ) - assert expected_exception == excinfo.value + assert graph.find_modules_directly_imported_by("syntaxerrorpackage.foo.one") == { + "syntaxerrorpackage.foo.two" + } def test_missing_root_init_file(): From 08157bc6ddc53d427d0cb6a19cd12b1aa72227a7 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 15:55:07 +0100 Subject: [PATCH 5/8] Update CHANGELOG --- CHANGELOG.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7ae982e5..cea66adb 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,13 @@ Changelog ========= +latest +------ + +* Use Rust instead of Python's built-in ast module for import parsing. + Note that `SourceSyntaxError` will no longer be raised if the parser encounters + invalid Python. + 3.8.2 (2025-04-24) ------------------ From 9d1f214685b60b6711bf335d02f827a4d275e6c6 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 13:47:10 +0100 Subject: [PATCH 6/8] Correct inconsistencies with Rust-based parsing Prior to this, the parsing behaved differently from the Python-based parsing in three ways: 1. The line contents only included up to the last matched token, meaning comments afterwards were excluded. 2. The type_checking conditional did not work for an alias of the `typing` module, e.g. `if t.TYPE_CHECKING`. 3. Non-ascii Python identifiers weren't supported. This commit fixes both of those. --- rust/Cargo.lock | 10 ++++ rust/Cargo.toml | 1 + rust/src/parsing.rs | 73 ++++++++++++++++++++--- tests/unit/adaptors/test_importscanner.py | 21 +++++++ 4 files changed, 96 insertions(+), 9 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index f60f25f0..27774e70 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -15,6 +15,7 @@ dependencies = [ "lazy_static", "nom 8.0.0", "nom-regex", + "nom-unicode", "nom_locate", "parameterized", "pyimportparse", @@ -272,6 +273,15 @@ dependencies = [ "regex", ] +[[package]] +name = "nom-unicode" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00dea06b6b61609274f105c468f1ea996d2f7226bfa528b4de79c05eabf5f759" +dependencies = [ + "nom 8.0.0", +] + [[package]] name = "nom_locate" version = "5.0.0" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 94e81cc1..aab46c79 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -26,6 +26,7 @@ pyimportparse = "0.2.1" nom = "8.0.0" nom_locate = "5.0.0" nom-regex = "0.2.0" +nom-unicode = "0.4.0" [dependencies.pyo3] version = "0.24.1" diff --git a/rust/src/parsing.rs b/rust/src/parsing.rs index ee10f23c..b9500126 100644 --- a/rust/src/parsing.rs +++ b/rust/src/parsing.rs @@ -27,8 +27,9 @@ use nom::branch::alt; use nom::bytes::complete::{tag, take_until}; use nom::character::complete::{ - alphanumeric1, line_ending, multispace1, not_line_ending, space0, space1, + line_ending, multispace1, not_line_ending, space0, space1, }; +use nom_unicode::complete::alphanumeric1; use nom::combinator::{all_consuming, opt, recognize, value, verify}; use nom::multi::{many0, many1, separated_list1}; use nom::sequence::{delimited, preceded, terminated}; @@ -61,12 +62,28 @@ impl Import { } } -pub fn parse_imports(s: &str) -> Result, String> { - let s = Span::new(s); - let (_, result) = all_consuming(parse_block(false)) - .parse(s) +pub fn parse_imports(python_file_contents: &str) -> Result, String> { + let span = Span::new(python_file_contents); + let (_, imports) = all_consuming(parse_block(false)) + .parse(span) .map_err(|e| e.to_string())?; - Ok(result) + Ok(with_corrected_line_contents(python_file_contents, imports)) +} + +// Return the imports, but with the full line contents. +// Currently our nom parsing only pulls out to the latest token, so we correct it here +// by finding the corresponding whole line, based on the line number. +// TODO: adjust the parsing code so it figures it out correctly in the first place. +fn with_corrected_line_contents(python_file_contents: &str, imports: Vec) -> Vec { + let lines: Vec<&str> = python_file_contents.lines().collect(); + imports.into_iter().map( + |import| Import { + imported_object: import.imported_object, + line_number: import.line_number, + line_contents: lines[(import.line_number as usize) - 1].trim_start().to_string(), + typechecking_only: import.typechecking_only, + } + ).collect() } fn parse_block(typechecking_only: bool) -> impl Fn(Span) -> IResult> { @@ -276,8 +293,18 @@ fn parse_relative_module(s: Span) -> IResult { Ok((s, result.fragment())) } + +// Parse a valid Python identifier. +// +// Note this is not implemented as thoroughly as in the Python spec. +// Some identifiers will be valid here (e.g. ones that begin with digits) +// that aren't actually valid in Python. Unicode identifiers are supported. +// +// See https://docs.python.org/3/reference/lexical_analysis.html#identifiers fn parse_identifier(s: Span) -> IResult { - let (s, result) = recognize(many1(alt((alphanumeric1, tag("_"))))).parse(s)?; + let (s, result) = recognize( + many1(alt((alphanumeric1, tag("_")))) + ).parse(s)?; Ok((s, result.fragment())) } @@ -310,11 +337,17 @@ fn parse_space1(s: Span) -> IResult { Ok((s, ())) } + fn parse_if_typechecking(s: Span) -> IResult> { let (s, _) = ( tag("if"), parse_space1, - alt((tag("TYPE_CHECKING"), tag("typing.TYPE_CHECKING"))), + alt( + ( + tag("TYPE_CHECKING"), + preceded(parse_identifier, tag(".TYPE_CHECKING")), + ), + ), parse_space0, tag(":"), ) @@ -575,6 +608,18 @@ import baz (r#" import foo +if t.TYPE_CHECKING: import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo +if some_WE1RD_alias.TYPE_CHECKING: import bar +import baz +"#, &[("foo", false), ("bar", true), ("baz", false)]), + + (r#" +import foo if TYPE_CHECKING : import bar import baz "#, &[("foo", false), ("bar", true), ("baz", false)]), @@ -734,17 +779,27 @@ if TYPE_CHECKING: let imports = parse_imports( " import a +import a.b # Comment afterwards. from b import c from d import (e) -from f import *", +from f import * +from something.foo import * # Comment afterwards. +if True: + from indented import foo +from ñon_ascii_变 import ラーメン +", ) .unwrap(); assert_eq!( vec![ ("a".to_owned(), "import a".to_owned()), + ("a.b".to_owned(), "import a.b # Comment afterwards.".to_owned()), ("b.c".to_owned(), "from b import c".to_owned()), ("d.e".to_owned(), "from d import (e)".to_owned()), ("f.*".to_owned(), "from f import *".to_owned()), + ("something.foo.*".to_owned(), "from something.foo import * # Comment afterwards.".to_owned()), + ("indented.foo".to_owned(), "from indented import foo".to_owned()), + ("ñon_ascii_变.ラーメン".to_owned(), "from ñon_ascii_变 import ラーメン".to_owned()), ], imports .into_iter() diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index 601b9108..5fe19d7f 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -854,6 +854,9 @@ def test_get_raw_imports(): from . import g from .. import h from i import * +from ñon_ascii_变 import * +from . import ñon_ascii_变 +import ñon_ascii_变.ラーメン """ raw_imported_objects = ImportScanner._get_raw_imported_objects(module_contents) @@ -901,6 +904,24 @@ def test_get_raw_imports(): line_contents="from i import *", typechecking_only=False, ), + _ImportedObject( + name="ñon_ascii_变.*", + line_number=9, + line_contents="from ñon_ascii_变 import *", + typechecking_only=False, + ), + _ImportedObject( + name=".ñon_ascii_变", + line_number=10, + line_contents="from . import ñon_ascii_变", + typechecking_only=False, + ), + _ImportedObject( + name="ñon_ascii_变.ラーメン", + line_number=11, + line_contents="import ñon_ascii_变.ラーメン", + typechecking_only=False, + ), } From c1341c4d4b389e76a32c80cda575400c5455466a Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 15:48:39 +0100 Subject: [PATCH 7/8] Turn test_get_raw_imports into a class --- tests/unit/adaptors/test_importscanner.py | 161 +++++++++++----------- 1 file changed, 82 insertions(+), 79 deletions(-) diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index 5fe19d7f..fca4ab08 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -2,6 +2,7 @@ import pytest # type: ignore +from textwrap import dedent from grimp.adaptors.importscanner import ImportScanner, _ImportedObject from grimp.application.ports.modulefinder import FoundPackage, ModuleFile from grimp.domain.valueobjects import DirectImport, Module @@ -844,85 +845,87 @@ def _modules_to_module_files(modules: Set[Module]) -> Set[ModuleFile]: return {ModuleFile(module=module, mtime=some_mtime) for module in modules} -def test_get_raw_imports(): - module_contents = """\ -import a -if TYPE_CHECKING: - import b -from c import d -from .e import f -from . import g -from .. import h -from i import * -from ñon_ascii_变 import * -from . import ñon_ascii_变 -import ñon_ascii_变.ラーメン -""" - - raw_imported_objects = ImportScanner._get_raw_imported_objects(module_contents) - - assert raw_imported_objects == { - _ImportedObject( - name="a", - line_number=1, - line_contents="import a", - typechecking_only=False, - ), - _ImportedObject( - name="b", - line_number=3, - line_contents="import b", - typechecking_only=True, - ), - _ImportedObject( - name="c.d", - line_number=4, - line_contents="from c import d", - typechecking_only=False, - ), - _ImportedObject( - name=".e.f", - line_number=5, - line_contents="from .e import f", - typechecking_only=False, - ), - _ImportedObject( - name=".g", - line_number=6, - line_contents="from . import g", - typechecking_only=False, - ), - _ImportedObject( - name="..h", - line_number=7, - line_contents="from .. import h", - typechecking_only=False, - ), - _ImportedObject( - name="i.*", - line_number=8, - line_contents="from i import *", - typechecking_only=False, - ), - _ImportedObject( - name="ñon_ascii_变.*", - line_number=9, - line_contents="from ñon_ascii_变 import *", - typechecking_only=False, - ), - _ImportedObject( - name=".ñon_ascii_变", - line_number=10, - line_contents="from . import ñon_ascii_变", - typechecking_only=False, - ), - _ImportedObject( - name="ñon_ascii_变.ラーメン", - line_number=11, - line_contents="import ñon_ascii_变.ラーメン", - typechecking_only=False, - ), - } +class TestGetRawImportedObjects: + def test_get_raw_imports(self): + module_contents = dedent( + """\ + import a + if TYPE_CHECKING: + import b + from c import d + from .e import f + from . import g + from .. import h + from i import * + from ñon_ascii_变 import * + from . import ñon_ascii_变 + import ñon_ascii_变.ラーメン + """ + ) + raw_imported_objects = ImportScanner._get_raw_imported_objects(module_contents) + + assert raw_imported_objects == { + _ImportedObject( + name="a", + line_number=1, + line_contents="import a", + typechecking_only=False, + ), + _ImportedObject( + name="b", + line_number=3, + line_contents="import b", + typechecking_only=True, + ), + _ImportedObject( + name="c.d", + line_number=4, + line_contents="from c import d", + typechecking_only=False, + ), + _ImportedObject( + name=".e.f", + line_number=5, + line_contents="from .e import f", + typechecking_only=False, + ), + _ImportedObject( + name=".g", + line_number=6, + line_contents="from . import g", + typechecking_only=False, + ), + _ImportedObject( + name="..h", + line_number=7, + line_contents="from .. import h", + typechecking_only=False, + ), + _ImportedObject( + name="i.*", + line_number=8, + line_contents="from i import *", + typechecking_only=False, + ), + _ImportedObject( + name="ñon_ascii_变.*", + line_number=9, + line_contents="from ñon_ascii_变 import *", + typechecking_only=False, + ), + _ImportedObject( + name=".ñon_ascii_变", + line_number=10, + line_contents="from . import ñon_ascii_变", + typechecking_only=False, + ), + _ImportedObject( + name="ñon_ascii_变.ラーメン", + line_number=11, + line_contents="import ñon_ascii_变.ラーメン", + typechecking_only=False, + ), + } @pytest.mark.parametrize( From a39e4045df717443132137c0ab630f940c9e0570 Mon Sep 17 00:00:00 2001 From: David Seddon Date: Fri, 2 May 2025 16:50:29 +0100 Subject: [PATCH 8/8] Add failing tests for multiline comments --- tests/unit/adaptors/test_importscanner.py | 61 +++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/unit/adaptors/test_importscanner.py b/tests/unit/adaptors/test_importscanner.py index fca4ab08..a0db2017 100644 --- a/tests/unit/adaptors/test_importscanner.py +++ b/tests/unit/adaptors/test_importscanner.py @@ -927,6 +927,67 @@ def test_get_raw_imports(self): ), } + @pytest.mark.xfail(strict=True) + def test_multiline_comment_a(self): + module_contents = dedent( + """\ + import a + + FOO = \"\"\" + ... + \"\"\" + + \"\"\"import b + import c + \"\"\" + + import d + """ + ) + raw_imported_objects = ImportScanner._get_raw_imported_objects(module_contents) + + assert raw_imported_objects == { + _ImportedObject( + name="a", + line_number=1, + line_contents="import a", + typechecking_only=False, + ), + _ImportedObject( + name="d", + line_number=11, + line_contents="import d", + typechecking_only=False, + ), + } + + @pytest.mark.xfail(strict=True) + def test_multiline_comment_b(self): + module_contents = dedent( + """\ + r\"\"\" + A raw multiline string. + \"\"\" + + import a + + \"\"\" + Another multiline string. + \"\"\" + """ + ) + + raw_imported_objects = ImportScanner._get_raw_imported_objects(module_contents) + + assert raw_imported_objects == { + _ImportedObject( + name="a", + line_number=5, + line_contents="import a", + typechecking_only=False, + ), + } + @pytest.mark.parametrize( "is_package,imported_object_name,expected_absolute_imported_object_name",