diff --git a/src/cli/commands.rs b/src/cli/commands.rs index 397ba77..f56927c 100644 --- a/src/cli/commands.rs +++ b/src/cli/commands.rs @@ -78,7 +78,7 @@ impl StorageBudgetMode { name = "acb", about = "AgenticCodebase \u{2014} Semantic code compiler for AI agents", long_about = "AgenticCodebase compiles multi-language codebases into navigable concept \ - graphs that AI agents can query. Supports Python, Rust, TypeScript, and Go.\n\n\ + graphs that AI agents can query. Supports Python, Rust, TypeScript, JavaScript, Go, C++, Java, and C#.\n\n\ Quick start:\n\ \x20 acb compile ./my-project # build a graph\n\ \x20 acb info my-project.acb # inspect the graph\n\ @@ -127,7 +127,7 @@ pub enum Command { /// Compile a repository into an .acb graph file. /// /// Recursively scans the source directory, parses all supported languages - /// (Python, Rust, TypeScript, Go), performs semantic analysis, and writes + /// (Python, Rust, TypeScript, JavaScript, Go, C++, Java, C#), performs semantic analysis, and writes /// a compact binary .acb file for fast querying. /// /// Examples: @@ -1211,12 +1211,14 @@ fn cmd_compile( if cli.verbose { eprintln!(" {} Running semantic analysis...", s.info()); } - let unit_count = parse_result.units.len(); - progress("Analyzing", 0, unit_count); + const ANALYZE_PHASES: usize = 6; + progress("Analyzing", 0, ANALYZE_PHASES); let analyzer = SemanticAnalyzer::new(); let analyze_opts = AnalyzeOptions::default(); - let graph = analyzer.analyze(parse_result.units, &analyze_opts)?; - progress("Analyzing", unit_count, unit_count); + let graph = + analyzer.analyze_with_progress(parse_result.units, &analyze_opts, |step, total| { + progress("Analyzing", step, total); + })?; progress_done(); if cli.verbose { diff --git a/src/parse/java.rs b/src/parse/java.rs index f518ac4..68f4b86 100644 --- a/src/parse/java.rs +++ b/src/parse/java.rs @@ -2,16 +2,32 @@ //! //! Extracts classes, interfaces, enums, methods, constructors, imports, packages. -use std::path::Path; +use std::collections::HashMap; +use std::path::{Component, Path}; -use crate::types::{AcbResult, CodeUnitType, Language, Visibility}; +use crate::types::{AcbResult, CodeUnitType, Language, Span, Visibility}; use super::treesitter::{get_node_text, node_to_span}; -use super::{LanguageParser, RawCodeUnit}; +use super::{LanguageParser, RawCodeUnit, RawReference, ReferenceKind}; /// Java language parser. pub struct JavaParser; +#[derive(Default)] +struct SyntheticCounters { + lambda: u32, + anonymous: u32, +} + +#[derive(Clone)] +struct TraversalFrame<'a> { + node: tree_sitter::Node<'a>, + scope_qname: String, + current_type_qname: Option, + callable_temp_id: Option, + callable_qname: Option, +} + impl Default for JavaParser { fn default() -> Self { Self::new() @@ -24,109 +40,315 @@ impl JavaParser { Self } - fn extract_from_node( + fn walk_tree_iterative<'a>( &self, - node: tree_sitter::Node, + root: tree_sitter::Node<'a>, source: &str, file_path: &Path, + namespace_root: &str, units: &mut Vec, + index_by_temp_id: &mut HashMap, next_id: &mut u64, - parent_qname: &str, + counters: &mut SyntheticCounters, ) { - let mut cursor = node.walk(); - for child in node.children(&mut cursor) { - match child.kind() { + let mut pending_refs: HashMap> = HashMap::new(); + let mut stack = vec![TraversalFrame { + node: root, + scope_qname: namespace_root.to_string(), + current_type_qname: None, + callable_temp_id: None, + callable_qname: None, + }]; + + while let Some(frame) = stack.pop() { + let node = frame.node; + + if let Some(callable_id) = frame.callable_temp_id { + self.collect_inline_callable_refs(node, source, callable_id, &mut pending_refs); + } + + match node.kind() { "class_declaration" => { - self.extract_class( - child, + if let Some(unit) = self.extract_class_node( + node, source, file_path, - units, - next_id, - parent_qname, + &frame.scope_qname, CodeUnitType::Type, - ); + next_id, + ) { + let class_qname = unit.qualified_name.clone(); + self.push_unit(units, index_by_temp_id, unit); + self.push_children( + &mut stack, + node, + class_qname.clone(), + Some(class_qname), + None, + None, + ); + continue; + } } "interface_declaration" => { - self.extract_class( - child, + if let Some(unit) = self.extract_class_node( + node, source, file_path, - units, - next_id, - parent_qname, + &frame.scope_qname, CodeUnitType::Trait, - ); + next_id, + ) { + let class_qname = unit.qualified_name.clone(); + self.push_unit(units, index_by_temp_id, unit); + self.push_children( + &mut stack, + node, + class_qname.clone(), + Some(class_qname), + None, + None, + ); + continue; + } } - "enum_declaration" => { - self.extract_class( - child, + "enum_declaration" | "record_declaration" => { + if let Some(unit) = self.extract_class_node( + node, source, file_path, - units, - next_id, - parent_qname, + &frame.scope_qname, CodeUnitType::Type, - ); + next_id, + ) { + let class_qname = unit.qualified_name.clone(); + self.push_unit(units, index_by_temp_id, unit); + self.push_children( + &mut stack, + node, + class_qname.clone(), + Some(class_qname), + None, + None, + ); + continue; + } } - "record_declaration" => { - self.extract_class( - child, + "method_declaration" | "constructor_declaration" => { + if let Some(unit) = self.extract_method_node( + node, source, file_path, - units, + &frame.scope_qname, next_id, - parent_qname, - CodeUnitType::Type, - ); - } - "method_declaration" => { - if let Some(unit) = - self.extract_method(child, source, file_path, parent_qname, next_id) - { - units.push(unit); + ) { + let callable_id = unit.temp_id; + let callable_qname = unit.qualified_name.clone(); + self.push_unit(units, index_by_temp_id, unit); + self.push_children( + &mut stack, + node, + callable_qname.clone(), + frame.current_type_qname.clone(), + Some(callable_id), + Some(callable_qname), + ); + continue; } } - "constructor_declaration" => { - if let Some(unit) = - self.extract_method(child, source, file_path, parent_qname, next_id) - { - units.push(unit); + "import_declaration" => { + if let Some(unit) = self.extract_import_node( + node, + source, + file_path, + &frame.scope_qname, + next_id, + ) { + self.push_unit(units, index_by_temp_id, unit); } } - "import_declaration" => { - if let Some(unit) = - self.extract_import(child, source, file_path, parent_qname, next_id) - { - units.push(unit); + "lambda_expression" => { + if let Some(unit) = self.extract_lambda_node( + node, + source, + file_path, + &frame.scope_qname, + next_id, + counters, + ) { + let callable_id = unit.temp_id; + let callable_qname = unit.qualified_name.clone(); + self.push_unit(units, index_by_temp_id, unit); + self.push_children( + &mut stack, + node, + callable_qname.clone(), + frame.current_type_qname.clone(), + Some(callable_id), + Some(callable_qname), + ); + continue; } } - "package_declaration" => { - // Captured as module-level metadata, skip as unit + "object_creation_expression" => { + if let Some(class_body) = find_anonymous_class_body(node) { + if let Some(unit) = self.extract_anonymous_class_node( + node, + source, + file_path, + &frame.scope_qname, + next_id, + counters, + ) { + let anon_qname = unit.qualified_name.clone(); + self.push_unit(units, index_by_temp_id, unit); + + let mut children = collect_direct_children(node); + children.reverse(); + for child in children { + if child == class_body { + stack.push(TraversalFrame { + node: child, + scope_qname: anon_qname.clone(), + current_type_qname: Some(anon_qname.clone()), + callable_temp_id: None, + callable_qname: None, + }); + } else { + stack.push(TraversalFrame { + node: child, + scope_qname: frame.scope_qname.clone(), + current_type_qname: frame.current_type_qname.clone(), + callable_temp_id: frame.callable_temp_id, + callable_qname: frame.callable_qname.clone(), + }); + } + } + continue; + } + } } _ => {} } + + self.push_children( + &mut stack, + node, + frame.scope_qname, + frame.current_type_qname, + frame.callable_temp_id, + frame.callable_qname, + ); + } + + for (temp_id, refs) in pending_refs { + if let Some(idx) = index_by_temp_id.get(&temp_id).copied() { + let unit = &mut units[idx]; + for r in refs { + push_reference(&mut unit.references, r.name, r.kind, r.span); + } + } + } + } + + fn push_children<'a>( + &self, + stack: &mut Vec>, + node: tree_sitter::Node<'a>, + scope_qname: String, + current_type_qname: Option, + callable_temp_id: Option, + callable_qname: Option, + ) { + let mut children = collect_direct_children(node); + children.reverse(); + for child in children { + stack.push(TraversalFrame { + node: child, + scope_qname: scope_qname.clone(), + current_type_qname: current_type_qname.clone(), + callable_temp_id, + callable_qname: callable_qname.clone(), + }); + } + } + + fn push_unit( + &self, + units: &mut Vec, + index_by_temp_id: &mut HashMap, + unit: RawCodeUnit, + ) { + index_by_temp_id.insert(unit.temp_id, units.len()); + units.push(unit); + } + + fn collect_inline_callable_refs( + &self, + node: tree_sitter::Node, + source: &str, + callable_temp_id: u64, + pending_refs: &mut HashMap>, + ) { + let refs = pending_refs.entry(callable_temp_id).or_default(); + + match node.kind() { + "method_invocation" => { + let name = node + .child_by_field_name("name") + .map(|n| get_node_text(n, source).trim().to_string()) + .unwrap_or_else(|| parse_call_name(get_node_text(node, source))); + push_reference(refs, name, ReferenceKind::Call, node_to_span(node)); + } + "object_creation_expression" => { + let ctor = node + .child_by_field_name("type") + .map(|n| normalize_type_text(get_node_text(n, source))) + .unwrap_or_else(|| parse_new_target(get_node_text(node, source))); + push_reference(refs, ctor, ReferenceKind::Call, node_to_span(node)); + } + "explicit_constructor_invocation" => { + let name = parse_call_name(get_node_text(node, source)); + push_reference(refs, name, ReferenceKind::Call, node_to_span(node)); + } + "field_access" => { + let name = node + .child_by_field_name("field") + .or_else(|| node.child_by_field_name("name")) + .map(|n| get_node_text(n, source).trim().to_string()) + .unwrap_or_else(|| parse_access_name(get_node_text(node, source))); + push_reference(refs, name, ReferenceKind::Access, node_to_span(node)); + } + "local_variable_declaration" + | "formal_parameter" + | "spread_parameter" + | "receiver_parameter" + | "catch_formal_parameter" => { + if let Some(type_node) = node.child_by_field_name("type") { + push_type_refs_from_text( + get_node_text(type_node, source), + node_to_span(type_node), + refs, + ); + } + } + _ => {} } } - #[allow(clippy::too_many_arguments)] - fn extract_class( + fn extract_class_node( &self, node: tree_sitter::Node, source: &str, file_path: &Path, - units: &mut Vec, - next_id: &mut u64, parent_qname: &str, unit_type: CodeUnitType, - ) { - let name = match node.child_by_field_name("name") { - Some(n) => get_node_text(n, source).to_string(), - None => return, - }; - let qname = java_qname(parent_qname, &name); - let span = node_to_span(node); - let vis = extract_java_visibility(node, source); + next_id: &mut u64, + ) -> Option { + let name = node + .child_by_field_name("name") + .map(|n| get_node_text(n, source).to_string())?; + let qname = java_qname(parent_qname, &name, None); let id = *next_id; *next_id += 1; @@ -136,20 +358,17 @@ impl JavaParser { Language::Java, name, file_path.to_path_buf(), - span, + node_to_span(node), ); unit.temp_id = id; - unit.qualified_name = qname.clone(); - unit.visibility = vis; - units.push(unit); - - // Recurse into the class body - if let Some(body) = node.child_by_field_name("body") { - self.extract_from_node(body, source, file_path, units, next_id, &qname); - } + unit.qualified_name = qname; + unit.visibility = extract_java_visibility(node, source); + extract_heritage_refs(node, source, &mut unit.references); + extract_direct_class_field_type_refs(node, source, &mut unit.references); + Some(unit) } - fn extract_method( + fn extract_method_node( &self, node: tree_sitter::Node, source: &str, @@ -159,9 +378,8 @@ impl JavaParser { ) -> Option { let name_node = node.child_by_field_name("name")?; let name = get_node_text(name_node, source).to_string(); - let qname = java_qname(parent_qname, &name); - let span = node_to_span(node); - let vis = extract_java_visibility(node, source); + let overload = method_overload_suffix(node, source); + let qname = java_qname(parent_qname, &name, Some(&overload)); let id = *next_id; *next_id += 1; @@ -178,18 +396,19 @@ impl JavaParser { let mut unit = RawCodeUnit::new( unit_type, Language::Java, - name, + name.clone(), file_path.to_path_buf(), - span, + node_to_span(node), ); unit.temp_id = id; unit.qualified_name = qname; - unit.visibility = vis; - + unit.signature = Some(method_signature(node, source, &name)); + unit.visibility = extract_java_visibility(node, source); + extract_method_header_type_refs(node, source, &mut unit.references); Some(unit) } - fn extract_import( + fn extract_import_node( &self, node: tree_sitter::Node, source: &str, @@ -216,7 +435,100 @@ impl JavaParser { span, ); unit.temp_id = id; - unit.qualified_name = java_qname(parent_qname, "import"); + unit.qualified_name = java_qname( + parent_qname, + &format!("import.{}", sanitize_qname_leaf(&unit.name)), + None, + ); + unit.references.push(RawReference { + name: unit.name.clone(), + kind: ReferenceKind::Import, + span, + }); + Some(unit) + } + + fn extract_lambda_node( + &self, + node: tree_sitter::Node, + source: &str, + file_path: &Path, + parent_qname: &str, + next_id: &mut u64, + counters: &mut SyntheticCounters, + ) -> Option { + counters.lambda += 1; + let span = node_to_span(node); + let name = format!( + "lambda${}_{}_{}", + span.start_line, span.start_col, counters.lambda + ); + let qname = java_qname(parent_qname, &name, None); + + let id = *next_id; + *next_id += 1; + + let mut unit = RawCodeUnit::new( + CodeUnitType::Function, + Language::Java, + name, + file_path.to_path_buf(), + span, + ); + unit.temp_id = id; + unit.qualified_name = qname; + unit.visibility = Visibility::Private; + + if let Some(params) = node.child_by_field_name("parameters") { + unit.signature = Some(get_node_text(params, source).trim().to_string()); + collect_type_refs_from_param_list(params, source, &mut unit.references); + } + + Some(unit) + } + + fn extract_anonymous_class_node( + &self, + node: tree_sitter::Node, + source: &str, + file_path: &Path, + parent_qname: &str, + next_id: &mut u64, + counters: &mut SyntheticCounters, + ) -> Option { + let class_body = find_anonymous_class_body(node)?; + counters.anonymous += 1; + + let span = node_to_span(class_body); + let name = format!( + "anonymous${}_{}_{}", + span.start_line, span.start_col, counters.anonymous + ); + let qname = java_qname(parent_qname, &name, None); + + let id = *next_id; + *next_id += 1; + + let mut unit = RawCodeUnit::new( + CodeUnitType::Type, + Language::Java, + name, + file_path.to_path_buf(), + span, + ); + unit.temp_id = id; + unit.qualified_name = qname; + unit.visibility = Visibility::Private; + + if let Some(type_node) = node.child_by_field_name("type") { + let type_text = normalize_type_text(get_node_text(type_node, source)); + push_reference( + &mut unit.references, + type_text, + ReferenceKind::Inherit, + node_to_span(type_node), + ); + } Some(unit) } @@ -230,36 +542,53 @@ impl LanguageParser for JavaParser { file_path: &Path, ) -> AcbResult> { let mut units = Vec::new(); + let mut index_by_temp_id: HashMap = HashMap::new(); let mut next_id = 0u64; + let mut counters = SyntheticCounters::default(); let module_name = file_path .file_stem() .and_then(|s| s.to_str()) .unwrap_or("unknown") .to_string(); + let package_name = extract_package_name(tree.root_node(), source) + .or_else(|| fallback_package_from_path(file_path)) + .unwrap_or_default(); + let namespace_root = if package_name.is_empty() { + module_name.clone() + } else { + package_name.clone() + }; + + self.walk_tree_iterative( + tree.root_node(), + source, + file_path, + &namespace_root, + &mut units, + &mut index_by_temp_id, + &mut next_id, + &mut counters, + ); - let root_span = node_to_span(tree.root_node()); + // Emit module after traversal to keep type/function lookup precedence + // for same-name symbols in a file. let mut module_unit = RawCodeUnit::new( CodeUnitType::Module, Language::Java, module_name.clone(), file_path.to_path_buf(), - root_span, + node_to_span(tree.root_node()), ); module_unit.temp_id = next_id; - module_unit.qualified_name = module_name.clone(); - next_id += 1; + let module_leaf = sanitize_qname_leaf(&module_name); + module_unit.qualified_name = if namespace_root.is_empty() { + format!("$module.{}", module_leaf) + } else { + format!("{}.$module.{}", namespace_root, module_leaf) + }; units.push(module_unit); - self.extract_from_node( - tree.root_node(), - source, - file_path, - &mut units, - &mut next_id, - &module_name, - ); - Ok(units) } @@ -272,11 +601,34 @@ impl LanguageParser for JavaParser { } } -fn java_qname(parent: &str, name: &str) -> String { - if parent.is_empty() { +fn collect_direct_children<'a>(node: tree_sitter::Node<'a>) -> Vec> { + let mut cursor = node.walk(); + node.children(&mut cursor).collect() +} + +fn find_anonymous_class_body(node: tree_sitter::Node) -> Option { + if let Some(class_body) = node.child_by_field_name("class_body") { + return Some(class_body); + } + + let mut cursor = node.walk(); + let found = node + .children(&mut cursor) + .find(|child| child.kind() == "class_body"); + found +} + +fn java_qname(parent: &str, name: &str, overload_suffix: Option<&str>) -> String { + let leaf = if let Some(suffix) = overload_suffix { + format!("{}${}", name, sanitize_qname_leaf(suffix)) + } else { name.to_string() + }; + + if parent.is_empty() { + leaf } else { - format!("{}.{}", parent, name) + format!("{}.{}", parent, leaf) } } @@ -288,14 +640,17 @@ fn extract_java_visibility(node: tree_sitter::Node, source: &str) -> Visibility let text = get_node_text(child, source); if text.contains("public") { return Visibility::Public; - } else if text.contains("private") { + } + if text.contains("private") { return Visibility::Private; - } else if text.contains("protected") { - return Visibility::Public; // close enough for graph purposes + } + if text.contains("protected") { + return Visibility::Protected; } } } - // Java default (package-private) — treat as public for graph + + // Java default (package-private): treat as public for graph Visibility::Public } @@ -317,3 +672,324 @@ fn has_annotation(node: tree_sitter::Node, source: &str, annotation: &str) -> bo } false } + +fn extract_package_name(root: tree_sitter::Node, source: &str) -> Option { + let mut cursor = root.walk(); + for child in root.children(&mut cursor) { + if child.kind() == "package_declaration" { + let package_text = get_node_text(child, source) + .trim_start_matches("package") + .trim_end_matches(';') + .trim(); + if is_valid_package(package_text) { + return Some(package_text.to_string()); + } + } + } + None +} + +fn fallback_package_from_path(file_path: &Path) -> Option { + let parent = file_path.parent()?; + let parts: Vec = parent + .components() + .filter_map(|c| match c { + Component::Normal(s) => s.to_str().map(|x| x.to_string()), + _ => None, + }) + .collect(); + + if parts.is_empty() { + return None; + } + + let markers = ["sources", "source", "src", "java", "kotlin"]; + if let Some(idx) = parts.iter().rposition(|p| { + let lower = p.to_ascii_lowercase(); + markers.contains(&lower.as_str()) + }) { + let candidate: Vec = parts[idx + 1..] + .iter() + .filter(|seg| is_valid_package_segment(seg)) + .cloned() + .collect(); + if !candidate.is_empty() { + return Some(candidate.join(".")); + } + } + + let mut tail: Vec = parts + .iter() + .rev() + .filter(|seg| is_valid_package_segment(seg)) + .take(4) + .cloned() + .collect(); + tail.reverse(); + + if tail.is_empty() { + None + } else { + Some(tail.join(".")) + } +} + +fn is_valid_package(package_name: &str) -> bool { + !package_name.is_empty() && package_name.split('.').all(is_valid_package_segment) +} + +fn is_valid_package_segment(segment: &str) -> bool { + let mut chars = segment.chars(); + let Some(first) = chars.next() else { + return false; + }; + if !(first.is_ascii_alphabetic() || first == '_' || first == '$') { + return false; + } + chars.all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '$') +} + +fn method_signature(node: tree_sitter::Node, source: &str, method_name: &str) -> String { + let param_types = collect_parameter_type_texts(node, source); + let params = if param_types.is_empty() { + "()".to_string() + } else { + format!("({})", param_types.join(", ")) + }; + + let return_ty = node + .child_by_field_name("type") + .map(|n| format!(" -> {}", normalize_type_text(get_node_text(n, source)))) + .unwrap_or_default(); + + format!("{}{}{}", method_name, params, return_ty) +} + +fn method_overload_suffix(node: tree_sitter::Node, source: &str) -> String { + let param_types = collect_parameter_type_texts(node, source); + if !param_types.is_empty() { + return format!("sig_{}", sanitize_qname_leaf(¶m_types.join("_"))); + } + format!("arity_{}", parameter_arity(node)) +} + +fn parameter_arity(node: tree_sitter::Node) -> usize { + let Some(params) = node.child_by_field_name("parameters") else { + return 0; + }; + + let mut count = 0usize; + let mut cursor = params.walk(); + for child in params.children(&mut cursor) { + match child.kind() { + "formal_parameter" | "spread_parameter" | "receiver_parameter" => { + count += 1; + } + _ => {} + } + } + count +} + +fn collect_parameter_type_texts(node: tree_sitter::Node, source: &str) -> Vec { + let mut result = Vec::new(); + let Some(params) = node.child_by_field_name("parameters") else { + return result; + }; + + let mut cursor = params.walk(); + for child in params.children(&mut cursor) { + match child.kind() { + "formal_parameter" | "spread_parameter" | "receiver_parameter" => { + if let Some(type_node) = child.child_by_field_name("type") { + let text = normalize_type_text(get_node_text(type_node, source)); + if !text.is_empty() { + result.push(text); + } + } + } + _ => {} + } + } + + result +} + +fn collect_type_refs_from_param_list( + params: tree_sitter::Node, + source: &str, + refs: &mut Vec, +) { + let mut cursor = params.walk(); + for child in params.children(&mut cursor) { + match child.kind() { + "formal_parameter" | "spread_parameter" | "receiver_parameter" => { + if let Some(type_node) = child.child_by_field_name("type") { + push_type_refs_from_text( + get_node_text(type_node, source), + node_to_span(type_node), + refs, + ); + } + } + _ => {} + } + } +} + +fn extract_heritage_refs(node: tree_sitter::Node, source: &str, refs: &mut Vec) { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + match child.kind() { + "superclass" => { + let text = get_node_text(child, source).trim(); + for ty in split_type_candidates(text.trim_start_matches("extends ")) { + push_reference(refs, ty, ReferenceKind::Inherit, node_to_span(child)); + } + } + "super_interfaces" | "interfaces" => { + let text = get_node_text(child, source).trim(); + for ty in split_type_candidates( + text.trim_start_matches("implements ") + .trim_start_matches("extends "), + ) { + push_reference(refs, ty, ReferenceKind::Implement, node_to_span(child)); + } + } + _ => {} + } + } +} + +fn extract_direct_class_field_type_refs( + class_node: tree_sitter::Node, + source: &str, + refs: &mut Vec, +) { + let Some(body) = class_node.child_by_field_name("body") else { + return; + }; + + let mut cursor = body.walk(); + for child in body.children(&mut cursor) { + if child.kind() == "field_declaration" { + if let Some(type_node) = child.child_by_field_name("type") { + push_type_refs_from_text( + get_node_text(type_node, source), + node_to_span(type_node), + refs, + ); + } + } + } +} + +fn extract_method_header_type_refs( + method_node: tree_sitter::Node, + source: &str, + refs: &mut Vec, +) { + if let Some(return_type) = method_node.child_by_field_name("type") { + push_type_refs_from_text( + get_node_text(return_type, source), + node_to_span(return_type), + refs, + ); + } + + if let Some(parameters) = method_node.child_by_field_name("parameters") { + collect_type_refs_from_param_list(parameters, source, refs); + } + + let mut cursor = method_node.walk(); + for child in method_node.children(&mut cursor) { + if child.kind() == "throws" { + let text = get_node_text(child, source) + .trim_start_matches("throws ") + .trim(); + for ty in split_type_candidates(text) { + push_reference(refs, ty, ReferenceKind::TypeUse, node_to_span(child)); + } + } + } +} + +fn push_type_refs_from_text(raw: &str, span: Span, refs: &mut Vec) { + for ty in split_type_candidates(raw) { + push_reference(refs, ty, ReferenceKind::TypeUse, span); + } +} + +fn split_type_candidates(raw: &str) -> Vec { + let mut out = Vec::new(); + for token in + raw.split(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '$' || c == '.')) + { + if token.is_empty() || is_java_primitive(token) { + continue; + } + let Some(first) = token.chars().next() else { + continue; + }; + if first.is_ascii_uppercase() || token.contains('.') || token.contains('$') { + let normalized = normalize_type_text(token); + if !normalized.is_empty() && !out.contains(&normalized) { + out.push(normalized); + } + } + } + out +} + +fn is_java_primitive(token: &str) -> bool { + matches!( + token, + "byte" | "short" | "int" | "long" | "float" | "double" | "char" | "boolean" | "void" + ) +} + +fn normalize_type_text(raw: &str) -> String { + raw.split_whitespace().collect::() +} + +fn parse_call_name(raw: &str) -> String { + let head = raw.split('(').next().unwrap_or("").trim(); + head.rsplit('.').next().unwrap_or(head).trim().to_string() +} + +fn parse_new_target(raw: &str) -> String { + let tail = raw.trim_start_matches("new ").trim(); + tail.split('(').next().unwrap_or("").trim().to_string() +} + +fn parse_access_name(raw: &str) -> String { + raw.rsplit('.').next().unwrap_or(raw).trim().to_string() +} + +fn push_reference( + refs: &mut Vec, + name: impl Into, + kind: ReferenceKind, + span: Span, +) { + let name = name.into(); + if name.is_empty() { + return; + } + if refs.iter().any(|r| r.kind == kind && r.name == name) { + return; + } + refs.push(RawReference { name, kind, span }); +} + +fn sanitize_qname_leaf(raw: &str) -> String { + raw.chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$' | '.' | '-') { + ch + } else { + '_' + } + }) + .collect() +} diff --git a/src/semantic/analyzer.rs b/src/semantic/analyzer.rs index 7187c83..b3328f2 100644 --- a/src/semantic/analyzer.rs +++ b/src/semantic/analyzer.rs @@ -63,11 +63,29 @@ impl SemanticAnalyzer { raw_units: Vec, options: &AnalyzeOptions, ) -> AcbResult { + self.analyze_with_progress(raw_units, options, |_step, _total_steps| {}) + } + + /// Analyze with coarse-grained phase progress callbacks. + pub fn analyze_with_progress( + &self, + raw_units: Vec, + options: &AnalyzeOptions, + mut on_progress: F, + ) -> AcbResult + where + F: FnMut(usize, usize), + { + const TOTAL_STEPS: usize = 6; + on_progress(0, TOTAL_STEPS); + // Phase 1: Build symbol table let symbol_table = SymbolTable::build(&raw_units)?; + on_progress(1, TOTAL_STEPS); // Phase 2: Resolve references let resolved = self.resolver.resolve_all(&raw_units, &symbol_table)?; + on_progress(2, TOTAL_STEPS); // Phase 3: Trace FFI boundaries let ffi_edges = if options.trace_ffi { @@ -75,6 +93,7 @@ impl SemanticAnalyzer { } else { Vec::new() }; + on_progress(3, TOTAL_STEPS); // Phase 4: Detect patterns let patterns = if options.detect_patterns { @@ -82,6 +101,7 @@ impl SemanticAnalyzer { } else { Vec::new() }; + on_progress(4, TOTAL_STEPS); // Phase 5: Extract concepts let concepts = if options.extract_concepts { @@ -89,9 +109,12 @@ impl SemanticAnalyzer { } else { Vec::new() }; + on_progress(5, TOTAL_STEPS); // Phase 6: Build final graph - self.build_graph(resolved, ffi_edges, patterns, concepts) + let graph = self.build_graph(resolved, ffi_edges, patterns, concepts)?; + on_progress(6, TOTAL_STEPS); + Ok(graph) } /// Build the CodeGraph from resolved units and analysis results. diff --git a/src/semantic/concept_extractor.rs b/src/semantic/concept_extractor.rs index e1a817d..c378c5f 100644 --- a/src/semantic/concept_extractor.rs +++ b/src/semantic/concept_extractor.rs @@ -250,43 +250,66 @@ impl ConceptExtractor { /// Extract concepts from the resolved units. pub fn extract(&self, units: &[ResolvedUnit]) -> AcbResult> { - let mut extracted = Vec::new(); + let mut concept_units_by_idx: Vec> = + (0..self.concepts.len()).map(|_| Vec::new()).collect(); + + // Precompute normalized text once per unit to avoid repeated lowercasing + // for each concept definition. + for unit in units { + let name_lower = unit.unit.name.to_lowercase(); + let qname_lower = unit.unit.qualified_name.to_lowercase(); + let doc_lower = unit.unit.doc.as_ref().map(|d| d.to_lowercase()); + let role = self.determine_role(unit); - for concept_def in &self.concepts { - let mut concept_units = Vec::new(); + for (idx, concept_def) in self.concepts.iter().enumerate() { + let score = self.score_unit_normalized( + &name_lower, + &qname_lower, + doc_lower.as_deref(), + unit.unit.unit_type, + concept_def, + ); - for unit in units { - let score = self.score_unit(unit, concept_def); if score > 0.3 { - concept_units.push(ConceptUnit { + concept_units_by_idx[idx].push(ConceptUnit { unit_id: unit.unit.temp_id, - role: self.determine_role(unit), + role, score, }); } } + } - if !concept_units.is_empty() { - let avg_score = - concept_units.iter().map(|u| u.score).sum::() / concept_units.len() as f32; - - extracted.push(ExtractedConcept { - name: concept_def.name.clone(), - units: concept_units, - confidence: avg_score, - }); + let mut extracted = Vec::new(); + for (idx, concept_def) in self.concepts.iter().enumerate() { + let concept_units = std::mem::take(&mut concept_units_by_idx[idx]); + if concept_units.is_empty() { + continue; } + + let avg_score = + concept_units.iter().map(|u| u.score).sum::() / concept_units.len() as f32; + + extracted.push(ExtractedConcept { + name: concept_def.name.clone(), + units: concept_units, + confidence: avg_score, + }); } Ok(extracted) } - fn score_unit(&self, unit: &ResolvedUnit, concept: &ConceptDefinition) -> f32 { + fn score_unit_normalized( + &self, + name_lower: &str, + qname_lower: &str, + doc_lower: Option<&str>, + unit_type: CodeUnitType, + concept: &ConceptDefinition, + ) -> f32 { let mut score = 0.0f32; - let name_lower = unit.unit.name.to_lowercase(); - let qname_lower = unit.unit.qualified_name.to_lowercase(); - // Keyword matching in name for keyword in &concept.keywords { if name_lower.contains(keyword.as_str()) { @@ -297,8 +320,7 @@ impl ConceptExtractor { } // Doc matching - if let Some(ref doc) = unit.unit.doc { - let doc_lower = doc.to_lowercase(); + if let Some(doc_lower) = doc_lower { for keyword in &concept.keywords { if doc_lower.contains(keyword.as_str()) { score += 0.15; @@ -307,7 +329,7 @@ impl ConceptExtractor { } // Type bonus - if concept.typical_types.contains(&unit.unit.unit_type) { + if concept.typical_types.contains(&unit_type) { score += 0.1; } diff --git a/src/semantic/pattern_detector.rs b/src/semantic/pattern_detector.rs index a6b3f9f..4e42c9a 100644 --- a/src/semantic/pattern_detector.rs +++ b/src/semantic/pattern_detector.rs @@ -3,6 +3,8 @@ //! Detects common design patterns in code: Singleton, Factory, Repository, //! Decorator, Observer, Strategy patterns. +use std::collections::{HashMap, HashSet}; + use crate::types::{AcbResult, CodeUnitType, Visibility}; use super::resolver::ResolvedUnit; @@ -69,46 +71,48 @@ struct SingletonMatcher; impl PatternMatcher for SingletonMatcher { fn detect(&self, units: &[ResolvedUnit]) -> Vec { let mut instances = Vec::new(); + let (functions_by_owner, functions_by_file) = build_function_indexes(units); for unit in units { if unit.unit.unit_type != CodeUnitType::Type { continue; } - // Look for singleton indicators by checking sibling methods - let type_name = &unit.unit.name; - let type_name_lower = type_name.to_lowercase(); + let type_name_lower = unit.unit.name.to_lowercase(); + let candidate_methods = + gather_candidate_methods(unit, &functions_by_owner, &functions_by_file); let mut has_instance_method = false; let mut has_private_constructor = false; let mut participants = vec![unit.unit.temp_id]; + let mut seen = HashSet::from([unit.unit.temp_id]); - for other in units { - if other.unit.unit_type != CodeUnitType::Function { + for other in candidate_methods { + if !method_belongs_to_type(other, &type_name_lower) { continue; } - let other_qname_lower = other.unit.qualified_name.to_lowercase(); let other_name_lower = other.unit.name.to_lowercase(); - // Check if this is a method of the type - if other_qname_lower.contains(&type_name_lower) { - // Check for get_instance, instance, or shared patterns - if other_name_lower.contains("instance") - || other_name_lower.contains("shared") - || other_name_lower == "default" - { - has_instance_method = true; + // Check for get_instance, instance, or shared patterns + if other_name_lower.contains("instance") + || other_name_lower.contains("shared") + || other_name_lower == "default" + { + has_instance_method = true; + if seen.insert(other.unit.temp_id) { participants.push(other.unit.temp_id); } + } - // Check for private constructors - if (other_name_lower == "__init__" - || other_name_lower == "new" - || other_name_lower == "constructor") - && other.unit.visibility == Visibility::Private - { - has_private_constructor = true; + // Check for private constructors + if (other_name_lower == "__init__" + || other_name_lower == "new" + || other_name_lower == "constructor") + && other.unit.visibility == Visibility::Private + { + has_private_constructor = true; + if seen.insert(other.unit.temp_id) { participants.push(other.unit.temp_id); } } @@ -176,6 +180,7 @@ struct RepositoryMatcher; impl PatternMatcher for RepositoryMatcher { fn detect(&self, units: &[ResolvedUnit]) -> Vec { let mut instances = Vec::new(); + let (functions_by_owner, functions_by_file) = build_function_indexes(units); for unit in units { if unit.unit.unit_type != CodeUnitType::Type { @@ -189,27 +194,25 @@ impl PatternMatcher for RepositoryMatcher { || name_lower.contains("dao") || name_lower.contains("store") { - // Look for CRUD methods + // Look for CRUD methods among likely methods for this type. + let methods = + gather_candidate_methods(unit, &functions_by_owner, &functions_by_file); let mut crud_count = 0; - for other in units { - if other.unit.unit_type == CodeUnitType::Function { - let method_lower = other.unit.name.to_lowercase(); - let in_type = other - .unit - .qualified_name - .to_lowercase() - .contains(&name_lower); - if in_type - && (method_lower.starts_with("get") - || method_lower.starts_with("find") - || method_lower.starts_with("create") - || method_lower.starts_with("update") - || method_lower.starts_with("delete") - || method_lower.starts_with("save") - || method_lower.starts_with("list")) - { - crud_count += 1; - } + for other in methods { + if !method_belongs_to_type(other, &name_lower) { + continue; + } + + let method_lower = other.unit.name.to_lowercase(); + if method_lower.starts_with("get") + || method_lower.starts_with("find") + || method_lower.starts_with("create") + || method_lower.starts_with("update") + || method_lower.starts_with("delete") + || method_lower.starts_with("save") + || method_lower.starts_with("list") + { + crud_count += 1; } } @@ -264,3 +267,81 @@ impl PatternMatcher for DecoratorMatcher { instances } } + +fn build_function_indexes<'a>( + units: &'a [ResolvedUnit], +) -> ( + HashMap>, + HashMap>, +) { + let mut by_owner: HashMap> = HashMap::new(); + let mut by_file: HashMap> = HashMap::new(); + + for unit in units { + if unit.unit.unit_type != CodeUnitType::Function { + continue; + } + + let file_key = unit.unit.file_path.to_string_lossy().to_string(); + by_file.entry(file_key).or_default().push(unit); + + if let Some(owner) = infer_owner_type_name(&unit.unit.qualified_name, &unit.unit.name) { + by_owner.entry(owner).or_default().push(unit); + } + } + + (by_owner, by_file) +} + +fn gather_candidate_methods<'a>( + type_unit: &ResolvedUnit, + functions_by_owner: &HashMap>, + functions_by_file: &HashMap>, +) -> Vec<&'a ResolvedUnit> { + let type_name_lower = type_unit.unit.name.to_lowercase(); + if let Some(methods) = functions_by_owner.get(&type_name_lower) { + return methods.clone(); + } + + let file_key = type_unit.unit.file_path.to_string_lossy().to_string(); + functions_by_file + .get(&file_key) + .cloned() + .unwrap_or_default() +} + +fn method_belongs_to_type(method: &ResolvedUnit, type_name_lower: &str) -> bool { + if let Some(owner) = infer_owner_type_name(&method.unit.qualified_name, &method.unit.name) { + if owner == type_name_lower { + return true; + } + } + + method + .unit + .qualified_name + .to_lowercase() + .contains(type_name_lower) +} + +fn infer_owner_type_name(qname: &str, function_name: &str) -> Option { + let segments: Vec<&str> = qname + .split(|c| c == '.' || c == ':' || c == '/' || c == '$') + .filter(|segment| !segment.is_empty()) + .collect(); + + if segments.len() < 2 { + return None; + } + + let fn_segment_idx = segments.iter().rposition(|segment| { + let clean = segment.split('(').next().unwrap_or(segment); + clean == function_name || clean.starts_with(function_name) + })?; + + if fn_segment_idx == 0 { + return None; + } + + Some(segments[fn_segment_idx - 1].to_lowercase()) +} diff --git a/src/semantic/resolver.rs b/src/semantic/resolver.rs index 2d43aa8..be43d95 100644 --- a/src/semantic/resolver.rs +++ b/src/semantic/resolver.rs @@ -163,10 +163,18 @@ impl Resolver { units: &[RawCodeUnit], symbol_table: &SymbolTable, ) -> AcbResult> { + let mut file_imports: HashMap> = HashMap::new(); + for unit in units { + if unit.unit_type == CodeUnitType::Import { + let file_key = unit.file_path.to_string_lossy().to_string(); + file_imports.entry(file_key).or_default().push(unit); + } + } + let mut resolved = Vec::with_capacity(units.len()); for unit in units { - let resolved_refs = self.resolve_unit_references(unit, units, symbol_table)?; + let resolved_refs = self.resolve_unit_references(unit, symbol_table, &file_imports)?; resolved.push(ResolvedUnit { unit: unit.clone(), resolved_refs, @@ -179,13 +187,13 @@ impl Resolver { fn resolve_unit_references( &self, unit: &RawCodeUnit, - all_units: &[RawCodeUnit], symbol_table: &SymbolTable, + file_imports: &HashMap>, ) -> AcbResult> { let mut resolved = Vec::new(); for raw_ref in &unit.references { - let resolution = self.resolve_reference(raw_ref, unit, all_units, symbol_table); + let resolution = self.resolve_reference(raw_ref, unit, symbol_table, file_imports); resolved.push(ResolvedReference { raw: raw_ref.clone(), resolution, @@ -199,8 +207,8 @@ impl Resolver { &self, raw_ref: &RawReference, unit: &RawCodeUnit, - all_units: &[RawCodeUnit], symbol_table: &SymbolTable, + file_imports: &HashMap>, ) -> Resolution { // Strategy 1: Try exact qualified name match if let Some(target_id) = symbol_table.lookup_qualified(&raw_ref.name) { @@ -210,13 +218,12 @@ impl Resolver { } // Strategy 2: Try local resolution (same file, then by simple name) - if let Some(local_id) = self.resolve_local(&raw_ref.name, unit, all_units, symbol_table) { + if let Some(local_id) = self.resolve_local(raw_ref, unit, symbol_table) { return Resolution::Local(local_id); } // Strategy 3: Try imported symbol resolution - if let Some(imported) = self.resolve_imported(&raw_ref.name, unit, all_units, symbol_table) - { + if let Some(imported) = self.resolve_imported(raw_ref, unit, file_imports) { return Resolution::Imported(imported); } @@ -230,11 +237,11 @@ impl Resolver { fn resolve_local( &self, - name: &str, + raw_ref: &RawReference, unit: &RawCodeUnit, - _all_units: &[RawCodeUnit], symbol_table: &SymbolTable, ) -> Option { + let name = raw_ref.name.as_str(); let file_key = unit.file_path.to_string_lossy().to_string(); let file_units = symbol_table.units_in_file(&file_key); @@ -247,7 +254,14 @@ impl Resolver { // Match on the simple name part of the qname let simple = qname.rsplit('.').next().unwrap_or(qname); let simple2 = qname.rsplit("::").next().unwrap_or(qname); - if simple == name || simple2 == name || qname == name { + let simple_overload = strip_overload_suffix(simple); + let simple2_overload = strip_overload_suffix(simple2); + if simple == name + || simple2 == name + || qname == name + || simple_overload == name + || simple2_overload == name + { return Some(id); } } @@ -260,30 +274,22 @@ impl Resolver { fn resolve_imported( &self, - name: &str, + raw_ref: &RawReference, unit: &RawCodeUnit, - all_units: &[RawCodeUnit], - symbol_table: &SymbolTable, + file_imports: &HashMap>, ) -> Option { - // Check if any import in the same file matches this name + let name = raw_ref.name.as_str(); + // Check imports declared in the same file only. let file_key = unit.file_path.to_string_lossy().to_string(); - let file_unit_ids = symbol_table.units_in_file(&file_key); - - for &fid in file_unit_ids { - // Find the unit for this ID - if let Some(file_unit) = all_units.iter().find(|u| u.temp_id == fid) { - if file_unit.unit_type == CodeUnitType::Import { - // Check if this import's name matches the reference - let import_name = &file_unit.name; - if import_name.contains(name) - || name.contains(import_name.rsplit('/').next().unwrap_or(import_name)) - { - return Some(ImportedSymbol { - unit_id: fid, - import_path: import_name.clone(), - }); - } - } + let imports_in_file = file_imports.get(&file_key)?; + + for file_unit in imports_in_file { + let import_name = &file_unit.name; + if import_matches(unit.language, raw_ref.kind, import_name, name) { + return Some(ImportedSymbol { + unit_id: file_unit.temp_id, + import_path: import_name.clone(), + }); } } @@ -511,6 +517,62 @@ impl Resolver { } } +fn strip_overload_suffix(name: &str) -> &str { + name.split('$').next().unwrap_or(name) +} + +fn import_matches(language: Language, kind: ReferenceKind, import_name: &str, name: &str) -> bool { + if import_name == name { + return true; + } + if import_name.ends_with(&format!(".{}", name)) { + return true; + } + if name.contains('.') && name.ends_with(import_name) { + return true; + } + + if language == Language::Java { + return import_matches_java(kind, import_name, name); + } + + import_name.contains(name) + || name.contains(import_name.rsplit('/').next().unwrap_or(import_name)) +} + +fn import_matches_java(kind: ReferenceKind, import_name: &str, name: &str) -> bool { + let normalized_import = import_name.trim(); + let normalized_name = name.trim(); + + if normalized_import + .rsplit('.') + .next() + .is_some_and(|leaf| leaf == normalized_name) + { + return true; + } + + if let Some(prefix) = normalized_import.strip_suffix(".*") { + if normalized_name.starts_with(prefix) { + return true; + } + if kind != ReferenceKind::Call && !normalized_name.contains('.') { + return true; + } + } + + if kind == ReferenceKind::Call + && normalized_import + .rsplit('.') + .next() + .is_some_and(|leaf| leaf == normalized_name) + { + return true; + } + + false +} + impl Default for Resolver { fn default() -> Self { Self::new() diff --git a/testdata/java/com/example/core/Base.java b/testdata/java/com/example/core/Base.java new file mode 100644 index 0000000..850aeaa --- /dev/null +++ b/testdata/java/com/example/core/Base.java @@ -0,0 +1,6 @@ +package com.example.core; + +public class Base { + public void ping() { + } +} diff --git a/testdata/java/com/example/core/Workable.java b/testdata/java/com/example/core/Workable.java new file mode 100644 index 0000000..30a6514 --- /dev/null +++ b/testdata/java/com/example/core/Workable.java @@ -0,0 +1,5 @@ +package com.example.core; + +public interface Workable { + void process(String item); +} diff --git a/testdata/java/com/example/core/Worker.java b/testdata/java/com/example/core/Worker.java new file mode 100644 index 0000000..a7ccb91 --- /dev/null +++ b/testdata/java/com/example/core/Worker.java @@ -0,0 +1,39 @@ +package com.example.core; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Consumer; +import com.example.shared.Helper; + +public class Worker extends Base implements Workable { + private List names = new ArrayList<>(); + private Helper helper = new Helper(); + + public Worker() { + super(); + } + + @Override + public void process(String item) { + Helper.log(item); + names.add(item); + + Runnable r = new Runnable() { + @Override + public void run() { + Helper.log("anon"); + } + }; + + Consumer consumer = s -> Helper.log(s); + r.run(); + consumer.accept(item); + } + + public void process(String item, int count) { + for (int i = 0; i < count; i++) { + process(item); + } + } +} + diff --git a/testdata/java/com/example/other/Helper.java b/testdata/java/com/example/other/Helper.java new file mode 100644 index 0000000..b0ca324 --- /dev/null +++ b/testdata/java/com/example/other/Helper.java @@ -0,0 +1,6 @@ +package com.example.other; + +public class Helper { + public void ping() { + } +} diff --git a/testdata/java/com/example/shared/Helper.java b/testdata/java/com/example/shared/Helper.java new file mode 100644 index 0000000..ef02537 --- /dev/null +++ b/testdata/java/com/example/shared/Helper.java @@ -0,0 +1,7 @@ +package com.example.shared; + +public class Helper { + public static void log(String value) { + System.out.println(value); + } +} diff --git a/testdata/java/com/obf/a.java b/testdata/java/com/obf/a.java new file mode 100644 index 0000000..270d132 --- /dev/null +++ b/testdata/java/com/obf/a.java @@ -0,0 +1,15 @@ +package com.obf; + +import java.util.List; + +public class a { + public final List b; + + public a(List list) { + this.b = list; + } + + public void c(String v) { + b.add(v); + } +} diff --git a/tests/phase2_parsing.rs b/tests/phase2_parsing.rs index 6cad379..888007b 100644 --- a/tests/phase2_parsing.rs +++ b/tests/phase2_parsing.rs @@ -1,6 +1,6 @@ //! Phase 2 tests: Multi-language parsing engine. //! -//! Tests the tree-sitter based parsers for Python, Rust, TypeScript, and Go. +//! Tests the tree-sitter based parsers for Python, Rust, TypeScript, Go, and Java. use std::path::Path; @@ -51,6 +51,7 @@ fn test_parser_new() { assert!(parser.should_parse(Path::new("foo.ts"))); assert!(parser.should_parse(Path::new("foo.js"))); assert!(parser.should_parse(Path::new("foo.go"))); + assert!(parser.should_parse(Path::new("foo.java"))); assert!(!parser.should_parse(Path::new("foo.txt"))); assert!(!parser.should_parse(Path::new("foo.c"))); } @@ -678,6 +679,233 @@ fn test_go_benchmark_detection() { assert_eq!(bench.unwrap().unit_type, CodeUnitType::Test); } +// ============================================================ +// Java parsing +// ============================================================ + +#[test] +fn test_java_parse_worker_module() { + let units = parse_test_file("java/com/example/core/Worker.java"); + assert!(!units.is_empty()); + + let modules = find_units_by_type(&units, CodeUnitType::Module); + assert_eq!(modules.len(), 1); + assert_eq!(modules[0].language, Language::Java); +} + +#[test] +fn test_java_extracts_types_and_signatures() { + let units = parse_test_file("java/com/example/core/Worker.java"); + + let worker = find_unit_by_name(&units, "Worker").expect("Worker not found"); + assert_eq!(worker.unit_type, CodeUnitType::Type); + assert!( + worker.qualified_name.starts_with("com.example.core.Worker"), + "Worker qname should be package-rooted, got {}", + worker.qualified_name + ); + + let process_methods: Vec<_> = units.iter().filter(|u| u.name == "process").collect(); + assert!( + process_methods.len() >= 2, + "Expected overloaded process methods" + ); + let qnames: std::collections::HashSet<_> = process_methods + .iter() + .map(|u| u.qualified_name.as_str()) + .collect(); + assert_eq!( + qnames.len(), + process_methods.len(), + "Overloaded methods should have unique qnames" + ); +} + +#[test] +fn test_java_module_and_type_qnames_are_distinct() { + let units = parse_test_file("java/com/example/core/Worker.java"); + + let module = units + .iter() + .find(|u| u.name == "Worker" && u.unit_type == CodeUnitType::Module) + .expect("Worker module not found"); + let ty = units + .iter() + .find(|u| u.name == "Worker" && u.unit_type == CodeUnitType::Type) + .expect("Worker type not found"); + + assert_ne!( + module.qualified_name, ty.qualified_name, + "Module and top-level type should have distinct qnames" + ); +} + +#[test] +fn test_java_extracts_reference_kinds() { + let units = parse_test_file("java/com/example/core/Worker.java"); + + let worker = find_unit_by_name(&units, "Worker").expect("Worker not found"); + assert!( + worker + .references + .iter() + .any(|r| r.kind == agentic_codebase::parse::ReferenceKind::Inherit), + "Worker should contain inheritance refs" + ); + assert!( + worker + .references + .iter() + .any(|r| r.kind == agentic_codebase::parse::ReferenceKind::Implement), + "Worker should contain interface refs" + ); + assert!( + worker + .references + .iter() + .any(|r| r.kind == agentic_codebase::parse::ReferenceKind::TypeUse), + "Worker should contain type-use refs" + ); + + let process = units + .iter() + .find(|u| { + u.name == "process" + && u.signature + .as_deref() + .is_some_and(|s| s.contains("String)")) + }) + .expect("single-arg process not found"); + assert!( + process + .references + .iter() + .any(|r| r.kind == agentic_codebase::parse::ReferenceKind::Call), + "process should contain call refs" + ); +} + +#[test] +fn test_java_import_units_have_import_refs() { + let units = parse_test_file("java/com/example/core/Worker.java"); + let imports = find_units_by_type(&units, CodeUnitType::Import); + assert!(imports.len() >= 3, "Expected import units for Worker.java"); + assert!(imports.iter().all(|u| { + u.references + .iter() + .any(|r| r.kind == agentic_codebase::parse::ReferenceKind::Import) + })); +} + +#[test] +fn test_java_synthetic_lambda_and_anonymous_nodes() { + let units = parse_test_file("java/com/example/core/Worker.java"); + assert!( + units.iter().any(|u| u.name.starts_with("lambda$")), + "Expected synthetic lambda node" + ); + assert!( + units.iter().any(|u| u.name.starts_with("anonymous$")), + "Expected synthetic anonymous class node" + ); +} + +#[test] +fn test_java_test_file_detection() { + let parser = agentic_codebase::parse::java::JavaParser::new(); + assert!(agentic_codebase::parse::LanguageParser::is_test_file( + &parser, + Path::new("WorkerTest.java"), + "" + )); + assert!(!agentic_codebase::parse::LanguageParser::is_test_file( + &parser, + Path::new("Worker.java"), + "" + )); +} + +#[test] +fn test_java_qname_collision_regression() { + let parser = Parser::new(); + let root = testdata_path("java"); + let opts = ParseOptions { + languages: vec![Language::Java], + ..Default::default() + }; + let result = parser + .parse_directory(&root, &opts) + .expect("parse_directory failed"); + + let helpers: Vec<_> = result + .units + .iter() + .filter(|u| u.name == "Helper" && u.unit_type == CodeUnitType::Type) + .collect(); + assert!( + helpers.len() >= 2, + "Expected Helper classes in separate packages" + ); + + let helper_qnames: std::collections::HashSet<_> = + helpers.iter().map(|u| u.qualified_name.as_str()).collect(); + assert_eq!( + helper_qnames.len(), + helpers.len(), + "Helper qnames should be unique across packages" + ); +} + +#[test] +fn test_java_deep_nesting_generated_source() { + let parser = Parser::new(); + let depth = 4000usize; + + let mut source = String::from("package stress; public class Deep { public void run() {"); + for _ in 0..depth { + source.push_str("if (true) {"); + } + for _ in 0..depth { + source.push('}'); + } + source.push_str(" } }"); + + let units = parser + .parse_file(Path::new("Deep.java"), &source) + .expect("deep nesting Java parse failed"); + + assert!(!units.is_empty(), "Deep.java should produce units"); + assert!( + units.iter().any(|u| u.name == "run"), + "Expected run() method in deep nesting source" + ); +} + +#[test] +fn test_java_large_body_generated_source() { + let parser = Parser::new(); + let statements = 15000usize; + + let mut source = + String::from("package stress; public class Large { public void run() { String s = \"x\";"); + for _ in 0..statements { + source.push_str("helper(); s.length();"); + } + source.push_str(" } private void helper() {} }"); + + let units = parser + .parse_file(Path::new("Large.java"), &source) + .expect("large-body Java parse failed"); + + let run = units + .iter() + .find(|u| u.name == "run") + .expect("run() not found in Large.java"); + assert!( + !run.references.is_empty(), + "run() should contain extracted references in large body" + ); +} // ============================================================ // Cross-language tests // ============================================================ @@ -689,6 +917,7 @@ fn test_all_units_have_spans() { "rust/simple_lib.rs", "typescript/simple_module.ts", "go/simple_module.go", + "java/com/example/core/Worker.java", ] { let units = parse_test_file(file); for unit in &units { @@ -710,6 +939,7 @@ fn test_all_units_have_qualified_names() { "rust/simple_lib.rs", "typescript/simple_module.ts", "go/simple_module.go", + "java/com/example/core/Worker.java", ] { let units = parse_test_file(file); for unit in &units { @@ -756,6 +986,15 @@ fn test_all_units_have_correct_language() { for u in &go_units { assert_eq!(u.language, Language::Go, "Go units should be Language::Go"); } + + let java_units = parse_test_file("java/com/example/core/Worker.java"); + for u in &java_units { + assert_eq!( + u.language, + Language::Java, + "Java units should be Language::Java" + ); + } } #[test] @@ -765,6 +1004,7 @@ fn test_unique_temp_ids_per_file() { "rust/simple_lib.rs", "typescript/simple_module.ts", "go/simple_module.go", + "java/com/example/core/Worker.java", ] { let units = parse_test_file(file); let mut ids: Vec = units.iter().map(|u| u.temp_id).collect(); @@ -916,6 +1156,16 @@ fn test_parse_malformed_rust() { } } +#[test] +fn test_parse_malformed_java() { + let parser = Parser::new(); + let source = "class Broken { void run( { int x = 1; }"; + let result = parser.parse_file(Path::new("broken.java"), source); + if let Ok(units) = result { + assert!(!units.is_empty()); + } +} + #[test] fn test_parse_unicode_identifiers() { let parser = Parser::new(); diff --git a/tests/phase3_semantic.rs b/tests/phase3_semantic.rs index ac1d1b4..8a71f6a 100644 --- a/tests/phase3_semantic.rs +++ b/tests/phase3_semantic.rs @@ -765,6 +765,91 @@ fn test_full_analysis_go() { assert!(graph.edge_count() > 0); } +#[test] +fn test_full_analysis_java() { + let units = parse_test_file("java/com/example/core/Worker.java"); + let analyzer = SemanticAnalyzer::new(); + let graph = analyzer + .analyze(units, &AnalyzeOptions::default()) + .expect("analysis failed"); + + assert!(graph.unit_count() > 0); +} + +#[test] +fn test_full_analysis_java_edge_categories() { + let parser = Parser::new(); + let testdata = testdata_path("java"); + let opts = agentic_codebase::parse::ParseOptions { + languages: vec![Language::Java], + ..Default::default() + }; + let parsed = parser + .parse_directory(&testdata, &opts) + .expect("parse failed"); + + let analyzer = SemanticAnalyzer::new(); + let graph = analyzer + .analyze(parsed.units, &AnalyzeOptions::default()) + .expect("analysis failed"); + + let edge_count = |edge_type| { + graph + .edges() + .iter() + .filter(|e| e.edge_type == edge_type) + .count() + }; + + assert!( + edge_count(EdgeType::Contains) > 0, + "Expected contains edges" + ); + assert!(edge_count(EdgeType::Calls) > 0, "Expected call edges"); + assert!(edge_count(EdgeType::Imports) > 0, "Expected import edges"); + assert!( + edge_count(EdgeType::Inherits) > 0, + "Expected inherits edges" + ); + assert!( + edge_count(EdgeType::Implements) > 0, + "Expected implements edges" + ); + assert!( + edge_count(EdgeType::UsesType) > 0, + "Expected type-use edges" + ); +} + +#[test] +fn test_full_analysis_java_overloads_keep_unique_qnames() { + let parser = Parser::new(); + let path = testdata_path("java/com/example/core/Worker.java"); + let content = std::fs::read_to_string(&path).expect("Could not read test file"); + let units = parser.parse_file(&path, &content).expect("Parse failed"); + + let analyzer = SemanticAnalyzer::new(); + let graph = analyzer + .analyze(units, &AnalyzeOptions::default()) + .expect("analysis failed"); + + let processes: Vec<_> = (0..graph.unit_count() as u64) + .filter_map(|id| graph.get_unit(id)) + .filter(|u| u.language == Language::Java && u.name == "process") + .collect(); + assert!(processes.len() >= 2, "Expected overloaded process units"); + + let unique: std::collections::HashSet<_> = processes + .iter() + .map(|u| u.qualified_name.as_str()) + .collect(); + assert_eq!( + unique.len(), + processes.len(), + "Overloaded Java methods should have unique qnames" + ); +} + #[test] fn test_full_analysis_containment_edges() { let units = parse_test_file("python/simple_module.py");