diff --git a/agent-brain-cli/agent_brain_cli/commands/types.py b/agent-brain-cli/agent_brain_cli/commands/types.py index 309d1fd..9ecc5b1 100644 --- a/agent-brain-cli/agent_brain_cli/commands/types.py +++ b/agent-brain-cli/agent_brain_cli/commands/types.py @@ -26,6 +26,7 @@ "csharp": ["*.cs"], "c": ["*.c", "*.h"], "cpp": ["*.cpp", "*.hpp", "*.cc", "*.hh"], + "pascal": ["*.pas", "*.pp", "*.lpr", "*.dpr"], "web": ["*.html", "*.css", "*.scss", "*.jsx", "*.tsx"], "docs": ["*.md", "*.txt", "*.rst", "*.pdf"], "text": ["*.md", "*.txt", "*.rst"], @@ -50,6 +51,10 @@ "*.hpp", "*.cc", "*.hh", + "*.pas", + "*.pp", + "*.lpr", + "*.dpr", ], } diff --git a/agent-brain-cli/tests/test_types_cli.py b/agent-brain-cli/tests/test_types_cli.py index 4568d8d..e94a884 100644 --- a/agent-brain-cli/tests/test_types_cli.py +++ b/agent-brain-cli/tests/test_types_cli.py @@ -118,6 +118,7 @@ def test_all_expected_presets_exist(self) -> None: "csharp", "c", "cpp", + "pascal", "web", "docs", "text", @@ -136,6 +137,13 @@ def test_docs_preset_patterns(self) -> None: assert "*.md" in FILE_TYPE_PRESETS["docs"] assert "*.pdf" in FILE_TYPE_PRESETS["docs"] + def test_pascal_preset_patterns(self) -> None: + """Test Pascal preset includes all four extensions.""" + assert "*.pas" in FILE_TYPE_PRESETS["pascal"] + assert "*.pp" in FILE_TYPE_PRESETS["pascal"] + assert "*.lpr" in FILE_TYPE_PRESETS["pascal"] + assert "*.dpr" in FILE_TYPE_PRESETS["pascal"] + def test_code_preset_is_superset(self) -> None: """Test that 'code' preset contains all language patterns.""" code_patterns = set(FILE_TYPE_PRESETS["code"]) diff --git a/agent-brain-server/agent_brain_server/indexing/chunking.py b/agent-brain-server/agent_brain_server/indexing/chunking.py index b31cabe..8de9927 100644 --- a/agent-brain-server/agent_brain_server/indexing/chunking.py +++ b/agent-brain-server/agent_brain_server/indexing/chunking.py @@ -457,6 +457,7 @@ def _setup_language(self) -> None: "cpp": "cpp", "c": "c", "csharp": "csharp", + "pascal": "pascal", } lang_id = lang_map.get(self.language) @@ -486,7 +487,7 @@ def _get_symbols(self, text: str) -> list[dict[str, Any]]: logger.error(f"Failed to parse AST: {e}") return [] - symbols = [] + symbols: list[dict[str, Any]] = [] # Define queries for common languages query_str = "" @@ -557,6 +558,15 @@ def _get_symbols(self, text: str) -> list[dict[str, Any]]: (namespace_declaration name: (identifier) @name) @symbol """ + elif self.language == "pascal": + # Pascal: use manual AST walking because qualified method names + # (TClass.Method via genericDot) can't be unambiguously captured + # with a single tree-sitter query capture. + try: + self._collect_pascal_symbols(root, symbols) + except Exception as e: + logger.error(f"Error querying AST for {self.language}: {e}") + return symbols if not query_str: return [] @@ -640,6 +650,79 @@ def _extract_xml_doc_comment(self, text: str, declaration_line: int) -> str | No return plain_text if plain_text else None + def _collect_pascal_symbols( + self, + node: tree_sitter.Node, + symbols: list[dict[str, Any]], + ) -> None: + """Recursively walk a Pascal AST and collect procedure/function/type symbols. + + Args: + node: Current tree-sitter AST node. + symbols: Accumulator list to append symbol dicts into. + """ + if node.type == "defProc": + # Procedure or function implementation body. + for child in node.children: + if child.type == "declProc": + name = self._pascal_proc_name(child) + if name: + symbols.append( + { + "name": name, + "kind": "defProc", + "start_line": node.start_point[0] + 1, + "end_line": node.end_point[0] + 1, + } + ) + break + elif node.type == "declType": + # Type declaration (class, record, enum, etc.). + for child in node.children: + if child.type == "identifier": + raw = child.text + if raw: + symbols.append( + { + "name": raw.decode("utf-8"), + "kind": "declType", + "start_line": node.start_point[0] + 1, + "end_line": node.end_point[0] + 1, + } + ) + break + + for child in node.children: + self._collect_pascal_symbols(child, symbols) + + def _pascal_proc_name(self, decl_proc: tree_sitter.Node) -> str | None: + """Extract the bare name from a Pascal ``declProc`` AST node. + + Handles two forms: + - Simple: ``procedure Foo;`` → ``identifier`` is a direct child. + - Qualified: ``procedure TClass.Method;`` → name lives inside + ``genericDot``; the last ``identifier`` child is the method name. + + Args: + decl_proc: The ``declProc`` node to inspect. + + Returns: + The extracted name string, or ``None`` if no identifier was found. + """ + for child in decl_proc.children: + if child.type == "identifier": + raw = child.text + return raw.decode("utf-8") if raw else None + if child.type == "genericDot": + # Walk children of genericDot and keep the last identifier. + last_ident: tree_sitter.Node | None = None + for sub in child.children: + if sub.type == "identifier": + last_ident = sub + if last_ident is not None and last_ident.text: + return last_ident.text.decode("utf-8") + return None + def count_tokens(self, text: str) -> int: """Count the number of tokens in a text string.""" return len(self.tokenizer.encode(text)) diff --git a/agent-brain-server/agent_brain_server/indexing/document_loader.py b/agent-brain-server/agent_brain_server/indexing/document_loader.py index e0a5730..87d486e 100644 --- a/agent-brain-server/agent_brain_server/indexing/document_loader.py +++ b/agent-brain-server/agent_brain_server/indexing/document_loader.py @@ -37,8 +37,10 @@ class LanguageDetector: """ Utility for detecting programming languages from file paths and content. - Supports the 10 languages with tree-sitter parsers: - - Python, TypeScript, JavaScript, Kotlin, C, C++, Java, Go, Rust, Swift + Supports languages with tree-sitter parsers (AST-aware chunking): + - Python, TypeScript, JavaScript, Go, Rust, Java, C, C++, C#, Pascal + + Also detects Kotlin and Swift by extension/content (no AST chunking). """ # Language detection by file extension @@ -76,6 +78,11 @@ class LanguageDetector: # C# ".cs": "csharp", ".csx": "csharp", + # Object Pascal / Delphi / Free Pascal / Lazarus + ".pas": "pascal", + ".pp": "pascal", + ".lpr": "pascal", + ".dpr": "pascal", } # Language detection by content patterns (fallback) @@ -141,6 +148,17 @@ class LanguageDetector: re.MULTILINE, ), ], + "pascal": [ + re.compile( + r"^\s*(unit|program|library)\s+\w+\s*;", + re.MULTILINE | re.IGNORECASE, + ), + re.compile( + r"^\s*(procedure|function)\s+\w+", + re.MULTILINE | re.IGNORECASE, + ), + re.compile(r"\bbegin\b", re.MULTILINE | re.IGNORECASE), + ], } @classmethod @@ -280,6 +298,10 @@ class DocumentLoader: ".swift", # Swift ".cs", ".csx", # C# + ".pas", + ".pp", + ".lpr", + ".dpr", # Object Pascal / Delphi / Free Pascal } SUPPORTED_EXTENSIONS: set[str] = DOCUMENT_EXTENSIONS | CODE_EXTENSIONS diff --git a/agent-brain-server/agent_brain_server/services/file_type_presets.py b/agent-brain-server/agent_brain_server/services/file_type_presets.py index e850dfb..abddf1b 100644 --- a/agent-brain-server/agent_brain_server/services/file_type_presets.py +++ b/agent-brain-server/agent_brain_server/services/file_type_presets.py @@ -19,6 +19,7 @@ "csharp": ["*.cs"], "c": ["*.c", "*.h"], "cpp": ["*.cpp", "*.hpp", "*.cc", "*.hh"], + "pascal": ["*.pas", "*.pp", "*.lpr", "*.dpr"], "web": ["*.html", "*.css", "*.scss", "*.jsx", "*.tsx"], "docs": ["*.md", "*.txt", "*.rst", "*.pdf"], "text": ["*.md", "*.txt", "*.rst"], @@ -53,6 +54,11 @@ "*.hpp", "*.cc", "*.hh", + # pascal + "*.pas", + "*.pp", + "*.lpr", + "*.dpr", ], } diff --git a/agent-brain-server/tests/fixtures/sample.pas b/agent-brain-server/tests/fixtures/sample.pas new file mode 100644 index 0000000..b40bb65 --- /dev/null +++ b/agent-brain-server/tests/fixtures/sample.pas @@ -0,0 +1,119 @@ +{ Agent Brain - Sample Object Pascal unit for AST chunking tests } + +unit SampleUnit; + +interface + +uses + SysUtils; + +const + MAX_ITEMS = 100; + DEFAULT_COLOR = 'red'; + +type + TDirection = (dNorth, dSouth, dEast, dWest); + + TPoint = record + X: Integer; + Y: Integer; + end; + + TShape = class + private + FColor: string; + FVisible: Boolean; + public + constructor Create(const AColor: string); + destructor Destroy; override; + procedure SetColor(const AColor: string); + function GetColor: string; + function IsVisible: Boolean; + procedure Draw; virtual; + end; + + TCircle = class(TShape) + private + FRadius: Double; + public + constructor Create(const AColor: string; ARadius: Double); + function GetArea: Double; + procedure Draw; override; + end; + +function CalculateDistance(const A, B: TPoint): Double; +procedure PrintPoint(const P: TPoint); + +implementation + +uses + Math; + +{ TShape implementation } + +constructor TShape.Create(const AColor: string); +begin + FColor := AColor; + FVisible := True; +end; + +destructor TShape.Destroy; +begin + inherited Destroy; +end; + +procedure TShape.SetColor(const AColor: string); +begin + FColor := AColor; +end; + +function TShape.GetColor: string; +begin + Result := FColor; +end; + +function TShape.IsVisible: Boolean; +begin + Result := FVisible; +end; + +procedure TShape.Draw; +begin + WriteLn('Drawing shape with color: ', FColor); +end; + +{ TCircle implementation } + +constructor TCircle.Create(const AColor: string; ARadius: Double); +begin + inherited Create(AColor); + FRadius := ARadius; +end; + +function TCircle.GetArea: Double; +begin + Result := Pi * FRadius * FRadius; +end; + +procedure TCircle.Draw; +begin + WriteLn('Drawing circle with radius: ', FRadius:0:2); +end; + +{ Standalone routines } + +function CalculateDistance(const A, B: TPoint): Double; +var + DX, DY: Double; +begin + DX := B.X - A.X; + DY := B.Y - A.Y; + Result := Sqrt(DX * DX + DY * DY); +end; + +procedure PrintPoint(const P: TPoint); +begin + WriteLn(Format('Point(%d, %d)', [P.X, P.Y])); +end; + +end. diff --git a/agent-brain-server/tests/test_file_type_presets.py b/agent-brain-server/tests/test_file_type_presets.py index 396a60c..6c295ce 100644 --- a/agent-brain-server/tests/test_file_type_presets.py +++ b/agent-brain-server/tests/test_file_type_presets.py @@ -104,6 +104,7 @@ def test_code_preset_includes_all_languages(self) -> None: "csharp", "c", "cpp", + "pascal", ] for lang in language_presets: lang_patterns = resolve_file_types([lang]) @@ -175,8 +176,8 @@ def test_c_preset_patterns(self) -> None: result = resolve_file_types(["c"]) assert result == ["*.c", "*.h"] - def test_all_14_presets_exist(self) -> None: - """Test that all 14 expected presets are defined.""" + def test_all_15_presets_exist(self) -> None: + """Test that all 15 expected presets are defined.""" expected_presets = { "python", "javascript", @@ -187,6 +188,7 @@ def test_all_14_presets_exist(self) -> None: "csharp", "c", "cpp", + "pascal", "web", "docs", "code", diff --git a/agent-brain-server/tests/unit/test_chunking.py b/agent-brain-server/tests/unit/test_chunking.py index c665d78..f971b20 100644 --- a/agent-brain-server/tests/unit/test_chunking.py +++ b/agent-brain-server/tests/unit/test_chunking.py @@ -300,3 +300,153 @@ async def test_csharp_code_chunker_chunking() -> None: # Check that symbols are found across chunks symbol_names = [c.metadata.symbol_name for c in chunks if c.metadata.symbol_name] assert "Calculator" in symbol_names or "Add" in symbol_names + + +# --- Pascal Code Chunker Tests --- + + +def test_pascal_code_chunker_initialization() -> None: + """Test CodeChunker can be initialized for pascal.""" + chunker = CodeChunker(language="pascal") + assert chunker.language == "pascal" + assert chunker.ts_language is not None + + +def test_pascal_symbol_extraction_basic() -> None: + """Test symbol extraction from basic Pascal code (standalone routines).""" + chunker = CodeChunker(language="pascal") + code = """program TestProgs; +procedure HelloWorld; +begin + WriteLn('Hello'); +end; +function AddNumbers(A, B: Integer): Integer; +begin + Result := A + B; +end; +begin +end. +""" + symbols = chunker._get_symbols(code) + symbol_names = [s["name"] for s in symbols] + assert "HelloWorld" in symbol_names + assert "AddNumbers" in symbol_names + + +def test_pascal_symbol_extraction_class() -> None: + """Test symbol extraction finds class type declarations and methods.""" + chunker = CodeChunker(language="pascal") + code = """unit TestUnit; +interface +type + TGreeter = class + procedure Greet(const Name: string); + function GetGreeting: string; + end; + TPoint = record + X: Integer; + Y: Integer; + end; +implementation +procedure TGreeter.Greet(const Name: string); +begin WriteLn(Name); end; +function TGreeter.GetGreeting: string; +begin Result := 'hello'; end; +end. +""" + symbols = chunker._get_symbols(code) + symbol_names = [s["name"] for s in symbols] + symbol_kinds = [s["kind"] for s in symbols] + + # Type declarations + assert "TGreeter" in symbol_names + assert "TPoint" in symbol_names + # Method implementations (qualified names resolve to the method part) + assert "Greet" in symbol_names + assert "GetGreeting" in symbol_names + + # Verify kind strings match actual grammar node names + assert "declType" in symbol_kinds + assert "defProc" in symbol_kinds + + +def test_pascal_fixture_file_symbols() -> None: + """Test symbol extraction from the sample.pas fixture file.""" + fixture_path = FIXTURES_DIR / "sample.pas" + if not fixture_path.exists(): + pytest.skip("sample.pas fixture not found") + + code = fixture_path.read_text() + chunker = CodeChunker(language="pascal") + symbols = chunker._get_symbols(code) + symbol_names = [s["name"] for s in symbols] + + # Type declarations from the interface section + assert "TShape" in symbol_names + assert "TCircle" in symbol_names + assert "TPoint" in symbol_names + + # Method implementations from the implementation section + assert "Create" in symbol_names + assert "Draw" in symbol_names + assert "GetColor" in symbol_names + assert "SetColor" in symbol_names + + # Standalone routines + assert "CalculateDistance" in symbol_names + assert "PrintPoint" in symbol_names + + +@pytest.mark.asyncio +async def test_pascal_code_chunker_chunking() -> None: + """Test full chunking pipeline for Pascal code.""" + code = ( + """unit TestUnit; +interface +type + TShape = class + procedure Draw; + function GetColor: string; + end; +implementation +procedure TShape.Draw; +begin + WriteLn('Drawing'); +end; +function TShape.GetColor: string; +begin + Result := 'red'; +end; +{ padding } +""" + + ("{ padding line }\n" * 30) + + """ +procedure StandaloneProc; +begin + WriteLn('standalone'); +end; +end. +""" + ) + doc = LoadedDocument( + text=code, + source="test.pas", + file_name="test.pas", + file_path="test.pas", + file_size=len(code), + metadata={"source_type": "code", "language": "pascal"}, + ) + + chunker = CodeChunker(language="pascal", chunk_lines=5, max_chars=200) + chunks = await chunker.chunk_code_document(doc) + + assert len(chunks) > 0 + + # At least one chunk should have a non-None symbol_name + symbol_names = [c.metadata.symbol_name for c in chunks if c.metadata.symbol_name] + assert len(symbol_names) > 0 + assert ( + "Draw" in symbol_names + or "TShape" in symbol_names + or "StandaloneProc" in symbol_names + ) diff --git a/agent-brain-server/tests/unit/test_document_loader.py b/agent-brain-server/tests/unit/test_document_loader.py index d713295..4e1e371 100644 --- a/agent-brain-server/tests/unit/test_document_loader.py +++ b/agent-brain-server/tests/unit/test_document_loader.py @@ -129,6 +129,100 @@ def test_docx_included_when_available(self) -> None: assert ".docx" not in loader.DOCUMENT_EXTENSIONS +class TestPascalExtensionDetection: + """Tests for Object Pascal file extension detection.""" + + def test_pascal_pas_extension(self) -> None: + """Test .pas extension is detected as pascal.""" + assert LanguageDetector.detect_from_path("Unit1.pas") == "pascal" + + def test_pascal_pp_extension(self) -> None: + """Test .pp extension is detected as pascal.""" + assert LanguageDetector.detect_from_path("module.pp") == "pascal" + + def test_pascal_lpr_extension(self) -> None: + """Test .lpr (Lazarus project) extension is detected as pascal.""" + assert LanguageDetector.detect_from_path("project.lpr") == "pascal" + + def test_pascal_dpr_extension(self) -> None: + """Test .dpr (Delphi project) extension is detected as pascal.""" + assert LanguageDetector.detect_from_path("project.dpr") == "pascal" + + def test_pascal_case_insensitive(self) -> None: + """Test extension detection is case-insensitive.""" + assert LanguageDetector.detect_from_path("Unit1.PAS") == "pascal" + + def test_pascal_nested_path(self) -> None: + """Test detection works with nested file paths.""" + assert LanguageDetector.detect_from_path("src/units/Calculator.pas") == "pascal" + + +class TestPascalIsSupported: + """Tests for Pascal language support check.""" + + def test_pascal_is_supported(self) -> None: + """Test pascal is listed as a supported language.""" + assert LanguageDetector.is_supported_language("pascal") is True + + def test_pascal_in_supported_languages(self) -> None: + """Test pascal appears in get_supported_languages().""" + assert "pascal" in LanguageDetector.get_supported_languages() + + +class TestPascalContentDetection: + """Tests for Pascal content-based language detection.""" + + def test_pascal_unit_pattern(self) -> None: + """Test detection of 'unit' header pattern.""" + content = "unit MyUnit;\ninterface\n" + matches = LanguageDetector.detect_from_content(content) + assert len(matches) > 0 + assert matches[0][0] == "pascal" + + def test_pascal_program_pattern(self) -> None: + """Test detection of 'program' header pattern.""" + content = "program HelloWorld;\nbegin\n WriteLn('Hello');\nend.\n" + matches = LanguageDetector.detect_from_content(content) + lang_names = [m[0] for m in matches] + assert "pascal" in lang_names + + def test_pascal_procedure_function_pattern(self) -> None: + """Test detection of procedure/function keywords.""" + content = ( + "procedure Greet;\nbegin\nend;\n" + "function Add(A, B: Integer): Integer;\nbegin\nend;\n" + ) + matches = LanguageDetector.detect_from_content(content) + lang_names = [m[0] for m in matches] + assert "pascal" in lang_names + + def test_pascal_full_content_detection(self) -> None: + """Test comprehensive Pascal content returns pascal as top match.""" + content = """unit MyUnit; +interface +procedure Hello; +function Add(A, B: Integer): Integer; +implementation +procedure Hello; +begin + WriteLn('Hello'); +end; +function Add(A, B: Integer): Integer; +begin + Result := A + B; +end; +end. +""" + matches = LanguageDetector.detect_from_content(content) + assert len(matches) > 0 + assert matches[0][0] == "pascal" + + def test_pascal_detect_language_with_path(self) -> None: + """Test detect_language prefers path-based detection for .pas files.""" + result = LanguageDetector.detect_language("Module.pas", "some content") + assert result == "pascal" + + class TestDefaultExcludePatterns: """Tests for default exclude patterns."""