diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..b3d33a0 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,98 @@ +# Copilot Agent Instructions for arcflow + +This file provides guidance for GitHub Copilot agents working on the arcflow repository. + +## Commit Style + +When making changes to this repository, use **granular, single-purpose commits**: + +### Guidelines + +- **One commit per logical change** - Each commit should do one thing and do it well +- **Separate refactoring from features** - Don't mix code restructuring with new functionality +- **Clear, descriptive messages** - Explain what the commit does and why +- **Include imports with usage** - Add necessary imports in the same commit where they're used, not as separate commits + +### Examples + +Good commit sequence: +``` +1. Refactor XML injection logic for extensibility +2. Add linked_agents to resolve parameter +3. Add get_creator_bioghist method + (includes import of xml.sax.saxutils.escape used in the method) +4. Integrate bioghist into XML injection +5. Update comment to reflect new behavior +``` + +Bad commit sequences: + +Too dense: +``` +1. Add creator biographical information to EAD XML exports + (combines refactoring, new imports, new methods, and integration) +``` + +Too granular: +``` +1. Import xml.sax.saxutils.escape +2. Add get_creator_bioghist method that uses xml.sax.saxutils.escape + (import should have been included in this commit) +``` + +### Commit Message Format + +- **First line**: Clear, concise summary (50-72 characters) +- **Body** (optional): Bullet points explaining the changes +- **Keep it focused**: If you need many bullets, consider splitting into multiple commits + +### Why This Matters + +- Makes code review easier +- Helps understand the progression of changes +- Easier to revert specific changes if needed +- Clear history for future maintainers + +--- + +## XML Content Handling in EAD Pipeline + +When injecting content into EAD XML files, distinguish between plain text and structured XML: + +### Escaping Strategy + +- **Plain text labels** (recordgroup, subgroup): Use `xml_escape()` to escape special characters (`&`, `<`, `>`) + - These are simple strings that may contain characters that break XML syntax + - Example: `xml_escape(rg_label)` → converts `"Group & Co"` to `"Group & Co"` + +- **Structured EAD XML content** (bioghist, scopecontent): Do NOT escape + - Content from ArchivesSpace already contains valid EAD XML markup (``, ``, etc.) + - These are legitimate XML nodes that must be preserved + - Escaping would convert them to literal text: `<emph>` → `<emph>` + - Example: Pass through as-is: `f'<p>{subnote["content"]}</p>'` + +### Why This Matters + +The Traject indexing pipeline and ArcLight display rely on proper XML structure: +1. Traject's `.to_html` converts XML nodes to HTML +2. ArcLight's `render_html_tags` processes the HTML for display +3. If XML nodes are escaped (treated as text), they can't be processed and appear as raw markup + +### Pattern for Future Fields + +When adding new EAD fields to the pipeline: +1. Determine if content is plain text or structured XML +2. Apply escaping only to plain text +3. Pass structured XML through unchanged +4. Document the decision in code comments + +--- + +## Adding More Instructions + +To add additional instructions to this file: + +1. Add a new section with a clear heading (e.g., `## Testing Strategy`, `## Code Style`) +2. Keep instructions concise and actionable +3. Use examples where helpful +4. Maintain the simple, scannable format diff --git a/arcflow/main.py b/arcflow/main.py index 815903b..0cd5ba1 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -9,6 +9,7 @@ import re import logging from xml.dom.pulldom import parse, START_ELEMENT +from xml.sax.saxutils import escape as xml_escape from datetime import datetime, timezone from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool @@ -205,7 +206,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): resource = self.client.get( f'{repo["uri"]}/resources/{resource_id}', params={ - 'resolve': ['classifications', 'classification_terms'], + 'resolve': ['classifications', 'classification_terms', 'linked_agents'], }).json() xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml' @@ -225,24 +226,41 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): 'ead3': 'false', }) - # add record group and subgroup labels to EAD inside <archdesc level="collection"> + # add custom XML elements to EAD inside <archdesc level="collection"> + # (record group/subgroup labels and biographical/historical notes) if xml.content: - rg_label, sg_label = extract_labels(resource)[1:3] - if rg_label: - xml_content = xml.content.decode('utf-8') - insert_pos = xml_content.find('<archdesc level="collection">') + xml_content = xml.content.decode('utf-8') + insert_pos = xml_content.find('<archdesc level="collection">') + + if insert_pos != -1: + # Find the position after the closing </did> tag + insert_pos = xml_content.find('</did>', insert_pos) + if insert_pos != -1: - # Find the position after the opening tag - insert_pos = xml_content.find('</did>', insert_pos) - extra_xml = f'<recordgroup>{rg_label}</recordgroup>' - if sg_label: - extra_xml += f'<subgroup>{sg_label}</subgroup>' - xml_content = (xml_content[:insert_pos] + - extra_xml + - xml_content[insert_pos:]) - xml_content = xml_content.encode('utf-8') - else: - xml_content = xml.content + # Move to after the </did> tag + insert_pos += len('</did>') + extra_xml = '' + + # Add record group and subgroup labels + rg_label, sg_label = extract_labels(resource)[1:3] + if rg_label: + extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>' + if sg_label: + extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>' + + # Add biographical/historical notes from creator agents + bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) + if bioghist_content: + extra_xml += f'\n{bioghist_content}' + + if extra_xml: + xml_content = (xml_content[:insert_pos] + + extra_xml + + xml_content[insert_pos:]) + + xml_content = xml_content.encode('utf-8') + else: + xml_content = xml.content # next level of indentation for nested operations indent_size += 2 @@ -499,6 +517,64 @@ def index(self, repo_id, xml_file_path, indent_size=0): self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}') + def get_creator_bioghist(self, resource, indent_size=0): + """ + Get biographical/historical notes from creator agents linked to the resource. + Returns nested bioghist elements for each creator, or None if no creator agents have notes. + Each bioghist element includes the creator name in a head element and an id attribute. + """ + indent = ' ' * indent_size + bioghist_elements = [] + + if 'linked_agents' not in resource: + return None + + # Process linked_agents in order to maintain consistency with origination order + for linked_agent in resource['linked_agents']: + # Only process agents with 'creator' role + if linked_agent.get('role') == 'creator': + agent_ref = linked_agent.get('ref') + if agent_ref: + try: + agent = self.client.get(agent_ref).json() + + # Extract agent ID from URI for id attribute + agent_id = agent_ref.split('/')[-1] if agent_ref else '' + + # Get agent name for head element + agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown') + + # Check for notes in the agent record + if 'notes' in agent: + for note in agent['notes']: + # Look for biographical/historical notes + if note.get('jsonmodel_type') == 'note_bioghist': + # Extract note content from subnotes + paragraphs = [] + if 'subnotes' in note: + for subnote in note['subnotes']: + if 'content' in subnote: + # Split content on single newlines to create paragraphs + content = subnote['content'] + # Split on newline and filter out empty strings + lines = [line.strip() for line in content.split('\n') if line.strip()] + # Wrap each line in <p> tags + for line in lines: + paragraphs.append(f'<p>{line}</p>') + + # Create nested bioghist element if we have paragraphs + if paragraphs: + paragraphs_xml = ''.join(paragraphs) + bioghist_el = f'<bioghist id="aspace_{agent_id}"><head>{xml_escape(agent_name)}</head>{paragraphs_xml}</bioghist>' + bioghist_elements.append(bioghist_el) + except Exception as e: + self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') + + if bioghist_elements: + return ''.join(bioghist_elements) + return None + + def get_repo_id(self, repo): """ Get the repository ID from the repository URI.