Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions .github/copilot-instructions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copilot Agent Instructions for arcflow

This file provides guidance for GitHub Copilot agents working on the arcflow repository.

## Commit Style

When making changes to this repository, use **granular, single-purpose commits**:

### Guidelines

- **One commit per logical change** - Each commit should do one thing and do it well
- **Separate refactoring from features** - Don't mix code restructuring with new functionality
- **Clear, descriptive messages** - Explain what the commit does and why
- **Include imports with usage** - Add necessary imports in the same commit where they're used, not as separate commits

### Examples

Good commit sequence:
```
1. Refactor XML injection logic for extensibility
2. Add linked_agents to resolve parameter
3. Add get_creator_bioghist method
(includes import of xml.sax.saxutils.escape used in the method)
4. Integrate bioghist into XML injection
5. Update comment to reflect new behavior
```

Bad commit sequences:

Too dense:
```
1. Add creator biographical information to EAD XML exports
(combines refactoring, new imports, new methods, and integration)
```

Too granular:
```
1. Import xml.sax.saxutils.escape
2. Add get_creator_bioghist method that uses xml.sax.saxutils.escape
(import should have been included in this commit)
```

### Commit Message Format

- **First line**: Clear, concise summary (50-72 characters)
- **Body** (optional): Bullet points explaining the changes
- **Keep it focused**: If you need many bullets, consider splitting into multiple commits

### Why This Matters

- Makes code review easier
- Helps understand the progression of changes
- Easier to revert specific changes if needed
- Clear history for future maintainers

---

## Adding More Instructions

To add additional instructions to this file:

1. Add a new section with a clear heading (e.g., `## Testing Strategy`, `## Code Style`)
2. Keep instructions concise and actionable
3. Use examples where helpful
4. Maintain the simple, scannable format
91 changes: 74 additions & 17 deletions arcflow/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import re
import logging
from xml.dom.pulldom import parse, START_ELEMENT
from xml.sax.saxutils import escape as xml_escape
from datetime import datetime, timezone
from asnake.client import ASnakeClient
from multiprocessing.pool import ThreadPool as Pool
Expand Down Expand Up @@ -205,7 +206,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
resource = self.client.get(
f'{repo["uri"]}/resources/{resource_id}',
params={
'resolve': ['classifications', 'classification_terms'],
'resolve': ['classifications', 'classification_terms', 'linked_agents'],
}).json()

xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml'
Expand All @@ -225,24 +226,41 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
'ead3': 'false',
})

# add record group and subgroup labels to EAD inside <archdesc level="collection">
# add custom XML elements to EAD inside <archdesc level="collection">
# (record group/subgroup labels and biographical/historical notes)
if xml.content:
rg_label, sg_label = extract_labels(resource)[1:3]
if rg_label:
xml_content = xml.content.decode('utf-8')
insert_pos = xml_content.find('<archdesc level="collection">')
xml_content = xml.content.decode('utf-8')
insert_pos = xml_content.find('<archdesc level="collection">')

if insert_pos != -1:
# Find the position after the closing </did> tag
insert_pos = xml_content.find('</did>', insert_pos)

if insert_pos != -1:
# Find the position after the opening tag
insert_pos = xml_content.find('</did>', insert_pos)
extra_xml = f'<recordgroup>{rg_label}</recordgroup>'
if sg_label:
extra_xml += f'<subgroup>{sg_label}</subgroup>'
xml_content = (xml_content[:insert_pos] +
extra_xml +
xml_content[insert_pos:])
xml_content = xml_content.encode('utf-8')
else:
xml_content = xml.content
# Move to after the </did> tag
insert_pos += len('</did>')
extra_xml = ''

# Add record group and subgroup labels
rg_label, sg_label = extract_labels(resource)[1:3]
if rg_label:
extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>'
if sg_label:
extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>'

# Add biographical/historical notes from creator agents
bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
if bioghist_content:
extra_xml += f'\n<bioghist>{bioghist_content}</bioghist>'

if extra_xml:
xml_content = (xml_content[:insert_pos] +
extra_xml +
xml_content[insert_pos:])

xml_content = xml_content.encode('utf-8')
else:
xml_content = xml.content

# next level of indentation for nested operations
indent_size += 2
Expand Down Expand Up @@ -499,6 +517,45 @@ def index(self, repo_id, xml_file_path, indent_size=0):
self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}')


def get_creator_bioghist(self, resource, indent_size=0):
"""
Get biographical/historical notes from creator agents linked to the resource.
Returns the notes formatted as XML paragraphs, or None if no creator agents have notes.
"""
indent = ' ' * indent_size
bioghist_paragraphs = []

if 'linked_agents' not in resource:
return None

for linked_agent in resource['linked_agents']:
# Only process agents with 'creator' role
if linked_agent.get('role') == 'creator':
agent_ref = linked_agent.get('ref')
if agent_ref:
try:
agent = self.client.get(agent_ref).json()

# Check for notes in the agent record
if 'notes' in agent:
for note in agent['notes']:
# Look for biographical/historical notes
if note.get('jsonmodel_type') == 'note_bioghist':
# Extract note content from subnotes
if 'subnotes' in note:
for subnote in note['subnotes']:
if 'content' in subnote:
# Escape XML special characters and wrap in paragraph tags
escaped_content = xml_escape(subnote['content'])
bioghist_paragraphs.append(f'<p>{escaped_content}</p>')
except Exception as e:
self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')

if bioghist_paragraphs:
return ''.join(bioghist_paragraphs)
return None


def get_repo_id(self, repo):
"""
Get the repository ID from the repository URI.
Expand Down