UIUCLibrary · Copilot · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -0,0 +1,65 @@
+# Copilot Agent Instructions for arcflow
+
+This file provides guidance for GitHub Copilot agents working on the arcflow repository.
+
+## Commit Style
+
+When making changes to this repository, use **granular, single-purpose commits**:
+
+### Guidelines
+
+- **One commit per logical change** - Each commit should do one thing and do it well
+- **Separate refactoring from features** - Don't mix code restructuring with new functionality
+- **Clear, descriptive messages** - Explain what the commit does and why
+- **Include imports with usage** - Add necessary imports in the same commit where they're used, not as separate commits
+
+### Examples
+
+Good commit sequence:
+```
+1. Refactor XML injection logic for extensibility
+2. Add linked_agents to resolve parameter
+3. Add get_creator_bioghist method
+   (includes import of xml.sax.saxutils.escape used in the method)
+4. Integrate bioghist into XML injection
+5. Update comment to reflect new behavior
+```
+
+Bad commit sequences:
+
+Too dense:
+```
+1. Add creator biographical information to EAD XML exports
+   (combines refactoring, new imports, new methods, and integration)
+```
+
+Too granular:
+```
+1. Import xml.sax.saxutils.escape
+2. Add get_creator_bioghist method that uses xml.sax.saxutils.escape
+   (import should have been included in this commit)
+```
+
+### Commit Message Format
+
+- **First line**: Clear, concise summary (50-72 characters)
+- **Body** (optional): Bullet points explaining the changes
+- **Keep it focused**: If you need many bullets, consider splitting into multiple commits
+
+### Why This Matters
+
+- Makes code review easier
+- Helps understand the progression of changes
+- Easier to revert specific changes if needed
+- Clear history for future maintainers
+
+---
+
+## Adding More Instructions
+
+To add additional instructions to this file:
+
+1. Add a new section with a clear heading (e.g., `## Testing Strategy`, `## Code Style`)
+2. Keep instructions concise and actionable
+3. Use examples where helpful
+4. Maintain the simple, scannable format
diff --git a/arcflow/main.py b/arcflow/main.py
@@ -9,6 +9,7 @@
 import re
 import logging
 from xml.dom.pulldom import parse, START_ELEMENT
+from xml.sax.saxutils import escape as xml_escape
 from datetime import datetime, timezone
 from asnake.client import ASnakeClient
 from multiprocessing.pool import ThreadPool as Pool
@@ -205,7 +206,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
         resource = self.client.get(
             f'{repo["uri"]}/resources/{resource_id}',
             params={
-                'resolve': ['classifications', 'classification_terms'],
+                'resolve': ['classifications', 'classification_terms', 'linked_agents'],
             }).json()
 
         xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml'
@@ -225,24 +226,41 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0):
                     'ead3': 'false',
                 })
 
-            # add record group and subgroup labels to EAD inside <archdesc level="collection">
+            # add custom XML elements to EAD inside <archdesc level="collection">
+            # (record group/subgroup labels and biographical/historical notes)
             if xml.content:
-                rg_label, sg_label = extract_labels(resource)[1:3]
-                if rg_label:
-                    xml_content = xml.content.decode('utf-8')
-                    insert_pos = xml_content.find('<archdesc level="collection">')
+                xml_content = xml.content.decode('utf-8')
+                insert_pos = xml_content.find('<archdesc level="collection">')
+
+                if insert_pos != -1:
+                    # Find the position after the closing </did> tag
+                    insert_pos = xml_content.find('</did>', insert_pos)
+
                     if insert_pos != -1:
-                        # Find the position after the opening tag
-                        insert_pos = xml_content.find('</did>', insert_pos)
-                        extra_xml = f'<recordgroup>{rg_label}</recordgroup>'
-                        if sg_label:
-                            extra_xml += f'<subgroup>{sg_label}</subgroup>'
-                        xml_content = (xml_content[:insert_pos] + 
-                            extra_xml + 
-                            xml_content[insert_pos:])
-                    xml_content = xml_content.encode('utf-8')
-                else:
-                    xml_content = xml.content
+                        # Move to after the </did> tag
+                        insert_pos += len('</did>')
+                        extra_xml = ''
+
+                        # Add record group and subgroup labels
+                        rg_label, sg_label = extract_labels(resource)[1:3]
+                        if rg_label:
+                            extra_xml += f'\n<recordgroup>{xml_escape(rg_label)}</recordgroup>'
+                            if sg_label:
+                                extra_xml += f'\n<subgroup>{xml_escape(sg_label)}</subgroup>'
+
+                        # Add biographical/historical notes from creator agents
+                        bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size)
+                        if bioghist_content:
+                            extra_xml += f'\n<bioghist>{bioghist_content}</bioghist>'
+
+                        if extra_xml:
+                            xml_content = (xml_content[:insert_pos] + 
+                                extra_xml + 
+                                xml_content[insert_pos:])
+
+                xml_content = xml_content.encode('utf-8')
+            else:
+                xml_content = xml.content
 
             # next level of indentation for nested operations
             indent_size += 2
@@ -499,6 +517,45 @@ def index(self, repo_id, xml_file_path, indent_size=0):
             self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}')
 
 
+    def get_creator_bioghist(self, resource, indent_size=0):
+        """
+        Get biographical/historical notes from creator agents linked to the resource.
+        Returns the notes formatted as XML paragraphs, or None if no creator agents have notes.
+        """
+        indent = ' ' * indent_size
+        bioghist_paragraphs = []
+
+        if 'linked_agents' not in resource:
+            return None
+
+        for linked_agent in resource['linked_agents']:
+            # Only process agents with 'creator' role
+            if linked_agent.get('role') == 'creator':
+                agent_ref = linked_agent.get('ref')
+                if agent_ref:
+                    try:
+                        agent = self.client.get(agent_ref).json()
+
+                        # Check for notes in the agent record
+                        if 'notes' in agent:
+                            for note in agent['notes']:
+                                # Look for biographical/historical notes
+                                if note.get('jsonmodel_type') == 'note_bioghist':
+                                    # Extract note content from subnotes
+                                    if 'subnotes' in note:
+                                        for subnote in note['subnotes']:
+                                            if 'content' in subnote:
+                                                # Escape XML special characters and wrap in paragraph tags
+                                                escaped_content = xml_escape(subnote['content'])
+                                                bioghist_paragraphs.append(f'<p>{escaped_content}</p>')
+                    except Exception as e:
+                        self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')
+
+        if bioghist_paragraphs:
+            return ''.join(bioghist_paragraphs)
+        return None
+
+
     def get_repo_id(self, repo):
         """
         Get the repository ID from the repository URI.