From 6f410952362390e7e00bc035a00d00009c2ae3a9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:06:09 +0000 Subject: [PATCH 01/22] Initial plan From e49106bb67d7b9f6a5eef2abf4487da5b0a52391 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:10:55 +0000 Subject: [PATCH 02/22] Implement bioghist extraction from creator agents - Add get_creator_bioghist method to fetch biographical notes - Update task_resource to resolve linked_agents - Inject bioghist element into EAD XML after tag - Follow existing pattern for custom XML injection Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 90 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index 815903b..c04acf3 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -205,7 +205,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): resource = self.client.get( f'{repo["uri"]}/resources/{resource_id}', params={ - 'resolve': ['classifications', 'classification_terms'], + 'resolve': ['classifications', 'classification_terms', 'linked_agents'], }).json() xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml' @@ -227,22 +227,43 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # add record group and subgroup labels to EAD inside if xml.content: - rg_label, sg_label = extract_labels(resource)[1:3] - if rg_label: - xml_content = xml.content.decode('utf-8') - insert_pos = xml_content.find('') + xml_content = xml.content.decode('utf-8') + insert_pos = xml_content.find('') + + if insert_pos != -1: + # Find the position after the closing tag + insert_pos = xml_content.find('', insert_pos) + if insert_pos != -1: - # Find the position after the opening tag - insert_pos = xml_content.find('', insert_pos) - extra_xml = f'{rg_label}' - if sg_label: - extra_xml += f'{sg_label}' - xml_content = (xml_content[:insert_pos] + - extra_xml + - xml_content[insert_pos:]) - xml_content = xml_content.encode('utf-8') - else: - xml_content = xml.content + # Move to after the tag + insert_pos += len('') + extra_xml = '' + + # Add record group and subgroup labels + rg_label, sg_label = extract_labels(resource)[1:3] + if rg_label: + extra_xml += f'{rg_label}' + if sg_label: + extra_xml += f'{sg_label}' + + # Add biographical/historical notes from creator agents + bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) + if bioghist_content: + # Escape XML special characters + bioghist_content = (bioghist_content + .replace('&', '&') + .replace('<', '<') + .replace('>', '>')) + extra_xml += f'

{bioghist_content}

' + + if extra_xml: + xml_content = (xml_content[:insert_pos] + + extra_xml + + xml_content[insert_pos:]) + + xml_content = xml_content.encode('utf-8') + else: + xml_content = xml.content # next level of indentation for nested operations indent_size += 2 @@ -499,6 +520,43 @@ def index(self, repo_id, xml_file_path, indent_size=0): self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}') + def get_creator_bioghist(self, resource, indent_size=0): + """ + Get biographical/historical notes from creator agents linked to the resource. + Returns the concatenated notes as a string, or None if no creator agents have notes. + """ + indent = ' ' * indent_size + bioghist_notes = [] + + if 'linked_agents' not in resource: + return None + + for linked_agent in resource['linked_agents']: + # Only process agents with 'creator' role + if linked_agent.get('role') == 'creator': + agent_ref = linked_agent.get('ref') + if agent_ref: + try: + agent = self.client.get(agent_ref).json() + + # Check for notes in the agent record + if 'notes' in agent: + for note in agent['notes']: + # Look for biographical/historical notes + if note.get('jsonmodel_type') == 'note_bioghist': + # Extract note content from subnotes + if 'subnotes' in note: + for subnote in note['subnotes']: + if 'content' in subnote: + bioghist_notes.append(subnote['content']) + except Exception as e: + self.log.error(f'{indent}Error fetching agent {agent_ref}: {e}') + + if bioghist_notes: + return '\n'.join(bioghist_notes) + return None + + def get_repo_id(self, repo): """ Get the repository ID from the repository URI. From 3c608debfa0a63db4939e3623e813419d3ce8566 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:14:54 +0000 Subject: [PATCH 03/22] Improve XML escaping and bioghist formatting - Use xml.sax.saxutils.escape for proper XML escaping - Format bioghist notes as separate paragraphs - Remove manual XML escaping in favor of library function Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index c04acf3..ef67bfc 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -9,6 +9,7 @@ import re import logging from xml.dom.pulldom import parse, START_ELEMENT +from xml.sax.saxutils import escape as xml_escape from datetime import datetime, timezone from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool @@ -249,12 +250,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # Add biographical/historical notes from creator agents bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) if bioghist_content: - # Escape XML special characters - bioghist_content = (bioghist_content - .replace('&', '&') - .replace('<', '<') - .replace('>', '>')) - extra_xml += f'

{bioghist_content}

' + extra_xml += f'{bioghist_content}' if extra_xml: xml_content = (xml_content[:insert_pos] + @@ -523,10 +519,10 @@ def index(self, repo_id, xml_file_path, indent_size=0): def get_creator_bioghist(self, resource, indent_size=0): """ Get biographical/historical notes from creator agents linked to the resource. - Returns the concatenated notes as a string, or None if no creator agents have notes. + Returns the notes formatted as XML paragraphs, or None if no creator agents have notes. """ indent = ' ' * indent_size - bioghist_notes = [] + bioghist_paragraphs = [] if 'linked_agents' not in resource: return None @@ -548,12 +544,14 @@ def get_creator_bioghist(self, resource, indent_size=0): if 'subnotes' in note: for subnote in note['subnotes']: if 'content' in subnote: - bioghist_notes.append(subnote['content']) + # Escape XML special characters and wrap in paragraph tags + escaped_content = xml_escape(subnote['content']) + bioghist_paragraphs.append(f'

{escaped_content}

') except Exception as e: self.log.error(f'{indent}Error fetching agent {agent_ref}: {e}') - if bioghist_notes: - return '\n'.join(bioghist_notes) + if bioghist_paragraphs: + return ''.join(bioghist_paragraphs) return None From 63ffe60f090f0eb93e9096c0a6114ec722aa2bd8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:17:51 +0000 Subject: [PATCH 04/22] Improve error message for bioghist fetching - Make error message more specific about the operation that failed Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index ef67bfc..df3c52d 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -548,7 +548,7 @@ def get_creator_bioghist(self, resource, indent_size=0): escaped_content = xml_escape(subnote['content']) bioghist_paragraphs.append(f'

{escaped_content}

') except Exception as e: - self.log.error(f'{indent}Error fetching agent {agent_ref}: {e}') + self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') if bioghist_paragraphs: return ''.join(bioghist_paragraphs) From c1bea750da60ef7f28cce57d8bc329e19b8f16b3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:48:44 +0000 Subject: [PATCH 05/22] Revert "Improve error message for bioghist fetching" This reverts commit 63ffe60f090f0eb93e9096c0a6114ec722aa2bd8. --- arcflow/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index df3c52d..ef67bfc 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -548,7 +548,7 @@ def get_creator_bioghist(self, resource, indent_size=0): escaped_content = xml_escape(subnote['content']) bioghist_paragraphs.append(f'

{escaped_content}

') except Exception as e: - self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') + self.log.error(f'{indent}Error fetching agent {agent_ref}: {e}') if bioghist_paragraphs: return ''.join(bioghist_paragraphs) From 48320f3fa9a98f74554702b5bf5de1199dcba242 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:48:44 +0000 Subject: [PATCH 06/22] Revert "Improve XML escaping and bioghist formatting" This reverts commit 3c608debfa0a63db4939e3623e813419d3ce8566. --- arcflow/main.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index ef67bfc..c04acf3 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -9,7 +9,6 @@ import re import logging from xml.dom.pulldom import parse, START_ELEMENT -from xml.sax.saxutils import escape as xml_escape from datetime import datetime, timezone from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool @@ -250,7 +249,12 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # Add biographical/historical notes from creator agents bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) if bioghist_content: - extra_xml += f'{bioghist_content}' + # Escape XML special characters + bioghist_content = (bioghist_content + .replace('&', '&') + .replace('<', '<') + .replace('>', '>')) + extra_xml += f'

{bioghist_content}

' if extra_xml: xml_content = (xml_content[:insert_pos] + @@ -519,10 +523,10 @@ def index(self, repo_id, xml_file_path, indent_size=0): def get_creator_bioghist(self, resource, indent_size=0): """ Get biographical/historical notes from creator agents linked to the resource. - Returns the notes formatted as XML paragraphs, or None if no creator agents have notes. + Returns the concatenated notes as a string, or None if no creator agents have notes. """ indent = ' ' * indent_size - bioghist_paragraphs = [] + bioghist_notes = [] if 'linked_agents' not in resource: return None @@ -544,14 +548,12 @@ def get_creator_bioghist(self, resource, indent_size=0): if 'subnotes' in note: for subnote in note['subnotes']: if 'content' in subnote: - # Escape XML special characters and wrap in paragraph tags - escaped_content = xml_escape(subnote['content']) - bioghist_paragraphs.append(f'

{escaped_content}

') + bioghist_notes.append(subnote['content']) except Exception as e: self.log.error(f'{indent}Error fetching agent {agent_ref}: {e}') - if bioghist_paragraphs: - return ''.join(bioghist_paragraphs) + if bioghist_notes: + return '\n'.join(bioghist_notes) return None From a54c4ba3b764bcb8fc11eec7cf74efb870b022ea Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:48:44 +0000 Subject: [PATCH 07/22] Revert "Implement bioghist extraction from creator agents" This reverts commit e49106bb67d7b9f6a5eef2abf4487da5b0a52391. --- arcflow/main.py | 90 +++++++++---------------------------------------- 1 file changed, 16 insertions(+), 74 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index c04acf3..815903b 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -205,7 +205,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): resource = self.client.get( f'{repo["uri"]}/resources/{resource_id}', params={ - 'resolve': ['classifications', 'classification_terms', 'linked_agents'], + 'resolve': ['classifications', 'classification_terms'], }).json() xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml' @@ -227,43 +227,22 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # add record group and subgroup labels to EAD inside if xml.content: - xml_content = xml.content.decode('utf-8') - insert_pos = xml_content.find('') - - if insert_pos != -1: - # Find the position after the closing tag - insert_pos = xml_content.find('', insert_pos) - + rg_label, sg_label = extract_labels(resource)[1:3] + if rg_label: + xml_content = xml.content.decode('utf-8') + insert_pos = xml_content.find('') if insert_pos != -1: - # Move to after the tag - insert_pos += len('') - extra_xml = '' - - # Add record group and subgroup labels - rg_label, sg_label = extract_labels(resource)[1:3] - if rg_label: - extra_xml += f'{rg_label}' - if sg_label: - extra_xml += f'{sg_label}' - - # Add biographical/historical notes from creator agents - bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) - if bioghist_content: - # Escape XML special characters - bioghist_content = (bioghist_content - .replace('&', '&') - .replace('<', '<') - .replace('>', '>')) - extra_xml += f'

{bioghist_content}

' - - if extra_xml: - xml_content = (xml_content[:insert_pos] + - extra_xml + - xml_content[insert_pos:]) - - xml_content = xml_content.encode('utf-8') - else: - xml_content = xml.content + # Find the position after the opening tag + insert_pos = xml_content.find('', insert_pos) + extra_xml = f'{rg_label}' + if sg_label: + extra_xml += f'{sg_label}' + xml_content = (xml_content[:insert_pos] + + extra_xml + + xml_content[insert_pos:]) + xml_content = xml_content.encode('utf-8') + else: + xml_content = xml.content # next level of indentation for nested operations indent_size += 2 @@ -520,43 +499,6 @@ def index(self, repo_id, xml_file_path, indent_size=0): self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}') - def get_creator_bioghist(self, resource, indent_size=0): - """ - Get biographical/historical notes from creator agents linked to the resource. - Returns the concatenated notes as a string, or None if no creator agents have notes. - """ - indent = ' ' * indent_size - bioghist_notes = [] - - if 'linked_agents' not in resource: - return None - - for linked_agent in resource['linked_agents']: - # Only process agents with 'creator' role - if linked_agent.get('role') == 'creator': - agent_ref = linked_agent.get('ref') - if agent_ref: - try: - agent = self.client.get(agent_ref).json() - - # Check for notes in the agent record - if 'notes' in agent: - for note in agent['notes']: - # Look for biographical/historical notes - if note.get('jsonmodel_type') == 'note_bioghist': - # Extract note content from subnotes - if 'subnotes' in note: - for subnote in note['subnotes']: - if 'content' in subnote: - bioghist_notes.append(subnote['content']) - except Exception as e: - self.log.error(f'{indent}Error fetching agent {agent_ref}: {e}') - - if bioghist_notes: - return '\n'.join(bioghist_notes) - return None - - def get_repo_id(self, repo): """ Get the repository ID from the repository URI. From 430557fa93c28e45e69f0b7b4ad4005fcb43b6b8 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:49:29 +0000 Subject: [PATCH 08/22] Push revert commits From b39d03b6bfa61851ef9600365527cc111b5f337e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:50:22 +0000 Subject: [PATCH 09/22] Refactor XML injection logic for extensibility Restructure the XML injection code to make it easier to add additional XML elements. Changes: - Move XML decode/encode outside the label-conditional block - Calculate insertion point (after tag) upfront - Use extra_xml variable to accumulate multiple XML elements - Maintains existing recordgroup/subgroup functionality Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index 815903b..e686e6c 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -227,22 +227,33 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # add record group and subgroup labels to EAD inside if xml.content: - rg_label, sg_label = extract_labels(resource)[1:3] - if rg_label: - xml_content = xml.content.decode('utf-8') - insert_pos = xml_content.find('') + xml_content = xml.content.decode('utf-8') + insert_pos = xml_content.find('') + + if insert_pos != -1: + # Find the position after the closing tag + insert_pos = xml_content.find('', insert_pos) + if insert_pos != -1: - # Find the position after the opening tag - insert_pos = xml_content.find('', insert_pos) - extra_xml = f'{rg_label}' - if sg_label: - extra_xml += f'{sg_label}' - xml_content = (xml_content[:insert_pos] + - extra_xml + - xml_content[insert_pos:]) - xml_content = xml_content.encode('utf-8') - else: - xml_content = xml.content + # Move to after the tag + insert_pos += len('') + extra_xml = '' + + # Add record group and subgroup labels + rg_label, sg_label = extract_labels(resource)[1:3] + if rg_label: + extra_xml += f'{rg_label}' + if sg_label: + extra_xml += f'{sg_label}' + + if extra_xml: + xml_content = (xml_content[:insert_pos] + + extra_xml + + xml_content[insert_pos:]) + + xml_content = xml_content.encode('utf-8') + else: + xml_content = xml.content # next level of indentation for nested operations indent_size += 2 From a50571c3ee4faf7bf27df9b24d7c94a4aae46d1a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:50:54 +0000 Subject: [PATCH 10/22] Add linked_agents to resolve parameter Include linked_agents in the resource API call resolution to fetch agent information along with the resource. This prepares for extracting creator biographical information. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index e686e6c..ec22c20 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -205,7 +205,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): resource = self.client.get( f'{repo["uri"]}/resources/{resource_id}', params={ - 'resolve': ['classifications', 'classification_terms'], + 'resolve': ['classifications', 'classification_terms', 'linked_agents'], }).json() xml_file_path = f'{xml_dir}/{resource["ead_id"]}.xml' From 600442f61a25971419581e216d681432a5855722 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:51:31 +0000 Subject: [PATCH 11/22] Import xml.sax.saxutils.escape for proper XML escaping Add import for xml_escape function to properly escape XML special characters (ampersands, less-than, greater-than, quotes) when injecting content into EAD XML. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arcflow/main.py b/arcflow/main.py index ec22c20..5b92039 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -9,6 +9,7 @@ import re import logging from xml.dom.pulldom import parse, START_ELEMENT +from xml.sax.saxutils import escape as xml_escape from datetime import datetime, timezone from asnake.client import ASnakeClient from multiprocessing.pool import ThreadPool as Pool From d7f1bc122163e4b7100f2b8080187ee30bdc1f30 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:52:22 +0000 Subject: [PATCH 12/22] Add get_creator_bioghist method Implement method to fetch biographical/historical notes from ArchivesSpace agent records. The method: - Filters agents by 'creator' role - Fetches agent details via API - Extracts note_bioghist content from subnotes - Escapes XML special characters using xml_escape - Wraps each note in

tags for proper XML structure - Returns concatenated paragraphs or None if no notes found Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arcflow/main.py b/arcflow/main.py index 5b92039..40662bc 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -511,6 +511,45 @@ def index(self, repo_id, xml_file_path, indent_size=0): self.log.error(f'{indent}Error indexing pending resources in repository ID {repo_id} to ArcLight Solr: {e}') + def get_creator_bioghist(self, resource, indent_size=0): + """ + Get biographical/historical notes from creator agents linked to the resource. + Returns the notes formatted as XML paragraphs, or None if no creator agents have notes. + """ + indent = ' ' * indent_size + bioghist_paragraphs = [] + + if 'linked_agents' not in resource: + return None + + for linked_agent in resource['linked_agents']: + # Only process agents with 'creator' role + if linked_agent.get('role') == 'creator': + agent_ref = linked_agent.get('ref') + if agent_ref: + try: + agent = self.client.get(agent_ref).json() + + # Check for notes in the agent record + if 'notes' in agent: + for note in agent['notes']: + # Look for biographical/historical notes + if note.get('jsonmodel_type') == 'note_bioghist': + # Extract note content from subnotes + if 'subnotes' in note: + for subnote in note['subnotes']: + if 'content' in subnote: + # Escape XML special characters and wrap in paragraph tags + escaped_content = xml_escape(subnote['content']) + bioghist_paragraphs.append(f'

{escaped_content}

') + except Exception as e: + self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') + + if bioghist_paragraphs: + return ''.join(bioghist_paragraphs) + return None + + def get_repo_id(self, repo): """ Get the repository ID from the repository URI. From 2ae937ef63b343d636ff8ff62c5b49f4335dc4e4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:53:09 +0000 Subject: [PATCH 13/22] Integrate bioghist into XML injection Call get_creator_bioghist to fetch biographical notes and inject them into the EAD XML as a element. The bioghist element is added after tag, following recordgroup/subgroup labels. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arcflow/main.py b/arcflow/main.py index 40662bc..df3c52d 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -247,6 +247,11 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): if sg_label: extra_xml += f'{sg_label}' + # Add biographical/historical notes from creator agents + bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) + if bioghist_content: + extra_xml += f'{bioghist_content}' + if extra_xml: xml_content = (xml_content[:insert_pos] + extra_xml + From db640cef55e5a78d5f8b9696a19b6096ae41dfa7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 16:53:53 +0000 Subject: [PATCH 14/22] Update comment to reflect new behavior Update the comment on the XML injection section to indicate that it now handles multiple types of custom XML elements (record group/ subgroup labels AND biographical/historical notes), not just labels. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arcflow/main.py b/arcflow/main.py index df3c52d..b37f7f5 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -226,7 +226,8 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): 'ead3': 'false', }) - # add record group and subgroup labels to EAD inside + # add custom XML elements to EAD inside + # (record group/subgroup labels and biographical/historical notes) if xml.content: xml_content = xml.content.decode('utf-8') insert_pos = xml_content.find('') From b485f55fddc64c2404ef976eb246719ce5ba7d0b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 17:00:35 +0000 Subject: [PATCH 15/22] Add copilot-instructions.md with commit style guidelines Create onboarding instructions for GitHub Copilot agents with focus on granular, single-purpose commit style. Includes: - Guidelines for commit granularity - Examples of good vs bad commit sequences - Commit message format recommendations - Instructions for extending the file Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- .github/copilot-instructions.md | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..9b2e5e9 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,55 @@ +# Copilot Agent Instructions for arcflow + +This file provides guidance for GitHub Copilot agents working on the arcflow repository. + +## Commit Style + +When making changes to this repository, use **granular, single-purpose commits**: + +### Guidelines + +- **One commit per logical change** - Each commit should do one thing and do it well +- **Separate refactoring from features** - Don't mix code restructuring with new functionality +- **Clear, descriptive messages** - Explain what the commit does and why + +### Examples + +Good commit sequence: +``` +1. Refactor XML injection logic for extensibility +2. Add linked_agents to resolve parameter +3. Import xml.sax.saxutils.escape for proper XML escaping +4. Add get_creator_bioghist method +5. Integrate bioghist into XML injection +6. Update comment to reflect new behavior +``` + +Bad commit (too dense): +``` +1. Add creator biographical information to EAD XML exports + (combines refactoring, new imports, new methods, and integration) +``` + +### Commit Message Format + +- **First line**: Clear, concise summary (50-72 characters) +- **Body** (optional): Bullet points explaining the changes +- **Keep it focused**: If you need many bullets, consider splitting into multiple commits + +### Why This Matters + +- Makes code review easier +- Helps understand the progression of changes +- Easier to revert specific changes if needed +- Clear history for future maintainers + +--- + +## Adding More Instructions + +To add additional instructions to this file: + +1. Add a new section with a clear heading (e.g., `## Testing Strategy`, `## Code Style`) +2. Keep instructions concise and actionable +3. Use examples where helpful +4. Maintain the simple, scannable format From 76ac7515f25aeb010b687c20417f3856591e273d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 17:09:51 +0000 Subject: [PATCH 16/22] Clarify that imports should be included with usage Update copilot-instructions.md to specify that imports should be added in the same commit where they're used, not as separate commits. Added guideline and examples showing both good and too-granular commit patterns. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- .github/copilot-instructions.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 9b2e5e9..5ec323d 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -11,6 +11,7 @@ When making changes to this repository, use **granular, single-purpose commits** - **One commit per logical change** - Each commit should do one thing and do it well - **Separate refactoring from features** - Don't mix code restructuring with new functionality - **Clear, descriptive messages** - Explain what the commit does and why +- **Include imports with usage** - Add necessary imports in the same commit where they're used, not as separate commits ### Examples @@ -18,18 +19,27 @@ Good commit sequence: ``` 1. Refactor XML injection logic for extensibility 2. Add linked_agents to resolve parameter -3. Import xml.sax.saxutils.escape for proper XML escaping -4. Add get_creator_bioghist method -5. Integrate bioghist into XML injection -6. Update comment to reflect new behavior +3. Add get_creator_bioghist method + (includes import of xml.sax.saxutils.escape used in the method) +4. Integrate bioghist into XML injection +5. Update comment to reflect new behavior ``` -Bad commit (too dense): +Bad commit sequences: + +Too dense: ``` 1. Add creator biographical information to EAD XML exports (combines refactoring, new imports, new methods, and integration) ``` +Too granular: +``` +1. Import xml.sax.saxutils.escape +2. Add get_creator_bioghist method that uses xml.sax.saxutils.escape + (import should have been included in this commit) +``` + ### Commit Message Format - **First line**: Clear, concise summary (50-72 characters) From 0fd29ad3c4bd39223c81d7219de5ccbdcbc064f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 20:15:24 +0000 Subject: [PATCH 17/22] Add newlines between injected XML elements for readability Add newline characters before each injected XML element (recordgroup, subgroup, bioghist) to improve readability of the generated EAD XML for human reviewers. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index b37f7f5..4a63c5f 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -244,14 +244,14 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # Add record group and subgroup labels rg_label, sg_label = extract_labels(resource)[1:3] if rg_label: - extra_xml += f'{rg_label}' + extra_xml += f'\n{rg_label}' if sg_label: - extra_xml += f'{sg_label}' + extra_xml += f'\n{sg_label}' # Add biographical/historical notes from creator agents bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) if bioghist_content: - extra_xml += f'{bioghist_content}' + extra_xml += f'\n{bioghist_content}' if extra_xml: xml_content = (xml_content[:insert_pos] + From eabad7c36ebfab5674a49ba733876979be2a0faf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 20:32:17 +0000 Subject: [PATCH 18/22] Escape XML special characters in recordgroup/subgroup labels Apply xml_escape to recordgroup and subgroup labels to prevent XML syntax errors when labels contain special characters like &, <, or >. This was a pre-existing issue but is now fixed to be consistent with bioghist content escaping. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index 4a63c5f..a3dafae 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -244,9 +244,9 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # Add record group and subgroup labels rg_label, sg_label = extract_labels(resource)[1:3] if rg_label: - extra_xml += f'\n{rg_label}' + extra_xml += f'\n{xml_escape(rg_label)}' if sg_label: - extra_xml += f'\n{sg_label}' + extra_xml += f'\n{xml_escape(sg_label)}' # Add biographical/historical notes from creator agents bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) From fa57624598e12fd785e26f45ae56d427707ea941 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 5 Jan 2026 18:35:01 +0000 Subject: [PATCH 19/22] Remove xml_escape from bioghist content The bioghist content from ArchivesSpace already contains valid EAD XML markup (e.g., , tags) that should be preserved as actual XML nodes, not escaped text. Only recordgroup/subgroup labels need escaping since they're plain text strings. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index a3dafae..e31e62a 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -545,9 +545,8 @@ def get_creator_bioghist(self, resource, indent_size=0): if 'subnotes' in note: for subnote in note['subnotes']: if 'content' in subnote: - # Escape XML special characters and wrap in paragraph tags - escaped_content = xml_escape(subnote['content']) - bioghist_paragraphs.append(f'<p>{escaped_content}</p>') + # Content already contains valid EAD XML markup - pass through as-is + bioghist_paragraphs.append(f'<p>{subnote["content"]}</p>') except Exception as e: self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') From 167cfe7699a1d9e8a8f74e0115c77d7ce037adb5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 5 Jan 2026 18:35:56 +0000 Subject: [PATCH 20/22] Document XML escaping patterns in copilot-instructions Add guidelines for handling plain text vs structured XML content in the EAD pipeline. Explains when to use xml_escape() and when to pass content through unchanged, with rationale for the Traject and ArcLight display pipeline. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- .github/copilot-instructions.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 5ec323d..b3d33a0 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -55,6 +55,39 @@ Too granular: --- +## XML Content Handling in EAD Pipeline + +When injecting content into EAD XML files, distinguish between plain text and structured XML: + +### Escaping Strategy + +- **Plain text labels** (recordgroup, subgroup): Use `xml_escape()` to escape special characters (`&`, `<`, `>`) + - These are simple strings that may contain characters that break XML syntax + - Example: `xml_escape(rg_label)` → converts `"Group & Co"` to `"Group & Co"` + +- **Structured EAD XML content** (bioghist, scopecontent): Do NOT escape + - Content from ArchivesSpace already contains valid EAD XML markup (`<emph>`, `<title>`, etc.) + - These are legitimate XML nodes that must be preserved + - Escaping would convert them to literal text: `<emph>` → `<emph>` + - Example: Pass through as-is: `f'<p>{subnote["content"]}</p>'` + +### Why This Matters + +The Traject indexing pipeline and ArcLight display rely on proper XML structure: +1. Traject's `.to_html` converts XML nodes to HTML +2. ArcLight's `render_html_tags` processes the HTML for display +3. If XML nodes are escaped (treated as text), they can't be processed and appear as raw markup + +### Pattern for Future Fields + +When adding new EAD fields to the pipeline: +1. Determine if content is plain text or structured XML +2. Apply escaping only to plain text +3. Pass structured XML through unchanged +4. Document the decision in code comments + +--- + ## Adding More Instructions To add additional instructions to this file: From 65340e078d3308ed80b703fe43fd533ea0b50b85 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 9 Jan 2026 01:09:10 +0000 Subject: [PATCH 21/22] Structure bioghist with nested elements per creator Implement improved bioghist structure: - Create separate <bioghist> element for each creator - Add <head> element with creator name (XML-escaped) - Include id attribute with ArchivesSpace agent ID - Split content on single newlines to create <p> paragraphs - Maintain linked_agents order for consistency with origination Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index e31e62a..0cd5ba1 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -251,7 +251,7 @@ def task_resource(self, repo, resource_id, xml_dir, pdf_dir, indent_size=0): # Add biographical/historical notes from creator agents bioghist_content = self.get_creator_bioghist(resource, indent_size=indent_size) if bioghist_content: - extra_xml += f'\n<bioghist>{bioghist_content}</bioghist>' + extra_xml += f'\n{bioghist_content}' if extra_xml: xml_content = (xml_content[:insert_pos] + @@ -520,14 +520,16 @@ def index(self, repo_id, xml_file_path, indent_size=0): def get_creator_bioghist(self, resource, indent_size=0): """ Get biographical/historical notes from creator agents linked to the resource. - Returns the notes formatted as XML paragraphs, or None if no creator agents have notes. + Returns nested bioghist elements for each creator, or None if no creator agents have notes. + Each bioghist element includes the creator name in a head element and an id attribute. """ indent = ' ' * indent_size - bioghist_paragraphs = [] + bioghist_elements = [] if 'linked_agents' not in resource: return None + # Process linked_agents in order to maintain consistency with origination order for linked_agent in resource['linked_agents']: # Only process agents with 'creator' role if linked_agent.get('role') == 'creator': @@ -536,22 +538,40 @@ def get_creator_bioghist(self, resource, indent_size=0): try: agent = self.client.get(agent_ref).json() + # Extract agent ID from URI for id attribute + agent_id = agent_ref.split('/')[-1] if agent_ref else '' + + # Get agent name for head element + agent_name = agent.get('title') or agent.get('display_name', {}).get('sort_name', 'Unknown') + # Check for notes in the agent record if 'notes' in agent: for note in agent['notes']: # Look for biographical/historical notes if note.get('jsonmodel_type') == 'note_bioghist': # Extract note content from subnotes + paragraphs = [] if 'subnotes' in note: for subnote in note['subnotes']: if 'content' in subnote: - # Content already contains valid EAD XML markup - pass through as-is - bioghist_paragraphs.append(f'<p>{subnote["content"]}</p>') + # Split content on single newlines to create paragraphs + content = subnote['content'] + # Split on newline and filter out empty strings + lines = [line.strip() for line in content.split('\n') if line.strip()] + # Wrap each line in <p> tags + for line in lines: + paragraphs.append(f'<p>{line}</p>') + + # Create nested bioghist element if we have paragraphs + if paragraphs: + paragraphs_xml = ''.join(paragraphs) + bioghist_el = f'<bioghist id="aspace_{agent_id}"><head>{xml_escape(agent_name)}</head>{paragraphs_xml}</bioghist>' + bioghist_elements.append(bioghist_el) except Exception as e: self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}') - if bioghist_paragraphs: - return ''.join(bioghist_paragraphs) + if bioghist_elements: + return ''.join(bioghist_elements) return None From 8d33aef5ab5b81aa503b85bfe1bd4da70303b381 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 9 Jan 2026 21:45:06 +0000 Subject: [PATCH 22/22] Add line breaks between bioghist paragraphs for legibility Add newline characters between <p> elements within bioghist content and after head/before closing tag to improve readability of the generated XML. Co-authored-by: alexdryden <47127862+alexdryden@users.noreply.github.com> --- arcflow/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arcflow/main.py b/arcflow/main.py index 0cd5ba1..b90b2fa 100644 --- a/arcflow/main.py +++ b/arcflow/main.py @@ -564,8 +564,8 @@ def get_creator_bioghist(self, resource, indent_size=0): # Create nested bioghist element if we have paragraphs if paragraphs: - paragraphs_xml = ''.join(paragraphs) - bioghist_el = f'<bioghist id="aspace_{agent_id}"><head>{xml_escape(agent_name)}</head>{paragraphs_xml}</bioghist>' + paragraphs_xml = '\n'.join(paragraphs) + bioghist_el = f'<bioghist id="aspace_{agent_id}"><head>{xml_escape(agent_name)}</head>\n{paragraphs_xml}\n</bioghist>' bioghist_elements.append(bioghist_el) except Exception as e: self.log.error(f'{indent}Error fetching biographical information for agent {agent_ref}: {e}')