From 4448c4552f26aa5f269fb03fc7b06f4ca114afba Mon Sep 17 00:00:00 2001 From: jmp111 <60651038+jmp111@users.noreply.github.com> Date: Fri, 14 Feb 2025 11:48:05 +0000 Subject: [PATCH 1/6] Add in infons.type for passages --- autocorpus/bioc_passages.py | 151 +++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 70 deletions(-) diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index 6dbe7163..0897db5a 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -1,70 +1,81 @@ -"""BioC Passage builder script.""" - - -class BioCPassage: - """BioC Passage builder class.""" - - @classmethod - def from_title(cls, title, offset): - """Creates a BioCPassage object from a title. - - Args: - title (str): Passage title - offset (int): Passage offset - - Returns: - (dict): BioCPassage object - """ - title_passage = { - "section_heading": "", - "subsection_heading": "", - "body": title, - "section_type": [{"iao_name": "document title", "iao_id": "IAO:0000305"}], - } - return cls(title_passage, offset) - - def __build_passage(self, passage, offset): - defaultkeys = ["section_heading", "subsection_heading", "body", "section_type"] - passage_dict = { - "offset": offset, - "infons": {}, - "text": passage["body"], - "sentences": [], - "annotations": [], - "relations": [], - } - for key in passage.keys(): - if key not in defaultkeys: - passage_dict["infons"][key] = passage[key] - # TODO: currently assumes section_heading and subsection_heading will always exist, should ideally check for existence. - # Also doesn't account for subsubsection headings which might exist - if passage["section_heading"] != "": - passage_dict["infons"]["section_title_1"] = passage["section_heading"] - if passage["subsection_heading"] != "": - passage_dict["infons"]["section_title_2"] = passage["subsection_heading"] - counter = 1 - for section_type in passage["section_type"]: - passage_dict["infons"][f"iao_name_{counter}"] = section_type["iao_name"] - passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] - counter += 1 - - return passage_dict - - def __init__(self, passage, offset): - """Construct a passage object from the provided passage dict and offset. - - Args: - passage (dict): Article passage dictionary - offset (int): Passage offset to use - """ - self.offset = 0 - self.passage = self.__build_passage(passage, offset) - pass - - def as_dict(self): - """Returns a dictionary representation of the passage. - - Returns: - (dict): Dictionary representation of the passage - """ - return self.passage +"""BioC Passage builder script.""" + + +class BioCPassage: + """BioC Passage builder class.""" + + @classmethod + def from_title(cls, title, offset): + """Creates a BioCPassage object from a title. + + Args: + title (str): Passage title + offset (int): Passage offset + + Returns: + (dict): BioCPassage object + """ + title_passage = { + "section_heading": "", + "subsection_heading": "", + "body": title, + "section_type": [{"iao_name": "document title", "iao_id": "IAO:0000305"}], + } + return cls(title_passage, offset) + + def __build_passage(self, passage, offset): + defaultkeys = ["section_heading", "subsection_heading", "body", "section_type"] + passage_dict = { + "offset": offset, + "infons": {}, + "text": passage["body"], + "sentences": [], + "annotations": [], + "relations": [], + } + for key in passage.keys(): + if key not in defaultkeys: + passage_dict["infons"][key] = passage[key] + # TODO: currently assumes section_heading and subsection_heading will always exist, should ideally check for existence. + # Also doesn't account for subsubsection headings which might exist + if passage["section_heading"] != "": + passage_dict["infons"]["section_title_1"] = passage["section_heading"] + if passage["subsection_heading"] != "": + passage_dict["infons"]["section_title_2"] = passage["subsection_heading"] + counter = 1 + for section_type in passage["section_type"]: + passage_dict["infons"][f"iao_name_{counter}"] = section_type["iao_name"] + passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] + counter += 1 + if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": + passage_dict["infons"]["type"] = "front" + else: + passage_dict["infons"]["type"] = "paragraph" + # TODO: make optional input to AutoCORPus to have section (sub)headers a separate passage in the document. + # section headers: + # passage_dict["infons"]["type"] = "title_1" + # subsection headers: + # passage_dict["infons"]["type"] = "title_2" + # subsubsection headers: + # passage_dict["infons"]["type"] = "title_3" + + return passage_dict + + def __init__(self, passage, offset): + """Construct a passage object from the provided passage dict and offset. + + Args: + passage (dict): Article passage dictionary + offset (int): Passage offset to use + """ + self.offset = 0 + self.passage = self.__build_passage(passage, offset) + pass + + def as_dict(self): + """Returns a dictionary representation of the passage. + + Returns: + (dict): Dictionary representation of the passage + """ + return self.passage From 2f12aaad3938c83a38a511c3d82b554b9a22abbf Mon Sep 17 00:00:00 2001 From: jmp111 <60651038+jmp111@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:10:28 +0000 Subject: [PATCH 2/6] Add in ref type --- autocorpus/bioc_passages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index 0897db5a..fc2062f4 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -49,6 +49,8 @@ def __build_passage(self, passage, offset): counter += 1 if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": passage_dict["infons"]["type"] = "front" + elif passage_dict["infons"]["iao_id_1"] == "IAO:0000320": + passage_dict["infons"]["type"] = "ref" else: passage_dict["infons"]["type"] = "paragraph" # TODO: make optional input to AutoCORPus to have section (sub)headers a separate passage in the document. From 0543877bf17f8bf3b323720a40aea3dbb21dba69 Mon Sep 17 00:00:00 2001 From: jmp111 <60651038+jmp111@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:24:36 +0000 Subject: [PATCH 3/6] Updated for references. --- autocorpus/bioc_passages.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index fc2062f4..8322f53c 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -48,17 +48,17 @@ def __build_passage(self, passage, offset): passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] counter += 1 if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": - passage_dict["infons"]["type"] = "front" + passage_dict["infons"]["type"] = "front" # Suggest to make this (title) title_1 instead of front elif passage_dict["infons"]["iao_id_1"] == "IAO:0000320": passage_dict["infons"]["type"] = "ref" else: passage_dict["infons"]["type"] = "paragraph" # TODO: make optional input to AutoCORPus to have section (sub)headers a separate passage in the document. - # section headers: + # section headers: # Suggest to make this (section) title_2 # passage_dict["infons"]["type"] = "title_1" - # subsection headers: + # subsection headers: # Suggest to make this (subsection) title_3 # passage_dict["infons"]["type"] = "title_2" - # subsubsection headers: + # subsubsection headers: # suggest not to use this as we only look at 2 levels currently, see above. # passage_dict["infons"]["type"] = "title_3" return passage_dict From dcfefb0085608c1cceb6c57edc6f27dce51409ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:24:47 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- autocorpus/bioc_passages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index 8322f53c..ca01084b 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -48,7 +48,9 @@ def __build_passage(self, passage, offset): passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] counter += 1 if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": - passage_dict["infons"]["type"] = "front" # Suggest to make this (title) title_1 instead of front + passage_dict["infons"]["type"] = ( + "front" # Suggest to make this (title) title_1 instead of front + ) elif passage_dict["infons"]["iao_id_1"] == "IAO:0000320": passage_dict["infons"]["type"] = "ref" else: From 137aae925d440ba252cfcc849c476b37e398673a Mon Sep 17 00:00:00 2001 From: jmp111 <60651038+jmp111@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:26:46 +0000 Subject: [PATCH 5/6] Added more comments --- autocorpus/bioc_passages.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index ca01084b..655b9ca0 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -48,11 +48,9 @@ def __build_passage(self, passage, offset): passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] counter += 1 if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": - passage_dict["infons"]["type"] = ( - "front" # Suggest to make this (title) title_1 instead of front - ) + passage_dict["infons"]["type"] = "title_1" # Suggest to make this (title) title_1 instead of front elif passage_dict["infons"]["iao_id_1"] == "IAO:0000320": - passage_dict["infons"]["type"] = "ref" + passage_dict["infons"]["type"] = "ref" # does not look as nice, consider keeping as paragraph else: passage_dict["infons"]["type"] = "paragraph" # TODO: make optional input to AutoCORPus to have section (sub)headers a separate passage in the document. From 9bcd3a33261d4cb0f9ececce826dc00c829ab09a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Feb 2025 12:26:57 +0000 Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- autocorpus/bioc_passages.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index 655b9ca0..21207a98 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -48,9 +48,13 @@ def __build_passage(self, passage, offset): passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] counter += 1 if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": - passage_dict["infons"]["type"] = "title_1" # Suggest to make this (title) title_1 instead of front + passage_dict["infons"]["type"] = ( + "title_1" # Suggest to make this (title) title_1 instead of front + ) elif passage_dict["infons"]["iao_id_1"] == "IAO:0000320": - passage_dict["infons"]["type"] = "ref" # does not look as nice, consider keeping as paragraph + passage_dict["infons"]["type"] = ( + "ref" # does not look as nice, consider keeping as paragraph + ) else: passage_dict["infons"]["type"] = "paragraph" # TODO: make optional input to AutoCORPus to have section (sub)headers a separate passage in the document.