diff --git a/autocorpus/bioc_passages.py b/autocorpus/bioc_passages.py index 6dbe7163..21207a98 100644 --- a/autocorpus/bioc_passages.py +++ b/autocorpus/bioc_passages.py @@ -1,70 +1,87 @@ -"""BioC Passage builder script.""" - - -class BioCPassage: - """BioC Passage builder class.""" - - @classmethod - def from_title(cls, title, offset): - """Creates a BioCPassage object from a title. - - Args: - title (str): Passage title - offset (int): Passage offset - - Returns: - (dict): BioCPassage object - """ - title_passage = { - "section_heading": "", - "subsection_heading": "", - "body": title, - "section_type": [{"iao_name": "document title", "iao_id": "IAO:0000305"}], - } - return cls(title_passage, offset) - - def __build_passage(self, passage, offset): - defaultkeys = ["section_heading", "subsection_heading", "body", "section_type"] - passage_dict = { - "offset": offset, - "infons": {}, - "text": passage["body"], - "sentences": [], - "annotations": [], - "relations": [], - } - for key in passage.keys(): - if key not in defaultkeys: - passage_dict["infons"][key] = passage[key] - # TODO: currently assumes section_heading and subsection_heading will always exist, should ideally check for existence. - # Also doesn't account for subsubsection headings which might exist - if passage["section_heading"] != "": - passage_dict["infons"]["section_title_1"] = passage["section_heading"] - if passage["subsection_heading"] != "": - passage_dict["infons"]["section_title_2"] = passage["subsection_heading"] - counter = 1 - for section_type in passage["section_type"]: - passage_dict["infons"][f"iao_name_{counter}"] = section_type["iao_name"] - passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] - counter += 1 - - return passage_dict - - def __init__(self, passage, offset): - """Construct a passage object from the provided passage dict and offset. - - Args: - passage (dict): Article passage dictionary - offset (int): Passage offset to use - """ - self.offset = 0 - self.passage = self.__build_passage(passage, offset) - pass - - def as_dict(self): - """Returns a dictionary representation of the passage. - - Returns: - (dict): Dictionary representation of the passage - """ - return self.passage +"""BioC Passage builder script.""" + + +class BioCPassage: + """BioC Passage builder class.""" + + @classmethod + def from_title(cls, title, offset): + """Creates a BioCPassage object from a title. + + Args: + title (str): Passage title + offset (int): Passage offset + + Returns: + (dict): BioCPassage object + """ + title_passage = { + "section_heading": "", + "subsection_heading": "", + "body": title, + "section_type": [{"iao_name": "document title", "iao_id": "IAO:0000305"}], + } + return cls(title_passage, offset) + + def __build_passage(self, passage, offset): + defaultkeys = ["section_heading", "subsection_heading", "body", "section_type"] + passage_dict = { + "offset": offset, + "infons": {}, + "text": passage["body"], + "sentences": [], + "annotations": [], + "relations": [], + } + for key in passage.keys(): + if key not in defaultkeys: + passage_dict["infons"][key] = passage[key] + # TODO: currently assumes section_heading and subsection_heading will always exist, should ideally check for existence. + # Also doesn't account for subsubsection headings which might exist + if passage["section_heading"] != "": + passage_dict["infons"]["section_title_1"] = passage["section_heading"] + if passage["subsection_heading"] != "": + passage_dict["infons"]["section_title_2"] = passage["subsection_heading"] + counter = 1 + for section_type in passage["section_type"]: + passage_dict["infons"][f"iao_name_{counter}"] = section_type["iao_name"] + passage_dict["infons"][f"iao_id_{counter}"] = section_type["iao_id"] + counter += 1 + if passage_dict["infons"]["iao_id_1"] == "IAO:0000305": + passage_dict["infons"]["type"] = ( + "title_1" # Suggest to make this (title) title_1 instead of front + ) + elif passage_dict["infons"]["iao_id_1"] == "IAO:0000320": + passage_dict["infons"]["type"] = ( + "ref" # does not look as nice, consider keeping as paragraph + ) + else: + passage_dict["infons"]["type"] = "paragraph" + # TODO: make optional input to AutoCORPus to have section (sub)headers a separate passage in the document. + # section headers: # Suggest to make this (section) title_2 + # passage_dict["infons"]["type"] = "title_1" + # subsection headers: # Suggest to make this (subsection) title_3 + # passage_dict["infons"]["type"] = "title_2" + # subsubsection headers: # suggest not to use this as we only look at 2 levels currently, see above. + # passage_dict["infons"]["type"] = "title_3" + + return passage_dict + + def __init__(self, passage, offset): + """Construct a passage object from the provided passage dict and offset. + + Args: + passage (dict): Article passage dictionary + offset (int): Passage offset to use + """ + self.offset = 0 + self.passage = self.__build_passage(passage, offset) + pass + + def as_dict(self): + """Returns a dictionary representation of the passage. + + Returns: + (dict): Dictionary representation of the passage + """ + return self.passage