From b09d8e803dfe6ffd7c9c1bc594ba36f1c0805977 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 00:23:03 +0800 Subject: [PATCH 01/38] regenerate ontology --- ftx/medical/clinical_ontology.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ftx/medical/clinical_ontology.py b/ftx/medical/clinical_ontology.py index 52019a75..6f782423 100644 --- a/ftx/medical/clinical_ontology.py +++ b/ftx/medical/clinical_ontology.py @@ -47,6 +47,8 @@ "MedicalArticle", "Abbreviation", "Hyponym", + "Disease", + "Chemical" ] @@ -492,3 +494,23 @@ class Hyponym(Link): def __init__(self, pack: DataPack, parent: Optional[Entry] = None, child: Optional[Entry] = None): super().__init__(pack, parent, child) self.hyponym_link: Optional[str] = None + + +@dataclass +class Disease(Annotation): + """ + A span based annotation `Disease`, used to represent the diseases in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Chemical(Annotation): + """ + A span based annotation `Chemical`, used to represent the chemical in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) From 4e5893aefc227b3c79c905c30beee39a0b388447 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 00:24:10 +0800 Subject: [PATCH 02/38] Add Disease and Chemical in ontology --- .../ontology_specs/clinical_ontology.json | 828 +++++++++--------- 1 file changed, 419 insertions(+), 409 deletions(-) diff --git a/fortex/health/ontology_specs/clinical_ontology.json b/fortex/health/ontology_specs/clinical_ontology.json index a9269abd..ed8df678 100644 --- a/fortex/health/ontology_specs/clinical_ontology.json +++ b/fortex/health/ontology_specs/clinical_ontology.json @@ -1,407 +1,407 @@ -{ - "name": "clinical_ontology", - "imports": [ - "base_ontology.json" - ], - "additional_prefixes": [ - "ftx.medical.clinical_ontology" - ], - "definitions": [ - { - "entry_name": "ftx.medical.clinical_ontology.ClinicalEntityMention", - "parent_entry": "ft.onto.base_ontology.EntityMention", - "description": "A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text." - }, - { - "entry_name": "ftx.medical.clinical_ontology.Description", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "A span based annotation `Description`, used to represent the description in a piece of clinical note." - }, - { - "entry_name": "ftx.medical.clinical_ontology.Body", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "A span based annotation `Body`, used to represent the actual content in a piece of clinical note." - }, - { - "entry_name": "ftx.medical.clinical_ontology.FrequencyAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The frequency determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.DurationAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The duration determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.RouteAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The route determination for the Drug NER profile.", - "attributes": [ - { - "name": "in_take_method", - "type": "str" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.SuffixStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The suffix portion of dosage strength determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.FractionStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The fraction portion of dosages strength determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.RangeStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The range portion of dosages stength determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.DecimalStrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The decimal portion of dosages stength determination for the Drug NER profile" - }, - { - "entry_name": "ftx.medical.clinical_ontology.DrugChangeStatusAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The change status of dosages determination for the Drug NER profile.", - "attributes": [ - { - "name": "change_status", - "type": "str", - "description": "Indicates the drug change status of 'stop', 'start', 'increase', 'decrease', or 'noChange'." - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.DosagesAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The dosage determination for the Drug NER profile." - }, - { - "entry_name": "ftx.medical.clinical_ontology.StrengthAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "Holds the value representing the unit of the drug dosage." - }, - { - "entry_name": "ftx.medical.clinical_ontology.StrengthUnitAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "" - }, - { - "entry_name": "ftx.medical.clinical_ontology.FrequencyUnitAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The value represents the unit portion of the drug frequency.", - "attributes": [ - { - "name": "period", - "type": "float", - "description": "The periodic unit used, e.g day, month, hour, etc." - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.FormAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The value represents the form portion of the drug mention." - }, - { - "entry_name": "ftx.medical.clinical_ontology.SubSectionAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "", - "attributes": [ - { - "name": "sub_ssection_body_begin", - "type": "int", - "description": "Sub-section body begin offset." - }, - { - "name": "sub_section_body_end", - "type": "int", - "description": "Sub-section body end offset." - }, - { - "name": "status", - "type": "int", - "description": "Status of 'possible', 'history of', or 'family history of'." - }, - { - "name": "sub_section_header_begin", - "type": "int", - "description": "Begin offset of subSection header" - }, - { - "name": "sub_section_header_end", - "type": "int", - "description": "Ending offset of subsection header" - }, - { - "name": "parent_section_id", - "type": "str", - "description": "The section in which the subsection was found." - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.DrugMentionAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "", - "attributes": [ - { - "name": "status", - "type": "int", - "description": "" - }, - { - "name": "confidence", - "type": "float", - "description": "The confidence of the annotation." - }, - { - "name": "frequency", - "type": "str", - "description": "Frequency refers to how often the patient needs to take the drug. Frequency is divided into frequency number and frequency unit. E.g. twice daily" - }, - { - "name": "frequency_begin", - "type": "int", - "description": "" - }, - { - "name": "frequency_end", - "type": "int", - "description": "" - }, - { - "name": "duration", - "type": "str", - "description": "Duration refers to for how long the patient is expected to take the drug. E.g. 'for 2 weeks' Strongly encouraged to use bold text" - }, - { - "name": "duration_begin", - "type": "int", - "description": "" - }, - { - "name": "duration_end", - "type": "int", - "description": "" - }, - { - "name": "route", - "type": "str", - "description": "Medication route refers to the way that a drug is introduced into the body. E.g oral Strongly encouraged to use bold text" - }, - { - "name": "route_begin", - "type": "int", - "description": "" - }, - { - "name": "route_end", - "type": "int", - "description": "" - }, - { - "name": "drug_change_status", - "type": "str", - "description": "Status refers to the whether the medication is currently being taken or not." - }, - { - "name": "dosage", - "type": "str", - "description": "Dosage refers to how many of each drug the patient is taking. E.g. 5 mg" - }, - { - "name": "dosage_begin", - "type": "int", - "description": "" - }, - { - "name": "dosage_end", - "type": "int", - "description": "" - }, - { - "name": "strength", - "type": "str", - "description": "" - }, - { - "name": "strength_begin", - "type": "int", - "description": "" - }, - { - "name": "strength_end", - "type": "int", - "description": "" - }, - { - "name": "strength_unit", - "type": "str", - "description": "" - }, - { - "name": "su_begin", - "type": "int", - "description": "" - }, - { - "name": "su_end", - "type": "int", - "description": "" - }, - { - "name": "form", - "type": "str", - "description": "Form refers to the physical appearance of the drug. E.g. cream" - }, - { - "name": "form_begin", - "type": "int", - "description": "" - }, - { - "name": "form_end", - "type": "int", - "description": "" - }, - { - "name": "frequency_unit", - "type": "str", - "description": "" - }, - { - "name": "fu_begin", - "type": "int", - "description": "" - }, - { - "name": "fu_end", - "type": "int", - "description": "" - }, - { - "name": "start_date", - "type": "str", - "description": "" - }, - { - "name": "reason", - "type": "Dict", - "key_type": "str", - "value_type": "int" - }, - { - "name": "change_status_begin", - "type": "int", - "description": "" - }, - { - "name": "change_status_end", - "type": "int", - "description": "" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.ChunkAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "The value represents the unit portion of the drug frequency.", - "attributes": [ - { - "name": "sentence_id", - "type": "str", - "description": "" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.DrugLookupWindowAnnotation", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "Similar to LookupWindowAnnotation however, these annotations are restricted to the segments/sections specified in the parameter - sectionOverrideSet - in DrugCNP2LookupWindow" - }, - { - "entry_name": "ftx.medical.clinical_ontology.NegationContext", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", - "attributes": [ - { - "name": "polarity", - "type": "bool" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.UMLSConceptLink", - "parent_entry": "forte.data.ontology.top.Generics", - "description": "A umls concept entity, used to represent basic information of a umls concept", - "attributes": [ - { - "name": "cui", - "type": "str" - }, - { - "name": "name", - "type": "str" - }, - { - "name": "definition", - "type": "str" - }, - { - "name": "tuis", - "type": "List", - "item_type": "str" - }, - { - "name": "aliases", - "type": "List", - "item_type": "str" - }, - { - "name": "score", - "type": "str" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", - "parent_entry": "ft.onto.base_ontology.EntityMention", - "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", - "attributes": [ - { - "name": "umls_link", - "type": "str" - }, - { - "name": "umls_entities", - "type": "List", - "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" - } - ] - }, - { - "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", - "parent_entry": "forte.data.ontology.top.Annotation", - "description": "An annotation which represents the whole medical text chunk/document", - "attributes": [ - { - "name": "icd_version", - "type": "int", - "description": "The version of ICD-Coding being used." - }, - { - "name": "icd_code", - "type": "str", - "description": "The ICD code assigned to current medical article." - } - ] - }, - { +{ + "name": "clinical_ontology", + "imports": [ + "base_ontology.json" + ], + "additional_prefixes": [ + "ftx.medical.clinical_ontology" + ], + "definitions": [ + { + "entry_name": "ftx.medical.clinical_ontology.ClinicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Description", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Description`, used to represent the description in a piece of clinical note." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Body", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Body`, used to represent the actual content in a piece of clinical note." + }, + { + "entry_name": "ftx.medical.clinical_ontology.FrequencyAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The frequency determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.DurationAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The duration determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.RouteAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The route determination for the Drug NER profile.", + "attributes": [ + { + "name": "in_take_method", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.SuffixStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The suffix portion of dosage strength determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.FractionStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The fraction portion of dosages strength determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.RangeStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The range portion of dosages stength determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.DecimalStrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The decimal portion of dosages stength determination for the Drug NER profile" + }, + { + "entry_name": "ftx.medical.clinical_ontology.DrugChangeStatusAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The change status of dosages determination for the Drug NER profile.", + "attributes": [ + { + "name": "change_status", + "type": "str", + "description": "Indicates the drug change status of 'stop', 'start', 'increase', 'decrease', or 'noChange'." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.DosagesAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The dosage determination for the Drug NER profile." + }, + { + "entry_name": "ftx.medical.clinical_ontology.StrengthAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Holds the value representing the unit of the drug dosage." + }, + { + "entry_name": "ftx.medical.clinical_ontology.StrengthUnitAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "" + }, + { + "entry_name": "ftx.medical.clinical_ontology.FrequencyUnitAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The value represents the unit portion of the drug frequency.", + "attributes": [ + { + "name": "period", + "type": "float", + "description": "The periodic unit used, e.g day, month, hour, etc." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.FormAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The value represents the form portion of the drug mention." + }, + { + "entry_name": "ftx.medical.clinical_ontology.SubSectionAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "", + "attributes": [ + { + "name": "sub_ssection_body_begin", + "type": "int", + "description": "Sub-section body begin offset." + }, + { + "name": "sub_section_body_end", + "type": "int", + "description": "Sub-section body end offset." + }, + { + "name": "status", + "type": "int", + "description": "Status of 'possible', 'history of', or 'family history of'." + }, + { + "name": "sub_section_header_begin", + "type": "int", + "description": "Begin offset of subSection header" + }, + { + "name": "sub_section_header_end", + "type": "int", + "description": "Ending offset of subsection header" + }, + { + "name": "parent_section_id", + "type": "str", + "description": "The section in which the subsection was found." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.DrugMentionAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "", + "attributes": [ + { + "name": "status", + "type": "int", + "description": "" + }, + { + "name": "confidence", + "type": "float", + "description": "The confidence of the annotation." + }, + { + "name": "frequency", + "type": "str", + "description": "Frequency refers to how often the patient needs to take the drug. Frequency is divided into frequency number and frequency unit. E.g. twice daily" + }, + { + "name": "frequency_begin", + "type": "int", + "description": "" + }, + { + "name": "frequency_end", + "type": "int", + "description": "" + }, + { + "name": "duration", + "type": "str", + "description": "Duration refers to for how long the patient is expected to take the drug. E.g. 'for 2 weeks' Strongly encouraged to use bold text" + }, + { + "name": "duration_begin", + "type": "int", + "description": "" + }, + { + "name": "duration_end", + "type": "int", + "description": "" + }, + { + "name": "route", + "type": "str", + "description": "Medication route refers to the way that a drug is introduced into the body. E.g oral Strongly encouraged to use bold text" + }, + { + "name": "route_begin", + "type": "int", + "description": "" + }, + { + "name": "route_end", + "type": "int", + "description": "" + }, + { + "name": "drug_change_status", + "type": "str", + "description": "Status refers to the whether the medication is currently being taken or not." + }, + { + "name": "dosage", + "type": "str", + "description": "Dosage refers to how many of each drug the patient is taking. E.g. 5 mg" + }, + { + "name": "dosage_begin", + "type": "int", + "description": "" + }, + { + "name": "dosage_end", + "type": "int", + "description": "" + }, + { + "name": "strength", + "type": "str", + "description": "" + }, + { + "name": "strength_begin", + "type": "int", + "description": "" + }, + { + "name": "strength_end", + "type": "int", + "description": "" + }, + { + "name": "strength_unit", + "type": "str", + "description": "" + }, + { + "name": "su_begin", + "type": "int", + "description": "" + }, + { + "name": "su_end", + "type": "int", + "description": "" + }, + { + "name": "form", + "type": "str", + "description": "Form refers to the physical appearance of the drug. E.g. cream" + }, + { + "name": "form_begin", + "type": "int", + "description": "" + }, + { + "name": "form_end", + "type": "int", + "description": "" + }, + { + "name": "frequency_unit", + "type": "str", + "description": "" + }, + { + "name": "fu_begin", + "type": "int", + "description": "" + }, + { + "name": "fu_end", + "type": "int", + "description": "" + }, + { + "name": "start_date", + "type": "str", + "description": "" + }, + { + "name": "reason", + "type": "Dict", + "key_type": "str", + "value_type": "int" + }, + { + "name": "change_status_begin", + "type": "int", + "description": "" + }, + { + "name": "change_status_end", + "type": "int", + "description": "" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.ChunkAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "The value represents the unit portion of the drug frequency.", + "attributes": [ + { + "name": "sentence_id", + "type": "str", + "description": "" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.DrugLookupWindowAnnotation", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Similar to LookupWindowAnnotation however, these annotations are restricted to the segments/sections specified in the parameter - sectionOverrideSet - in DrugCNP2LookupWindow" + }, + { + "entry_name": "ftx.medical.clinical_ontology.NegationContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", + "attributes": [ + { + "name": "polarity", + "type": "bool" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.UMLSConceptLink", + "parent_entry": "forte.data.ontology.top.Generics", + "description": "A umls concept entity, used to represent basic information of a umls concept", + "attributes": [ + { + "name": "cui", + "type": "str" + }, + { + "name": "name", + "type": "str" + }, + { + "name": "definition", + "type": "str" + }, + { + "name": "tuis", + "type": "List", + "item_type": "str" + }, + { + "name": "aliases", + "type": "List", + "item_type": "str" + }, + { + "name": "score", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", + "attributes": [ + { + "name": "umls_link", + "type": "str" + }, + { + "name": "umls_entities", + "type": "List", + "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An annotation which represents the whole medical text chunk/document", + "attributes": [ + { + "name": "icd_version", + "type": "int", + "description": "The version of ICD-Coding being used." + }, + { + "name": "icd_code", + "type": "str", + "description": "The ICD code assigned to current medical article." + } + ] + }, + { "entry_name": "ftx.medical.clinical_ontology.Abbreviation", "parent_entry": "forte.data.ontology.top.Annotation", "description": "A span based annotation `Abbreviation`, used to represent an abbreviated token..", @@ -411,8 +411,8 @@ "type": "str" } ] - }, - { + }, + { "entry_name": "ftx.medical.clinical_ontology.Hyponym", "parent_entry": "forte.data.ontology.top.Link", "description": "A `Link` type entry which represent a hyponym pair.", @@ -425,6 +425,16 @@ ], "parent_type": "ft.onto.base_ontology.Phrase", "child_type": "ft.onto.base_ontology.Phrase" - } - ] - } + }, + { + "entry_name": "ftx.medical.clinical_ontology.Disease", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Diesease`, used to represent the diseases in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Chemical", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Chemical`, used to represent the chemicals in a piece of clinical text." + } + ] +} \ No newline at end of file From 266d64846aa9c348aaaa6accfd46d53f2bf1b109 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 00:52:58 +0800 Subject: [PATCH 03/38] Add NER Label Processor --- .../health/processors/ner_label_processor.py | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 fortex/health/processors/ner_label_processor.py diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py new file mode 100644 index 00000000..92dbfcd5 --- /dev/null +++ b/fortex/health/processors/ner_label_processor.py @@ -0,0 +1,109 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +NER Labeling Processor +""" + +import spacy +from typing import Dict, Set +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor +from forte.common.configuration import Config +from forte.common.resources import Resources +from forte.common import ProcessExecutionException + + +from ftx.medical.clinical_ontology import Disease, Chemical + + +__all__ = [ + "NERLabelProcessor", +] + + +class NERLabelProcessor(PackProcessor): + r""" + Implementation of this NERLabelProcessor has been based on spaCy + pretained model. A rendition of it that exists on github has + been referred to as well. + + Referred repository link: + https://github.com/explosion/spaCy + """ + + def __init__(self): + super().__init__() + self.nlp = None + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.nlp = spacy.load("en_ner_bc5cdr_md") + + def _process(self, input_pack: DataPack): + r""" + NER Label processing is based on spaCy. + """ + labels = self.configs.labels + + doc = input_pack.text + + # Do all process. + if self.nlp is None: + raise ProcessExecutionException( + "The SpaCy pipeline is not initialized, maybe you " + "haven't called the initialization function." + ) + result = self.nlp(doc) + + if "disease" in labels: + for ent in result.ents: + if ent.label_ == "DISEASE": + Disease( + pack=input_pack, + begin=ent.start_char, + end=ent.end_char + ) + + if "chemical" in labels: + for ent in result.ents: + if ent.label_ == "CHEMICAL": + Chemical( + pack=input_pack, + begin=ent.start_char, + end=ent.end_char + ) + + @classmethod + def default_configs(cls): + r""" + This defines a basic config structure for `ICDCodingProcessor`. + + Following are the keys for this dictionary: + - `labels`: ner labels + + Returns: A dictionary with the default config for this processor. + """ + return { + "labels":["disease","chemical"] + } + + def record(self, record_meta: Dict[str, Set[str]]): + r""" + + Args: + record_meta: the field in the datapack for type record that need to + fill in for consistency checking. + """ + record_meta["ft.onto.base_ontology.Disease"] = set() + record_meta["ft.onto.base_ontology.Chemical"] = set() From 3c9865e74201f2b60cc7fb0544f95eb5e0e93556 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:01:37 +0800 Subject: [PATCH 04/38] Add mimic iii reader --- examples/label_example/mimic3_note_reader.py | 80 ++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 examples/label_example/mimic3_note_reader.py diff --git a/examples/label_example/mimic3_note_reader.py b/examples/label_example/mimic3_note_reader.py new file mode 100644 index 00000000..b3f02de6 --- /dev/null +++ b/examples/label_example/mimic3_note_reader.py @@ -0,0 +1,80 @@ +# Copyright 2021 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +from pathlib import Path +from typing import Any, Iterator, Union, List + +from smart_open import open + +from demo.clinical import Description, Body +from forte.data.data_pack import DataPack +from forte.data.base_reader import PackReader +from ft.onto.base_ontology import Document + + +class Mimic3DischargeNoteReader(PackReader): + """This class is designed to read the discharge notes from MIMIC3 dataset + as plain text packs. + + For more information for the dataset, visit: + https://mimic.physionet.org/ + """ + + def __init__(self): + super().__init__() + self.headers: List[str] = [] + self.text_col = -1 # Default to be last column. + self.description_col = 0 # Default to be first column. + self.__note_count = 0 # Count number of notes processed. + + def _collect( # type: ignore + self, mimic3_path: Union[Path, str] + ) -> Iterator[Any]: + with open(mimic3_path) as f: + for r in csv.reader(f): + if 0 < self.configs.max_num_notes <= self.__note_count: + break + yield r + + def _parse_pack(self, row: List[str]) -> Iterator[DataPack]: + if len(self.headers) == 0: + self.headers.extend(row) + for i, h in enumerate(self.headers): + if h == "TEXT": + self.text_col = i + logging.info("Text Column is %d", i) + if h == "DESCRIPTION": + self.description_col = i + logging.info("Description Column is %d", i) + else: + pack: DataPack = DataPack() + description: str = row[self.description_col] + text: str = row[self.text_col] + delimiter = "\n-----------------\n" + full_text = description + delimiter + text + pack.set_text(full_text) + + Description(pack, 0, len(description)) + Body(pack, len(description) + len(delimiter), len(full_text)) + Document(pack, 0, len(pack.text)) + self.__note_count += 1 + yield pack + + @classmethod + def default_configs(cls): + # If this is set (>0), the reader will only read up to + # the number specified. + return {'max_num_notes':-1} From 745d3544a93b6a50c899428bb507b4a13e016a8c Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:02:07 +0800 Subject: [PATCH 05/38] Add demo --- examples/label_example/demo/__init__.py | 1 + examples/label_example/demo/clinical.py | 49 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 examples/label_example/demo/__init__.py create mode 100644 examples/label_example/demo/clinical.py diff --git a/examples/label_example/demo/__init__.py b/examples/label_example/demo/__init__.py new file mode 100644 index 00000000..49ecbbf8 --- /dev/null +++ b/examples/label_example/demo/__init__.py @@ -0,0 +1 @@ +# ***automatically_generated*** diff --git a/examples/label_example/demo/clinical.py b/examples/label_example/demo/clinical.py new file mode 100644 index 00000000..68541b46 --- /dev/null +++ b/examples/label_example/demo/clinical.py @@ -0,0 +1,49 @@ +# ***automatically_generated*** +# ***source json:examples/clinical_pipeline/clinical_onto.json*** +# flake8: noqa +# mypy: ignore-errors +# pylint: skip-file +""" +Automatically generated ontology clinical. Do not change manually. +""" + +from dataclasses import dataclass +from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation +from ft.onto.base_ontology import EntityMention + +__all__ = [ + "ClinicalEntityMention", + "Description", + "Body", +] + + +@dataclass +class ClinicalEntityMention(EntityMention): + """ + A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Description(Annotation): + """ + A span based annotation `Description`, used to represent the description in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Body(Annotation): + """ + A span based annotation `Body`, used to represent the actual content in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) From c18184ee74999595c3f7e18dd27cfc30b8cfa66b Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:23:09 +0800 Subject: [PATCH 06/38] design clinical pipeline --- examples/label_example/clinical_pipeline.py | 63 +++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 examples/label_example/clinical_pipeline.py diff --git a/examples/label_example/clinical_pipeline.py b/examples/label_example/clinical_pipeline.py new file mode 100644 index 00000000..e4644c45 --- /dev/null +++ b/examples/label_example/clinical_pipeline.py @@ -0,0 +1,63 @@ +import sys +import time + +sys.path.insert(0,"E:\\NLP\\Forte\\ForteHealthBranches\\53\\ForteHealth") +print(sys.path) + +from forte.data.data_pack import DataPack +from forte.data.readers import PlainTextReader +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter +from fortex.elastic import ElasticSearchPackIndexProcessor +from fortex.health.processors.ner_label_processor import NERLabelProcessor +# from ner_label_processor import NERLabelProcessor + +from mimic3_note_reader import Mimic3DischargeNoteReader + +# from stave_backend.lib.stave_session import StaveSession + + +def main( + input_path: str, output_path: str, max_packs: int = -1, use_mimiciii_reader=1 + ): + + pl = Pipeline[DataPack]() + if use_mimiciii_reader == 1: + pl.set_reader( + Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs} + ) + else: + pl.set_reader(PlainTextReader()) + + config_for_ner = { + "labels": ["disease", "chemical"] + } + pl.add(NERLabelProcessor(), config=config_for_ner) + + pl.add( + ElasticSearchPackIndexProcessor(), + { + "indexer": { + "other_kwargs": {"refresh": True}, + } + }, + ) + pl.add( + PackIdJsonPackWriter(), + { + "output_dir": output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": False, + }, + ) + + pl.initialize() + + for idx, pack in enumerate(pl.process_dataset(input_path)): + if (idx + 1) % 50 == 0: + print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs") + + +main(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])) From 451c4882342576d54e4a696901ccca6cc85a1f58 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:26:23 +0800 Subject: [PATCH 07/38] Search engine and related utils --- examples/label_example/search_engine.py | 93 +++++++++++++++++++++++++ examples/label_example/search_utils.py | 57 +++++++++++++++ examples/label_example/sqlite_utils.py | 79 +++++++++++++++++++++ examples/label_example/templates.py | 22 ++++++ 4 files changed, 251 insertions(+) create mode 100644 examples/label_example/search_engine.py create mode 100644 examples/label_example/search_utils.py create mode 100644 examples/label_example/sqlite_utils.py create mode 100644 examples/label_example/templates.py diff --git a/examples/label_example/search_engine.py b/examples/label_example/search_engine.py new file mode 100644 index 00000000..03d58640 --- /dev/null +++ b/examples/label_example/search_engine.py @@ -0,0 +1,93 @@ +import sqlite3 +from typing import List +import streamlit as st +from forte.common.configuration import Config +import yaml +from elasticsearch import Elasticsearch +from search_utils import all_search, index_search +from sqlite_utils import create_links, sqlite_insert, get_json, update_stave_db +import templates + + +st.set_page_config(page_title="ForteHealth_Search_Engine", layout="wide") + +es = Elasticsearch(hosts=["http://localhost:9200/"]) +INDEX = "elastic_indexer" + +config = yaml.safe_load(open("stave_config.yml", "r")) +config = Config(config, default_hparams=None) + +default_project_json = get_json("default_onto_project.json") + +base_project_id = update_stave_db( + default_project_json, config + ) + +st.title("Search the MIMIC III Data...") +search = st.text_input("Enter search words:") + +if not search: + records = {} + results = all_search(es, INDEX) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + print(pack_id, db_id) + + links: List[str] = create_links(config.Stave.url, answers) + + for link in links: + st.write(link, unsafe_allow_html=True) + +if search: + results = index_search(es, INDEX, search) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + docs = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + highlight = "...".join(hit["highlight"]["content"]) + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + + docs.append(highlight) + + links: List[str] = create_links(config.Stave.url, answers) + + for i in range(len(links)): + st.write(links[i], unsafe_allow_html=True) + st.write( + templates.search_result( + docs[i] + .replace("\n", " ") + ), + unsafe_allow_html=True, + ) diff --git a/examples/label_example/search_utils.py b/examples/label_example/search_utils.py new file mode 100644 index 00000000..eb85da10 --- /dev/null +++ b/examples/label_example/search_utils.py @@ -0,0 +1,57 @@ +''' +this file defines search functions for searching data in elasticsearch. +''' + + +def all_search(es, index: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + size: Number of results returned in each search. + """ + # search query + body = {"query": {"match_all": {}}} + + res = es.search(index=index, body=body) + + return res + + +def index_search(es, index: str, keywords: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + keywords: Search keywords. + from_i: Start index of the results for pagination. + size: Number of results returned in each search. + """ + # search query + body = { + "query": { + "bool": { + "must": [ + { + "query_string": { + "query": keywords, + "fields": ["content"], + "default_operator": "AND", + } + } + ], + } + }, + "highlight": { + "pre_tags": [' '], + "post_tags": [""], + "fields": {"content": {}}, + }, + # "from": from_i, + # "size": size, + "aggs": {"match_count": {"value_count": {"field": "_id"}}}, + } + + res = es.search(index=index, body=body) + + return res diff --git a/examples/label_example/sqlite_utils.py b/examples/label_example/sqlite_utils.py new file mode 100644 index 00000000..6cc7c036 --- /dev/null +++ b/examples/label_example/sqlite_utils.py @@ -0,0 +1,79 @@ +""" +this file defines sqlite3 related utils for inserting data to the database of stave. +""" +import json +from typing import List +from stave_backend.lib.stave_session import StaveSession +import sqlite3 + + +def sqlite_insert(conn, table, row): + """ + Args: + conn: connection + table: table name + row: inserted item + """ + cols: str = ", ".join('"{}"'.format(col) for col in row.keys()) + vals: str = ", ".join(":{}".format(col) for col in row.keys()) + sql: str = 'INSERT INTO "{0}" ({1}) VALUES ({2})'.format(table, cols, vals) + cursor = conn.cursor() + cursor.execute(sql, row) + conn.commit() + return cursor.lastrowid + + +def create_links(url_stub: str, ids: List[int]) -> List[str]: + """ + Args: + url_stub: url of stave + ids: the doc ids of the reports + """ + links: List[str] = [] + + url_stub: str = url_stub.strip("/") + for temp_idm in ids: + links.append( + f"Report #{temp_idm}" + ) + return links + + +def get_json(path: str): + """ + Args: + path: the file path of the json file + """ + file_obj = open(path) + data = json.load(file_obj) + file_obj.close() + return data + + +def update_stave_db(default_project_json, config): + """ + Args: + default_project_json: the ontology configuration file + config: the configuration of Stave, including url, name, password, etc. + """ + project_id_base = 0 + with StaveSession(url=config.Stave.url) as session: + session.login(username=config.Stave.username, password=config.Stave.pw) + + projects = session.get_project_list().json() + project_names = [project["name"] for project in projects] + + if ( + default_project_json["name"] in project_names + ): + + base_project = [ + proj + for proj in projects + if proj["name"] == default_project_json["name"] + ][0] + return base_project["id"] + + resp1 = session.create_project(default_project_json) + project_id_base = json.loads(resp1.text)["id"] + return project_id_base diff --git a/examples/label_example/templates.py b/examples/label_example/templates.py new file mode 100644 index 00000000..63bf9aa2 --- /dev/null +++ b/examples/label_example/templates.py @@ -0,0 +1,22 @@ +""" +This file defines some HTML templates +""" + + +def number_of_results(total_hits: int, duration: float) -> str: + """HTML scripts to display number of results and duration.""" + return f""" +
+ {total_hits} results ({duration:.2f} seconds) +

+ """ + + +def search_result(highlights: str) -> str: + """HTML scripts to display search results.""" + return f""" + +
+ {highlights} +
+ """ From f201997991bf3a52b54d5048f9c88e965bff7790 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:26:45 +0800 Subject: [PATCH 08/38] add stave config --- examples/label_example/stave_config.yml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 examples/label_example/stave_config.yml diff --git a/examples/label_example/stave_config.yml b/examples/label_example/stave_config.yml new file mode 100644 index 00000000..f9ff6f02 --- /dev/null +++ b/examples/label_example/stave_config.yml @@ -0,0 +1,5 @@ +Stave: + stave_db_path: "C://Users//Leo//.stave//db.sqlite3" + url: "http://localhost:8899" + username: admin + pw: admin From 4642690f12abcd78310cff20aca97eb3657bbea8 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:29:02 +0800 Subject: [PATCH 09/38] add stave project ontology file --- .../label_example/default_onto_project.json | 751 ++++++++++++++++++ 1 file changed, 751 insertions(+) create mode 100644 examples/label_example/default_onto_project.json diff --git a/examples/label_example/default_onto_project.json b/examples/label_example/default_onto_project.json new file mode 100644 index 00000000..901ce4f1 --- /dev/null +++ b/examples/label_example/default_onto_project.json @@ -0,0 +1,751 @@ +{ + "name": "clinical_pipeline_base", + "ontology": { + "name": "base_ontology", + "definitions": [ + { + "entry_name": "ft.onto.base_ontology.Token", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation :class:`Token`, used to represent a token or a word.", + "attributes": [ + { + "name": "pos", + "type": "str" + }, + { + "name": "ud_xpos", + "type": "str", + "description": "Language specific pos tag. Used in CoNLL-U Format. Refer to https://universaldependencies.org/format.html" + }, + { + "name": "lemma", + "type": "str", + "description": "Lemma or stem of word form." + }, + { + "name": "chunk", + "type": "str" + }, + { + "name": "ner", + "type": "str" + }, + { + "name": "sense", + "type": "str" + }, + { + "name": "is_root", + "type": "bool" + }, + { + "name": "ud_features", + "type": "Dict", + "key_type": "str", + "value_type": "str" + }, + { + "name": "ud_misc", + "type": "Dict", + "key_type": "str", + "value_type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Subword", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "Used to represent subword tokenization results.", + "attributes": [ + { + "name": "is_first_segment", + "type": "bool" + }, + { + "name": "is_unk", + "type": "bool" + }, + { + "name": "vocab_id", + "type": "int" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Classification", + "parent_entry": "forte.data.ontology.top.Generics", + "description": "Used to store values for classification prediction", + "attributes": [ + { + "name": "classification_result", + "type": "Dict", + "key_type": "str", + "value_type": "float" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Document", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Document`, normally used to represent a document.", + "attributes": [ + { + "name": "document_class", + "type": "List", + "item_type": "str", + "description": "A list of class names that the document belongs to." + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classifications", + "type": "Dict", + "key_type": "str", + "value_type": "ft.onto.base_ontology.Classification", + "description": "Stores the classification results for this document. The key is the name/task of the classification, the value is an classification object storing the results." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Sentence", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Sentence`, normally used to represent a sentence.", + "attributes": [ + { + "name": "speaker", + "type": "str" + }, + { + "name": "part_id", + "type": "int" + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classification", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "classifications", + "type": "Dict", + "key_type": "str", + "value_type": "ft.onto.base_ontology.Classification", + "description": "Stores the classification results for this sentence. The key is the name/task of the classification, the value is an classification object storing the results." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Phrase", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Phrase`.", + "attributes": [ + { + "name": "phrase_type", + "type": "str" + }, + { + "name": "headword", + "type": "ft.onto.base_ontology.Token" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.UtteranceContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "`UtteranceContext` represents the context part in dialogue." + }, + { + "entry_name": "ft.onto.base_ontology.Utterance", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Utterance`, normally used to represent an utterance in dialogue.", + "attributes": [ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateArgument", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `PredicateArgument`, normally used to represent an argument of a predicate, can be linked to the predicate via the predicate link.", + "attributes": [ + { + "name": "ner_type", + "type": "str" + }, + { + "name": "predicate_lemma", + "type": "str" + }, + { + "name": "is_verb", + "type": "bool" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.EntityMention", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `EntityMention`, normally used to represent an Entity Mention in a piece of text.", + "attributes": [ + { + "name": "ner_type", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.EventMention", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `EventMention`, used to refer to a mention of an event.", + "attributes": [ + { + "name": "event_type", + "type": "str" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateMention", + "parent_entry": "ft.onto.base_ontology.Phrase", + "description": "A span based annotation `PredicateMention`, normally used to represent a predicate (normally verbs) in a piece of text.", + "attributes": [ + { + "name": "predicate_lemma", + "type": "str" + }, + { + "name": "framenet_id", + "type": "str" + }, + { + "name": "is_verb", + "type": "bool" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.PredicateLink", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a semantic role link between a predicate and its argument.", + "attributes": [ + { + "name": "arg_type", + "type": "str", + "description": "The predicate link type." + } + ], + "parent_type": "ft.onto.base_ontology.PredicateMention", + "child_type": "ft.onto.base_ontology.PredicateArgument" + }, + { + "entry_name": "ft.onto.base_ontology.Dependency", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a syntactic dependency.", + "attributes": [ + { + "name": "dep_label", + "type": "str", + "description": "The dependency label." + }, + { + "name": "rel_type", + "type": "str" + } + ], + "parent_type": "ft.onto.base_ontology.Token", + "child_type": "ft.onto.base_ontology.Token" + }, + { + "entry_name": "ft.onto.base_ontology.EnhancedDependency", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a enhanced dependency: \n https://universaldependencies.org/u/overview/enhanced-syntax.html", + "attributes": [ + { + "name": "dep_label", + "type": "str", + "description": "The enhanced dependency label in Universal Dependency." + } + ], + "parent_type": "ft.onto.base_ontology.Token", + "child_type": "ft.onto.base_ontology.Token" + }, + { + "entry_name": "ft.onto.base_ontology.RelationLink", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a relation between two entity mentions", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EntityMention", + "child_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.CrossDocEntityRelation", + "parent_entry": "forte.data.ontology.top.MultiPackLink", + "description": "A `Link` type entry which represent a relation between two entity mentions across the packs.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EntityMention", + "child_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.CoreferenceGroup", + "parent_entry": "forte.data.ontology.top.Group", + "description": "A group type entry that take `EntityMention`, as members, used to represent coreferent group of entities.", + "member_type": "ft.onto.base_ontology.EntityMention" + }, + { + "entry_name": "ft.onto.base_ontology.EventRelation", + "parent_entry": "forte.data.ontology.top.Link", + "description": "A `Link` type entry which represent a relation between two event mentions.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EventMention", + "child_type": "ft.onto.base_ontology.EventMention" + }, + { + "entry_name": "ft.onto.base_ontology.CrossDocEventRelation", + "parent_entry": "forte.data.ontology.top.MultiPackLink", + "description": "A `Link` type entry which represent a relation between two event mentions across the packs.", + "attributes": [ + { + "name": "rel_type", + "type": "str", + "description": "The type of the relation." + } + ], + "parent_type": "ft.onto.base_ontology.EventMention", + "child_type": "ft.onto.base_ontology.EventMention" + }, + { + "entry_name": "ft.onto.base_ontology.ConstituentNode", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `ConstituentNode` to represent constituents in constituency parsing. This can also sentiment values annotated on the nodes.", + "attributes": [ + { + "name": "label", + "type": "str" + }, + { + "name": "sentiment", + "type": "Dict", + "key_type": "str", + "value_type": "float" + }, + { + "name": "is_root", + "type": "bool" + }, + { + "name": "is_leaf", + "type": "bool" + }, + { + "name": "parent_node", + "type": "ft.onto.base_ontology.ConstituentNode" + }, + { + "name": "children_nodes", + "type": "List", + "item_type": "ft.onto.base_ontology.ConstituentNode" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Title", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Title`, normally used to represent a title." + }, + { + "entry_name": "ft.onto.base_ontology.Body", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Body`, normally used to represent a document body." + }, + { + "entry_name": "ft.onto.base_ontology.MCOption", + "parent_entry": "forte.data.ontology.top.Annotation" + }, + { + "entry_name": "ft.onto.base_ontology.MCQuestion", + "parent_entry": "forte.data.ontology.top.Annotation", + "attributes": [ + { + "name": "options", + "type": "List", + "item_type": "ft.onto.base_ontology.MCOption" + }, + { + "name": "answers", + "type": "List", + "item_type": "int" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.MRCQuestion", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An `Annotation` type which represents an MRC question.", + "attributes": [ + { + "name": "qid", + "type": "int" + }, + { + "name": "answers", + "type": "List", + "item_type": "ft.onto.base_ontology.Phrase" + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.Recording", + "parent_entry": "forte.data.ontology.top.AudioAnnotation", + "description": "A span based annotation `Recording`, normally used to represent a recording.", + "attributes": [ + { + "name": "recording_class", + "type": "List", + "item_type": "str", + "description": "A list of class names that the recording belongs to." + } + ] + }, + { + "entry_name": "ft.onto.base_ontology.AudioUtterance", + "parent_entry": "forte.data.ontology.top.AudioAnnotation", + "description": "A span based annotation `AudioUtterance`, normally used to represent an utterance in dialogue.", + "attributes": [ + { + "name": "speaker", + "type": "str" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.NegationContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", + "attributes": [ + { + "name": "polarity", + "type": "bool" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", + "attributes": [ + { + "name": "umls_link", + "type": "str" + }, + { + "name": "umls_entities", + "type": "List", + "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An annotation based representation for the whole medical text chunk/document", + "attributes": [ + { + "name": "icd_version", + "type": "int", + "description": "The version of ICD-Coding being used." + }, + { + "name": "icd_code", + "type": "str", + "description": "The ICD code assigned to current medical article." + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.Disease", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Disease`, used to represent the diseases in a piece of clinical text." + }, + { + "entry_name": "ftx.medical.clinical_ontology.Chemical", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `Chemical`, used to represent the chemical in a piece of clinical text." + } + ] + }, + "config": { + "legendConfigs": { + "ft.onto.base_ontology.Token": { + "is_selected": false, + "is_shown": true, + "attributes": { + "pos": false, + "ud_xpos": false, + "lemma": false, + "chunk": false, + "ner": false, + "sense": false + } + }, + "ft.onto.base_ontology.Subword": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Classification": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Document": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Sentence": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.Phrase": { + "is_selected": false, + "is_shown": true, + "attributes": { + "phrase_type": false + } + }, + "ft.onto.base_ontology.UtteranceContext": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Utterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.PredicateArgument": { + "is_selected": false, + "is_shown": false, + "attributes": { + "ner_type": false, + "predicate_lemma": false + } + }, + "ft.onto.base_ontology.EntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "ner_type": false + } + }, + "ft.onto.base_ontology.EventMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "event_type": false + } + }, + "ft.onto.base_ontology.PredicateMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "predicate_lemma": false, + "framenet_id": false + } + }, + "ft.onto.base_ontology.PredicateLink": { + "is_selected": false, + "is_shown": false, + "attributes": { + "arg_type": false + } + }, + "ft.onto.base_ontology.Dependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false, + "rel_type": false + } + }, + "ft.onto.base_ontology.EnhancedDependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false + } + }, + "ft.onto.base_ontology.RelationLink": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEntityRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CoreferenceGroup": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.EventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.ConstituentNode": { + "is_selected": false, + "is_shown": false, + "attributes": { + "label": false + } + }, + "ft.onto.base_ontology.Title": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Body": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCOption": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.MRCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Recording": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.AudioUtterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ftx.medical.clinical_ontology.NegationContext": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ftx.medical.clinical_ontology.MedicalEntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "umls_link": false + } + }, + "ftx.medical.clinical_ontology.MedicalArticle": { + "is_selected": false, + "is_shown": true, + "attributes": { + "icd_code": false + } + }, + "ftx.medical.clinical_ontology.Disease": { + "is_selected": false, + "is_shown": true + }, + "ftx.medical.clinical_ontology.Chemical": { + "is_selected": false, + "is_shown": true + } + }, + "scopeConfigs": { + "ft.onto.base_ontology.Token": false, + "ft.onto.base_ontology.Subword": false, + "ft.onto.base_ontology.Document": false, + "ft.onto.base_ontology.Sentence": false, + "ft.onto.base_ontology.Phrase": false, + "ft.onto.base_ontology.UtteranceContext": false, + "ft.onto.base_ontology.Utterance": false, + "ft.onto.base_ontology.PredicateArgument": false, + "ft.onto.base_ontology.EntityMention": false, + "ft.onto.base_ontology.EventMention": false, + "ft.onto.base_ontology.PredicateMention": false, + "ft.onto.base_ontology.ConstituentNode": false, + "ft.onto.base_ontology.Title": false, + "ft.onto.base_ontology.Body": false, + "ft.onto.base_ontology.MCOption": false, + "ft.onto.base_ontology.MCQuestion": false, + "ft.onto.base_ontology.MRCQuestion": false, + "ftx.medical.clinical_ontology.NegationContext": false, + "ftx.medical.clinical_ontology.MedicalEntityMention": false, + "ftx.medical.clinical_ontology.MedicalArticle": false, + "ftx.medical.clinical_ontology.Disease": false, + "ftx.medical.clinical_ontology.Chemical": false + }, + "layoutConfigs": { + "center-middle": "default-nlp", + "left": "default-meta", + "right": "default-attribute", + "center-bottom": "disable" + }, + "remoteConfigs": { + "pipelineUrl": "", + "doValidation": false, + "expectedName": "", + "inputFormat": "string", + "expectedRecords": {} + } + } +} \ No newline at end of file From f6dea81cd070a4652727b70d6c037bc20bf7d20d Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 01:48:34 +0800 Subject: [PATCH 10/38] add some CRUD to temporally fix a bug --- examples/label_example/sqlite_utils.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/examples/label_example/sqlite_utils.py b/examples/label_example/sqlite_utils.py index 6cc7c036..3d223078 100644 --- a/examples/label_example/sqlite_utils.py +++ b/examples/label_example/sqlite_utils.py @@ -2,8 +2,10 @@ this file defines sqlite3 related utils for inserting data to the database of stave. """ import json +import yaml from typing import List from stave_backend.lib.stave_session import StaveSession +from forte.common import Config import sqlite3 @@ -76,4 +78,23 @@ def update_stave_db(default_project_json, config): resp1 = session.create_project(default_project_json) project_id_base = json.loads(resp1.text)["id"] + + config = yaml.safe_load(open("stave_config.yml", "r")) + config = Config(config, default_hparams=None) + con = sqlite3.connect(config.Stave.stave_db_path) + + cursorObj = con.cursor() + cursorObj.execute('SELECT ontology, config FROM stave_backend_project WHERE id = {0}'.format(project_id_base)) + results = cursorObj.fetchall() + onto = results[0][0] + conf = results[0][1] + + onto_new = onto.replace("\'","\"") + conf_new = conf.replace("\'", "\"").replace("True", "true").replace("False", "false") + + cursorObj.execute("UPDATE stave_backend_project SET ontology ='" + onto_new + "' WHERE id = {0}".format(project_id_base)) + cursorObj.execute("UPDATE stave_backend_project SET config ='" + conf_new + "' WHERE id = {0}".format(project_id_base)) + + con.commit() + return project_id_base From 62328123008fdcb31797961734992aa1c423b4fc Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 02:20:16 +0800 Subject: [PATCH 11/38] black and pylint --- examples/label_example/clinical_pipeline.py | 21 +++---------- examples/label_example/search_engine.py | 31 ++++++++---------- examples/label_example/sqlite_utils.py | 35 ++++++++++++++------- 3 files changed, 40 insertions(+), 47 deletions(-) diff --git a/examples/label_example/clinical_pipeline.py b/examples/label_example/clinical_pipeline.py index e4644c45..43047d7f 100644 --- a/examples/label_example/clinical_pipeline.py +++ b/examples/label_example/clinical_pipeline.py @@ -1,37 +1,24 @@ import sys import time -sys.path.insert(0,"E:\\NLP\\Forte\\ForteHealthBranches\\53\\ForteHealth") -print(sys.path) - from forte.data.data_pack import DataPack from forte.data.readers import PlainTextReader from forte.pipeline import Pipeline from forte.processors.writers import PackIdJsonPackWriter +from mimic3_note_reader import Mimic3DischargeNoteReader from fortex.elastic import ElasticSearchPackIndexProcessor from fortex.health.processors.ner_label_processor import NERLabelProcessor -# from ner_label_processor import NERLabelProcessor - -from mimic3_note_reader import Mimic3DischargeNoteReader - -# from stave_backend.lib.stave_session import StaveSession -def main( - input_path: str, output_path: str, max_packs: int = -1, use_mimiciii_reader=1 - ): +def main(input_path: str, output_path: str, max_packs: int = -1, use_mimiciii_reader=1): pl = Pipeline[DataPack]() if use_mimiciii_reader == 1: - pl.set_reader( - Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs} - ) + pl.set_reader(Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs}) else: pl.set_reader(PlainTextReader()) - config_for_ner = { - "labels": ["disease", "chemical"] - } + config_for_ner = {"labels": ["disease", "chemical"]} pl.add(NERLabelProcessor(), config=config_for_ner) pl.add( diff --git a/examples/label_example/search_engine.py b/examples/label_example/search_engine.py index 03d58640..b1be57c2 100644 --- a/examples/label_example/search_engine.py +++ b/examples/label_example/search_engine.py @@ -19,9 +19,7 @@ default_project_json = get_json("default_onto_project.json") -base_project_id = update_stave_db( - default_project_json, config - ) +base_project_id = update_stave_db(default_project_json, config) st.title("Search the MIMIC III Data...") search = st.text_input("Enter search words:") @@ -41,10 +39,10 @@ # Now you can write the pack into the database and generate url. item = { - "name": f"clinical_results_{idx}", - "textPack": raw_pack_str, - "project_id": base_project_id, - } + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } db_id = sqlite_insert(conn, "stave_backend_document", item) answers += [db_id] @@ -70,10 +68,10 @@ highlight = "...".join(hit["highlight"]["content"]) # Now you can write the pack into the database and generate url. item = { - "name": f"clinical_results_{idx}", - "textPack": raw_pack_str, - "project_id": base_project_id, - } + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } db_id = sqlite_insert(conn, "stave_backend_document", item) answers += [db_id] @@ -82,12 +80,9 @@ links: List[str] = create_links(config.Stave.url, answers) - for i in range(len(links)): + for i, _ in enumerate(links): st.write(links[i], unsafe_allow_html=True) st.write( - templates.search_result( - docs[i] - .replace("\n", " ") - ), - unsafe_allow_html=True, - ) + templates.search_result(docs[i].replace("\n", " ")), + unsafe_allow_html=True, + ) diff --git a/examples/label_example/sqlite_utils.py b/examples/label_example/sqlite_utils.py index 3d223078..da82e0ed 100644 --- a/examples/label_example/sqlite_utils.py +++ b/examples/label_example/sqlite_utils.py @@ -1,12 +1,13 @@ """ -this file defines sqlite3 related utils for inserting data to the database of stave. +this file defines sqlite3 related utils for inserting data to +the database of stave. """ import json -import yaml from typing import List +import sqlite3 +import yaml from stave_backend.lib.stave_session import StaveSession from forte.common import Config -import sqlite3 def sqlite_insert(conn, table, row): @@ -18,7 +19,7 @@ def sqlite_insert(conn, table, row): """ cols: str = ", ".join('"{}"'.format(col) for col in row.keys()) vals: str = ", ".join(":{}".format(col) for col in row.keys()) - sql: str = 'INSERT INTO "{0}" ({1}) VALUES ({2})'.format(table, cols, vals) + sql: str = f'INSERT INTO "{table}" ({cols}) VALUES ({vals})' cursor = conn.cursor() cursor.execute(sql, row) conn.commit() @@ -65,9 +66,7 @@ def update_stave_db(default_project_json, config): projects = session.get_project_list().json() project_names = [project["name"] for project in projects] - if ( - default_project_json["name"] in project_names - ): + if default_project_json["name"] in project_names: base_project = [ proj @@ -84,16 +83,28 @@ def update_stave_db(default_project_json, config): con = sqlite3.connect(config.Stave.stave_db_path) cursorObj = con.cursor() - cursorObj.execute('SELECT ontology, config FROM stave_backend_project WHERE id = {0}'.format(project_id_base)) + cursorObj.execute( + f"SELECT ontology, config FROM stave_backend_project WHERE id = {project_id_base}" + ) results = cursorObj.fetchall() onto = results[0][0] conf = results[0][1] - onto_new = onto.replace("\'","\"") - conf_new = conf.replace("\'", "\"").replace("True", "true").replace("False", "false") + onto_new = onto.replace("'", '"') + conf_new = ( + conf.replace("'", '"').replace("True", "true").replace("False", "false") + ) - cursorObj.execute("UPDATE stave_backend_project SET ontology ='" + onto_new + "' WHERE id = {0}".format(project_id_base)) - cursorObj.execute("UPDATE stave_backend_project SET config ='" + conf_new + "' WHERE id = {0}".format(project_id_base)) + cursorObj.execute( + "UPDATE stave_backend_project SET ontology ='" + + onto_new + + f"' WHERE id = {project_id_base}" + ) + cursorObj.execute( + "UPDATE stave_backend_project SET config ='" + + conf_new + + f"' WHERE id = {project_id_base}" + ) con.commit() From a8e880368cd0269e22c8c1dbb3286590f2dc701a Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 02:20:59 +0800 Subject: [PATCH 12/38] solve pylint issues --- fortex/health/processors/ner_label_processor.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 92dbfcd5..3e52f357 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -14,9 +14,8 @@ """ NER Labeling Processor """ - -import spacy from typing import Dict, Set +import spacy from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor from forte.common.configuration import Config @@ -94,9 +93,7 @@ def default_configs(cls): Returns: A dictionary with the default config for this processor. """ - return { - "labels":["disease","chemical"] - } + return {"labels": ["disease", "chemical"]} def record(self, record_meta: Dict[str, Set[str]]): r""" From e5418512e0080087431c3768fae88e20d1924e45 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 02:21:15 +0800 Subject: [PATCH 13/38] normalize the config --- examples/label_example/stave_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/label_example/stave_config.yml b/examples/label_example/stave_config.yml index f9ff6f02..1b5e0c7f 100644 --- a/examples/label_example/stave_config.yml +++ b/examples/label_example/stave_config.yml @@ -1,5 +1,5 @@ Stave: - stave_db_path: "C://Users//Leo//.stave//db.sqlite3" + stave_db_path: "$HOME//.stave//db.sqlite3" url: "http://localhost:8899" username: admin pw: admin From 9c7d6e2d8b9c474d32a67b9a91ebb32fedc04128 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 02:29:34 +0800 Subject: [PATCH 14/38] solve black issue --- fortex/health/processors/ner_label_processor.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 3e52f357..939cae69 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -69,18 +69,14 @@ def _process(self, input_pack: DataPack): for ent in result.ents: if ent.label_ == "DISEASE": Disease( - pack=input_pack, - begin=ent.start_char, - end=ent.end_char + pack=input_pack, begin=ent.start_char, end=ent.end_char ) if "chemical" in labels: for ent in result.ents: if ent.label_ == "CHEMICAL": Chemical( - pack=input_pack, - begin=ent.start_char, - end=ent.end_char + pack=input_pack, begin=ent.start_char, end=ent.end_char ) @classmethod From c7ea7e16c298057e3da63fac7a6f9272ee8ad6f9 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 02:36:10 +0800 Subject: [PATCH 15/38] solve pylint issue --- examples/label_example/clinical_pipeline.py | 14 +++- examples/label_example/mimic3_note_reader.py | 80 -------------------- 2 files changed, 11 insertions(+), 83 deletions(-) delete mode 100644 examples/label_example/mimic3_note_reader.py diff --git a/examples/label_example/clinical_pipeline.py b/examples/label_example/clinical_pipeline.py index 43047d7f..3d4939b3 100644 --- a/examples/label_example/clinical_pipeline.py +++ b/examples/label_example/clinical_pipeline.py @@ -5,16 +5,24 @@ from forte.data.readers import PlainTextReader from forte.pipeline import Pipeline from forte.processors.writers import PackIdJsonPackWriter -from mimic3_note_reader import Mimic3DischargeNoteReader +from fortex.health.readers import Mimic3DischargeNoteReader from fortex.elastic import ElasticSearchPackIndexProcessor from fortex.health.processors.ner_label_processor import NERLabelProcessor -def main(input_path: str, output_path: str, max_packs: int = -1, use_mimiciii_reader=1): +def main( + input_path: str, + output_path: str, + max_packs: int = -1, + use_mimiciii_reader=1 + ): pl = Pipeline[DataPack]() if use_mimiciii_reader == 1: - pl.set_reader(Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs}) + pl.set_reader( + Mimic3DischargeNoteReader(), + config={"max_num_notes": max_packs} + ) else: pl.set_reader(PlainTextReader()) diff --git a/examples/label_example/mimic3_note_reader.py b/examples/label_example/mimic3_note_reader.py deleted file mode 100644 index b3f02de6..00000000 --- a/examples/label_example/mimic3_note_reader.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2021 The Forte Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import csv -import logging -from pathlib import Path -from typing import Any, Iterator, Union, List - -from smart_open import open - -from demo.clinical import Description, Body -from forte.data.data_pack import DataPack -from forte.data.base_reader import PackReader -from ft.onto.base_ontology import Document - - -class Mimic3DischargeNoteReader(PackReader): - """This class is designed to read the discharge notes from MIMIC3 dataset - as plain text packs. - - For more information for the dataset, visit: - https://mimic.physionet.org/ - """ - - def __init__(self): - super().__init__() - self.headers: List[str] = [] - self.text_col = -1 # Default to be last column. - self.description_col = 0 # Default to be first column. - self.__note_count = 0 # Count number of notes processed. - - def _collect( # type: ignore - self, mimic3_path: Union[Path, str] - ) -> Iterator[Any]: - with open(mimic3_path) as f: - for r in csv.reader(f): - if 0 < self.configs.max_num_notes <= self.__note_count: - break - yield r - - def _parse_pack(self, row: List[str]) -> Iterator[DataPack]: - if len(self.headers) == 0: - self.headers.extend(row) - for i, h in enumerate(self.headers): - if h == "TEXT": - self.text_col = i - logging.info("Text Column is %d", i) - if h == "DESCRIPTION": - self.description_col = i - logging.info("Description Column is %d", i) - else: - pack: DataPack = DataPack() - description: str = row[self.description_col] - text: str = row[self.text_col] - delimiter = "\n-----------------\n" - full_text = description + delimiter + text - pack.set_text(full_text) - - Description(pack, 0, len(description)) - Body(pack, len(description) + len(delimiter), len(full_text)) - Document(pack, 0, len(pack.text)) - self.__note_count += 1 - yield pack - - @classmethod - def default_configs(cls): - # If this is set (>0), the reader will only read up to - # the number specified. - return {'max_num_notes':-1} From 557b8276bc1fbf4806ccf3e9afb6520715a9bc8a Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 02:40:24 +0800 Subject: [PATCH 16/38] solve pylint issue: import itself --- fortex/health/readers/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fortex/health/readers/__init__.py b/fortex/health/readers/__init__.py index 076a48e7..d3745f4b 100644 --- a/fortex/health/readers/__init__.py +++ b/fortex/health/readers/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from fortex.health.readers.mimic3_note_reader import * From 2e2dcd50676d84d04c0c80f0a130fb12ba9d9fa4 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 03:20:08 +0800 Subject: [PATCH 17/38] add a README file --- examples/label_example/README.md | 89 ++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 examples/label_example/README.md diff --git a/examples/label_example/README.md b/examples/label_example/README.md new file mode 100644 index 00000000..dbcb9bee --- /dev/null +++ b/examples/label_example/README.md @@ -0,0 +1,89 @@ +## NER Label Example + +This example shows how we start a search engine in streamlit and link the search results to stave. + +## Install extra dependencies + +To install from PyPI, +```bash +pip install forte.elastic +pip install forte.health +pip install stave +pip install streamlit +``` + +## Download spaCy model + +run the following command to download the model +```bash +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz +``` + +## Set up the configuration +Before run Elastic Searcher and Stave, we need to ensure that the current configuration is compatible with the environment of our computer. + +Please check and change the following configurations in `stave_config.yml`: + +1. Ensure `Stave.stave_db_path` is the correct path -> `$Home/.stave`, e.g., `"/home/name/.stave"`. +2. Ensure `Stave.username` and `Stave.pw`is `"admin"` and `"admin"`. + +## Prepare elastic searcher +Download corresponding elasticsearch archive from https://www.elastic.co/downloads/past-releases/elasticsearch-7-17-2, unzip it and run `elasticsearch-7-17-2/bin/elasticsearch` to start the service. + +Run the following to check if elasticsearch is running properly: +```bash +curl -XGET localhost:9200/_cluster/health?pretty +``` + +Make sure you create index 'elastic_indexer' in the cluster before working with this example, you can run the following command: +```bash +curl -X PUT localhost:9200/elastic_indexer +``` + +You can also follow the online blog for more information: + +https://www.elastic.co/guide/en/elasticsearch/reference/current/starting-elasticsearch.html + +## Run pipeline +First, you should start an Elastic Indexer backend. + +Now, open a terminal. You can run the following command to parse some files and index them. +```bash +python clinical__pipeline.py path_to_mimiciii/1.4/NOTEEVENTS.csv.gz path_to_mimiciii_output 10 1 +``` + +Here, we write out the raw data pack to `/path_to_sample_output`, and only index the first 10 notes. You can change the number to whatever you want in the above command. + +Also, we write the data into elasticsearch. You can run the command line to check whether the 10 notes are written into your database: + +```bash +curl -X GET localhost:9200/elastic_indexer/_search +``` + +## Run indexer and Stave +Again, you should start an Elastic Indexer backend. + +Then, to start the Stave server that our pipeline will connect to for visualization purposes, run +```bash +stave -s start -o -l -n 8899 +``` +Then, login with username (admin) and password (admin). + +Here, you need to make sure `Stave.url` in `stave_config.yml` is `"http://localhost:8899"`. Or you can change the port 8899 to any port you like. + +## Run streamlit + +To run streamlit, the python version should be >= 3.7.2. + +Now, open the terminal. Run the following command to start the streamlit. +```bash +streamlit run search_engine.py +``` + +Now open `http://localhost:8501` on your browser to access the streamlit interface. + +Next, you will see the reports shown on the interface. You can also search with the search engine. + +Click the report with link, it will link to Stave, the visualization and annotation page. + +Click the radio (Disease and Chemical) on the sidebar, you can see the annotations on the UI. From 8ba0d6f527a63d264922f5c6dd7bef7d27f0a874 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 03:56:10 +0800 Subject: [PATCH 18/38] add ner label test --- .../processors/ner_label_processor_test.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 tests/fortex/health/processors/ner_label_processor_test.py diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py new file mode 100644 index 00000000..a91f1356 --- /dev/null +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -0,0 +1,62 @@ +# Copyright 2022 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Unit tests for ICDCodingProcessor +""" + +import unittest + +from ddt import data, ddt +from forte.data.data_pack import DataPack +from forte.data.readers import StringReader +from forte.pipeline import Pipeline +from fortex.health.processors.ner_label_processor import NERLabelProcessor +from ftx.medical.clinical_ontology import Chemical, Disease + + +@ddt +class TestNERLabelProcessor(unittest.TestCase): + + @data( + "He got cancer, and he needs oxygen." + ) + def test_ner_label_processor(self, input_data): + self.nlp = Pipeline[DataPack]() + self.nlp.set_reader(StringReader()) + config = { + "labels": ["disease", "chemical"] + } + + self.nlp.add(NERLabelProcessor(), config=config) + self.nlp.initialize() + pack = self.nlp.process(input_data) + + exp_disease = ["cancer"] + disease = [] + + for idx, d in enumerate(pack.get(Disease)): + disease.append(d.text) + + assert exp_disease == disease + + exp_chemical = ["oxygen"] + chemical = [] + for idx, c in enumerate(pack.get(Chemical)): + chemical.append(c.text) + + assert exp_chemical == chemical + + +if __name__ == "__main__": + unittest.main() From 555610bcb0e52b9f8e51fafdebee0be00d173fc2 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:01:52 +0800 Subject: [PATCH 19/38] solve black issue --- tests/fortex/health/processors/ner_label_processor_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py index a91f1356..dc24cc97 100644 --- a/tests/fortex/health/processors/ner_label_processor_test.py +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -21,7 +21,9 @@ from forte.data.data_pack import DataPack from forte.data.readers import StringReader from forte.pipeline import Pipeline -from fortex.health.processors.ner_label_processor import NERLabelProcessor +from fortex.health.processors.ner_label_processor import ( + NERLabelProcessor +) from ftx.medical.clinical_ontology import Chemical, Disease From 94ae23049d56ed7ae37d4192fa5766f4a197741f Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:06:05 +0800 Subject: [PATCH 20/38] black check --- .../health/processors/ner_label_processor_test.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py index dc24cc97..b89b5806 100644 --- a/tests/fortex/health/processors/ner_label_processor_test.py +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -21,24 +21,17 @@ from forte.data.data_pack import DataPack from forte.data.readers import StringReader from forte.pipeline import Pipeline -from fortex.health.processors.ner_label_processor import ( - NERLabelProcessor -) +from fortex.health.processors.ner_label_processor import NERLabelProcessor from ftx.medical.clinical_ontology import Chemical, Disease @ddt class TestNERLabelProcessor(unittest.TestCase): - - @data( - "He got cancer, and he needs oxygen." - ) + @data("He got cancer, and he needs oxygen.") def test_ner_label_processor(self, input_data): self.nlp = Pipeline[DataPack]() self.nlp.set_reader(StringReader()) - config = { - "labels": ["disease", "chemical"] - } + config = {"labels": ["disease", "chemical"]} self.nlp.add(NERLabelProcessor(), config=config) self.nlp.initialize() From e21b7a2b93ed95d1fc4ca590983ccd2e03f59480 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:11:44 +0800 Subject: [PATCH 21/38] remove main --- tests/fortex/health/processors/ner_label_processor_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py index b89b5806..23a7bbe0 100644 --- a/tests/fortex/health/processors/ner_label_processor_test.py +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -51,7 +51,3 @@ def test_ner_label_processor(self, input_data): chemical.append(c.text) assert exp_chemical == chemical - - -if __name__ == "__main__": - unittest.main() From 42b2b063a0f8f4092a6eb2cb5708ec7f980e3a18 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:28:52 +0800 Subject: [PATCH 22/38] add set_up --- fortex/health/processors/ner_label_processor.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 939cae69..0812bcf1 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -15,6 +15,9 @@ NER Labeling Processor """ from typing import Dict, Set +import subprocess +import sys +import os import spacy from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor @@ -45,6 +48,13 @@ def __init__(self): super().__init__() self.nlp = None + def set_up(self): + download_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz" + command = [sys.executable, "-m", "pip", "install"] + [download_url] + subprocess.run( + command, env=os.environ.copy(), encoding="utf8", check=False + ) + def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) self.nlp = spacy.load("en_ner_bc5cdr_md") From 7015b8412cb84142048dcfa2209c7ff20a8a78b3 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:30:40 +0800 Subject: [PATCH 23/38] shorten the string --- fortex/health/processors/ner_label_processor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 0812bcf1..97b6ebd6 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -49,7 +49,8 @@ def __init__(self): self.nlp = None def set_up(self): - download_url = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz" + download_url = """https://s3-us-west-2.amazonaws.com/ + ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz""" command = [sys.executable, "-m", "pip", "install"] + [download_url] subprocess.run( command, env=os.environ.copy(), encoding="utf8", check=False From 560631f53a40be5eee10dc8e99b51ff9e52328e2 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:42:20 +0800 Subject: [PATCH 24/38] fix test bug --- fortex/health/processors/ner_label_processor.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 97b6ebd6..a25cb253 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -14,11 +14,13 @@ """ NER Labeling Processor """ +import importlib from typing import Dict, Set import subprocess import sys import os import spacy +from spacy.cli.download import download from forte.data.data_pack import DataPack from forte.processors.base import PackProcessor from forte.common.configuration import Config @@ -49,12 +51,7 @@ def __init__(self): self.nlp = None def set_up(self): - download_url = """https://s3-us-west-2.amazonaws.com/ - ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz""" - command = [sys.executable, "-m", "pip", "install"] + [download_url] - subprocess.run( - command, env=os.environ.copy(), encoding="utf8", check=False - ) + download("en_ner_bc5cdr_md") def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) From b83e4aefa80a96f74faefe8b8261cfbf92a8fda0 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:46:25 +0800 Subject: [PATCH 25/38] remove unused import --- fortex/health/processors/ner_label_processor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index a25cb253..1bd37df2 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -14,11 +14,8 @@ """ NER Labeling Processor """ -import importlib + from typing import Dict, Set -import subprocess -import sys -import os import spacy from spacy.cli.download import download from forte.data.data_pack import DataPack From 3634a07a36ad61de8fc36e172e1321be486802d4 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 04:53:01 +0800 Subject: [PATCH 26/38] fix pytest issue --- fortex/health/processors/ner_label_processor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 1bd37df2..fffe00da 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -47,11 +47,9 @@ def __init__(self): super().__init__() self.nlp = None - def set_up(self): - download("en_ner_bc5cdr_md") - def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) + download("en_ner_bc5cdr_md") self.nlp = spacy.load("en_ner_bc5cdr_md") def _process(self, input_pack: DataPack): From 0ccdfa44b5591cb5fe9c658b12cb93b147be591e Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 05:03:40 +0800 Subject: [PATCH 27/38] fix pytest bug --- .../health/processors/ner_label_processor.py | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index fffe00da..970a0b1b 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -32,6 +32,49 @@ "NERLabelProcessor", ] +CUSTOM_SPACYMODEL_URL = { + "en_core_sci_sm": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz", + "en_core_sci_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz", + "en_core_sci_lg": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz", + "en_ner_craft_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_ner_craft_md-0.3.0.tar.gz", + "en_ner_jnlpba_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_ner_jnlpba_md-0.3.0.tar.gz", + "en_ner_bc5cdr_md": "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy" + "/releases/v0.3.0/en_ner_bc5cdr_md-0.3.0.tar.gz", + "en_ner_bionlp13cg_md": "https://s3-us-west-2.amazonaws.com/ai2-s2" + "-scispacy/releases/v0.3.0/en_ner_bionlp13cg_md-0" + ".3.0.tar.gz", +} + +def load_lang_model(lang_model): + # pylint: disable=import-outside-toplevel + if lang_model in CUSTOM_SPACYMODEL_URL: + # download ScispaCy model using URL + import subprocess + import sys + import os + import importlib + + download_url = CUSTOM_SPACYMODEL_URL[lang_model] + command = [sys.executable, "-m", "pip", "install"] + [download_url] + subprocess.run( + command, env=os.environ.copy(), encoding="utf8", check=False + ) + cls = importlib.import_module(lang_model) + return cls.load() # type: ignore + else: + # Use spaCy download + try: + nlp = spacy.load(lang_model) # type: ignore + except OSError: + download(lang_model) + nlp = spacy.load(lang_model) # type: ignore + return nlp + class NERLabelProcessor(PackProcessor): r""" @@ -49,8 +92,7 @@ def __init__(self): def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) - download("en_ner_bc5cdr_md") - self.nlp = spacy.load("en_ner_bc5cdr_md") + self.nlp = load_lang_model("en_ner_bc5cdr_md") def _process(self, input_pack: DataPack): r""" From b9d08cf6f455873c1a8f2f9e75e3633607d44138 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 11:52:57 +0800 Subject: [PATCH 28/38] black reformat --- fortex/health/processors/ner_label_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 970a0b1b..d9d48694 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -50,6 +50,7 @@ ".3.0.tar.gz", } + def load_lang_model(lang_model): # pylint: disable=import-outside-toplevel if lang_model in CUSTOM_SPACYMODEL_URL: From deefc849b37dc10f7bb7d3093626987b954f43f1 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Fri, 22 Jul 2022 12:00:12 +0800 Subject: [PATCH 29/38] remove unused comment --- fortex/health/processors/ner_label_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index d9d48694..54ebe68e 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -66,14 +66,14 @@ def load_lang_model(lang_model): command, env=os.environ.copy(), encoding="utf8", check=False ) cls = importlib.import_module(lang_model) - return cls.load() # type: ignore + return cls.load() else: # Use spaCy download try: - nlp = spacy.load(lang_model) # type: ignore + nlp = spacy.load(lang_model) except OSError: download(lang_model) - nlp = spacy.load(lang_model) # type: ignore + nlp = spacy.load(lang_model) return nlp From 7625dd95f5deb5c38879945de1bafd4a128469ec Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Mon, 25 Jul 2022 17:51:24 +0800 Subject: [PATCH 30/38] add json dumps and remove SQL statements --- examples/label_example/sqlite_utils.py | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/examples/label_example/sqlite_utils.py b/examples/label_example/sqlite_utils.py index da82e0ed..38d61dce 100644 --- a/examples/label_example/sqlite_utils.py +++ b/examples/label_example/sqlite_utils.py @@ -75,37 +75,13 @@ def update_stave_db(default_project_json, config): ][0] return base_project["id"] - resp1 = session.create_project(default_project_json) + resp1 = session.create_project(json.dumps(default_project_json)) project_id_base = json.loads(resp1.text)["id"] config = yaml.safe_load(open("stave_config.yml", "r")) config = Config(config, default_hparams=None) con = sqlite3.connect(config.Stave.stave_db_path) - cursorObj = con.cursor() - cursorObj.execute( - f"SELECT ontology, config FROM stave_backend_project WHERE id = {project_id_base}" - ) - results = cursorObj.fetchall() - onto = results[0][0] - conf = results[0][1] - - onto_new = onto.replace("'", '"') - conf_new = ( - conf.replace("'", '"').replace("True", "true").replace("False", "false") - ) - - cursorObj.execute( - "UPDATE stave_backend_project SET ontology ='" - + onto_new - + f"' WHERE id = {project_id_base}" - ) - cursorObj.execute( - "UPDATE stave_backend_project SET config ='" - + conf_new - + f"' WHERE id = {project_id_base}" - ) - con.commit() return project_id_base From b9af6dfa9b3d6eca7af55a28ac16cec492a762f4 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:03:46 +0800 Subject: [PATCH 31/38] wrap search engine code in method --- examples/label_example/search_engine.py | 159 ++++++++++++------------ 1 file changed, 82 insertions(+), 77 deletions(-) diff --git a/examples/label_example/search_engine.py b/examples/label_example/search_engine.py index b1be57c2..4c9bc98f 100644 --- a/examples/label_example/search_engine.py +++ b/examples/label_example/search_engine.py @@ -9,80 +9,85 @@ import templates -st.set_page_config(page_title="ForteHealth_Search_Engine", layout="wide") - -es = Elasticsearch(hosts=["http://localhost:9200/"]) -INDEX = "elastic_indexer" - -config = yaml.safe_load(open("stave_config.yml", "r")) -config = Config(config, default_hparams=None) - -default_project_json = get_json("default_onto_project.json") - -base_project_id = update_stave_db(default_project_json, config) - -st.title("Search the MIMIC III Data...") -search = st.text_input("Enter search words:") - -if not search: - records = {} - results = all_search(es, INDEX) - hits = results["hits"]["hits"] - - conn = sqlite3.connect(config.Stave.stave_db_path) - answers = [] - for idx, hit in enumerate(hits): - source = hit["_source"] - # The raw pack string and pack id (not database id) - raw_pack_str: str = source["pack_info"] - pack_id: str = source["doc_id"] - - # Now you can write the pack into the database and generate url. - item = { - "name": f"clinical_results_{idx}", - "textPack": raw_pack_str, - "project_id": base_project_id, - } - - db_id = sqlite_insert(conn, "stave_backend_document", item) - answers += [db_id] - print(pack_id, db_id) - - links: List[str] = create_links(config.Stave.url, answers) - - for link in links: - st.write(link, unsafe_allow_html=True) - -if search: - results = index_search(es, INDEX, search) - hits = results["hits"]["hits"] - - conn = sqlite3.connect(config.Stave.stave_db_path) - answers = [] - docs = [] - for idx, hit in enumerate(hits): - source = hit["_source"] - # The raw pack string and pack id (not database id) - raw_pack_str: str = source["pack_info"] - pack_id: str = source["doc_id"] - highlight = "...".join(hit["highlight"]["content"]) - # Now you can write the pack into the database and generate url. - item = { - "name": f"clinical_results_{idx}", - "textPack": raw_pack_str, - "project_id": base_project_id, - } - - db_id = sqlite_insert(conn, "stave_backend_document", item) - answers += [db_id] - - docs.append(highlight) - - links: List[str] = create_links(config.Stave.url, answers) - - for i, _ in enumerate(links): - st.write(links[i], unsafe_allow_html=True) - st.write( - templates.search_result(docs[i].replace("\n", " ")), - unsafe_allow_html=True, - ) +def main(): + st.set_page_config(page_title="ForteHealth_Search_Engine", layout="wide") + + es = Elasticsearch(hosts=["http://localhost:9200/"]) + INDEX = "elastic_indexer" + + config = yaml.safe_load(open("stave_config.yml", "r")) + config = Config(config, default_hparams=None) + + default_project_json = get_json("default_onto_project.json") + + base_project_id = update_stave_db(default_project_json, config) + + st.title("Search the MIMIC III Data...") + search = st.text_input("Enter search words:") + + if not search: + records = {} + results = all_search(es, INDEX) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + print(pack_id, db_id) + + links: List[str] = create_links(config.Stave.url, answers) + + for link in links: + st.write(link, unsafe_allow_html=True) + + if search: + results = index_search(es, INDEX, search) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(config.Stave.stave_db_path) + answers = [] + docs = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + highlight = "...".join(hit["highlight"]["content"]) + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": base_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + + docs.append(highlight) + + links: List[str] = create_links(config.Stave.url, answers) + + for i, _ in enumerate(links): + st.write(links[i], unsafe_allow_html=True) + st.write( + templates.search_result(docs[i].replace("\n", " ")), + unsafe_allow_html=True, + ) + + +if __name__ == '__main__': + main() From 8ffa0fac22e5434bd5eba2e6db918f48fcb09d51 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:04:49 +0800 Subject: [PATCH 32/38] remove unnecessary comments --- examples/label_example/search_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/label_example/search_utils.py b/examples/label_example/search_utils.py index eb85da10..35f1e6c4 100644 --- a/examples/label_example/search_utils.py +++ b/examples/label_example/search_utils.py @@ -47,8 +47,6 @@ def index_search(es, index: str, keywords: str) -> dict: "post_tags": [""], "fields": {"content": {}}, }, - # "from": from_i, - # "size": size, "aggs": {"match_count": {"value_count": {"field": "_id"}}}, } From 9532d136408752cb991c3fbb1c873a606d887ce4 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:20:08 +0800 Subject: [PATCH 33/38] remove extra empty lines --- examples/label_example/templates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/label_example/templates.py b/examples/label_example/templates.py index 63bf9aa2..5278a33a 100644 --- a/examples/label_example/templates.py +++ b/examples/label_example/templates.py @@ -15,7 +15,6 @@ def number_of_results(total_hits: int, duration: float) -> str: def search_result(highlights: str) -> str: """HTML scripts to display search results.""" return f""" -
{highlights}
From 24dca3e8ab6b9ce7e57ea9b835dcaf38f082db77 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:40:09 +0800 Subject: [PATCH 34/38] add lang model as config --- fortex/health/processors/ner_label_processor.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 54ebe68e..cbb938c0 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -93,7 +93,7 @@ def __init__(self): def initialize(self, resources: Resources, configs: Config): super().initialize(resources, configs) - self.nlp = load_lang_model("en_ner_bc5cdr_md") + self.nlp = load_lang_model(configs.lang) def _process(self, input_pack: DataPack): r""" @@ -111,15 +111,13 @@ def _process(self, input_pack: DataPack): ) result = self.nlp(doc) - if "disease" in labels: - for ent in result.ents: + for ent in result.ents: + if "disease" in labels: if ent.label_ == "DISEASE": Disease( pack=input_pack, begin=ent.start_char, end=ent.end_char ) - - if "chemical" in labels: - for ent in result.ents: + if "chemical" in labels: if ent.label_ == "CHEMICAL": Chemical( pack=input_pack, begin=ent.start_char, end=ent.end_char @@ -135,7 +133,10 @@ def default_configs(cls): Returns: A dictionary with the default config for this processor. """ - return {"labels": ["disease", "chemical"]} + return { + "labels": ["disease", "chemical"], + "lang": "en_ner_bc5cdr_md" + } def record(self, record_meta: Dict[str, Set[str]]): r""" From 9d98979e980925867f96ce05db7bc670d8fa92dc Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:41:08 +0800 Subject: [PATCH 35/38] fix dependency --- examples/label_example/clinical_pipeline.py | 5 ++++- tests/fortex/health/processors/ner_label_processor_test.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/label_example/clinical_pipeline.py b/examples/label_example/clinical_pipeline.py index 3d4939b3..24df20de 100644 --- a/examples/label_example/clinical_pipeline.py +++ b/examples/label_example/clinical_pipeline.py @@ -26,7 +26,10 @@ def main( else: pl.set_reader(PlainTextReader()) - config_for_ner = {"labels": ["disease", "chemical"]} + config_for_ner = { + "labels": ["disease", "chemical"], + "lang": "en_ner_bc5cdr_md" + } pl.add(NERLabelProcessor(), config=config_for_ner) pl.add( diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py index 23a7bbe0..4271f31f 100644 --- a/tests/fortex/health/processors/ner_label_processor_test.py +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -31,7 +31,10 @@ class TestNERLabelProcessor(unittest.TestCase): def test_ner_label_processor(self, input_data): self.nlp = Pipeline[DataPack]() self.nlp.set_reader(StringReader()) - config = {"labels": ["disease", "chemical"]} + config = { + "labels": ["disease", "chemical"], + "lang": "en_ner_bc5cdr_md" + } self.nlp.add(NERLabelProcessor(), config=config) self.nlp.initialize() From 1b2263ed627225b895214c4292065631cdc09356 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:46:43 +0800 Subject: [PATCH 36/38] black reformat --- fortex/health/processors/ner_label_processor.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index cbb938c0..30c8e1bd 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -62,9 +62,7 @@ def load_lang_model(lang_model): download_url = CUSTOM_SPACYMODEL_URL[lang_model] command = [sys.executable, "-m", "pip", "install"] + [download_url] - subprocess.run( - command, env=os.environ.copy(), encoding="utf8", check=False - ) + subprocess.run(command, env=os.environ.copy(), encoding="utf8", check=False) cls = importlib.import_module(lang_model) return cls.load() else: @@ -114,14 +112,10 @@ def _process(self, input_pack: DataPack): for ent in result.ents: if "disease" in labels: if ent.label_ == "DISEASE": - Disease( - pack=input_pack, begin=ent.start_char, end=ent.end_char - ) + Disease(pack=input_pack, begin=ent.start_char, end=ent.end_char) if "chemical" in labels: if ent.label_ == "CHEMICAL": - Chemical( - pack=input_pack, begin=ent.start_char, end=ent.end_char - ) + Chemical(pack=input_pack, begin=ent.start_char, end=ent.end_char) @classmethod def default_configs(cls): @@ -133,10 +127,7 @@ def default_configs(cls): Returns: A dictionary with the default config for this processor. """ - return { - "labels": ["disease", "chemical"], - "lang": "en_ner_bc5cdr_md" - } + return {"labels": ["disease", "chemical"], "lang": "en_ner_bc5cdr_md"} def record(self, record_meta: Dict[str, Set[str]]): r""" From 2694aec7462d7c0686dff0c602cf69a3b636e55c Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:47:27 +0800 Subject: [PATCH 37/38] black reformat --- tests/fortex/health/processors/ner_label_processor_test.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/fortex/health/processors/ner_label_processor_test.py b/tests/fortex/health/processors/ner_label_processor_test.py index 4271f31f..aba91a09 100644 --- a/tests/fortex/health/processors/ner_label_processor_test.py +++ b/tests/fortex/health/processors/ner_label_processor_test.py @@ -31,10 +31,7 @@ class TestNERLabelProcessor(unittest.TestCase): def test_ner_label_processor(self, input_data): self.nlp = Pipeline[DataPack]() self.nlp.set_reader(StringReader()) - config = { - "labels": ["disease", "chemical"], - "lang": "en_ner_bc5cdr_md" - } + config = {"labels": ["disease", "chemical"], "lang": "en_ner_bc5cdr_md"} self.nlp.add(NERLabelProcessor(), config=config) self.nlp.initialize() From a8494cf826ea4ef72089d94ebbf2db52c5b8a5e3 Mon Sep 17 00:00:00 2001 From: Leolty <569359974@qq.com> Date: Sun, 14 Aug 2022 02:52:04 +0800 Subject: [PATCH 38/38] black line len 80 format --- fortex/health/processors/ner_label_processor.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fortex/health/processors/ner_label_processor.py b/fortex/health/processors/ner_label_processor.py index 30c8e1bd..55932867 100644 --- a/fortex/health/processors/ner_label_processor.py +++ b/fortex/health/processors/ner_label_processor.py @@ -62,7 +62,9 @@ def load_lang_model(lang_model): download_url = CUSTOM_SPACYMODEL_URL[lang_model] command = [sys.executable, "-m", "pip", "install"] + [download_url] - subprocess.run(command, env=os.environ.copy(), encoding="utf8", check=False) + subprocess.run( + command, env=os.environ.copy(), encoding="utf8", check=False + ) cls = importlib.import_module(lang_model) return cls.load() else: @@ -112,10 +114,14 @@ def _process(self, input_pack: DataPack): for ent in result.ents: if "disease" in labels: if ent.label_ == "DISEASE": - Disease(pack=input_pack, begin=ent.start_char, end=ent.end_char) + Disease( + pack=input_pack, begin=ent.start_char, end=ent.end_char + ) if "chemical" in labels: if ent.label_ == "CHEMICAL": - Chemical(pack=input_pack, begin=ent.start_char, end=ent.end_char) + Chemical( + pack=input_pack, begin=ent.start_char, end=ent.end_char + ) @classmethod def default_configs(cls):