Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b06d54d
NRE: example of using spacy
SepidehAlassi Jan 18, 2021
e4970db
test_data: mozartItalienReise (de)
SepidehAlassi Jan 18, 2021
85f3c13
test_data: english pure text
SepidehAlassi Jan 18, 2021
0ac7f4f
install prerequisites
SepidehAlassi Jan 18, 2021
bdb055c
extract location from german text
SepidehAlassi Jan 18, 2021
58b1d6c
more test data
SepidehAlassi Jan 18, 2021
2911ae4
italiand text parser and refactor
SepidehAlassi Jan 18, 2021
fdcdc34
Merge branch 'main' into geoEntityRecognition
SepidehAlassi Jan 18, 2021
61dd0b9
Added gitignore, prettied printing, fixed typo
mjkanji Jan 19, 2021
eca20b5
Small changes to printing.
mjkanji Jan 19, 2021
67a743f
Testing lxml + added some TEI data.
mjkanji Jan 19, 2021
83de2ce
more attributes
SepidehAlassi Jan 19, 2021
4b373b7
Cleanup for pull request to main repo.
mjkanji Jan 19, 2021
3a1fcbe
Merge pull request #16 from mjkanji/geoEntityRecognition
mjkanji Jan 19, 2021
56d337b
Added lat and lon args to GeoEntity constructor.
mjkanji Jan 19, 2021
38f7177
get wikidata links
SepidehAlassi Jan 19, 2021
0862880
Merge branch 'geoEntityRecognition' of https://github.com/ORD-Hackath…
SepidehAlassi Jan 19, 2021
0bbd532
minor updates.
mjkanji Jan 19, 2021
7e5259c
fix import
SepidehAlassi Jan 19, 2021
626c204
Merge branch 'geoEntityRecognition' of https://github.com/ORD-Hackath…
SepidehAlassi Jan 19, 2021
c536a84
fix typo
SepidehAlassi Jan 19, 2021
e27cc00
introduce sleep
SepidehAlassi Jan 19, 2021
86343c3
added code for generation HTML preview.
mjkanji Jan 19, 2021
20bfda9
write html
SepidehAlassi Jan 19, 2021
0bc1a0e
Merge branch 'geoEntityRecognition' of https://github.com/ORD-Hackath…
mjkanji Jan 19, 2021
38ac8aa
Merge branch 'geoEntityRecognition' of https://github.com/ORD-Hackath…
mjkanji Jan 19, 2021
82e0358
refactor
SepidehAlassi Jan 19, 2021
663f0a2
Merge branch 'geoEntityRecognition' of https://github.com/ORD-Hackath…
SepidehAlassi Jan 19, 2021
7b574f8
refactor
SepidehAlassi Jan 19, 2021
9d2351d
import locations to knora api as RDF resources
SepidehAlassi Jan 19, 2021
ea89057
Added parsing logic for TEI files.
mjkanji Jan 20, 2021
858233f
minor changes to parse_tei.py
mjkanji Jan 20, 2021
b57c4dc
parse_tei now works whether a namespace is defined or not
mjkanji Jan 20, 2021
429678f
parse_tei now pulls geoIDs from the WikiData API.
mjkanji Jan 20, 2021
39a02e4
create document resource
SepidehAlassi Jan 20, 2021
5f8dbce
import document to knora
SepidehAlassi Jan 21, 2021
d63e751
Merge branch 'geoEntityRecognition' of https://github.com/ORD-Hackath…
SepidehAlassi Jan 21, 2021
a06c689
refactor
SepidehAlassi Jan 29, 2021
2fe4d99
clean up
SepidehAlassi Jan 29, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.venv/
**/__pycache__/
*.pyc
47 changes: 47 additions & 0 deletions geoEntityRecognition/createDocumentResources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from writeElement import writeToXMLForImport
import json
import codecs
import requests

def importDocument(textContent, name):
url = "http://localhost:3333/v2/resources"
payload = {}
payload["@id"] = "http://rdfh.ch/0001/document_" + name
payload["@type"] = "geo:Document"
payload["knora-api:attachedToProject"] = {"@id": "http://rdfh.ch/projects/0001"}
payload["geo:hasTextWithLocation"] = {
"@type": "knora-api:TextValue",
"knora-api:textValueAsXml": textContent,
"knora-api:textValueHasMapping": { "@id": "http://rdfh.ch/standoff/mappings/StandardMapping"},
"knora-api:creationDate": {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}
}
payload["rdfs:label"] = name
payload["knora-api:creationDate"] = {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}
payload["@context"] = {
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"knora-api": "http://api.knora.org/ontology/knora-api/v2#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"anything": "http://0.0.0.0:3333/ontology/0001/anything/v2#"
}
json_object = json.dumps(payload, indent=4)
headers = {
'authorization': "Basic cm9vdEBleGFtcGxlLmNvbTp0ZXN0",
'cache-control': "no-cache",
'postman-token': "19f719be-45ed-fa95-487d-e5510794e9b2"
}
response = requests.request("POST", url, data=json_object, headers=headers)
if response.ok:
print("Document "+ name + "is stored!")

def createDocumentResource(text, updatedLocations, name, outputFile):
pureText = open(text, "r").read()
writeToXMLForImport(pureText, updatedLocations, outputFile)
# textContent = codecs.open(outputFile, 'r', encoding='utf8')
# importDocument(textContent, name)
41 changes: 41 additions & 0 deletions geoEntityRecognition/createLocationResources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from getIdentifier import addWikiInfo
from writeElement import writeToHTML
from importLocations import importLocation
import os
from extractLocationFromText import ExtractLocationFromText

parser = ExtractLocationFromText()

def createLocationsInEnglishText(importedLocations, testData):

text = open(testData, "r").read()
foundLocations = parser.getLocationsEnglish(text)
updatedLocations = addWikiInfo(foundLocations=foundLocations)
importedLocations = importLocation(foundLocations=updatedLocations, importedLocations=importedLocations)

return updatedLocations, importedLocations

def createLocationsInGermanText(importedLocations, testData):
text = open(testData, "r").read()
foundLocations = parser.getLocationsGerman(text)
updatedLocations = addWikiInfo(foundLocations=foundLocations)
importedLocations = importLocation(foundLocations=updatedLocations, importedLocations=importedLocations)

return updatedLocations, importedLocations

def createLocationsInFrenchText(importedLocations, testData):
text = open(testData, "r").read()
foundLocations = parser.getLocationsFrench(text)
updatedLocations = addWikiInfo(foundLocations=foundLocations)
importedLocations = importLocation(foundLocations=updatedLocations, importedLocations=importedLocations)

return updatedLocations, importedLocations


def createLocationsInItalianText(importedLocations, testData):
text = open(testData, "r").read()
foundLocations = parser.getLocationsItalian(text)
updatedLocations = addWikiInfo(foundLocations=foundLocations)
importedLocations = importLocation(foundLocations=updatedLocations, importedLocations=importedLocations)

return updatedLocations, importedLocations
18 changes: 18 additions & 0 deletions geoEntityRecognition/download_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os

# Download Spacy pretrained models
if __name__ == '__main__':
# download pretrained model for English language
os.system('python3 -m spacy download en_core_web_sm')

# download pretrained model for German language
os.system('python3 -m spacy download de_core_news_sm')

# download pretrained model for french language
os.system('python3 -m spacy download fr_core_news_sm')

# download pretrained model for italian language
os.system('python3 -m spacy download it_core_news_sm')

# download pretrained modelf for mult language
os.system('python3 -m spacy download xx_ent_wiki_sm')
76 changes: 76 additions & 0 deletions geoEntityRecognition/extractLocationFromText.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import spacy
import os
from geoEntity import GeoEntity

class ExtractLocationFromText:
def __init__(self):
self.locationLabels = ['GPE', 'LOC', 'FAC', 'ORG']

def extractLocationEntities(self, entities, lang):
foundLocations = []
geoEntities = list(filter(lambda ent: ent.label_ in self.locationLabels, entities))

for ent in geoEntities:
entity = GeoEntity(ent.text, ent.label_, ent.start_char, ent.end_char, lang)
foundLocations.append(entity)

return foundLocations

def getLocationsEnglish(self, text):
# load pretrained english model
englishModel = spacy.load("en_core_web_sm")

# extrac geo entities
doc = englishModel(text)
return self.extractLocationEntities(doc.ents, 'en')

def getLocationsGerman(self, text):
# load pretrained german model
germanhModel = spacy.load("de_core_news_sm")

# extrac geo entities
doc = germanhModel(text)
return self.extractLocationEntities(doc.ents, 'de')

def getLocationsFrench(self, text):
# load pretrained french model
frenchhModel = spacy.load("fr_core_news_sm")

# extrac geo entities
doc = frenchhModel(text)
return self.extractLocationEntities(doc.ents, 'fr')

def getLocationsItalian(self, text):
# load pretrained italian model
italianModel = spacy.load("it_core_news_sm")

# extrac geo entities
doc = italianModel(text)
return self.extractLocationEntities(doc.ents, 'it')


def pretty_print(locations):
return '\n'.join([str(d) for d in locations])

if __name__ == '__main__':
testDataPath = "test_data/"
parser = ExtractLocationFromText()
englishTestData = os.path.join(testDataPath, "en_magellan_voyage.txt")
englishText = open(englishTestData, "r").read()
foundLocations = parser.getLocationsEnglish(englishText)
print(f"---English---\nLocations found: {len(foundLocations)}\nResults:\n{pretty_print(foundLocations)}\n\n")

germanTestData = os.path.join(testDataPath, "de_mozartItalienReise.txt")
germanText = open(germanTestData, "r").read()
foundLocations = parser.getLocationsGerman(germanText)
print(f"---German---\nLocations found: {len(foundLocations)}\nResults:\n{pretty_print(foundLocations)}\n\n")

frenchTestData = os.path.join(testDataPath, "fr_paris_a_Jerusalem.txt")
frenchText = open(frenchTestData, "r").read()
foundLocations = parser.getLocationsFrench(frenchText)
print(f"---French---\nLocations found: {len(foundLocations)}\nResults:\n{pretty_print(foundLocations)}\n\n")

italianTestData = os.path.join(testDataPath, "it_marcOPollo.txt")
italianText = open(italianTestData, "r").read()
foundLocations = parser.getLocationsItalian(italianText)
print(f"---Italian---\nLocations found: {len(foundLocations)}\nResults:\n{pretty_print(foundLocations)}\n\n")
11 changes: 11 additions & 0 deletions geoEntityRecognition/geoEntity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
class GeoEntity:
def __init__(self, text, label, startChar=None, endChar=None, lang="en", lon=None, lat=None):
self.label = label # e.g., GPE for geo-political entities
self.text = text # name of the location
self.startChar = startChar
self.endChar = endChar
self.language = lang
self.geoNameID = ""
self.wikiID = ""
self.longitude = lon
self.latitude = lat
57 changes: 57 additions & 0 deletions geoEntityRecognition/getIdentifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from qwikidata.sparql import return_sparql_query_results
import warnings
import time

def getPostition(coordinate):
longitude, latitude = coordinate.split(" ")
return longitude.strip('Point('), latitude.strip(')')

def getWikiRecord(name, lang):

sparql_statement = """
select ?wikiItem ?geonameID ?coordinate
where {""" +\
'?wikiItem rdfs:label "' + name + '"@' + lang + "; " +\
"""wdt:P1566 ?geonameID;\n
wdt:P625 ?coordinate
}
"""

# print(sparql_statement)

res = return_sparql_query_results(sparql_statement)
results = res["results"]["bindings"]
wikiId = ""
geonameID = ""
longitude = ""
lattitude = ""
if (len(results)==0) :
# warnings.warn("No records found for location " + name + " in language " + lang + ".")
pass
else :
record = results[0]
# if(len(results)>1):
# warnings.warn("Multiple records found for location " + name + "in language" + lang + ". Top one is chosen.")
wikiId = record['wikiItem']['value']
geonameID = record['geonameID']['value']
coordinate = record['coordinate']['value']
longitude, lattitude = getPostition(coordinate)

return wikiId, geonameID, longitude, lattitude


def addWikiInfo(foundLocations):
counter = 0
for loc in foundLocations:
# print(counter)
loc.wikiID, loc.geoNameID, loc.longitude, loc.latitude = getWikiRecord(loc.text, loc.language)
if (counter == 8):
time.sleep(90)
counter = 0
else:
counter += 1
# print(loc.__dict__)
return foundLocations

if __name__ == '__main__':
print(getWikiRecord("Berlin", "de"))
109 changes: 109 additions & 0 deletions geoEntityRecognition/importLocations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import requests
import json

ontoName = "geo"

def createNewResourceForLocation(loc, importedLocations):
url = "http://localhost:3333/v2/resources"
payload = {}
payload["@id"] = "http://rdfh.ch/0001/" + loc.geoNameID
payload["@type"] = ontoName + ":Location"
payload["knora-api:attachedToProject"] = {"@id": "http://rdfh.ch/projects/0001"}
payload[ontoName + ":hasName"] = {
"@type": "knora-api:TextValue",
"knora-api:valueAsString": loc.text,
"knora-api:valueHasLanguage": loc.language,
"knora-api:creationDate": {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}
}
payload[ontoName + ":hasGeoNameID"] = {
"@type": "knora-api:GeonameValue",
"knora-api:geonameValueAsGeonameCode": loc.geoNameID,
"knora-api:creationDate": {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}

}
payload[ontoName + ":hasWikiLink"] = {"@type": "knora-api:UriValue",
"knora-api:uriValueAsUri": {"@type": "xsd:anyURI", "@value": loc.wikiID},
"knora-api:creationDate": {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}
}
payload["rdfs:label"] = loc.text
payload["knora-api:creationDate"] = {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}
payload["@context"] = {
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"knora-api": "http://api.knora.org/ontology/knora-api/v2#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"geo": "http://0.0.0.0:3333/ontology/0001/geo/v2#"
}
json_object = json.dumps(payload, indent=4)
headers = {
'authorization': "Basic cm9vdEBleGFtcGxlLmNvbTp0ZXN0",
'cache-control': "no-cache",
'postman-token': "19f719be-45ed-fa95-487d-e5510794e9b2"
}
response = requests.request("POST", url, data=json_object, headers=headers)
if response.ok:
importedLocations[loc.geoNameID] = [loc.language]
print("Location" + loc.text + "stored!")
else :
print (response.content)
return importedLocations

def createNewNameValue(loc, importedLocations):
url = "http://localhost:3333/v2/values"
payload = {}
payload["@id"] = "http://rdfh.ch/0001/" + loc.geoNameID
payload["@type"] = ontoName + ":Location"
payload[ontoName + ":hasName"] = {
"@type": "knora-api:TextValue",
"knora-api:valueAsString": loc.text,
"knora-api:valueHasLanguage": loc.language,
"knora-api:creationDate": {
"@type": "xsd:dateTimeStamp",
"@value": "2020-01-21T12:58:54.502951Z"
}
}
payload["@context"] = {
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"knora-api": "http://api.knora.org/ontology/knora-api/v2#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
"geo": "http://0.0.0.0:3333/ontology/0001/geo/v2#"
}


json_object = json.dumps(payload, indent=4)
headers = {
'authorization': "Basic cm9vdEBleGFtcGxlLmNvbTp0ZXN0",
'cache-control': "no-cache",
'postman-token': "19f719be-45ed-fa95-487d-e5510794e9b2"
}
response = requests.request("POST", url, data=json_object, headers=headers)
if response.ok:
importedLocations[loc.geoNameID].append(loc.language)
print("Location" + loc.text + "stored!")
return importedLocations

def importLocation(foundLocations, importedLocations):

for loc in foundLocations:

if loc.geoNameID:
if loc.geoNameID not in importedLocations.keys():
createNewResourceForLocation(loc, importedLocations)
else :
if loc.language not in importedLocations[loc.geoNameID]:
createNewNameValue(loc, importedLocations)
return importedLocations

21 changes: 21 additions & 0 deletions geoEntityRecognition/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from createLocationResources import *
from createDocumentResources import createDocumentResource
import os
import time
importedLocations = {}

if __name__ == '__main__':
testDataPath = "test_data/"

testFile = "en_swiss.txt"
testData = os.path.join(testDataPath, testFile)
foundLocations, importedLocations = createLocationsInEnglishText(importedLocations, testData)
createDocumentResource(testData, foundLocations, 'wiki_swiss_en', 'xmlEnglish.xml')
print("Locations in English text imported")
time.sleep(90)
testFile = "de_swiss.txt"
testData = os.path.join(testDataPath, testFile)
foundLocations, importedLocations = createLocationsInGermanText(importedLocations, testData)
createDocumentResource(testData, foundLocations, 'wiki_swiss_de', 'xmlDeutsch.xml')
print("Locations in German text imported")

Loading