API Documentation
This part of the project documentation focuses on
an information-oriented approach. Use it as a
reference for the technical implementation of the
acdh_tei_pyutils
project code.
acdh_tei_pyutils.utils
a bunch of helper functions for either
- processing (TEI)/XML nodes like extracting information, or to
- query/parse (TEI)/XML nodes/trees
add_graphic_url_to_pb(doc)
writes url attributes into tei:pb elements fetched from matching tei:surface//tei:graphic[1] elements
Source code in acdh_tei_pyutils/utils.py
def add_graphic_url_to_pb(doc: TeiReader) -> TeiReader:
"""writes url attributes into tei:pb elements fetched from matching tei:surface//tei:graphic[1] elements"""
for x in doc.any_xpath(".//tei:pb[@facs]"):
facs_id = check_for_hash(x.attrib["facs"])
xpath_expr = f'.//tei:surface[@xml:id="{facs_id}"]//tei:graphic[1]/@url'
try:
facs_url = doc.any_xpath(xpath_expr)[0]
except IndexError:
continue
x.attrib["url"] = facs_url
return doc
check_for_hash(value)
checks if value starts with '#' and if so removes the '#' from the returned value
Source code in acdh_tei_pyutils/utils.py
def check_for_hash(value: str) -> str:
"""checks if value starts with '#' and if so removes the '#' from the returned value"""
if value.startswith("#"):
return value[1:]
else:
return value
crate_tag_whitelist(element, tag_blacklist)
lists all unique elements from a given node and returns only those not in the given blacklist
Source code in acdh_tei_pyutils/utils.py
def crate_tag_whitelist(element: ET.Element, tag_blacklist: list) -> list:
"""lists all unique elements from a given node and returns only those not in the given blacklist"""
tags = list(
set([x.tag for x in element.iter(tag=ET.Element) if x.tag not in tag_blacklist])
)
return tags
extract_fulltext(root_node, tag_blacklist=[])
extracts all fulltext from given element and its children, except from blacklisted elements
Source code in acdh_tei_pyutils/utils.py
def extract_fulltext(root_node: ET.Element, tag_blacklist: list = []) -> str:
"""extracts all fulltext from given element and its children, except from blacklisted elements"""
tags = crate_tag_whitelist(root_node, tag_blacklist)
full_text = " ".join("".join(root_node.itertext(*tags)).split())
return full_text
get_birth_death_year(person_node, xpath_part='@when', birth=True)
tries to extract birth and death years from person nodes and returns either None or the year as Integer
Source code in acdh_tei_pyutils/utils.py
def get_birth_death_year(
person_node: ET.Element, xpath_part: str = "@when", birth: bool = True
) -> Union[int, bool]:
"""tries to extract birth and death years from person nodes and returns either None or the year as Integer"""
if birth:
year_xpath = f"./tei:birth/{xpath_part}"
else:
year_xpath = f"./tei:death/{xpath_part}"
try:
date_str = person_node.xpath(
year_xpath, namespaces={"tei": "http://www.tei-c.org/ns/1.0"}
)[0]
except IndexError:
return None
year_str = date_str[:4]
try:
return int(year_str)
except ValueError:
return None
get_xmlid(element)
returns an @xml:id of the given node
Source code in acdh_tei_pyutils/utils.py
def get_xmlid(element: ET.Element) -> str:
"""returns an @xml:id of the given node"""
return element.attrib["{http://www.w3.org/XML/1998/namespace}id"]
make_bibl_label(node, no_author='o.A.', no_title='o.T.', year='o.J.', editor_abbr='(Hg.)', max_title_length=75)
creates a nice, bibliograhpically useful label from the passed in tei:biblStruct element
Parameters: |
|
---|
Returns: |
|
---|
Source code in acdh_tei_pyutils/utils.py
def make_bibl_label(
node: ET.Element,
no_author="o.A.",
no_title="o.T.",
year="o.J.",
editor_abbr="(Hg.)",
max_title_length=75,
) -> str:
"""creates a nice, bibliograhpically useful label from the passed in tei:biblStruct element
Args:
node (ET.Element): a tei:biblStruct element
no_author (str, optional): Used if no author name can be extracted. Defaults to "o.A".
no_title (str, optional): Used if no title can be extracted. Defaults to "o.T".
year (str, optional): Used if no year can be extracted. Defaults to "o.J".
editor_abbr(str, optional): how to mark the 'author' beeing an editor. Defaults to "(Hg.)".
max_title_length(int, optional): max lenght for the title before it gets truncated. Defaults to
Returns:
str: _description_
"""
try:
author = node.xpath(".//tei:author[1]/tei:surname[1]", namespaces=nsmap)[0].text
except IndexError:
try:
author = node.xpath(".//tei:author[1]/tei:name[1]", namespaces=nsmap)[
0
].text
except IndexError:
try:
author = node.xpath(
".//tei:editor[1]/tei:surname[1]", namespaces=nsmap
)[0].text
author = f"{author} {editor_abbr}"
except IndexError:
try:
author = node.xpath(
".//tei:editor[1]/tei:name[1]", namespaces=nsmap
)[0].text
author = f"{author} {editor_abbr}"
except IndexError:
author = no_author
try:
year = node.xpath(".//tei:date[1]", namespaces=nsmap)[0].text
except IndexError:
year = year
title = node.xpath(".//tei:title[1]", namespaces=nsmap)[0].text
if title:
if len(title) > max_title_length:
title = f"{title[:max_title_length]}..."
else:
title = no_title
return f"{author}, {title}, {year}"
make_entity_label(name_node, default_msg='no label provided', default_lang='en')
Extracts a label and a lang tag from the past in name-node
Parameters: |
|
---|
Returns: |
|
---|
Source code in acdh_tei_pyutils/utils.py
def make_entity_label(
name_node: ET.Element, default_msg="no label provided", default_lang="en"
) -> tuple[str, str]:
"""Extracts a label and a lang tag from the past in name-node
Args:
name_node (ET.Element): A tei:persName|placeName|orgName element
default_msg (str, optional): some default vaule for the label. Defaults to "no label provided".
default_lang (str, optional): some default lang tag if the node does not provide and xml:lang attribute.
Defaults to "en".
Returns:
tuple[str, str]: returns the extracted label and a lang tag
"""
lang_tag = name_node.get("{http://www.w3.org/XML/1998/namespace}lang", default_lang)
fornames = [
normalize_string(x)
for x in name_node.xpath(".//tei:forename//text()", namespaces=nsmap)
]
surnames = [
normalize_string(x)
for x in name_node.xpath(".//tei:surname//text()", namespaces=nsmap)
]
if len(surnames) > 0 and len(fornames) > 0:
label = f"{surnames[0]}, {' '.join(fornames)}"
elif len(surnames) == 0 and len(fornames) > 0:
label = f"{' '.join(fornames)}"
elif len(surnames) > 0 and len(fornames) == 0:
label = f"{surnames[0]}"
else:
name_node_text = " ".join(name_node.xpath(".//text()", namespaces=nsmap))
label = normalize_string(name_node_text)
if label is None or label == "":
label = default_msg
return label, lang_tag
normalize_string(string)
removese any superfluos whitespace from a given string
Source code in acdh_tei_pyutils/utils.py
def normalize_string(string: str) -> str:
"""removese any superfluos whitespace from a given string"""
return " ".join(" ".join(string.split()).split())
previous_and_next(some_iterable)
taken from https://stackoverflow.com/a/1012089
Source code in acdh_tei_pyutils/utils.py
def previous_and_next(some_iterable): # pragma: no cover
"""taken from https://stackoverflow.com/a/1012089"""
prevs, items, nexts = tee(some_iterable, 3)
prevs = chain([None], prevs)
nexts = chain(islice(nexts, 1, None), [None])
return zip(prevs, items, nexts)
acdh_tei_pyutils.tei
TeiEnricher
Bases: TeiReader
a class to enrich tei-documents
Source code in acdh_tei_pyutils/tei.py
class TeiEnricher(TeiReader):
"""a class to enrich tei-documents"""
def add_base_and_id(self, base_value, id_value, prev_value, next_value):
"""adds @xml:base and @xml:id and next and prev to root element
:param base_value: The value of the @xml:base
:type base_value: str
:return: the updated tree
"""
base = self.any_xpath("//tei:TEI")[0]
if base_value:
base.set(f"{{{self.ns_xml['xml']}}}base", base_value)
if id_value:
base.set(f"{{{self.ns_xml['xml']}}}id", id_value)
if prev_value:
base.set("prev", f"{base_value}/{prev_value}")
if next_value:
base.set("next", f"{base_value}/{next_value}")
return self.tree
def get_full_id(self):
"""returns the combination of @xml:base and @xml:id
:return: combination of @xml:base and @xml:id
:rtype: str
"""
base = self.any_xpath("//tei:TEI")[0]
try:
base_base = base.xpath("./@xml:base", namespaces=self.ns_xml)[0]
except IndexError:
return None
try:
base_id = base.xpath("./@xml:id", namespaces=self.ns_xml)[0]
except IndexError:
return None
if base_base.endswith("/"):
return f"{base_base}{base_id}"
else:
return f"{base_base}/{base_id}"
def handle_exist(self, handle_xpath='.//tei:idno[@type="handle"]'):
"""checks if a handle is already assigned
:return: the registered handle or empty string
:rtype: str, None
"""
try:
return self.any_xpath(handle_xpath)[0].text
except IndexError:
return None
def add_handle(
self,
handle,
handle_xpath='.//tei:idno[@type="handle"]',
insert_xpath=".//tei:publicationStmt/tei:p",
):
"""adds an idno @type=handle element into tei:publicationStmt
:param handle: the handle
:type handle: str
:param handle_xpath: an xpath expression where to look for an handle
:type handle_xpath: str
:raises: `HandleAlreadyExist` Error
:returns: the indo node
"""
tei_ns = f"{self.ns_tei['tei']}"
if self.handle_exist(handle_xpath=handle_xpath):
raise HandleAlreadyExist(
f"a handle: {self.handle_exist()} is already registered"
)
else:
idno_node = ET.Element(f"{{{tei_ns}}}idno")
idno_node.set("type", "handle")
idno_node.text = handle
insert_node = self.any_xpath(insert_xpath)[0]
insert_node.append(idno_node)
return idno_node
def create_mention_list(self, mentions, event_title=""):
"""creates a tei element with notes of mentions
:param mentions: a list of dicts with keys `doc_id` and `doc_title`
:type mentions: noteGrp
:return: a etree.element
"""
tei_ns = f"{self.ns_tei['tei']}"
node_root = ET.Element(f"{{{tei_ns}}}noteGrp")
mentions_added = {}
for x in mentions:
try:
mentions_added[slugify(x["doc_id"])]
except KeyError:
note = ET.Element(f"{{{tei_ns}}}note")
note.attrib["target"] = x["doc_id"]
note.attrib["type"] = "mentions"
if x["doc_date"] is not None:
note.attrib["corresp"] = x["doc_date"]
if x["doc_title_sec"] is not None:
note.text = event_title + f"{x['doc_title']} {x['doc_title_sec']}"
else:
note.text = x["doc_title"]
node_root.append(note)
mentions_added[slugify(x["doc_id"])] = True
return node_root
add_base_and_id(base_value, id_value, prev_value, next_value)
adds @xml:base and @xml:id and next and prev to root element
:param base_value: The value of the @xml:base :type base_value: str
:return: the updated tree
Source code in acdh_tei_pyutils/tei.py
def add_base_and_id(self, base_value, id_value, prev_value, next_value):
"""adds @xml:base and @xml:id and next and prev to root element
:param base_value: The value of the @xml:base
:type base_value: str
:return: the updated tree
"""
base = self.any_xpath("//tei:TEI")[0]
if base_value:
base.set(f"{{{self.ns_xml['xml']}}}base", base_value)
if id_value:
base.set(f"{{{self.ns_xml['xml']}}}id", id_value)
if prev_value:
base.set("prev", f"{base_value}/{prev_value}")
if next_value:
base.set("next", f"{base_value}/{next_value}")
return self.tree
add_handle(handle, handle_xpath='.//tei:idno[@type="handle"]', insert_xpath='.//tei:publicationStmt/tei:p')
adds an idno @type=handle element into tei:publicationStmt
:param handle: the handle :type handle: str
:param handle_xpath: an xpath expression where to look for an handle :type handle_xpath: str
:raises: HandleAlreadyExist
Error
:returns: the indo node
Source code in acdh_tei_pyutils/tei.py
def add_handle(
self,
handle,
handle_xpath='.//tei:idno[@type="handle"]',
insert_xpath=".//tei:publicationStmt/tei:p",
):
"""adds an idno @type=handle element into tei:publicationStmt
:param handle: the handle
:type handle: str
:param handle_xpath: an xpath expression where to look for an handle
:type handle_xpath: str
:raises: `HandleAlreadyExist` Error
:returns: the indo node
"""
tei_ns = f"{self.ns_tei['tei']}"
if self.handle_exist(handle_xpath=handle_xpath):
raise HandleAlreadyExist(
f"a handle: {self.handle_exist()} is already registered"
)
else:
idno_node = ET.Element(f"{{{tei_ns}}}idno")
idno_node.set("type", "handle")
idno_node.text = handle
insert_node = self.any_xpath(insert_xpath)[0]
insert_node.append(idno_node)
return idno_node
create_mention_list(mentions, event_title='')
creates a tei element with notes of mentions
:param mentions: a list of dicts with keys doc_id
and doc_title
:type mentions: noteGrp
:return: a etree.element
Source code in acdh_tei_pyutils/tei.py
def create_mention_list(self, mentions, event_title=""):
"""creates a tei element with notes of mentions
:param mentions: a list of dicts with keys `doc_id` and `doc_title`
:type mentions: noteGrp
:return: a etree.element
"""
tei_ns = f"{self.ns_tei['tei']}"
node_root = ET.Element(f"{{{tei_ns}}}noteGrp")
mentions_added = {}
for x in mentions:
try:
mentions_added[slugify(x["doc_id"])]
except KeyError:
note = ET.Element(f"{{{tei_ns}}}note")
note.attrib["target"] = x["doc_id"]
note.attrib["type"] = "mentions"
if x["doc_date"] is not None:
note.attrib["corresp"] = x["doc_date"]
if x["doc_title_sec"] is not None:
note.text = event_title + f"{x['doc_title']} {x['doc_title_sec']}"
else:
note.text = x["doc_title"]
node_root.append(note)
mentions_added[slugify(x["doc_id"])] = True
return node_root
get_full_id()
returns the combination of @xml:base and @xml:id
:return: combination of @xml:base and @xml:id :rtype: str
Source code in acdh_tei_pyutils/tei.py
def get_full_id(self):
"""returns the combination of @xml:base and @xml:id
:return: combination of @xml:base and @xml:id
:rtype: str
"""
base = self.any_xpath("//tei:TEI")[0]
try:
base_base = base.xpath("./@xml:base", namespaces=self.ns_xml)[0]
except IndexError:
return None
try:
base_id = base.xpath("./@xml:id", namespaces=self.ns_xml)[0]
except IndexError:
return None
if base_base.endswith("/"):
return f"{base_base}{base_id}"
else:
return f"{base_base}/{base_id}"
handle_exist(handle_xpath='.//tei:idno[@type="handle"]')
checks if a handle is already assigned
:return: the registered handle or empty string :rtype: str, None
Source code in acdh_tei_pyutils/tei.py
def handle_exist(self, handle_xpath='.//tei:idno[@type="handle"]'):
"""checks if a handle is already assigned
:return: the registered handle or empty string
:rtype: str, None
"""
try:
return self.any_xpath(handle_xpath)[0].text
except IndexError:
return None
TeiReader
Bases: XMLReader
a class to read an process tei-documents
Source code in acdh_tei_pyutils/tei.py
class TeiReader(XMLReader):
"""a class to read an process tei-documents"""
def any_xpath(self, any_xpath="//tei:rs"):
"""Runs any xpath expressions against the parsed document
:param any_xpath: Any XPath expression.
:return: The result of the xpath
"""
return self.tree.xpath(any_xpath, namespaces=self.ns_tei)
def extract_ne_elements(self, parent_node, ne_xpath="//tei:rs"):
"""extract elements tagged as named entities
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
:return: A list of elements
"""
ne_elements = parent_node.xpath(ne_xpath, namespaces=self.ns_tei)
return ne_elements
def extract_ne_dicts(
self, parent_node, ne_xpath="//tei:rs", NER_TAG_MAP=NER_TAG_MAP
):
""" extract strings tagged as named entities
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
:param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
spacy-tags
:return: A list of NE-dicts containing the 'text' and the 'ne_type'
"""
ne_elements = self.extract_ne_elements(parent_node, ne_xpath)
ne_dicts = []
for x in ne_elements:
item = {}
text = "".join(x.xpath(".//text()"))
item["text"] = re.sub(r"\s+", " ", text).strip()
try:
ne_type = NER_TAG_MAP.get("{}".format(x.xpath("./@type")[0]), "MISC")
except IndexError:
ne_type = NER_TAG_MAP.get("{}".format(x.xpath("name()")), "MISC")
item["ne_type"] = ne_type
ne_dicts.append(item)
return ne_dicts
def create_plain_text(self, node):
""" extracts all text nodes from given element
:param start_node: An XPath expressione pointing to\
an element which text nodes should be extracted
:return: A normalized, cleaned plain text
"""
result = re.sub(r"\s+", " ", "".join(node.xpath(".//text()"))).strip()
return result
def get_text_nes_list(
self,
parent_nodes=".//tei:body//tei:p",
ne_xpath=".//tei:rs",
NER_TAG_MAP=NER_TAG_MAP,
):
""" extracts all text nodes from given elements and their NE
:param parent_nodes: An XPath expressione pointing to\
those elements which text nodes should be extracted
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
Takes the parent node(s) as context
:param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
spacy-tags
:return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien",\
"ne_type": "LOC"}]}]
"""
parents = self.tree.xpath(parent_nodes, namespaces=self.ns_tei)
result = []
for node in parents:
text = self.create_plain_text(node)
ner_dicts = self.extract_ne_dicts(node, ne_xpath, NER_TAG_MAP)
result.append({"text": text, "ner_dicts": ner_dicts})
return result
def extract_ne_offsets(
self,
parent_nodes=".//tei:body//tei:p",
ne_xpath=".//tei:rs",
NER_TAG_MAP=NER_TAG_MAP,
):
""" extracts offsets of NEs and the NE-type
:param parent_nodes: An XPath expressione pointing to\
those element which text nodes should be extracted
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
Takes the parent node(s) as context
:param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
spacy-tags
:return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
"""
text_nes_dict = self.get_text_nes_list(parent_nodes, ne_xpath, NER_TAG_MAP)
result = []
for x in text_nes_dict:
plain_text = x["text"]
ner_dicts = x["ner_dicts"]
entities = []
for x in ner_dicts:
if x["text"] != "":
for m in re.finditer(re.escape(x["text"]), plain_text):
entities.append([m.start(), m.end(), x["ne_type"]])
entities = [item for item in set(tuple(row) for row in entities)]
entities = sorted(entities, key=lambda x: x[0])
ents = []
next_item_index = 1
# remove entities with the same start offset
for x in entities:
cur_start = x[0]
try:
next_start = entities[next_item_index][0]
except IndexError:
next_start = 9999999999999999999999
if cur_start == next_start:
pass
else:
ents.append(x)
next_item_index = next_item_index + 1
train_data = (plain_text, {"entities": ents})
result.append(train_data)
return result
any_xpath(any_xpath='//tei:rs')
Runs any xpath expressions against the parsed document :param any_xpath: Any XPath expression. :return: The result of the xpath
Source code in acdh_tei_pyutils/tei.py
def any_xpath(self, any_xpath="//tei:rs"):
"""Runs any xpath expressions against the parsed document
:param any_xpath: Any XPath expression.
:return: The result of the xpath
"""
return self.tree.xpath(any_xpath, namespaces=self.ns_tei)
create_plain_text(node)
extracts all text nodes from given element :param start_node: An XPath expressione pointing to an element which text nodes should be extracted :return: A normalized, cleaned plain text
Source code in acdh_tei_pyutils/tei.py
def create_plain_text(self, node):
""" extracts all text nodes from given element
:param start_node: An XPath expressione pointing to\
an element which text nodes should be extracted
:return: A normalized, cleaned plain text
"""
result = re.sub(r"\s+", " ", "".join(node.xpath(".//text()"))).strip()
return result
extract_ne_dicts(parent_node, ne_xpath='//tei:rs', NER_TAG_MAP=NER_TAG_MAP)
extract strings tagged as named entities :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of NE-dicts containing the 'text' and the 'ne_type'
Source code in acdh_tei_pyutils/tei.py
def extract_ne_dicts(
self, parent_node, ne_xpath="//tei:rs", NER_TAG_MAP=NER_TAG_MAP
):
""" extract strings tagged as named entities
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
:param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
spacy-tags
:return: A list of NE-dicts containing the 'text' and the 'ne_type'
"""
ne_elements = self.extract_ne_elements(parent_node, ne_xpath)
ne_dicts = []
for x in ne_elements:
item = {}
text = "".join(x.xpath(".//text()"))
item["text"] = re.sub(r"\s+", " ", text).strip()
try:
ne_type = NER_TAG_MAP.get("{}".format(x.xpath("./@type")[0]), "MISC")
except IndexError:
ne_type = NER_TAG_MAP.get("{}".format(x.xpath("name()")), "MISC")
item["ne_type"] = ne_type
ne_dicts.append(item)
return ne_dicts
extract_ne_elements(parent_node, ne_xpath='//tei:rs')
extract elements tagged as named entities :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. :return: A list of elements
Source code in acdh_tei_pyutils/tei.py
def extract_ne_elements(self, parent_node, ne_xpath="//tei:rs"):
"""extract elements tagged as named entities
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
:return: A list of elements
"""
ne_elements = parent_node.xpath(ne_xpath, namespaces=self.ns_tei)
return ne_elements
extract_ne_offsets(parent_nodes='.//tei:body//tei:p', ne_xpath='.//tei:rs', NER_TAG_MAP=NER_TAG_MAP)
extracts offsets of NEs and the NE-type :param parent_nodes: An XPath expressione pointing to those element which text nodes should be extracted :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. Takes the parent node(s) as context :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
Source code in acdh_tei_pyutils/tei.py
def extract_ne_offsets(
self,
parent_nodes=".//tei:body//tei:p",
ne_xpath=".//tei:rs",
NER_TAG_MAP=NER_TAG_MAP,
):
""" extracts offsets of NEs and the NE-type
:param parent_nodes: An XPath expressione pointing to\
those element which text nodes should be extracted
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
Takes the parent node(s) as context
:param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
spacy-tags
:return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
"""
text_nes_dict = self.get_text_nes_list(parent_nodes, ne_xpath, NER_TAG_MAP)
result = []
for x in text_nes_dict:
plain_text = x["text"]
ner_dicts = x["ner_dicts"]
entities = []
for x in ner_dicts:
if x["text"] != "":
for m in re.finditer(re.escape(x["text"]), plain_text):
entities.append([m.start(), m.end(), x["ne_type"]])
entities = [item for item in set(tuple(row) for row in entities)]
entities = sorted(entities, key=lambda x: x[0])
ents = []
next_item_index = 1
# remove entities with the same start offset
for x in entities:
cur_start = x[0]
try:
next_start = entities[next_item_index][0]
except IndexError:
next_start = 9999999999999999999999
if cur_start == next_start:
pass
else:
ents.append(x)
next_item_index = next_item_index + 1
train_data = (plain_text, {"entities": ents})
result.append(train_data)
return result
get_text_nes_list(parent_nodes='.//tei:body//tei:p', ne_xpath='.//tei:rs', NER_TAG_MAP=NER_TAG_MAP)
extracts all text nodes from given elements and their NE :param parent_nodes: An XPath expressione pointing to those elements which text nodes should be extracted :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. Takes the parent node(s) as context :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien", "ne_type": "LOC"}]}]
Source code in acdh_tei_pyutils/tei.py
def get_text_nes_list(
self,
parent_nodes=".//tei:body//tei:p",
ne_xpath=".//tei:rs",
NER_TAG_MAP=NER_TAG_MAP,
):
""" extracts all text nodes from given elements and their NE
:param parent_nodes: An XPath expressione pointing to\
those elements which text nodes should be extracted
:param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
Takes the parent node(s) as context
:param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
spacy-tags
:return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien",\
"ne_type": "LOC"}]}]
"""
parents = self.tree.xpath(parent_nodes, namespaces=self.ns_tei)
result = []
for node in parents:
text = self.create_plain_text(node)
ner_dicts = self.extract_ne_dicts(node, ne_xpath, NER_TAG_MAP)
result.append({"text": text, "ner_dicts": ner_dicts})
return result
command line interface
Console script for acdh_collatex_utils.
add_base_id_next_prev(glob_pattern, base_value)
Console script add @xml:base, @xml:id and @prev @next attributes to root element
Source code in acdh_tei_pyutils/cli.py
@click.command() # pragma: no cover
@click.option(
"-g", "--glob-pattern", default="./editions/*.xml", show_default=True
) # pragma: no cover
@click.option("-b", "--base-value") # pragma: no cover
def add_base_id_next_prev(glob_pattern, base_value): # pragma: no cover
"""Console script add @xml:base, @xml:id and @prev @next attributes to root element"""
files = sorted(glob.glob(glob_pattern))
for prev_value, current, next_value in tqdm.tqdm(
previous_and_next(files), total=len(files)
):
doc = TeiEnricher(current)
id_value = os.path.split(current)[1]
if prev_value:
prev_id = os.path.split(prev_value)[1]
else:
prev_id = None
if next_value:
next_id = os.path.split(next_value)[1]
else:
next_id = None
doc.add_base_and_id(base_value, id_value, prev_id, next_id)
doc.tree_to_file(file=current)
add_handles(glob_pattern, hdl_user, hdl_pw, hdl_provider, hdl_prefix, hdl_resolver, hdl_xpath, hdlinsert_xpath)
Console script to register handels base on the values of @xml:id and @xml:base
Source code in acdh_tei_pyutils/cli.py
@click.command() # pragma: no cover
@click.option(
"-g", "--glob-pattern", default="./editions/*.xml", show_default=True
) # pragma: no cover
@click.option("-user", "--hdl-user") # pragma: no cover
@click.option("-pw", "--hdl-pw") # pragma: no cover
@click.option(
"-provider",
"--hdl-provider",
default="http://pid.gwdg.de/handles/",
show_default=True,
) # pragma: no cover
@click.option(
"-prefix", "--hdl-prefix", default="21.11115", show_default=True
) # pragma: no cover
@click.option(
"-resolver", "--hdl-resolver", default="https://hdl.handle.net/", show_default=True
) # pragma: no cover
@click.option(
"-hxpath", "--hdl-xpath", default=".//tei:idno[@type='handle']", show_default=True
) # pragma: no cover
@click.option(
"-hixpath",
"--hdlinsert-xpath",
default=".//tei:publicationStmt/tei:p",
show_default=True,
) # pragma: no cover
def add_handles(
glob_pattern,
hdl_user,
hdl_pw,
hdl_provider,
hdl_prefix,
hdl_resolver,
hdl_xpath,
hdlinsert_xpath,
): # pragma: no cover
"""Console script to register handels base on the values of @xml:id and @xml:base"""
files = sorted(glob.glob(glob_pattern))
hdl_client = HandleClient(
hdl_user,
hdl_pw,
hdl_provider=hdl_provider,
hdl_prefix=hdl_prefix,
hdl_resolver=hdl_resolver,
)
for x in tqdm.tqdm(files, total=len(files)):
doc = TeiEnricher(x)
if doc.handle_exist():
continue
parsed_data = doc.get_full_id()
if parsed_data is None:
continue
hdl = hdl_client.register_handle(parsed_data)
print(hdl)
doc.add_handle(hdl, handle_xpath=hdl_xpath, insert_xpath=hdlinsert_xpath)
doc.tree_to_file(x)
denormalize_indices(files, indices, mention_xpath, title_xpath, title_sec_xpath, date_xpath, blacklist_ids=[])
Write pointers to mentions in index-docs and copy index entries into docs
Source code in acdh_tei_pyutils/cli.py
@click.command() # pragma: no cover
@click.option(
"-f", "--files", default="./editions/*.xml", show_default=True
) # pragma: no cover
@click.option(
"-i", "--indices", default="./indices/list*.xml", show_default=True
) # pragma: no cover
@click.option(
"-m", "--mention-xpath", default=".//tei:rs[@ref]/@ref", show_default=True
) # pragma: no cover
@click.option(
"-x", "--title-xpath", default=".//tei:title/text()", show_default=True
) # pragma: no cover
@click.option("-xs", "--title-sec-xpath", required=False) # pragma: no cover
@click.option("-d", "--date-xpath", required=False) # pragma: no cover
@click.option(
"-b", "--blacklist-ids", default=[], multiple=True, show_default=True
) # pragma: no cover
def denormalize_indices(
files,
indices,
mention_xpath,
title_xpath,
title_sec_xpath,
date_xpath,
blacklist_ids=[],
): # pragma: no cover
"""Write pointers to mentions in index-docs and copy index entries into docs"""
files = sorted(glob.glob(files))
index_files = sorted(glob.glob(indices))
ref_doc_dict = defaultdict(list)
doc_ref_dict = defaultdict(list)
click.echo(
click.style(f"collecting list of mentions from {len(files)} docs", fg="green")
)
for x in tqdm.tqdm(files):
filename = os.path.split(x)[1]
if "list" in filename:
continue
doc = TeiEnricher(x)
doc_base = doc.any_xpath("./@xml:base")[0]
doc_id = doc.any_xpath("./@xml:id")[0]
doc_uri = f"{doc_base}/{doc_id}"
try:
doc_title = doc.any_xpath(title_xpath)[0]
except IndexError:
doc_title = f"ERROR in title xpath of file: {doc_id}"
print(f"ERROR in -x title xpath of file: {doc_id}")
if title_sec_xpath:
try:
doc_title_sec = doc.any_xpath(title_sec_xpath)[0]
except IndexError:
doc_title_sec = f"ERROR in -xs secondary title xpath of file: {doc_id}"
print(f"ERROR in secondary title xpath of file: {doc_id}")
else:
doc_title_sec = None
if date_xpath:
try:
doc_date = doc.any_xpath(date_xpath)[0]
except IndexError:
doc_date = f"ERROR in date xpath of file: {doc_id}"
print(f"ERROR in -d date xpath of file: {doc_id}")
else:
doc_date = None
refs = doc.any_xpath(mention_xpath)
for ref in set(refs):
if ref.startswith("#") and len(ref.split(" ")) == 1:
ref = ref[1:]
if ref.startswith("#") and len(ref.split(" ")) > 1:
refs = ref.split(" ")
ref = refs[0]
ref = ref[1:]
for r in refs[1:]:
ref_doc_dict[r[1:]].append(
{
"doc_uri": doc_uri,
"doc_id": doc_id,
"doc_path": x,
"doc_title": doc_title,
"doc_title_sec": doc_title_sec,
"doc_date": doc_date,
}
)
ref_doc_dict[ref].append(
{
"doc_uri": doc_uri,
"doc_id": doc_id,
"doc_path": x,
"doc_title": doc_title,
"doc_title_sec": doc_title_sec,
"doc_date": doc_date,
}
)
doc_ref_dict[filename].append(ref)
click.echo(
click.style(
f"collected {len(ref_doc_dict.keys())} of mentioned entities from {len(files)} docs",
fg="green",
)
)
for x in index_files:
doc = TeiEnricher(x)
ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
for ent in ent_nodes:
ent_id = ent.xpath("@xml:id")[0]
mention = ref_doc_dict[ent_id]
if ent_id in blacklist_ids:
continue
ent_name = ent.tag
note_grp = doc.create_mention_list(mention)
try:
list(note_grp[0])
# TEI schema does not allow noteGrp in event after e.g. listPerson, ... so we need to insert it before
if ent_name == "{http://www.tei-c.org/ns/1.0}event":
ent.insert(1, note_grp)
else:
ent.append(note_grp)
except IndexError:
pass
doc.tree_to_file(file=x)
all_ent_nodes = {}
for x in index_files:
doc = TeiEnricher(x)
ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
for ent in ent_nodes:
all_ent_nodes[ent.xpath("@xml:id")[0]] = ent
click.echo(
click.style(
f"writing {len(all_ent_nodes)} index entries into {len(files)} files",
fg="green",
)
)
for x in tqdm.tqdm(files):
try:
filename = os.path.split(x)[1]
doc = TeiEnricher(x)
root_node = doc.any_xpath(".//tei:text")[0]
for bad in doc.any_xpath(".//tei:back"):
bad.getparent().remove(bad)
refs = doc.any_xpath(mention_xpath)
ent_dict = defaultdict(list)
for ref in set(refs):
# print(ref, type(ref))
if ref.startswith("#") and len(ref.split(" ")) == 1:
ent_id = ref[1:]
elif ref.startswith("#") and len(ref.split(" ")) > 1:
refs = ref.split(" ")
ref = refs[0]
ent_id = ref[1:]
for r in refs[1:]:
try:
index_ent = all_ent_nodes[r[1:]]
ent_dict[index_ent.tag].append(index_ent)
except KeyError:
continue
else:
ent_id = ref
try:
index_ent = all_ent_nodes[ent_id]
ent_dict[index_ent.tag].append(index_ent)
except KeyError:
continue
back_node = ET.Element("{http://www.tei-c.org/ns/1.0}back")
for key in ent_dict.keys():
if key.endswith("person"):
list_person = ET.Element("{http://www.tei-c.org/ns/1.0}listPerson")
back_node.append(list_person)
for ent in ent_dict[key]:
list_person.append(ent)
if key.endswith("place"):
list_place = ET.Element("{http://www.tei-c.org/ns/1.0}listPlace")
back_node.append(list_place)
for ent in ent_dict[key]:
list_place.append(ent)
if key.endswith("org"):
list_org = ET.Element("{http://www.tei-c.org/ns/1.0}listOrg")
back_node.append(list_org)
for ent in ent_dict[key]:
list_org.append(ent)
if key.endswith("bibl") or key.endswith("biblStruct"):
list_bibl = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
back_node.append(list_bibl)
for ent in ent_dict[key]:
list_bibl.append(ent)
if key.endswith("item"):
list_item = ET.Element("{http://www.tei-c.org/ns/1.0}list")
back_node.append(list_item)
for ent in ent_dict[key]:
list_item.append(ent)
if key.endswith("event"):
list_eve = ET.Element("{http://www.tei-c.org/ns/1.0}listEvent")
back_node.append(list_eve)
for ent in ent_dict[key]:
list_eve.append(ent)
root_node.append(back_node)
doc.tree_to_file(file=x)
except Exception as e:
print(f"failed to process {x} due to {e}")
click.echo(click.style("DONE", fg="green"))
mentions_to_indices(files, indices, mention_xpath, event_title, title_xpath)
Console script write pointers to mentions in index-docs
Source code in acdh_tei_pyutils/cli.py
@click.command() # pragma: no cover
@click.option(
"-f", "--files", default="./editions/*.xml", show_default=True
) # pragma: no cover
@click.option(
"-i", "--indices", default="./indices/list*.xml", show_default=True
) # pragma: no cover
@click.option(
"-m", "--mention-xpath", default=".//tei:rs[@ref]/@ref", show_default=True
) # pragma: no cover
@click.option(
"-t", "--event-title", default="erwähnt in ", show_default=True
) # pragma: no cover
@click.option(
"-x",
"--title-xpath",
default='.//tei:title[@type="main"]/text()',
show_default=True,
) # pragma: no cover
def mentions_to_indices(
files, indices, mention_xpath, event_title, title_xpath
): # pragma: no cover
"""Console script write pointers to mentions in index-docs"""
files = sorted(glob.glob(files))
index_files = sorted(glob.glob(indices))
ref_doc_dict = defaultdict(list)
doc_ref_dict = defaultdict(list)
click.echo(
click.style(f"collecting list of mentions from {len(files)} docs", fg="green")
)
for x in tqdm.tqdm(files):
filename = os.path.split(x)[1]
doc = TeiEnricher(x)
doc_base = doc.any_xpath("./@xml:base")[0]
doc_id = doc.any_xpath("./@xml:id")[0]
doc_uri = f"{doc_base}/{doc_id}"
doc_title = doc.any_xpath(title_xpath)[0]
refs = doc.any_xpath(mention_xpath)
for ref in set(refs):
if ref.startswith("#"):
ref = ref[1:]
ref_doc_dict[ref].append(
{
"doc_uri": doc_uri,
"doc_path": x,
"doc_title": doc_title,
"doc_id": doc_id,
"doc_date": None,
"doc_title_sec": None,
}
)
doc_ref_dict[filename].append(ref)
click.echo(
click.style(
f"collected {len(ref_doc_dict.keys())} of mentioned entities from {len(files)} docs",
fg="green",
)
)
for x in index_files:
doc = TeiEnricher(x)
ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
for ent in ent_nodes:
ent_id = ent.xpath("@xml:id", namespaces=doc.nsmap)[0]
mentions = ref_doc_dict[ent_id]
ent_name = ent.tag
note_grp = doc.create_mention_list(mentions, event_title)
try:
list(note_grp[0])
# TEI schema does not allow noteGrp in event after e.g. listPerson, ... so we need to insert it before
if ent_name == "{http://www.tei-c.org/ns/1.0}event":
ent.insert(1, note_grp)
else:
ent.append(note_grp)
except IndexError:
pass
doc.tree_to_file(file=x)
all_ent_nodes = {}
for x in index_files:
doc = TeiEnricher(x)
ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
for ent in ent_nodes:
all_ent_nodes[ent.xpath("@xml:id")[0]] = ent
click.echo(click.style("DONE", fg="green"))
schnitzler(files, indices, doc_person, doc_work)
Console script write pointers to mentions in index-docs
Source code in acdh_tei_pyutils/cli.py
@click.command() # pragma: no cover
@click.option(
"-f", "--files", default="./data/editions/*.xml", show_default=True
) # pragma: no cover
@click.option(
"-i", "--indices", default="./data/indices/list*.xml", show_default=True
) # pragma: no cover
@click.option(
"-t",
"--doc-person",
default="./data/indices/index_person_day.xml",
show_default=True,
) # pragma: no cover
@click.option(
"-t", "--doc-work", default="./data/indices/index_work_day.xml", show_default=True
) # pragma: no cover
def schnitzler(files, indices, doc_person, doc_work): # pragma: no cover
"""Console script write pointers to mentions in index-docs"""
files = sorted(glob.glob(files))
index_files = sorted(glob.glob(indices))
doc_person = TeiEnricher(doc_person)
doc_work = TeiEnricher(doc_work)
all_ent_nodes = {}
for x in index_files:
doc = TeiEnricher(x)
ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
for ent in ent_nodes:
all_ent_nodes[ent.xpath("@xml:id")[0]] = ent
no_matches = []
for x in tqdm.tqdm(files, total=len(files)):
day = x.split("/")[-1].replace("entry__", "").replace(".xml", "")
doc = TeiEnricher(x)
root_node = doc.any_xpath(".//tei:text")[0]
back_node = ET.Element("{http://www.tei-c.org/ns/1.0}back")
for bad in doc.any_xpath(".//tei:back"):
bad.getparent().remove(bad)
xpath = f".//item[@target='{day}']/ref/text()"
ids = doc_person.any_xpath(xpath)
list_person_node = ET.Element("{http://www.tei-c.org/ns/1.0}listPerson")
if len(ids) > 0:
for id in ids:
try:
nodes = all_ent_nodes[id]
except KeyError:
no_matches.append(id)
continue
list_person_node.append(nodes)
if len(list_person_node) > 0:
back_node.append(list_person_node)
ids = doc_work.any_xpath(xpath)
list_work_node = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
if len(ids) > 0:
for id in ids:
try:
nodes = all_ent_nodes[id]
except KeyError:
no_matches.append(id)
continue
list_work_node.append(nodes)
if len(list_work_node) > 0:
back_node.append(list_work_node)
place_ids = doc.any_xpath('.//tei:rs[@ref and @type="place"]/@ref')
if len(place_ids) > 0:
list_place_node = ET.Element("{http://www.tei-c.org/ns/1.0}listPlace")
for pl in place_ids:
try:
pl_node = all_ent_nodes[pl[1:]]
except KeyError:
no_matches.append(pl)
continue
list_place_node.append(pl_node)
if len(list_place_node) > 0:
back_node.append(list_place_node)
if len(back_node) > 0:
root_node.append(back_node)
doc.tree_to_file(file=x)
distinct_no_match = set(no_matches)
print(distinct_no_match)