API Documentation

This part of the project documentation focuses on an information-oriented approach. Use it as a reference for the technical implementation of the acdh_tei_pyutils project code.

acdh_tei_pyutils.utils

a bunch of helper functions for either

  • processing (TEI)/XML nodes like extracting information, or to
  • query/parse (TEI)/XML nodes/trees

add_graphic_url_to_pb(doc)

writes url attributes into tei:pb elements fetched from matching tei:surface//tei:graphic[1] elements

Source code in acdh_tei_pyutils/utils.py
def add_graphic_url_to_pb(doc: TeiReader) -> TeiReader:
    """writes url attributes into tei:pb elements fetched from matching tei:surface//tei:graphic[1] elements"""
    for x in doc.any_xpath(".//tei:pb[@facs]"):
        facs_id = check_for_hash(x.attrib["facs"])
        xpath_expr = f'.//tei:surface[@xml:id="{facs_id}"]//tei:graphic[1]/@url'
        try:
            facs_url = doc.any_xpath(xpath_expr)[0]
        except IndexError:
            continue
        x.attrib["url"] = facs_url
    return doc

check_for_hash(value)

checks if value starts with '#' and if so removes the '#' from the returned value

Source code in acdh_tei_pyutils/utils.py
def check_for_hash(value: str) -> str:
    """checks if value starts with '#' and if so removes the '#' from the returned value"""
    if value.startswith("#"):
        return value[1:]
    else:
        return value

crate_tag_whitelist(element, tag_blacklist)

lists all unique elements from a given node and returns only those not in the given blacklist

Source code in acdh_tei_pyutils/utils.py
def crate_tag_whitelist(element: ET.Element, tag_blacklist: list) -> list:
    """lists all unique elements from a given node and returns only those not in the given blacklist"""
    tags = list(
        set([x.tag for x in element.iter(tag=ET.Element) if x.tag not in tag_blacklist])
    )
    return tags

extract_fulltext(root_node, tag_blacklist=[])

extracts all fulltext from given element and its children, except from blacklisted elements

Source code in acdh_tei_pyutils/utils.py
def extract_fulltext(root_node: ET.Element, tag_blacklist: list = []) -> str:
    """extracts all fulltext from given element and its children, except from blacklisted elements"""
    tags = crate_tag_whitelist(root_node, tag_blacklist)
    full_text = " ".join("".join(root_node.itertext(*tags)).split())
    return full_text

get_birth_death_year(person_node, xpath_part='@when', birth=True)

tries to extract birth and death years from person nodes and returns either None or the year as Integer

Source code in acdh_tei_pyutils/utils.py
def get_birth_death_year(
    person_node: ET.Element, xpath_part: str = "@when", birth: bool = True
) -> Union[int, bool]:
    """tries to extract birth and death years from person nodes and returns either None or the year as Integer"""
    if birth:
        year_xpath = f"./tei:birth/{xpath_part}"
    else:
        year_xpath = f"./tei:death/{xpath_part}"
    try:
        date_str = person_node.xpath(
            year_xpath, namespaces={"tei": "http://www.tei-c.org/ns/1.0"}
        )[0]
    except IndexError:
        return None
    year_str = date_str[:4]
    try:
        return int(year_str)
    except ValueError:
        return None

get_xmlid(element)

returns an @xml:id of the given node

Source code in acdh_tei_pyutils/utils.py
def get_xmlid(element: ET.Element) -> str:
    """returns an @xml:id of the given node"""
    return element.attrib["{http://www.w3.org/XML/1998/namespace}id"]

make_bibl_label(node, no_author='o.A.', no_title='o.T.', year='o.J.', editor_abbr='(Hg.)', max_title_length=75)

creates a nice, bibliograhpically useful label from the passed in tei:biblStruct element

Parameters:
  • node (Element) –

    a tei:biblStruct element

  • no_author (str, default: 'o.A.' ) –

    Used if no author name can be extracted. Defaults to "o.A".

  • no_title (str, default: 'o.T.' ) –

    Used if no title can be extracted. Defaults to "o.T".

  • year (str, default: 'o.J.' ) –

    Used if no year can be extracted. Defaults to "o.J".

  • editor_abbr(str, (optional) –

    how to mark the 'author' beeing an editor. Defaults to "(Hg.)".

  • max_title_length(int, (optional) –

    max lenght for the title before it gets truncated. Defaults to

Returns:
  • str( str ) –

    description

Source code in acdh_tei_pyutils/utils.py
def make_bibl_label(
    node: ET.Element,
    no_author="o.A.",
    no_title="o.T.",
    year="o.J.",
    editor_abbr="(Hg.)",
    max_title_length=75,
) -> str:
    """creates a nice, bibliograhpically useful label from the passed in tei:biblStruct element

    Args:
        node (ET.Element): a tei:biblStruct element
        no_author (str, optional): Used if no author name can be extracted. Defaults to "o.A".
        no_title (str, optional): Used if no title can be extracted. Defaults to "o.T".
        year (str, optional): Used if no year can be extracted. Defaults to "o.J".
        editor_abbr(str, optional): how to mark the 'author' beeing an editor. Defaults to "(Hg.)".
        max_title_length(int, optional): max lenght for the title before it gets truncated. Defaults to

    Returns:
        str: _description_
    """
    try:
        author = node.xpath(".//tei:author[1]/tei:surname[1]", namespaces=nsmap)[0].text
    except IndexError:
        try:
            author = node.xpath(".//tei:author[1]/tei:name[1]", namespaces=nsmap)[
                0
            ].text
        except IndexError:
            try:
                author = node.xpath(
                    ".//tei:editor[1]/tei:surname[1]", namespaces=nsmap
                )[0].text
                author = f"{author} {editor_abbr}"
            except IndexError:
                try:
                    author = node.xpath(
                        ".//tei:editor[1]/tei:name[1]", namespaces=nsmap
                    )[0].text
                    author = f"{author} {editor_abbr}"
                except IndexError:
                    author = no_author
    try:
        year = node.xpath(".//tei:date[1]", namespaces=nsmap)[0].text
    except IndexError:
        year = year
    title = node.xpath(".//tei:title[1]", namespaces=nsmap)[0].text
    if title:
        if len(title) > max_title_length:
            title = f"{title[:max_title_length]}..."
    else:
        title = no_title
    return f"{author}, {title}, {year}"

make_entity_label(name_node, default_msg='no label provided', default_lang='en')

Extracts a label and a lang tag from the past in name-node

Parameters:
  • name_node (Element) –

    A tei:persName|placeName|orgName element

  • default_msg (str, default: 'no label provided' ) –

    some default vaule for the label. Defaults to "no label provided".

  • default_lang (str, default: 'en' ) –

    some default lang tag if the node does not provide and xml:lang attribute.

Returns:
  • tuple[str, str]

    tuple[str, str]: returns the extracted label and a lang tag

Source code in acdh_tei_pyutils/utils.py
def make_entity_label(
    name_node: ET.Element, default_msg="no label provided", default_lang="en"
) -> tuple[str, str]:
    """Extracts a label and a lang tag from the past in name-node

    Args:
        name_node (ET.Element): A tei:persName|placeName|orgName element
        default_msg (str, optional): some default vaule for the label. Defaults to "no label provided".
        default_lang (str, optional): some default lang tag if the node does not provide and xml:lang attribute.
        Defaults to "en".

    Returns:
        tuple[str, str]: returns the extracted label and a lang tag
    """

    lang_tag = name_node.get("{http://www.w3.org/XML/1998/namespace}lang", default_lang)
    fornames = [
        normalize_string(x)
        for x in name_node.xpath(".//tei:forename//text()", namespaces=nsmap)
    ]
    surnames = [
        normalize_string(x)
        for x in name_node.xpath(".//tei:surname//text()", namespaces=nsmap)
    ]
    if len(surnames) > 0 and len(fornames) > 0:
        label = f"{surnames[0]}, {' '.join(fornames)}"
    elif len(surnames) == 0 and len(fornames) > 0:
        label = f"{' '.join(fornames)}"
    elif len(surnames) > 0 and len(fornames) == 0:
        label = f"{surnames[0]}"
    else:
        name_node_text = " ".join(name_node.xpath(".//text()", namespaces=nsmap))
        label = normalize_string(name_node_text)
    if label is None or label == "":
        label = default_msg
    return label, lang_tag

normalize_string(string)

removese any superfluos whitespace from a given string

Source code in acdh_tei_pyutils/utils.py
def normalize_string(string: str) -> str:
    """removese any superfluos whitespace from a given string"""
    return " ".join(" ".join(string.split()).split())

previous_and_next(some_iterable)

taken from https://stackoverflow.com/a/1012089

Source code in acdh_tei_pyutils/utils.py
def previous_and_next(some_iterable):  # pragma: no cover
    """taken from https://stackoverflow.com/a/1012089"""
    prevs, items, nexts = tee(some_iterable, 3)
    prevs = chain([None], prevs)
    nexts = chain(islice(nexts, 1, None), [None])
    return zip(prevs, items, nexts)

acdh_tei_pyutils.tei

TeiEnricher

Bases: TeiReader

a class to enrich tei-documents

Source code in acdh_tei_pyutils/tei.py
class TeiEnricher(TeiReader):
    """a class to enrich tei-documents"""

    def add_base_and_id(self, base_value, id_value, prev_value, next_value):
        """adds @xml:base and @xml:id and next and prev to root element

        :param base_value: The value of the @xml:base
        :type base_value: str

        :return: the updated tree
        """

        base = self.any_xpath("//tei:TEI")[0]
        if base_value:
            base.set(f"{{{self.ns_xml['xml']}}}base", base_value)
        if id_value:
            base.set(f"{{{self.ns_xml['xml']}}}id", id_value)
        if prev_value:
            base.set("prev", f"{base_value}/{prev_value}")
        if next_value:
            base.set("next", f"{base_value}/{next_value}")
        return self.tree

    def get_full_id(self):
        """returns the combination of @xml:base and @xml:id

        :return: combination of @xml:base and @xml:id
        :rtype: str

        """
        base = self.any_xpath("//tei:TEI")[0]
        try:
            base_base = base.xpath("./@xml:base", namespaces=self.ns_xml)[0]
        except IndexError:
            return None
        try:
            base_id = base.xpath("./@xml:id", namespaces=self.ns_xml)[0]
        except IndexError:
            return None
        if base_base.endswith("/"):
            return f"{base_base}{base_id}"
        else:
            return f"{base_base}/{base_id}"

    def handle_exist(self, handle_xpath='.//tei:idno[@type="handle"]'):
        """checks if a handle is already assigned

        :return: the registered handle or empty string
        :rtype: str, None
        """
        try:
            return self.any_xpath(handle_xpath)[0].text
        except IndexError:
            return None

    def add_handle(
        self,
        handle,
        handle_xpath='.//tei:idno[@type="handle"]',
        insert_xpath=".//tei:publicationStmt/tei:p",
    ):
        """adds an idno @type=handle element into tei:publicationStmt

        :param handle: the handle
        :type handle: str

        :param handle_xpath: an xpath expression where to look for an handle
        :type handle_xpath: str

        :raises: `HandleAlreadyExist` Error

        :returns: the indo node
        """
        tei_ns = f"{self.ns_tei['tei']}"
        if self.handle_exist(handle_xpath=handle_xpath):
            raise HandleAlreadyExist(
                f"a handle: {self.handle_exist()} is already registered"
            )
        else:
            idno_node = ET.Element(f"{{{tei_ns}}}idno")
            idno_node.set("type", "handle")
            idno_node.text = handle
            insert_node = self.any_xpath(insert_xpath)[0]
            insert_node.append(idno_node)
            return idno_node

    def create_mention_list(self, mentions, event_title=""):
        """creates a tei element with notes of mentions

        :param mentions: a list of dicts with keys `doc_id` and `doc_title`
        :type mentions: noteGrp

        :return: a etree.element
        """
        tei_ns = f"{self.ns_tei['tei']}"
        node_root = ET.Element(f"{{{tei_ns}}}noteGrp")
        mentions_added = {}
        for x in mentions:
            try:
                mentions_added[slugify(x["doc_id"])]
            except KeyError:
                note = ET.Element(f"{{{tei_ns}}}note")
                note.attrib["target"] = x["doc_id"]
                note.attrib["type"] = "mentions"
                if x["doc_date"] is not None:
                    note.attrib["corresp"] = x["doc_date"]
                if x["doc_title_sec"] is not None:
                    note.text = event_title + f"{x['doc_title']} {x['doc_title_sec']}"
                else:
                    note.text = x["doc_title"]
                node_root.append(note)
                mentions_added[slugify(x["doc_id"])] = True
        return node_root

add_base_and_id(base_value, id_value, prev_value, next_value)

adds @xml:base and @xml:id and next and prev to root element

:param base_value: The value of the @xml:base :type base_value: str

:return: the updated tree

Source code in acdh_tei_pyutils/tei.py
def add_base_and_id(self, base_value, id_value, prev_value, next_value):
    """adds @xml:base and @xml:id and next and prev to root element

    :param base_value: The value of the @xml:base
    :type base_value: str

    :return: the updated tree
    """

    base = self.any_xpath("//tei:TEI")[0]
    if base_value:
        base.set(f"{{{self.ns_xml['xml']}}}base", base_value)
    if id_value:
        base.set(f"{{{self.ns_xml['xml']}}}id", id_value)
    if prev_value:
        base.set("prev", f"{base_value}/{prev_value}")
    if next_value:
        base.set("next", f"{base_value}/{next_value}")
    return self.tree

add_handle(handle, handle_xpath='.//tei:idno[@type="handle"]', insert_xpath='.//tei:publicationStmt/tei:p')

adds an idno @type=handle element into tei:publicationStmt

:param handle: the handle :type handle: str

:param handle_xpath: an xpath expression where to look for an handle :type handle_xpath: str

:raises: HandleAlreadyExist Error

:returns: the indo node

Source code in acdh_tei_pyutils/tei.py
def add_handle(
    self,
    handle,
    handle_xpath='.//tei:idno[@type="handle"]',
    insert_xpath=".//tei:publicationStmt/tei:p",
):
    """adds an idno @type=handle element into tei:publicationStmt

    :param handle: the handle
    :type handle: str

    :param handle_xpath: an xpath expression where to look for an handle
    :type handle_xpath: str

    :raises: `HandleAlreadyExist` Error

    :returns: the indo node
    """
    tei_ns = f"{self.ns_tei['tei']}"
    if self.handle_exist(handle_xpath=handle_xpath):
        raise HandleAlreadyExist(
            f"a handle: {self.handle_exist()} is already registered"
        )
    else:
        idno_node = ET.Element(f"{{{tei_ns}}}idno")
        idno_node.set("type", "handle")
        idno_node.text = handle
        insert_node = self.any_xpath(insert_xpath)[0]
        insert_node.append(idno_node)
        return idno_node

create_mention_list(mentions, event_title='')

creates a tei element with notes of mentions

:param mentions: a list of dicts with keys doc_id and doc_title :type mentions: noteGrp

:return: a etree.element

Source code in acdh_tei_pyutils/tei.py
def create_mention_list(self, mentions, event_title=""):
    """creates a tei element with notes of mentions

    :param mentions: a list of dicts with keys `doc_id` and `doc_title`
    :type mentions: noteGrp

    :return: a etree.element
    """
    tei_ns = f"{self.ns_tei['tei']}"
    node_root = ET.Element(f"{{{tei_ns}}}noteGrp")
    mentions_added = {}
    for x in mentions:
        try:
            mentions_added[slugify(x["doc_id"])]
        except KeyError:
            note = ET.Element(f"{{{tei_ns}}}note")
            note.attrib["target"] = x["doc_id"]
            note.attrib["type"] = "mentions"
            if x["doc_date"] is not None:
                note.attrib["corresp"] = x["doc_date"]
            if x["doc_title_sec"] is not None:
                note.text = event_title + f"{x['doc_title']} {x['doc_title_sec']}"
            else:
                note.text = x["doc_title"]
            node_root.append(note)
            mentions_added[slugify(x["doc_id"])] = True
    return node_root

get_full_id()

returns the combination of @xml:base and @xml:id

:return: combination of @xml:base and @xml:id :rtype: str

Source code in acdh_tei_pyutils/tei.py
def get_full_id(self):
    """returns the combination of @xml:base and @xml:id

    :return: combination of @xml:base and @xml:id
    :rtype: str

    """
    base = self.any_xpath("//tei:TEI")[0]
    try:
        base_base = base.xpath("./@xml:base", namespaces=self.ns_xml)[0]
    except IndexError:
        return None
    try:
        base_id = base.xpath("./@xml:id", namespaces=self.ns_xml)[0]
    except IndexError:
        return None
    if base_base.endswith("/"):
        return f"{base_base}{base_id}"
    else:
        return f"{base_base}/{base_id}"

handle_exist(handle_xpath='.//tei:idno[@type="handle"]')

checks if a handle is already assigned

:return: the registered handle or empty string :rtype: str, None

Source code in acdh_tei_pyutils/tei.py
def handle_exist(self, handle_xpath='.//tei:idno[@type="handle"]'):
    """checks if a handle is already assigned

    :return: the registered handle or empty string
    :rtype: str, None
    """
    try:
        return self.any_xpath(handle_xpath)[0].text
    except IndexError:
        return None

TeiReader

Bases: XMLReader

a class to read an process tei-documents

Source code in acdh_tei_pyutils/tei.py
class TeiReader(XMLReader):
    """a class to read an process tei-documents"""

    def any_xpath(self, any_xpath="//tei:rs"):
        """Runs any xpath expressions against the parsed document
        :param any_xpath: Any XPath expression.
        :return: The result of the xpath
        """
        return self.tree.xpath(any_xpath, namespaces=self.ns_tei)

    def extract_ne_elements(self, parent_node, ne_xpath="//tei:rs"):
        """extract elements tagged as named entities
        :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
        :return: A list of elements
        """

        ne_elements = parent_node.xpath(ne_xpath, namespaces=self.ns_tei)
        return ne_elements

    def extract_ne_dicts(
        self, parent_node, ne_xpath="//tei:rs", NER_TAG_MAP=NER_TAG_MAP
    ):
        """ extract strings tagged as named entities
        :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
        :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
        spacy-tags
        :return: A list of NE-dicts containing the 'text' and the 'ne_type'
        """

        ne_elements = self.extract_ne_elements(parent_node, ne_xpath)
        ne_dicts = []
        for x in ne_elements:
            item = {}
            text = "".join(x.xpath(".//text()"))
            item["text"] = re.sub(r"\s+", " ", text).strip()
            try:
                ne_type = NER_TAG_MAP.get("{}".format(x.xpath("./@type")[0]), "MISC")
            except IndexError:
                ne_type = NER_TAG_MAP.get("{}".format(x.xpath("name()")), "MISC")
            item["ne_type"] = ne_type
            ne_dicts.append(item)

        return ne_dicts

    def create_plain_text(self, node):
        """ extracts all text nodes from given element
        :param start_node: An XPath expressione pointing to\
        an element which text nodes should be extracted
        :return: A normalized, cleaned plain text
        """
        result = re.sub(r"\s+", " ", "".join(node.xpath(".//text()"))).strip()

        return result

    def get_text_nes_list(
        self,
        parent_nodes=".//tei:body//tei:p",
        ne_xpath=".//tei:rs",
        NER_TAG_MAP=NER_TAG_MAP,
    ):
        """ extracts all text nodes from given elements and their NE
        :param parent_nodes: An XPath expressione pointing to\
        those elements which text nodes should be extracted
        :param ne_xpath:  An XPath expression pointing to elements used to tagged NEs.\
        Takes the parent node(s) as context
        :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
        spacy-tags
        :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien",\
        "ne_type": "LOC"}]}]
        """

        parents = self.tree.xpath(parent_nodes, namespaces=self.ns_tei)
        result = []
        for node in parents:
            text = self.create_plain_text(node)
            ner_dicts = self.extract_ne_dicts(node, ne_xpath, NER_TAG_MAP)
            result.append({"text": text, "ner_dicts": ner_dicts})
        return result

    def extract_ne_offsets(
        self,
        parent_nodes=".//tei:body//tei:p",
        ne_xpath=".//tei:rs",
        NER_TAG_MAP=NER_TAG_MAP,
    ):
        """ extracts offsets of NEs and the NE-type
        :param parent_nodes: An XPath expressione pointing to\
        those element which text nodes should be extracted
        :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
        Takes the parent node(s) as context
        :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
        spacy-tags
        :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
        """

        text_nes_dict = self.get_text_nes_list(parent_nodes, ne_xpath, NER_TAG_MAP)
        result = []
        for x in text_nes_dict:
            plain_text = x["text"]
            ner_dicts = x["ner_dicts"]
            entities = []
            for x in ner_dicts:
                if x["text"] != "":
                    for m in re.finditer(re.escape(x["text"]), plain_text):
                        entities.append([m.start(), m.end(), x["ne_type"]])
            entities = [item for item in set(tuple(row) for row in entities)]
            entities = sorted(entities, key=lambda x: x[0])
            ents = []
            next_item_index = 1
            # remove entities with the same start offset
            for x in entities:
                cur_start = x[0]
                try:
                    next_start = entities[next_item_index][0]
                except IndexError:
                    next_start = 9999999999999999999999
                if cur_start == next_start:
                    pass
                else:
                    ents.append(x)
                next_item_index = next_item_index + 1

            train_data = (plain_text, {"entities": ents})
            result.append(train_data)
        return result

any_xpath(any_xpath='//tei:rs')

Runs any xpath expressions against the parsed document :param any_xpath: Any XPath expression. :return: The result of the xpath

Source code in acdh_tei_pyutils/tei.py
def any_xpath(self, any_xpath="//tei:rs"):
    """Runs any xpath expressions against the parsed document
    :param any_xpath: Any XPath expression.
    :return: The result of the xpath
    """
    return self.tree.xpath(any_xpath, namespaces=self.ns_tei)

create_plain_text(node)

extracts all text nodes from given element :param start_node: An XPath expressione pointing to an element which text nodes should be extracted :return: A normalized, cleaned plain text

Source code in acdh_tei_pyutils/tei.py
def create_plain_text(self, node):
    """ extracts all text nodes from given element
    :param start_node: An XPath expressione pointing to\
    an element which text nodes should be extracted
    :return: A normalized, cleaned plain text
    """
    result = re.sub(r"\s+", " ", "".join(node.xpath(".//text()"))).strip()

    return result

extract_ne_dicts(parent_node, ne_xpath='//tei:rs', NER_TAG_MAP=NER_TAG_MAP)

extract strings tagged as named entities :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of NE-dicts containing the 'text' and the 'ne_type'

Source code in acdh_tei_pyutils/tei.py
def extract_ne_dicts(
    self, parent_node, ne_xpath="//tei:rs", NER_TAG_MAP=NER_TAG_MAP
):
    """ extract strings tagged as named entities
    :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
    :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
    spacy-tags
    :return: A list of NE-dicts containing the 'text' and the 'ne_type'
    """

    ne_elements = self.extract_ne_elements(parent_node, ne_xpath)
    ne_dicts = []
    for x in ne_elements:
        item = {}
        text = "".join(x.xpath(".//text()"))
        item["text"] = re.sub(r"\s+", " ", text).strip()
        try:
            ne_type = NER_TAG_MAP.get("{}".format(x.xpath("./@type")[0]), "MISC")
        except IndexError:
            ne_type = NER_TAG_MAP.get("{}".format(x.xpath("name()")), "MISC")
        item["ne_type"] = ne_type
        ne_dicts.append(item)

    return ne_dicts

extract_ne_elements(parent_node, ne_xpath='//tei:rs')

extract elements tagged as named entities :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. :return: A list of elements

Source code in acdh_tei_pyutils/tei.py
def extract_ne_elements(self, parent_node, ne_xpath="//tei:rs"):
    """extract elements tagged as named entities
    :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
    :return: A list of elements
    """

    ne_elements = parent_node.xpath(ne_xpath, namespaces=self.ns_tei)
    return ne_elements

extract_ne_offsets(parent_nodes='.//tei:body//tei:p', ne_xpath='.//tei:rs', NER_TAG_MAP=NER_TAG_MAP)

extracts offsets of NEs and the NE-type :param parent_nodes: An XPath expressione pointing to those element which text nodes should be extracted :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. Takes the parent node(s) as context :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]

Source code in acdh_tei_pyutils/tei.py
def extract_ne_offsets(
    self,
    parent_nodes=".//tei:body//tei:p",
    ne_xpath=".//tei:rs",
    NER_TAG_MAP=NER_TAG_MAP,
):
    """ extracts offsets of NEs and the NE-type
    :param parent_nodes: An XPath expressione pointing to\
    those element which text nodes should be extracted
    :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
    Takes the parent node(s) as context
    :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
    spacy-tags
    :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
    """

    text_nes_dict = self.get_text_nes_list(parent_nodes, ne_xpath, NER_TAG_MAP)
    result = []
    for x in text_nes_dict:
        plain_text = x["text"]
        ner_dicts = x["ner_dicts"]
        entities = []
        for x in ner_dicts:
            if x["text"] != "":
                for m in re.finditer(re.escape(x["text"]), plain_text):
                    entities.append([m.start(), m.end(), x["ne_type"]])
        entities = [item for item in set(tuple(row) for row in entities)]
        entities = sorted(entities, key=lambda x: x[0])
        ents = []
        next_item_index = 1
        # remove entities with the same start offset
        for x in entities:
            cur_start = x[0]
            try:
                next_start = entities[next_item_index][0]
            except IndexError:
                next_start = 9999999999999999999999
            if cur_start == next_start:
                pass
            else:
                ents.append(x)
            next_item_index = next_item_index + 1

        train_data = (plain_text, {"entities": ents})
        result.append(train_data)
    return result

get_text_nes_list(parent_nodes='.//tei:body//tei:p', ne_xpath='.//tei:rs', NER_TAG_MAP=NER_TAG_MAP)

extracts all text nodes from given elements and their NE :param parent_nodes: An XPath expressione pointing to those elements which text nodes should be extracted :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. Takes the parent node(s) as context :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien", "ne_type": "LOC"}]}]

Source code in acdh_tei_pyutils/tei.py
def get_text_nes_list(
    self,
    parent_nodes=".//tei:body//tei:p",
    ne_xpath=".//tei:rs",
    NER_TAG_MAP=NER_TAG_MAP,
):
    """ extracts all text nodes from given elements and their NE
    :param parent_nodes: An XPath expressione pointing to\
    those elements which text nodes should be extracted
    :param ne_xpath:  An XPath expression pointing to elements used to tagged NEs.\
    Takes the parent node(s) as context
    :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
    spacy-tags
    :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien",\
    "ne_type": "LOC"}]}]
    """

    parents = self.tree.xpath(parent_nodes, namespaces=self.ns_tei)
    result = []
    for node in parents:
        text = self.create_plain_text(node)
        ner_dicts = self.extract_ne_dicts(node, ne_xpath, NER_TAG_MAP)
        result.append({"text": text, "ner_dicts": ner_dicts})
    return result

command line interface

Console script for acdh_collatex_utils.

add_base_id_next_prev(glob_pattern, base_value)

Console script add @xml:base, @xml:id and @prev @next attributes to root element

Source code in acdh_tei_pyutils/cli.py
@click.command()  # pragma: no cover
@click.option(
    "-g", "--glob-pattern", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option("-b", "--base-value")  # pragma: no cover
def add_base_id_next_prev(glob_pattern, base_value):  # pragma: no cover
    """Console script add @xml:base, @xml:id and @prev @next attributes to root element"""
    files = sorted(glob.glob(glob_pattern))

    for prev_value, current, next_value in tqdm.tqdm(
        previous_and_next(files), total=len(files)
    ):
        doc = TeiEnricher(current)
        id_value = os.path.split(current)[1]
        if prev_value:
            prev_id = os.path.split(prev_value)[1]
        else:
            prev_id = None
        if next_value:
            next_id = os.path.split(next_value)[1]
        else:
            next_id = None
        doc.add_base_and_id(base_value, id_value, prev_id, next_id)
        doc.tree_to_file(file=current)

add_handles(glob_pattern, hdl_user, hdl_pw, hdl_provider, hdl_prefix, hdl_resolver, hdl_xpath, hdlinsert_xpath)

Console script to register handels base on the values of @xml:id and @xml:base

Source code in acdh_tei_pyutils/cli.py
@click.command()  # pragma: no cover
@click.option(
    "-g", "--glob-pattern", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option("-user", "--hdl-user")  # pragma: no cover
@click.option("-pw", "--hdl-pw")  # pragma: no cover
@click.option(
    "-provider",
    "--hdl-provider",
    default="http://pid.gwdg.de/handles/",
    show_default=True,
)  # pragma: no cover
@click.option(
    "-prefix", "--hdl-prefix", default="21.11115", show_default=True
)  # pragma: no cover
@click.option(
    "-resolver", "--hdl-resolver", default="https://hdl.handle.net/", show_default=True
)  # pragma: no cover
@click.option(
    "-hxpath", "--hdl-xpath", default=".//tei:idno[@type='handle']", show_default=True
)  # pragma: no cover
@click.option(
    "-hixpath",
    "--hdlinsert-xpath",
    default=".//tei:publicationStmt/tei:p",
    show_default=True,
)  # pragma: no cover
def add_handles(
    glob_pattern,
    hdl_user,
    hdl_pw,
    hdl_provider,
    hdl_prefix,
    hdl_resolver,
    hdl_xpath,
    hdlinsert_xpath,
):  # pragma: no cover
    """Console script to register handels base on the values of @xml:id and @xml:base"""
    files = sorted(glob.glob(glob_pattern))
    hdl_client = HandleClient(
        hdl_user,
        hdl_pw,
        hdl_provider=hdl_provider,
        hdl_prefix=hdl_prefix,
        hdl_resolver=hdl_resolver,
    )
    for x in tqdm.tqdm(files, total=len(files)):
        doc = TeiEnricher(x)
        if doc.handle_exist():
            continue
        parsed_data = doc.get_full_id()
        if parsed_data is None:
            continue
        hdl = hdl_client.register_handle(parsed_data)
        print(hdl)
        doc.add_handle(hdl, handle_xpath=hdl_xpath, insert_xpath=hdlinsert_xpath)
        doc.tree_to_file(x)

denormalize_indices(files, indices, mention_xpath, title_xpath, title_sec_xpath, date_xpath, blacklist_ids=[])

Write pointers to mentions in index-docs and copy index entries into docs

Source code in acdh_tei_pyutils/cli.py
@click.command()  # pragma: no cover
@click.option(
    "-f", "--files", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-i", "--indices", default="./indices/list*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-m", "--mention-xpath", default=".//tei:rs[@ref]/@ref", show_default=True
)  # pragma: no cover
@click.option(
    "-x", "--title-xpath", default=".//tei:title/text()", show_default=True
)  # pragma: no cover
@click.option("-xs", "--title-sec-xpath", required=False)  # pragma: no cover
@click.option("-d", "--date-xpath", required=False)  # pragma: no cover
@click.option(
    "-b", "--blacklist-ids", default=[], multiple=True, show_default=True
)  # pragma: no cover
def denormalize_indices(
    files,
    indices,
    mention_xpath,
    title_xpath,
    title_sec_xpath,
    date_xpath,
    blacklist_ids=[],
):  # pragma: no cover
    """Write pointers to mentions in index-docs and copy index entries into docs"""
    files = sorted(glob.glob(files))
    index_files = sorted(glob.glob(indices))
    ref_doc_dict = defaultdict(list)
    doc_ref_dict = defaultdict(list)
    click.echo(
        click.style(f"collecting list of mentions from {len(files)} docs", fg="green")
    )
    for x in tqdm.tqdm(files):
        filename = os.path.split(x)[1]
        if "list" in filename:
            continue
        doc = TeiEnricher(x)
        doc_base = doc.any_xpath("./@xml:base")[0]
        doc_id = doc.any_xpath("./@xml:id")[0]
        doc_uri = f"{doc_base}/{doc_id}"
        try:
            doc_title = doc.any_xpath(title_xpath)[0]
        except IndexError:
            doc_title = f"ERROR in title xpath of file: {doc_id}"
            print(f"ERROR in -x title xpath of file: {doc_id}")
        if title_sec_xpath:
            try:
                doc_title_sec = doc.any_xpath(title_sec_xpath)[0]
            except IndexError:
                doc_title_sec = f"ERROR in -xs secondary title xpath of file: {doc_id}"
                print(f"ERROR in secondary title xpath of file: {doc_id}")
        else:
            doc_title_sec = None
        if date_xpath:
            try:
                doc_date = doc.any_xpath(date_xpath)[0]
            except IndexError:
                doc_date = f"ERROR in date xpath of file: {doc_id}"
                print(f"ERROR in -d date xpath of file: {doc_id}")
        else:
            doc_date = None
        refs = doc.any_xpath(mention_xpath)
        for ref in set(refs):
            if ref.startswith("#") and len(ref.split(" ")) == 1:
                ref = ref[1:]
            if ref.startswith("#") and len(ref.split(" ")) > 1:
                refs = ref.split(" ")
                ref = refs[0]
                ref = ref[1:]
                for r in refs[1:]:
                    ref_doc_dict[r[1:]].append(
                        {
                            "doc_uri": doc_uri,
                            "doc_id": doc_id,
                            "doc_path": x,
                            "doc_title": doc_title,
                            "doc_title_sec": doc_title_sec,
                            "doc_date": doc_date,
                        }
                    )
            ref_doc_dict[ref].append(
                {
                    "doc_uri": doc_uri,
                    "doc_id": doc_id,
                    "doc_path": x,
                    "doc_title": doc_title,
                    "doc_title_sec": doc_title_sec,
                    "doc_date": doc_date,
                }
            )
            doc_ref_dict[filename].append(ref)
    click.echo(
        click.style(
            f"collected {len(ref_doc_dict.keys())} of mentioned entities from {len(files)} docs",
            fg="green",
        )
    )
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            ent_id = ent.xpath("@xml:id")[0]
            mention = ref_doc_dict[ent_id]
            if ent_id in blacklist_ids:
                continue
            ent_name = ent.tag
            note_grp = doc.create_mention_list(mention)
            try:
                list(note_grp[0])
                # TEI schema does not allow noteGrp in event after e.g. listPerson, ... so we need to insert it before
                if ent_name == "{http://www.tei-c.org/ns/1.0}event":
                    ent.insert(1, note_grp)
                else:
                    ent.append(note_grp)
            except IndexError:
                pass
        doc.tree_to_file(file=x)

    all_ent_nodes = {}
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            all_ent_nodes[ent.xpath("@xml:id")[0]] = ent

    click.echo(
        click.style(
            f"writing {len(all_ent_nodes)} index entries into {len(files)} files",
            fg="green",
        )
    )
    for x in tqdm.tqdm(files):
        try:
            filename = os.path.split(x)[1]
            doc = TeiEnricher(x)
            root_node = doc.any_xpath(".//tei:text")[0]
            for bad in doc.any_xpath(".//tei:back"):
                bad.getparent().remove(bad)
            refs = doc.any_xpath(mention_xpath)
            ent_dict = defaultdict(list)
            for ref in set(refs):
                # print(ref, type(ref))
                if ref.startswith("#") and len(ref.split(" ")) == 1:
                    ent_id = ref[1:]
                elif ref.startswith("#") and len(ref.split(" ")) > 1:
                    refs = ref.split(" ")
                    ref = refs[0]
                    ent_id = ref[1:]
                    for r in refs[1:]:
                        try:
                            index_ent = all_ent_nodes[r[1:]]
                            ent_dict[index_ent.tag].append(index_ent)
                        except KeyError:
                            continue
                else:
                    ent_id = ref
                try:
                    index_ent = all_ent_nodes[ent_id]
                    ent_dict[index_ent.tag].append(index_ent)
                except KeyError:
                    continue
            back_node = ET.Element("{http://www.tei-c.org/ns/1.0}back")
            for key in ent_dict.keys():
                if key.endswith("person"):
                    list_person = ET.Element("{http://www.tei-c.org/ns/1.0}listPerson")
                    back_node.append(list_person)
                    for ent in ent_dict[key]:
                        list_person.append(ent)
                if key.endswith("place"):
                    list_place = ET.Element("{http://www.tei-c.org/ns/1.0}listPlace")
                    back_node.append(list_place)
                    for ent in ent_dict[key]:
                        list_place.append(ent)
                if key.endswith("org"):
                    list_org = ET.Element("{http://www.tei-c.org/ns/1.0}listOrg")
                    back_node.append(list_org)
                    for ent in ent_dict[key]:
                        list_org.append(ent)
                if key.endswith("bibl") or key.endswith("biblStruct"):
                    list_bibl = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
                    back_node.append(list_bibl)
                    for ent in ent_dict[key]:
                        list_bibl.append(ent)
                if key.endswith("item"):
                    list_item = ET.Element("{http://www.tei-c.org/ns/1.0}list")
                    back_node.append(list_item)
                    for ent in ent_dict[key]:
                        list_item.append(ent)
                if key.endswith("event"):
                    list_eve = ET.Element("{http://www.tei-c.org/ns/1.0}listEvent")
                    back_node.append(list_eve)
                    for ent in ent_dict[key]:
                        list_eve.append(ent)
            root_node.append(back_node)
            doc.tree_to_file(file=x)
        except Exception as e:
            print(f"failed to process {x} due to {e}")
    click.echo(click.style("DONE", fg="green"))

mentions_to_indices(files, indices, mention_xpath, event_title, title_xpath)

Console script write pointers to mentions in index-docs

Source code in acdh_tei_pyutils/cli.py
@click.command()  # pragma: no cover
@click.option(
    "-f", "--files", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-i", "--indices", default="./indices/list*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-m", "--mention-xpath", default=".//tei:rs[@ref]/@ref", show_default=True
)  # pragma: no cover
@click.option(
    "-t", "--event-title", default="erwähnt in ", show_default=True
)  # pragma: no cover
@click.option(
    "-x",
    "--title-xpath",
    default='.//tei:title[@type="main"]/text()',
    show_default=True,
)  # pragma: no cover
def mentions_to_indices(
    files, indices, mention_xpath, event_title, title_xpath
):  # pragma: no cover
    """Console script write pointers to mentions in index-docs"""
    files = sorted(glob.glob(files))
    index_files = sorted(glob.glob(indices))
    ref_doc_dict = defaultdict(list)
    doc_ref_dict = defaultdict(list)
    click.echo(
        click.style(f"collecting list of mentions from {len(files)} docs", fg="green")
    )
    for x in tqdm.tqdm(files):
        filename = os.path.split(x)[1]
        doc = TeiEnricher(x)
        doc_base = doc.any_xpath("./@xml:base")[0]
        doc_id = doc.any_xpath("./@xml:id")[0]
        doc_uri = f"{doc_base}/{doc_id}"
        doc_title = doc.any_xpath(title_xpath)[0]
        refs = doc.any_xpath(mention_xpath)
        for ref in set(refs):
            if ref.startswith("#"):
                ref = ref[1:]
            ref_doc_dict[ref].append(
                {
                    "doc_uri": doc_uri,
                    "doc_path": x,
                    "doc_title": doc_title,
                    "doc_id": doc_id,
                    "doc_date": None,
                    "doc_title_sec": None,
                }
            )
            doc_ref_dict[filename].append(ref)
    click.echo(
        click.style(
            f"collected {len(ref_doc_dict.keys())} of mentioned entities from {len(files)} docs",
            fg="green",
        )
    )
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            ent_id = ent.xpath("@xml:id", namespaces=doc.nsmap)[0]
            mentions = ref_doc_dict[ent_id]
            ent_name = ent.tag
            note_grp = doc.create_mention_list(mentions, event_title)
            try:
                list(note_grp[0])
                # TEI schema does not allow noteGrp in event after e.g. listPerson, ... so we need to insert it before
                if ent_name == "{http://www.tei-c.org/ns/1.0}event":
                    ent.insert(1, note_grp)
                else:
                    ent.append(note_grp)
            except IndexError:
                pass
        doc.tree_to_file(file=x)

    all_ent_nodes = {}
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            all_ent_nodes[ent.xpath("@xml:id")[0]] = ent
    click.echo(click.style("DONE", fg="green"))

schnitzler(files, indices, doc_person, doc_work)

Console script write pointers to mentions in index-docs

Source code in acdh_tei_pyutils/cli.py
@click.command()  # pragma: no cover
@click.option(
    "-f", "--files", default="./data/editions/*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-i", "--indices", default="./data/indices/list*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-t",
    "--doc-person",
    default="./data/indices/index_person_day.xml",
    show_default=True,
)  # pragma: no cover
@click.option(
    "-t", "--doc-work", default="./data/indices/index_work_day.xml", show_default=True
)  # pragma: no cover
def schnitzler(files, indices, doc_person, doc_work):  # pragma: no cover
    """Console script write pointers to mentions in index-docs"""
    files = sorted(glob.glob(files))
    index_files = sorted(glob.glob(indices))
    doc_person = TeiEnricher(doc_person)
    doc_work = TeiEnricher(doc_work)
    all_ent_nodes = {}
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            all_ent_nodes[ent.xpath("@xml:id")[0]] = ent

    no_matches = []
    for x in tqdm.tqdm(files, total=len(files)):
        day = x.split("/")[-1].replace("entry__", "").replace(".xml", "")
        doc = TeiEnricher(x)
        root_node = doc.any_xpath(".//tei:text")[0]
        back_node = ET.Element("{http://www.tei-c.org/ns/1.0}back")
        for bad in doc.any_xpath(".//tei:back"):
            bad.getparent().remove(bad)

        xpath = f".//item[@target='{day}']/ref/text()"
        ids = doc_person.any_xpath(xpath)
        list_person_node = ET.Element("{http://www.tei-c.org/ns/1.0}listPerson")
        if len(ids) > 0:
            for id in ids:
                try:
                    nodes = all_ent_nodes[id]
                except KeyError:
                    no_matches.append(id)
                    continue
                list_person_node.append(nodes)
            if len(list_person_node) > 0:
                back_node.append(list_person_node)

        ids = doc_work.any_xpath(xpath)
        list_work_node = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
        if len(ids) > 0:
            for id in ids:
                try:
                    nodes = all_ent_nodes[id]
                except KeyError:
                    no_matches.append(id)
                    continue
                list_work_node.append(nodes)
            if len(list_work_node) > 0:
                back_node.append(list_work_node)
        place_ids = doc.any_xpath('.//tei:rs[@ref and @type="place"]/@ref')
        if len(place_ids) > 0:
            list_place_node = ET.Element("{http://www.tei-c.org/ns/1.0}listPlace")
            for pl in place_ids:
                try:
                    pl_node = all_ent_nodes[pl[1:]]
                except KeyError:
                    no_matches.append(pl)
                    continue
                list_place_node.append(pl_node)
            if len(list_place_node) > 0:
                back_node.append(list_place_node)
        if len(back_node) > 0:
            root_node.append(back_node)
            doc.tree_to_file(file=x)
    distinct_no_match = set(no_matches)
    print(distinct_no_match)