API Documentation

This part of the project documentation focuses on an information-oriented approach. Use it as a reference for the technical implementation of the acdh_tei_pyutils project code.

acdh_tei_pyutils.utils

a bunch of helper functions for either

  • processing (TEI)/XML nodes like extracting information, or to
  • query/parse (TEI)/XML nodes/trees

add_graphic_url_to_pb(doc)

writes url attributes into tei:pb elements fetched from matching tei:surface//tei:graphic[1] elements

Source code in src/acdh_tei_pyutils/utils.py
50
51
52
53
54
55
56
57
58
59
60
def add_graphic_url_to_pb(doc: TeiReader) -> TeiReader:
    """writes url attributes into tei:pb elements fetched from matching tei:surface//tei:graphic[1] elements"""
    for x in doc.any_xpath(".//tei:pb[@facs]"):
        facs_id = check_for_hash(x.attrib["facs"])
        xpath_expr = f'.//tei:surface[@xml:id="{facs_id}"]//tei:graphic[1]/@url'
        try:
            facs_url = doc.any_xpath(xpath_expr)[0]
        except IndexError:
            continue
        x.attrib["url"] = facs_url
    return doc

any_xpath(node, any_xpath)

Runs any XPath expressions against the passed in node and provides common namespace prefixes like tei:, xml: or `skos :param node: An lxml.etree Element :param any_xpath: Any XPath expression, e.g. .//tei:rs :return: The result of the xpath

Source code in src/acdh_tei_pyutils/utils.py
12
13
14
15
16
17
18
19
def any_xpath(node: Element, any_xpath: str) -> list:
    """Runs any XPath expressions against the passed in node and provides\
        common namespace prefixes like `tei:`, `xml:` or `skos
    :param node: An lxml.etree Element
    :param any_xpath: Any XPath expression, e.g. .//tei:rs
    :return: The result of the xpath
    """
    return node.xpath(any_xpath, namespaces=NSMAP)

check_for_hash(value)

checks if value starts with '#' and if so removes the '#' from the returned value

Source code in src/acdh_tei_pyutils/utils.py
42
43
44
45
46
47
def check_for_hash(value: str) -> str:
    """checks if value starts with '#' and if so removes the '#' from the returned value"""
    if value.startswith("#"):
        return value[1:]
    else:
        return value

crate_tag_whitelist(element, tag_blacklist)

lists all unique elements from a given node and returns only those not in the given blacklist

Source code in src/acdh_tei_pyutils/utils.py
27
28
29
30
31
32
def crate_tag_whitelist(element: ET.Element, tag_blacklist: list) -> list:
    """lists all unique elements from a given node and returns only those not in the given blacklist"""
    tags = list(
        set([x.tag for x in element.iter(tag=ET.Element) if x.tag not in tag_blacklist])
    )
    return tags

extract_fulltext(root_node, tag_blacklist=[])

extracts all fulltext from given element and its children, except from blacklisted elements

Source code in src/acdh_tei_pyutils/utils.py
35
36
37
38
39
def extract_fulltext(root_node: ET.Element, tag_blacklist: list = []) -> str:
    """extracts all fulltext from given element and its children, except from blacklisted elements"""
    tags = crate_tag_whitelist(root_node, tag_blacklist)
    full_text = " ".join("".join(root_node.itertext(*tags)).split())
    return full_text

extract_fulltext_with_spacing(root_node, tag_blacklist=None, block_elements=['p', 'salute', 'dateline', 'closer', 'seg', 'opener', 'div', 'head'])

Extract full text content from an XML element tree with proper spacing. This function recursively traverses an XML element tree and extracts all text content while preserving logical spacing around block-level elements. It handles XML namespaces and respects a blacklist of elements to exclude from extraction. Taken from https://github.com/arthur-schnitzler/schnitzler-briefe-static/blob/main/python/make_typesense_index.py Args: root_node: The root XML element from which to extract text. tag_blacklist (list, optional): A list of element tag names to exclude from text extraction. Elements with tags in this list will be skipped entirely. Defaults to None (empty list). block_elements (tuple, optional): A tuple of tag names that should have spaces added around them. Defaults to ('p', 'salute', 'dateline', 'closer', 'seg', 'opener', 'div', 'head'). Returns: str: The extracted text with normalized spacing. Multiple consecutive whitespace characters are collapsed into a single space, and the result is stripped of leading/trailing whitespace. Notes: - Handles XML namespaced tags by extracting the local name (part after '}'). - Special handling for 'space' elements with unit='chars' attribute. - Preserves tail text from child elements. - Automatically collapses multiple spaces into single spaces using regex.

Source code in src/acdh_tei_pyutils/utils.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def extract_fulltext_with_spacing(
    root_node,
    tag_blacklist=None,
    block_elements=[
        "p",
        "salute",
        "dateline",
        "closer",
        "seg",
        "opener",
        "div",
        "head",
    ],
):
    """
    Extract full text content from an XML element tree with proper spacing.
    This function recursively traverses an XML element tree and extracts all text
    content while preserving logical spacing around block-level elements. It handles
    XML namespaces and respects a blacklist of elements to exclude from extraction.
    Taken from https://github.com/arthur-schnitzler/schnitzler-briefe-static/blob/main/python/make_typesense_index.py
    Args:
        root_node: The root XML element from which to extract text.
        tag_blacklist (list, optional): A list of element tag names to exclude from
            text extraction. Elements with tags in this list will be skipped entirely.
            Defaults to None (empty list).
        block_elements (tuple, optional): A tuple of tag names that should have spaces
            added around them. Defaults to ('p', 'salute', 'dateline', 'closer', 'seg',
            'opener', 'div', 'head').
    Returns:
        str: The extracted text with normalized spacing. Multiple consecutive
            whitespace characters are collapsed into a single space, and the
            result is stripped of leading/trailing whitespace.
    Notes:
        - Handles XML namespaced tags by extracting the local name (part after '}').
        - Special handling for 'space' elements with unit='chars' attribute.
        - Preserves tail text from child elements.
        - Automatically collapses multiple spaces into single spaces using regex.
    """

    if tag_blacklist is None:
        tag_blacklist = []

    def extract_text_recursive(element):
        try:
            if hasattr(element.tag, "split"):
                element_tag_name = element.tag.split("}")[-1]
            else:
                element_tag_name = str(element.tag).split("}")[-1]
        except (AttributeError, TypeError):
            element_tag_name = ""

        if element_tag_name in tag_blacklist:
            return ""

        text_parts = []

        if element.text:
            text_parts.append(element.text)

        # Process children
        for child in element:
            try:
                if hasattr(child.tag, "split"):
                    tag_name = child.tag.split("}")[-1]  # Remove namespace
                else:
                    tag_name = str(child.tag).split("}")[-1]
            except (AttributeError, TypeError):
                # Skip if we can't determine the tag name
                if hasattr(child, "tail") and child.tail:
                    text_parts.append(child.tail)
                continue

            # Handle space elements
            if tag_name == "space":
                unit = child.get("unit", "")
                if unit == "chars":
                    # Add space for char-based spacing elements
                    text_parts.append(" ")
                # Add tail text before continuing
                if child.tail:
                    text_parts.append(child.tail)
                continue

            # Add space before block elements
            if tag_name in block_elements:
                text_parts.append(" ")

            # Process child recursively
            child_text = extract_text_recursive(child)
            if child_text:
                text_parts.append(child_text)

            # Add space after block elements
            if tag_name in block_elements:
                text_parts.append(" ")

            # Add tail text
            if child.tail:
                text_parts.append(child.tail)

        return "".join(text_parts)

    result = extract_text_recursive(root_node)
    result = re.sub(r"\s+", " ", result).strip()
    return result

get_birth_death_year(person_node, xpath_part='@when', birth=True)

tries to extract birth and death years from person nodes and returns either None or the year as Integer

Source code in src/acdh_tei_pyutils/utils.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def get_birth_death_year(
    person_node: ET.Element, xpath_part: str = "@when", birth: bool = True
) -> Union[int, bool]:
    """tries to extract birth and death years from person nodes and returns either None or the year as Integer"""
    if birth:
        year_xpath = f"./tei:birth/{xpath_part}"
    else:
        year_xpath = f"./tei:death/{xpath_part}"
    try:
        date_str = any_xpath(person_node, year_xpath)[0]
    except IndexError:
        return None
    year_str = date_str[:4]
    try:
        return int(year_str)
    except ValueError:
        return None

get_xmlid(element)

returns an @xml:id of the given node

Source code in src/acdh_tei_pyutils/utils.py
22
23
24
def get_xmlid(element: ET.Element) -> str:
    """returns an @xml:id of the given node"""
    return element.attrib["{http://www.w3.org/XML/1998/namespace}id"]

make_bibl_label(node, no_author='o.A.', no_title='o.T.', year='o.J.', editor_abbr='(Hg.)', max_title_length=75)

creates a nice, bibliograhpically useful label from the passed in tei:biblStruct element

Parameters:
  • node (Element) –

    a tei:biblStruct element

  • no_author (str, default: 'o.A.' ) –

    Used if no author name can be extracted. Defaults to "o.A".

  • no_title (str, default: 'o.T.' ) –

    Used if no title can be extracted. Defaults to "o.T".

  • year (str, default: 'o.J.' ) –

    Used if no year can be extracted. Defaults to "o.J".

  • editor_abbr (str, default: '(Hg.)' ) –

    how to mark the 'author' beeing an editor. Defaults to "(Hg.)".

  • max_title_length (int, default: 75 ) –

    max lenght for the title before it gets truncated. Defaults to

Returns:
  • str( str ) –

    A nice, bibliograhpically useful label

Source code in src/acdh_tei_pyutils/utils.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def make_bibl_label(
    node: ET.Element,
    no_author="o.A.",
    no_title="o.T.",
    year="o.J.",
    editor_abbr="(Hg.)",
    max_title_length=75,
) -> str:
    """creates a nice, bibliograhpically useful label from the passed in tei:biblStruct element

    Args:
        node (ET.Element): a tei:biblStruct element
        no_author (str, optional): Used if no author name can be extracted. Defaults to "o.A".
        no_title (str, optional): Used if no title can be extracted. Defaults to "o.T".
        year (str, optional): Used if no year can be extracted. Defaults to "o.J".
        editor_abbr(str, optional): how to mark the 'author' beeing an editor. Defaults to "(Hg.)".
        max_title_length(int, optional): max lenght for the title before it gets truncated. Defaults to

    Returns:
        str: A nice, bibliograhpically useful label
    """
    try:
        author = any_xpath(node, ".//tei:author[1]/tei:surname[1]")[0].text
    except IndexError:
        try:
            author = any_xpath(node, ".//tei:author[1]/tei:name[1]")[0].text
        except IndexError:
            try:
                author = any_xpath(node, ".//tei:editor[1]/tei:surname[1]")[0].text
                author = f"{author} {editor_abbr}"
            except IndexError:
                try:
                    author = any_xpath(node, ".//tei:editor[1]/tei:name[1]")[0].text
                    author = f"{author} {editor_abbr}"
                except IndexError:
                    author = no_author
    try:
        source_year = any_xpath(node, ".//tei:date[1]")[0].text
        if source_year is not None:
            pass
        else:
            source_year = year
    except IndexError:
        source_year = year
    try:
        title = any_xpath(node, ".//tei:title[1]")[0].text
    except IndexError:
        title = no_title
    if title:
        if len(title) > max_title_length:
            title = f"{title[:max_title_length]}..."
    else:
        title = no_title
    return f"{author}, {title}, {source_year}"

make_entity_label(name_node, default_msg='no label provided', default_lang='en')

Extracts a label and a lang tag from the past in name-node

Parameters:
  • name_node (Element) –

    A tei:persName|placeName|orgName element

  • default_msg (str, default: 'no label provided' ) –

    some default vaule for the label.

  • default_lang (str, default: 'en' ) –

    some default lang tag if the node does not provide and xml:lang attribute".

Returns:
  • tuple[str, str]

    tuple[str, str]: returns the extracted label and a lang tag

Source code in src/acdh_tei_pyutils/utils.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def make_entity_label(
    name_node: ET.Element, default_msg="no label provided", default_lang="en"
) -> tuple[str, str]:
    """Extracts a label and a lang tag from the past in name-node

    Args:
        name_node (ET.Element): A tei:persName|placeName|orgName element
        default_msg (str, optional): some default vaule for the label.
        default_lang (str, optional): some default lang tag if the node does not provide and xml:lang attribute".

    Returns:
        tuple[str, str]: returns the extracted label and a lang tag
    """

    lang_tag = name_node.get("{http://www.w3.org/XML/1998/namespace}lang", default_lang)
    fornames = [
        normalize_string(x) for x in any_xpath(name_node, ".//tei:forename//text()")
    ]
    surnames = [
        normalize_string(x) for x in any_xpath(name_node, ".//tei:surname//text()")
    ]
    if len(surnames) > 0 and len(fornames) > 0:
        label = f"{surnames[0]}, {' '.join(fornames)}"
    elif len(surnames) == 0 and len(fornames) > 0:
        label = f"{' '.join(fornames)}"
    elif len(surnames) > 0 and len(fornames) == 0:
        label = f"{surnames[0]}"
    else:
        name_node_text = " ".join(any_xpath(name_node, ".//text()"))
        label = normalize_string(name_node_text)
    if label is None or label == "":
        label = default_msg
    return label, lang_tag

normalize_string(string)

removese any superfluos whitespace from a given string

Source code in src/acdh_tei_pyutils/utils.py
90
91
92
def normalize_string(string: str) -> str:
    """removese any superfluos whitespace from a given string"""
    return " ".join(" ".join(string.split()).split())

previous_and_next(some_iterable)

taken from https://stackoverflow.com/a/1012089

Source code in src/acdh_tei_pyutils/utils.py
82
83
84
85
86
87
def previous_and_next(some_iterable):  # pragma: no cover
    """taken from https://stackoverflow.com/a/1012089"""
    prevs, items, nexts = tee(some_iterable, 3)
    prevs = chain([None], prevs)
    nexts = chain(islice(nexts, 1, None), [None])
    return zip(prevs, items, nexts)

acdh_tei_pyutils.tei

TeiEnricher

Bases: TeiReader

a class to enrich tei-documents

Source code in src/acdh_tei_pyutils/tei.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
class TeiEnricher(TeiReader):
    """a class to enrich tei-documents"""

    def add_base_and_id(self, base_value, id_value, prev_value, next_value):
        """adds @xml:base and @xml:id and next and prev to root element

        :param base_value: The value of the @xml:base
        :type base_value: str

        :return: the updated tree
        """

        base = self.any_xpath("//tei:TEI")[0]
        if base_value:
            base.set(f"{{{self.ns_xml['xml']}}}base", base_value)
        if id_value:
            base.set(f"{{{self.ns_xml['xml']}}}id", id_value)
        if prev_value:
            base.set("prev", f"{base_value}/{prev_value}")
        if next_value:
            base.set("next", f"{base_value}/{next_value}")
        return self.tree

    def get_full_id(self):
        """returns the combination of @xml:base and @xml:id

        :return: combination of @xml:base and @xml:id
        :rtype: str

        """
        base = self.any_xpath("//tei:TEI")[0]
        try:
            base_base = base.xpath("./@xml:base", namespaces=self.ns_xml)[0]
        except IndexError:
            return None
        try:
            base_id = base.xpath("./@xml:id", namespaces=self.ns_xml)[0]
        except IndexError:
            return None
        if base_base.endswith("/"):
            return f"{base_base}{base_id}"
        else:
            return f"{base_base}/{base_id}"

    def handle_exist(self, handle_xpath='.//tei:idno[@type="handle"]'):
        """checks if a handle is already assigned

        :return: the registered handle or empty string
        :rtype: str, None
        """
        try:
            return self.any_xpath(handle_xpath)[0].text
        except IndexError:
            return None

    def add_handle(
        self,
        handle,
        handle_xpath='.//tei:idno[@type="handle"]',
        insert_xpath=".//tei:publicationStmt/tei:p",
    ):
        """adds an idno @type=handle element into tei:publicationStmt

        :param handle: the handle
        :type handle: str

        :param handle_xpath: an xpath expression where to look for an handle
        :type handle_xpath: str

        :raises: `HandleAlreadyExist` Error

        :returns: the indo node
        """
        tei_ns = f"{self.ns_tei['tei']}"
        if self.handle_exist(handle_xpath=handle_xpath):
            raise HandleAlreadyExist(
                f"a handle: {self.handle_exist()} is already registered"
            )
        else:
            idno_node = ET.Element(f"{{{tei_ns}}}idno")
            idno_node.set("type", "handle")
            idno_node.text = handle
            insert_node = self.any_xpath(insert_xpath)[0]
            insert_node.append(idno_node)
            return idno_node

    def create_mention_list(self, mentions, event_title=""):
        """creates a tei element with notes of mentions

        :param mentions: a list of dicts with keys `doc_id` and `doc_title`
        :type mentions: noteGrp

        :return: a etree.element
        """
        tei_ns = f"{self.ns_tei['tei']}"
        node_root = ET.Element(f"{{{tei_ns}}}noteGrp")
        mentions_added = {}
        for x in mentions:
            try:
                mentions_added[slugify(x["doc_id"])]
            except KeyError:
                note = ET.Element(f"{{{tei_ns}}}note")
                note.attrib["target"] = x["doc_id"]
                note.attrib["type"] = "mentions"
                if x["doc_date"] is not None:
                    note.attrib["corresp"] = x["doc_date"]
                if x["doc_title_sec"] is not None:
                    note.text = event_title + f"{x['doc_title']} {x['doc_title_sec']}"
                else:
                    note.text = x["doc_title"]
                node_root.append(note)
                mentions_added[slugify(x["doc_id"])] = True
        return node_root

add_base_and_id(base_value, id_value, prev_value, next_value)

adds @xml:base and @xml:id and next and prev to root element

:param base_value: The value of the @xml:base :type base_value: str

:return: the updated tree

Source code in src/acdh_tei_pyutils/tei.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def add_base_and_id(self, base_value, id_value, prev_value, next_value):
    """adds @xml:base and @xml:id and next and prev to root element

    :param base_value: The value of the @xml:base
    :type base_value: str

    :return: the updated tree
    """

    base = self.any_xpath("//tei:TEI")[0]
    if base_value:
        base.set(f"{{{self.ns_xml['xml']}}}base", base_value)
    if id_value:
        base.set(f"{{{self.ns_xml['xml']}}}id", id_value)
    if prev_value:
        base.set("prev", f"{base_value}/{prev_value}")
    if next_value:
        base.set("next", f"{base_value}/{next_value}")
    return self.tree

add_handle(handle, handle_xpath='.//tei:idno[@type="handle"]', insert_xpath='.//tei:publicationStmt/tei:p')

adds an idno @type=handle element into tei:publicationStmt

:param handle: the handle :type handle: str

:param handle_xpath: an xpath expression where to look for an handle :type handle_xpath: str

:raises: HandleAlreadyExist Error

:returns: the indo node

Source code in src/acdh_tei_pyutils/tei.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
def add_handle(
    self,
    handle,
    handle_xpath='.//tei:idno[@type="handle"]',
    insert_xpath=".//tei:publicationStmt/tei:p",
):
    """adds an idno @type=handle element into tei:publicationStmt

    :param handle: the handle
    :type handle: str

    :param handle_xpath: an xpath expression where to look for an handle
    :type handle_xpath: str

    :raises: `HandleAlreadyExist` Error

    :returns: the indo node
    """
    tei_ns = f"{self.ns_tei['tei']}"
    if self.handle_exist(handle_xpath=handle_xpath):
        raise HandleAlreadyExist(
            f"a handle: {self.handle_exist()} is already registered"
        )
    else:
        idno_node = ET.Element(f"{{{tei_ns}}}idno")
        idno_node.set("type", "handle")
        idno_node.text = handle
        insert_node = self.any_xpath(insert_xpath)[0]
        insert_node.append(idno_node)
        return idno_node

create_mention_list(mentions, event_title='')

creates a tei element with notes of mentions

:param mentions: a list of dicts with keys doc_id and doc_title :type mentions: noteGrp

:return: a etree.element

Source code in src/acdh_tei_pyutils/tei.py
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
def create_mention_list(self, mentions, event_title=""):
    """creates a tei element with notes of mentions

    :param mentions: a list of dicts with keys `doc_id` and `doc_title`
    :type mentions: noteGrp

    :return: a etree.element
    """
    tei_ns = f"{self.ns_tei['tei']}"
    node_root = ET.Element(f"{{{tei_ns}}}noteGrp")
    mentions_added = {}
    for x in mentions:
        try:
            mentions_added[slugify(x["doc_id"])]
        except KeyError:
            note = ET.Element(f"{{{tei_ns}}}note")
            note.attrib["target"] = x["doc_id"]
            note.attrib["type"] = "mentions"
            if x["doc_date"] is not None:
                note.attrib["corresp"] = x["doc_date"]
            if x["doc_title_sec"] is not None:
                note.text = event_title + f"{x['doc_title']} {x['doc_title_sec']}"
            else:
                note.text = x["doc_title"]
            node_root.append(note)
            mentions_added[slugify(x["doc_id"])] = True
    return node_root

get_full_id()

returns the combination of @xml:base and @xml:id

:return: combination of @xml:base and @xml:id :rtype: str

Source code in src/acdh_tei_pyutils/tei.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def get_full_id(self):
    """returns the combination of @xml:base and @xml:id

    :return: combination of @xml:base and @xml:id
    :rtype: str

    """
    base = self.any_xpath("//tei:TEI")[0]
    try:
        base_base = base.xpath("./@xml:base", namespaces=self.ns_xml)[0]
    except IndexError:
        return None
    try:
        base_id = base.xpath("./@xml:id", namespaces=self.ns_xml)[0]
    except IndexError:
        return None
    if base_base.endswith("/"):
        return f"{base_base}{base_id}"
    else:
        return f"{base_base}/{base_id}"

handle_exist(handle_xpath='.//tei:idno[@type="handle"]')

checks if a handle is already assigned

:return: the registered handle or empty string :rtype: str, None

Source code in src/acdh_tei_pyutils/tei.py
198
199
200
201
202
203
204
205
206
207
def handle_exist(self, handle_xpath='.//tei:idno[@type="handle"]'):
    """checks if a handle is already assigned

    :return: the registered handle or empty string
    :rtype: str, None
    """
    try:
        return self.any_xpath(handle_xpath)[0].text
    except IndexError:
        return None

TeiReader

Bases: XMLReader

a class to read an process tei-documents

Source code in src/acdh_tei_pyutils/tei.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
class TeiReader(XMLReader):
    """a class to read an process tei-documents"""

    def any_xpath(self, any_xpath="//tei:rs"):
        """Runs any xpath expressions against the parsed document
        :param any_xpath: Any XPath expression.
        :return: The result of the xpath
        """
        return self.tree.xpath(any_xpath, namespaces=self.ns_tei)

    def extract_ne_elements(self, parent_node, ne_xpath="//tei:rs"):
        """extract elements tagged as named entities
        :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
        :return: A list of elements
        """

        ne_elements = parent_node.xpath(ne_xpath, namespaces=self.ns_tei)
        return ne_elements

    def extract_ne_dicts(
        self, parent_node, ne_xpath="//tei:rs", NER_TAG_MAP=NER_TAG_MAP
    ):
        """ extract strings tagged as named entities
        :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
        :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
        spacy-tags
        :return: A list of NE-dicts containing the 'text' and the 'ne_type'
        """

        ne_elements = self.extract_ne_elements(parent_node, ne_xpath)
        ne_dicts = []
        for x in ne_elements:
            item = {}
            text = "".join(x.xpath(".//text()"))
            item["text"] = re.sub(r"\s+", " ", text).strip()
            try:
                ne_type = NER_TAG_MAP.get("{}".format(x.xpath("./@type")[0]), "MISC")
            except IndexError:
                ne_type = NER_TAG_MAP.get("{}".format(x.xpath("name()")), "MISC")
            item["ne_type"] = ne_type
            ne_dicts.append(item)

        return ne_dicts

    def create_plain_text(self, node):
        """ extracts all text nodes from given element
        :param start_node: An XPath expressione pointing to\
        an element which text nodes should be extracted
        :return: A normalized, cleaned plain text
        """
        result = re.sub(r"\s+", " ", "".join(node.xpath(".//text()"))).strip()

        return result

    def get_text_nes_list(
        self,
        parent_nodes=".//tei:body//tei:p",
        ne_xpath=".//tei:rs",
        NER_TAG_MAP=NER_TAG_MAP,
    ):
        """ extracts all text nodes from given elements and their NE
        :param parent_nodes: An XPath expressione pointing to\
        those elements which text nodes should be extracted
        :param ne_xpath:  An XPath expression pointing to elements used to tagged NEs.\
        Takes the parent node(s) as context
        :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
        spacy-tags
        :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien",\
        "ne_type": "LOC"}]}]
        """

        parents = self.tree.xpath(parent_nodes, namespaces=self.ns_tei)
        result = []
        for node in parents:
            text = self.create_plain_text(node)
            ner_dicts = self.extract_ne_dicts(node, ne_xpath, NER_TAG_MAP)
            result.append({"text": text, "ner_dicts": ner_dicts})
        return result

    def extract_ne_offsets(
        self,
        parent_nodes=".//tei:body//tei:p",
        ne_xpath=".//tei:rs",
        NER_TAG_MAP=NER_TAG_MAP,
    ):
        """ extracts offsets of NEs and the NE-type
        :param parent_nodes: An XPath expressione pointing to\
        those element which text nodes should be extracted
        :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
        Takes the parent node(s) as context
        :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
        spacy-tags
        :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
        """

        text_nes_dict = self.get_text_nes_list(parent_nodes, ne_xpath, NER_TAG_MAP)
        result = []
        for x in text_nes_dict:
            plain_text = x["text"]
            ner_dicts = x["ner_dicts"]
            entities = []
            for x in ner_dicts:
                if x["text"] != "":
                    for m in re.finditer(re.escape(x["text"]), plain_text):
                        entities.append([m.start(), m.end(), x["ne_type"]])
            entities = [item for item in set(tuple(row) for row in entities)]
            entities = sorted(entities, key=lambda x: x[0])
            ents = []
            next_item_index = 1
            # remove entities with the same start offset
            for x in entities:
                cur_start = x[0]
                try:
                    next_start = entities[next_item_index][0]
                except IndexError:
                    next_start = 9999999999999999999999
                if cur_start == next_start:
                    pass
                else:
                    ents.append(x)
                next_item_index = next_item_index + 1

            train_data = (plain_text, {"entities": ents})
            result.append(train_data)
        return result

any_xpath(any_xpath='//tei:rs')

Runs any xpath expressions against the parsed document :param any_xpath: Any XPath expression. :return: The result of the xpath

Source code in src/acdh_tei_pyutils/tei.py
30
31
32
33
34
35
def any_xpath(self, any_xpath="//tei:rs"):
    """Runs any xpath expressions against the parsed document
    :param any_xpath: Any XPath expression.
    :return: The result of the xpath
    """
    return self.tree.xpath(any_xpath, namespaces=self.ns_tei)

create_plain_text(node)

extracts all text nodes from given element :param start_node: An XPath expressione pointing to an element which text nodes should be extracted :return: A normalized, cleaned plain text

Source code in src/acdh_tei_pyutils/tei.py
71
72
73
74
75
76
77
78
79
def create_plain_text(self, node):
    """ extracts all text nodes from given element
    :param start_node: An XPath expressione pointing to\
    an element which text nodes should be extracted
    :return: A normalized, cleaned plain text
    """
    result = re.sub(r"\s+", " ", "".join(node.xpath(".//text()"))).strip()

    return result

extract_ne_dicts(parent_node, ne_xpath='//tei:rs', NER_TAG_MAP=NER_TAG_MAP)

extract strings tagged as named entities :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of NE-dicts containing the 'text' and the 'ne_type'

Source code in src/acdh_tei_pyutils/tei.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def extract_ne_dicts(
    self, parent_node, ne_xpath="//tei:rs", NER_TAG_MAP=NER_TAG_MAP
):
    """ extract strings tagged as named entities
    :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
    :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
    spacy-tags
    :return: A list of NE-dicts containing the 'text' and the 'ne_type'
    """

    ne_elements = self.extract_ne_elements(parent_node, ne_xpath)
    ne_dicts = []
    for x in ne_elements:
        item = {}
        text = "".join(x.xpath(".//text()"))
        item["text"] = re.sub(r"\s+", " ", text).strip()
        try:
            ne_type = NER_TAG_MAP.get("{}".format(x.xpath("./@type")[0]), "MISC")
        except IndexError:
            ne_type = NER_TAG_MAP.get("{}".format(x.xpath("name()")), "MISC")
        item["ne_type"] = ne_type
        ne_dicts.append(item)

    return ne_dicts

extract_ne_elements(parent_node, ne_xpath='//tei:rs')

extract elements tagged as named entities :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. :return: A list of elements

Source code in src/acdh_tei_pyutils/tei.py
37
38
39
40
41
42
43
44
def extract_ne_elements(self, parent_node, ne_xpath="//tei:rs"):
    """extract elements tagged as named entities
    :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.
    :return: A list of elements
    """

    ne_elements = parent_node.xpath(ne_xpath, namespaces=self.ns_tei)
    return ne_elements

extract_ne_offsets(parent_nodes='.//tei:body//tei:p', ne_xpath='.//tei:rs', NER_TAG_MAP=NER_TAG_MAP)

extracts offsets of NEs and the NE-type :param parent_nodes: An XPath expressione pointing to those element which text nodes should be extracted :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. Takes the parent node(s) as context :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]

Source code in src/acdh_tei_pyutils/tei.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def extract_ne_offsets(
    self,
    parent_nodes=".//tei:body//tei:p",
    ne_xpath=".//tei:rs",
    NER_TAG_MAP=NER_TAG_MAP,
):
    """ extracts offsets of NEs and the NE-type
    :param parent_nodes: An XPath expressione pointing to\
    those element which text nodes should be extracted
    :param ne_xpath: An XPath expression pointing to elements used to tagged NEs.\
    Takes the parent node(s) as context
    :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
    spacy-tags
    :return: A list of spacy-like NER Tuples [('some text'), {'entities': [(15, 19, 'place')]}]
    """

    text_nes_dict = self.get_text_nes_list(parent_nodes, ne_xpath, NER_TAG_MAP)
    result = []
    for x in text_nes_dict:
        plain_text = x["text"]
        ner_dicts = x["ner_dicts"]
        entities = []
        for x in ner_dicts:
            if x["text"] != "":
                for m in re.finditer(re.escape(x["text"]), plain_text):
                    entities.append([m.start(), m.end(), x["ne_type"]])
        entities = [item for item in set(tuple(row) for row in entities)]
        entities = sorted(entities, key=lambda x: x[0])
        ents = []
        next_item_index = 1
        # remove entities with the same start offset
        for x in entities:
            cur_start = x[0]
            try:
                next_start = entities[next_item_index][0]
            except IndexError:
                next_start = 9999999999999999999999
            if cur_start == next_start:
                pass
            else:
                ents.append(x)
            next_item_index = next_item_index + 1

        train_data = (plain_text, {"entities": ents})
        result.append(train_data)
    return result

get_text_nes_list(parent_nodes='.//tei:body//tei:p', ne_xpath='.//tei:rs', NER_TAG_MAP=NER_TAG_MAP)

extracts all text nodes from given elements and their NE :param parent_nodes: An XPath expressione pointing to those elements which text nodes should be extracted :param ne_xpath: An XPath expression pointing to elements used to tagged NEs. Takes the parent node(s) as context :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to spacy-tags :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien", "ne_type": "LOC"}]}]

Source code in src/acdh_tei_pyutils/tei.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def get_text_nes_list(
    self,
    parent_nodes=".//tei:body//tei:p",
    ne_xpath=".//tei:rs",
    NER_TAG_MAP=NER_TAG_MAP,
):
    """ extracts all text nodes from given elements and their NE
    :param parent_nodes: An XPath expressione pointing to\
    those elements which text nodes should be extracted
    :param ne_xpath:  An XPath expression pointing to elements used to tagged NEs.\
    Takes the parent node(s) as context
    :param NER_TAG_MAP: A dictionary providing mapping from TEI tags used to tag NEs to\
    spacy-tags
    :return: A list of dicts like [{"text": "Wien ist schön", "ner_dicts": [{"text": "Wien",\
    "ne_type": "LOC"}]}]
    """

    parents = self.tree.xpath(parent_nodes, namespaces=self.ns_tei)
    result = []
    for node in parents:
        text = self.create_plain_text(node)
        ner_dicts = self.extract_ne_dicts(node, ne_xpath, NER_TAG_MAP)
        result.append({"text": text, "ner_dicts": ner_dicts})
    return result

command line interface

Console script for acdh_collatex_utils.

add_base_id_next_prev(glob_pattern, base_value)

Console script add @xml:base, @xml:id and @prev @next attributes to root element

Source code in src/acdh_tei_pyutils/cli.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
@click.command()  # pragma: no cover
@click.option(
    "-g", "--glob-pattern", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option("-b", "--base-value")  # pragma: no cover
def add_base_id_next_prev(glob_pattern, base_value):  # pragma: no cover
    """Console script add @xml:base, @xml:id and @prev @next attributes to root element"""
    files = sorted(glob.glob(glob_pattern))

    for prev_value, current, next_value in tqdm.tqdm(
        previous_and_next(files), total=len(files)
    ):
        doc = TeiEnricher(current)
        id_value = os.path.split(current)[1]
        if prev_value:
            prev_id = os.path.split(prev_value)[1]
        else:
            prev_id = None
        if next_value:
            next_id = os.path.split(next_value)[1]
        else:
            next_id = None
        doc.add_base_and_id(base_value, id_value, prev_id, next_id)
        doc.tree_to_file(file=current)

denormalize_indices(files, indices, mention_xpath, title_xpath, title_sec_xpath, date_xpath, standoff, blacklist_ids=[])

Write pointers to mentions in index-docs and copy index entries into docs

Source code in src/acdh_tei_pyutils/cli.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
@click.command()  # pragma: no cover
@click.option(
    "-f", "--files", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-i", "--indices", default="./indices/list*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-m", "--mention-xpath", default=".//tei:rs[@ref]/@ref", show_default=True
)  # pragma: no cover
@click.option(
    "-x", "--title-xpath", default=".//tei:title/text()", show_default=True
)  # pragma: no cover
@click.option("-xs", "--title-sec-xpath", required=False)  # pragma: no cover
@click.option("-d", "--date-xpath", required=False)  # pragma: no cover
@click.option(
    "-b", "--blacklist-ids", default=[], multiple=True, show_default=True
)  # pragma: no cover
@click.option(
    "--standoff", is_flag=True, help="write entity-lists into tei:standoff element"
)
def denormalize_indices(
    files,
    indices,
    mention_xpath,
    title_xpath,
    title_sec_xpath,
    date_xpath,
    standoff,
    blacklist_ids=[],
):  # pragma: no cover
    """Write pointers to mentions in index-docs and copy index entries into docs"""
    files = sorted(glob.glob(files))
    index_files = sorted(glob.glob(indices))
    ref_doc_dict = defaultdict(list)
    doc_ref_dict = defaultdict(list)
    click.echo(
        click.style(f"collecting list of mentions from {len(files)} docs", fg="green")
    )
    for x in tqdm.tqdm(files):
        filename = os.path.split(x)[1]
        if "list" in filename:
            continue
        doc = TeiEnricher(x)
        doc_base = doc.any_xpath("./@xml:base")[0]
        doc_id = doc.any_xpath("./@xml:id")[0]
        doc_uri = f"{doc_base}/{doc_id}"
        try:
            doc_title = doc.any_xpath(title_xpath)[0]
        except IndexError:
            doc_title = f"ERROR in title xpath of file: {doc_id}"
            print(f"ERROR in -x title xpath of file: {doc_id}")
        if title_sec_xpath:
            try:
                doc_title_sec = doc.any_xpath(title_sec_xpath)[0]
            except IndexError:
                doc_title_sec = f"ERROR in -xs secondary title xpath of file: {doc_id}"
                print(f"ERROR in secondary title xpath of file: {doc_id}")
        else:
            doc_title_sec = None
        if date_xpath:
            try:
                doc_date = doc.any_xpath(date_xpath)[0]
            except IndexError:
                doc_date = f"ERROR in date xpath of file: {doc_id}"
                print(f"ERROR in -d date xpath of file: {doc_id}")
        else:
            doc_date = None
        refs = doc.any_xpath(mention_xpath)
        for ref in set(refs):
            if ref.startswith("#") and len(ref.split(" ")) == 1:
                ref = ref[1:]
            if ref.startswith("#") and len(ref.split(" ")) > 1:
                refs = ref.split(" ")
                ref = refs[0]
                ref = ref[1:]
                for r in refs[1:]:
                    ref_doc_dict[r[1:]].append(
                        {
                            "doc_uri": doc_uri,
                            "doc_id": doc_id,
                            "doc_path": x,
                            "doc_title": doc_title,
                            "doc_title_sec": doc_title_sec,
                            "doc_date": doc_date,
                        }
                    )
            ref_doc_dict[ref].append(
                {
                    "doc_uri": doc_uri,
                    "doc_id": doc_id,
                    "doc_path": x,
                    "doc_title": doc_title,
                    "doc_title_sec": doc_title_sec,
                    "doc_date": doc_date,
                }
            )
            doc_ref_dict[filename].append(ref)
    click.echo(
        click.style(
            f"collected {len(ref_doc_dict.keys())} of mentioned entities from {len(files)} docs",
            fg="green",
        )
    )
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            ent_id = ent.xpath("@xml:id")[0]
            mention = ref_doc_dict[ent_id]
            if ent_id in blacklist_ids:
                continue
            ent_name = ent.tag
            note_grp = doc.create_mention_list(mention)
            try:
                list(note_grp[0])
                # TEI schema does not allow noteGrp in event after e.g. listPerson, ... so we need to insert it before
                if ent_name == "{http://www.tei-c.org/ns/1.0}event":
                    ent.insert(1, note_grp)
                else:
                    ent.append(note_grp)
            except IndexError:
                pass
        doc.tree_to_file(file=x)

    all_ent_nodes = {}
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            all_ent_nodes[ent.xpath("@xml:id")[0]] = ent

    click.echo(
        click.style(
            f"writing {len(all_ent_nodes)} index entries into {len(files)} files",
            fg="green",
        )
    )
    for x in tqdm.tqdm(files):
        try:
            filename = os.path.split(x)[1]
            doc = TeiEnricher(x)

            if standoff:
                root_node = doc.any_xpath("//tei:TEI")[0]
            else:
                root_node = doc.any_xpath(".//tei:text")[0]
                for bad in doc.any_xpath(".//tei:back"):
                    bad.getparent().remove(bad)
            refs = doc.any_xpath(mention_xpath)
            ent_dict = defaultdict(list)
            for ref in set(refs):
                # print(ref, type(ref))
                if ref.startswith("#") and len(ref.split(" ")) == 1:
                    ent_id = ref[1:]
                elif ref.startswith("#") and len(ref.split(" ")) > 1:
                    refs = ref.split(" ")
                    ref = refs[0]
                    ent_id = ref[1:]
                    for r in refs[1:]:
                        try:
                            index_ent = all_ent_nodes[r[1:]]
                            ent_dict[index_ent.tag].append(index_ent)
                        except KeyError:
                            continue
                else:
                    ent_id = ref
                try:
                    index_ent = all_ent_nodes[ent_id]
                    ent_dict[index_ent.tag].append(index_ent)
                except KeyError:
                    continue
            if standoff:
                back_node = ET.Element("{http://www.tei-c.org/ns/1.0}standOff")
            else:
                back_node = ET.Element("{http://www.tei-c.org/ns/1.0}back")
            for key in ent_dict.keys():
                if key.endswith("person"):
                    list_person = ET.Element("{http://www.tei-c.org/ns/1.0}listPerson")
                    back_node.append(list_person)
                    for ent in ent_dict[key]:
                        list_person.append(ent)
                if key.endswith("place"):
                    list_place = ET.Element("{http://www.tei-c.org/ns/1.0}listPlace")
                    back_node.append(list_place)
                    for ent in ent_dict[key]:
                        list_place.append(ent)
                if key.endswith("org"):
                    list_org = ET.Element("{http://www.tei-c.org/ns/1.0}listOrg")
                    back_node.append(list_org)
                    for ent in ent_dict[key]:
                        list_org.append(ent)
                if key.endswith("bibl") or key.endswith("biblStruct"):
                    list_bibl = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
                    back_node.append(list_bibl)
                    for ent in ent_dict[key]:
                        list_bibl.append(ent)
                if key.endswith("item"):
                    list_item = ET.Element("{http://www.tei-c.org/ns/1.0}list")
                    back_node.append(list_item)
                    for ent in ent_dict[key]:
                        list_item.append(ent)
                if key.endswith("event"):
                    list_eve = ET.Element("{http://www.tei-c.org/ns/1.0}listEvent")
                    back_node.append(list_eve)
                    for ent in ent_dict[key]:
                        list_eve.append(ent)
            if len(back_node) > 0:
                if standoff:
                    root_node.insert(1, back_node)
                else:
                    root_node.append(back_node)
            doc.tree_to_file(file=x)
        except Exception as e:
            print(f"failed to process {x} due to {e}")
    click.echo(click.style("DONE", fg="green"))

mentions_to_indices(files, indices, mention_xpath, event_title, title_xpath)

Console script write pointers to mentions in index-docs

Source code in src/acdh_tei_pyutils/cli.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
@click.command()  # pragma: no cover
@click.option(
    "-f", "--files", default="./editions/*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-i", "--indices", default="./indices/list*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-m", "--mention-xpath", default=".//tei:rs[@ref]/@ref", show_default=True
)  # pragma: no cover
@click.option(
    "-t", "--event-title", default="erwähnt in ", show_default=True
)  # pragma: no cover
@click.option(
    "-x",
    "--title-xpath",
    default='.//tei:title[@type="main"]/text()',
    show_default=True,
)  # pragma: no cover
def mentions_to_indices(
    files, indices, mention_xpath, event_title, title_xpath
):  # pragma: no cover
    """Console script write pointers to mentions in index-docs"""
    files = sorted(glob.glob(files))
    index_files = sorted(glob.glob(indices))
    ref_doc_dict = defaultdict(list)
    doc_ref_dict = defaultdict(list)
    click.echo(
        click.style(f"collecting list of mentions from {len(files)} docs", fg="green")
    )
    for x in tqdm.tqdm(files):
        filename = os.path.split(x)[1]
        doc = TeiEnricher(x)
        doc_base = doc.any_xpath("./@xml:base")[0]
        doc_id = doc.any_xpath("./@xml:id")[0]
        doc_uri = f"{doc_base}/{doc_id}"
        doc_title = doc.any_xpath(title_xpath)[0]
        refs = doc.any_xpath(mention_xpath)
        for ref in set(refs):
            if ref.startswith("#"):
                ref = ref[1:]
            ref_doc_dict[ref].append(
                {
                    "doc_uri": doc_uri,
                    "doc_path": x,
                    "doc_title": doc_title,
                    "doc_id": doc_id,
                    "doc_date": None,
                    "doc_title_sec": None,
                }
            )
            doc_ref_dict[filename].append(ref)
    click.echo(
        click.style(
            f"collected {len(ref_doc_dict.keys())} of mentioned entities from {len(files)} docs",
            fg="green",
        )
    )
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            ent_id = ent.xpath("@xml:id", namespaces=doc.nsmap)[0]
            mentions = ref_doc_dict[ent_id]
            ent_name = ent.tag
            note_grp = doc.create_mention_list(mentions, event_title)
            try:
                list(note_grp[0])
                # TEI schema does not allow noteGrp in event after e.g. listPerson, ... so we need to insert it before
                if ent_name == "{http://www.tei-c.org/ns/1.0}event":
                    ent.insert(1, note_grp)
                else:
                    ent.append(note_grp)
            except IndexError:
                pass
        doc.tree_to_file(file=x)

    all_ent_nodes = {}
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            all_ent_nodes[ent.xpath("@xml:id")[0]] = ent
    click.echo(click.style("DONE", fg="green"))

schnitzler(files, indices, doc_person, doc_work)

Console script write pointers to mentions in index-docs

Source code in src/acdh_tei_pyutils/cli.py
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
@click.command()  # pragma: no cover
@click.option(
    "-f", "--files", default="./data/editions/*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-i", "--indices", default="./data/indices/list*.xml", show_default=True
)  # pragma: no cover
@click.option(
    "-t",
    "--doc-person",
    default="./data/indices/index_person_day.xml",
    show_default=True,
)  # pragma: no cover
@click.option(
    "-t", "--doc-work", default="./data/indices/index_work_day.xml", show_default=True
)  # pragma: no cover
def schnitzler(files, indices, doc_person, doc_work):  # pragma: no cover
    """Console script write pointers to mentions in index-docs"""
    files = sorted(glob.glob(files))
    index_files = sorted(glob.glob(indices))
    doc_person = TeiEnricher(doc_person)
    doc_work = TeiEnricher(doc_work)
    all_ent_nodes = {}
    for x in index_files:
        doc = TeiEnricher(x)
        ent_nodes = doc.any_xpath(".//tei:body//*[@xml:id]")
        for ent in ent_nodes:
            all_ent_nodes[ent.xpath("@xml:id")[0]] = ent

    no_matches = []
    for x in tqdm.tqdm(files, total=len(files)):
        day = x.split("/")[-1].replace("entry__", "").replace(".xml", "")
        doc = TeiEnricher(x)
        root_node = doc.any_xpath(".//tei:text")[0]
        back_node = ET.Element("{http://www.tei-c.org/ns/1.0}back")
        for bad in doc.any_xpath(".//tei:back"):
            bad.getparent().remove(bad)

        xpath = f".//item[@target='{day}']/ref/text()"
        ids = doc_person.any_xpath(xpath)
        list_person_node = ET.Element("{http://www.tei-c.org/ns/1.0}listPerson")
        if len(ids) > 0:
            for id in ids:
                try:
                    nodes = all_ent_nodes[id]
                except KeyError:
                    no_matches.append(id)
                    continue
                list_person_node.append(nodes)
            if len(list_person_node) > 0:
                back_node.append(list_person_node)

        ids = doc_work.any_xpath(xpath)
        list_work_node = ET.Element("{http://www.tei-c.org/ns/1.0}listBibl")
        if len(ids) > 0:
            for id in ids:
                try:
                    nodes = all_ent_nodes[id]
                except KeyError:
                    no_matches.append(id)
                    continue
                list_work_node.append(nodes)
            if len(list_work_node) > 0:
                back_node.append(list_work_node)
        place_ids = doc.any_xpath('.//tei:rs[@ref and @type="place"]/@ref')
        if len(place_ids) > 0:
            list_place_node = ET.Element("{http://www.tei-c.org/ns/1.0}listPlace")
            for pl in place_ids:
                try:
                    pl_node = all_ent_nodes[pl[1:]]
                except KeyError:
                    no_matches.append(pl)
                    continue
                list_place_node.append(pl_node)
            if len(list_place_node) > 0:
                back_node.append(list_place_node)
        if len(back_node) > 0:
            root_node.append(back_node)
            doc.tree_to_file(file=x)
    distinct_no_match = set(no_matches)
    print(distinct_no_match)