API Documentation

This part of the project documentation focuses on an information-oriented approach. Use it as a reference for the technical implementation of the acdh_xml_pyutils project code.

acdh_xml_pyutils.xml

XMLReader

Class to parse, preprocess and save XML/TEI

:param xml: An XML Document, either a File Path, an URL to an XML or an XML string :type xml: str

:return: A XMLReader instance :rtype: xml.XMLReader

Source code in src/acdh_xml_pyutils/xml.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class XMLReader:
    """Class to parse, preprocess and save XML/TEI

    :param xml: An XML Document, either a File Path, an URL to an XML or an XML string
    :type xml: str

    :return: A XMLReader instance
    :rtype: `xml.XMLReader`

    """

    def __init__(self, xml=None, xsl=None):
        """initializes the class

        :param xml: An XML Document, either a File Path, an URL to an XML or an XML string
        :type xml: str

        :param xsl: Path to an XSL Stylesheet
        :type xsl: str

        :return: A XMLReader instance
        :rtype: `xml.XMLReader`

        """
        self.ns_tei = {"tei": "http://www.tei-c.org/ns/1.0"}
        self.ns_xml = {"xml": "http://www.w3.org/XML/1998/namespace"}
        self.ns_tcf = {"tcf": "http://www.dspin.de/data/textcorpus"}
        self.nsmap = NSMAP
        self.file = xml.strip()
        if xsl:
            self.xsl = ET.parse(xsl)
        else:
            self.xsl = None
        if self.file.startswith("http"):
            r = requests.get(
                self.file,
                headers={
                    "Content-type": "application/xml; charset=utf-8",
                    "Accept-Charset": "utf-8",
                },
            )
            try:
                self.original = ET.fromstring(r.content.decode("utf-8"))
            except ValueError:
                self.original = ET.fromstring(r.content.decode("utf-8").encode("utf-8"))
        elif self.file.startswith("<"):
            try:
                self.original = ET.parse(self.file)
            except OSError:
                self.original = ET.fromstring(self.file.encode("utf8"))
        else:
            self.original = ET.parse(self.file)
        self.tree = self.original
        if self.xsl:
            transform = ET.XSLT(self.xsl)
            self.tree = transform(self.tree)

    def get_elements(self):
        """returns a list of all element names of the current tree

        :return: A list of all element names
        :rtype: list

        """
        all_elements = [element.tag for element in self.tree.iter()]
        return all_elements

    def get_element_stats(self):
        """returns a `collections.Counter` object holding element count

        :return: A list of all element names
        :rtype: `collections.Counter`
        """
        return Counter(self.get_elements())

    def return_byte_like_object(self):
        """returns current doc as byte like object"""

        return ET.tostring(self.tree, encoding="utf-8")

    def return_string(self):
        """
        returns current doc as string

        :rtype: str

        """
        return self.return_byte_like_object().decode("utf-8")

    def tree_to_file(self, file=None, xml_declaration=True):
        """
        saves current tree to file

        :param file: A filename/location to save the current doc
        :type file: str

        :param xml_declaration: should XML declaration be added
        :type xml_declaration: bool

        :return: The save-location
        :rtype: str

        """
        if xml_declaration:
            pass
        else:
            xml_declaration = None
        if file:
            pass
        else:
            timestamp = datetime.datetime.fromtimestamp(time.time()).strftime(
                "%Y-%m-%d-%H-%M-%S"
            )
            file = "{}.xml".format(timestamp)

        with open(file, "wb") as f:
            f.write(
                ET.tostring(
                    self.tree, xml_declaration=xml_declaration, encoding="UTF-8"
                )
            )
        return file

__init__(xml=None, xsl=None)

initializes the class

:param xml: An XML Document, either a File Path, an URL to an XML or an XML string :type xml: str

:param xsl: Path to an XSL Stylesheet :type xsl: str

:return: A XMLReader instance :rtype: xml.XMLReader

Source code in src/acdh_xml_pyutils/xml.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(self, xml=None, xsl=None):
    """initializes the class

    :param xml: An XML Document, either a File Path, an URL to an XML or an XML string
    :type xml: str

    :param xsl: Path to an XSL Stylesheet
    :type xsl: str

    :return: A XMLReader instance
    :rtype: `xml.XMLReader`

    """
    self.ns_tei = {"tei": "http://www.tei-c.org/ns/1.0"}
    self.ns_xml = {"xml": "http://www.w3.org/XML/1998/namespace"}
    self.ns_tcf = {"tcf": "http://www.dspin.de/data/textcorpus"}
    self.nsmap = NSMAP
    self.file = xml.strip()
    if xsl:
        self.xsl = ET.parse(xsl)
    else:
        self.xsl = None
    if self.file.startswith("http"):
        r = requests.get(
            self.file,
            headers={
                "Content-type": "application/xml; charset=utf-8",
                "Accept-Charset": "utf-8",
            },
        )
        try:
            self.original = ET.fromstring(r.content.decode("utf-8"))
        except ValueError:
            self.original = ET.fromstring(r.content.decode("utf-8").encode("utf-8"))
    elif self.file.startswith("<"):
        try:
            self.original = ET.parse(self.file)
        except OSError:
            self.original = ET.fromstring(self.file.encode("utf8"))
    else:
        self.original = ET.parse(self.file)
    self.tree = self.original
    if self.xsl:
        transform = ET.XSLT(self.xsl)
        self.tree = transform(self.tree)

get_element_stats()

returns a collections.Counter object holding element count

:return: A list of all element names :rtype: collections.Counter

Source code in src/acdh_xml_pyutils/xml.py
91
92
93
94
95
96
97
def get_element_stats(self):
    """returns a `collections.Counter` object holding element count

    :return: A list of all element names
    :rtype: `collections.Counter`
    """
    return Counter(self.get_elements())

get_elements()

returns a list of all element names of the current tree

:return: A list of all element names :rtype: list

Source code in src/acdh_xml_pyutils/xml.py
81
82
83
84
85
86
87
88
89
def get_elements(self):
    """returns a list of all element names of the current tree

    :return: A list of all element names
    :rtype: list

    """
    all_elements = [element.tag for element in self.tree.iter()]
    return all_elements

return_byte_like_object()

returns current doc as byte like object

Source code in src/acdh_xml_pyutils/xml.py
 99
100
101
102
def return_byte_like_object(self):
    """returns current doc as byte like object"""

    return ET.tostring(self.tree, encoding="utf-8")

return_string()

returns current doc as string

:rtype: str

Source code in src/acdh_xml_pyutils/xml.py
104
105
106
107
108
109
110
111
def return_string(self):
    """
    returns current doc as string

    :rtype: str

    """
    return self.return_byte_like_object().decode("utf-8")

tree_to_file(file=None, xml_declaration=True)

saves current tree to file

:param file: A filename/location to save the current doc :type file: str

:param xml_declaration: should XML declaration be added :type xml_declaration: bool

:return: The save-location :rtype: str

Source code in src/acdh_xml_pyutils/xml.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def tree_to_file(self, file=None, xml_declaration=True):
    """
    saves current tree to file

    :param file: A filename/location to save the current doc
    :type file: str

    :param xml_declaration: should XML declaration be added
    :type xml_declaration: bool

    :return: The save-location
    :rtype: str

    """
    if xml_declaration:
        pass
    else:
        xml_declaration = None
    if file:
        pass
    else:
        timestamp = datetime.datetime.fromtimestamp(time.time()).strftime(
            "%Y-%m-%d-%H-%M-%S"
        )
        file = "{}.xml".format(timestamp)

    with open(file, "wb") as f:
        f.write(
            ET.tostring(
                self.tree, xml_declaration=xml_declaration, encoding="UTF-8"
            )
        )
    return file