Class to parse, preprocess and save XML/TEI
:param xml: An XML Document, either a File Path, an URL to an XML or an XML string
:type xml: str
:return: A XMLReader instance
:rtype: xml.XMLReader
Source code in src/acdh_xml_pyutils/xml.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 | class XMLReader:
"""Class to parse, preprocess and save XML/TEI
:param xml: An XML Document, either a File Path, an URL to an XML or an XML string
:type xml: str
:return: A XMLReader instance
:rtype: `xml.XMLReader`
"""
def __init__(self, xml=None, xsl=None):
"""initializes the class
:param xml: An XML Document, either a File Path, an URL to an XML or an XML string
:type xml: str
:param xsl: Path to an XSL Stylesheet
:type xsl: str
:return: A XMLReader instance
:rtype: `xml.XMLReader`
"""
self.ns_tei = {"tei": "http://www.tei-c.org/ns/1.0"}
self.ns_xml = {"xml": "http://www.w3.org/XML/1998/namespace"}
self.ns_tcf = {"tcf": "http://www.dspin.de/data/textcorpus"}
self.nsmap = NSMAP
self.file = xml.strip()
if xsl:
self.xsl = ET.parse(xsl)
else:
self.xsl = None
if self.file.startswith("http"):
r = requests.get(
self.file,
headers={
"Content-type": "application/xml; charset=utf-8",
"Accept-Charset": "utf-8",
},
)
try:
self.original = ET.fromstring(r.content.decode("utf-8"))
except ValueError:
self.original = ET.fromstring(r.content.decode("utf-8").encode("utf-8"))
elif self.file.startswith("<"):
try:
self.original = ET.parse(self.file)
except OSError:
self.original = ET.fromstring(self.file.encode("utf8"))
else:
self.original = ET.parse(self.file)
self.tree = self.original
if self.xsl:
transform = ET.XSLT(self.xsl)
self.tree = transform(self.tree)
def get_elements(self):
"""returns a list of all element names of the current tree
:return: A list of all element names
:rtype: list
"""
all_elements = [element.tag for element in self.tree.iter()]
return all_elements
def get_element_stats(self):
"""returns a `collections.Counter` object holding element count
:return: A list of all element names
:rtype: `collections.Counter`
"""
return Counter(self.get_elements())
def return_byte_like_object(self):
"""returns current doc as byte like object"""
return ET.tostring(self.tree, encoding="utf-8")
def return_string(self):
"""
returns current doc as string
:rtype: str
"""
return self.return_byte_like_object().decode("utf-8")
def tree_to_file(self, file=None, xml_declaration=True):
"""
saves current tree to file
:param file: A filename/location to save the current doc
:type file: str
:param xml_declaration: should XML declaration be added
:type xml_declaration: bool
:return: The save-location
:rtype: str
"""
if xml_declaration:
pass
else:
xml_declaration = None
if file:
pass
else:
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime(
"%Y-%m-%d-%H-%M-%S"
)
file = "{}.xml".format(timestamp)
with open(file, "wb") as f:
f.write(
ET.tostring(
self.tree, xml_declaration=xml_declaration, encoding="UTF-8"
)
)
return file
|
__init__(xml=None, xsl=None)
initializes the class
:param xml: An XML Document, either a File Path, an URL to an XML or an XML string
:type xml: str
:param xsl: Path to an XSL Stylesheet
:type xsl: str
:return: A XMLReader instance
:rtype: xml.XMLReader
Source code in src/acdh_xml_pyutils/xml.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 | def __init__(self, xml=None, xsl=None):
"""initializes the class
:param xml: An XML Document, either a File Path, an URL to an XML or an XML string
:type xml: str
:param xsl: Path to an XSL Stylesheet
:type xsl: str
:return: A XMLReader instance
:rtype: `xml.XMLReader`
"""
self.ns_tei = {"tei": "http://www.tei-c.org/ns/1.0"}
self.ns_xml = {"xml": "http://www.w3.org/XML/1998/namespace"}
self.ns_tcf = {"tcf": "http://www.dspin.de/data/textcorpus"}
self.nsmap = NSMAP
self.file = xml.strip()
if xsl:
self.xsl = ET.parse(xsl)
else:
self.xsl = None
if self.file.startswith("http"):
r = requests.get(
self.file,
headers={
"Content-type": "application/xml; charset=utf-8",
"Accept-Charset": "utf-8",
},
)
try:
self.original = ET.fromstring(r.content.decode("utf-8"))
except ValueError:
self.original = ET.fromstring(r.content.decode("utf-8").encode("utf-8"))
elif self.file.startswith("<"):
try:
self.original = ET.parse(self.file)
except OSError:
self.original = ET.fromstring(self.file.encode("utf8"))
else:
self.original = ET.parse(self.file)
self.tree = self.original
if self.xsl:
transform = ET.XSLT(self.xsl)
self.tree = transform(self.tree)
|
get_element_stats()
returns a collections.Counter object holding element count
:return: A list of all element names
:rtype: collections.Counter
Source code in src/acdh_xml_pyutils/xml.py
| def get_element_stats(self):
"""returns a `collections.Counter` object holding element count
:return: A list of all element names
:rtype: `collections.Counter`
"""
return Counter(self.get_elements())
|
get_elements()
returns a list of all element names of the current tree
:return: A list of all element names
:rtype: list
Source code in src/acdh_xml_pyutils/xml.py
81
82
83
84
85
86
87
88
89 | def get_elements(self):
"""returns a list of all element names of the current tree
:return: A list of all element names
:rtype: list
"""
all_elements = [element.tag for element in self.tree.iter()]
return all_elements
|
return_byte_like_object()
returns current doc as byte like object
Source code in src/acdh_xml_pyutils/xml.py
| def return_byte_like_object(self):
"""returns current doc as byte like object"""
return ET.tostring(self.tree, encoding="utf-8")
|
return_string()
returns current doc as string
:rtype: str
Source code in src/acdh_xml_pyutils/xml.py
104
105
106
107
108
109
110
111 | def return_string(self):
"""
returns current doc as string
:rtype: str
"""
return self.return_byte_like_object().decode("utf-8")
|
tree_to_file(file=None, xml_declaration=True)
saves current tree to file
:param file: A filename/location to save the current doc
:type file: str
:param xml_declaration: should XML declaration be added
:type xml_declaration: bool
:return: The save-location
:rtype: str
Source code in src/acdh_xml_pyutils/xml.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 | def tree_to_file(self, file=None, xml_declaration=True):
"""
saves current tree to file
:param file: A filename/location to save the current doc
:type file: str
:param xml_declaration: should XML declaration be added
:type xml_declaration: bool
:return: The save-location
:rtype: str
"""
if xml_declaration:
pass
else:
xml_declaration = None
if file:
pass
else:
timestamp = datetime.datetime.fromtimestamp(time.time()).strftime(
"%Y-%m-%d-%H-%M-%S"
)
file = "{}.xml".format(timestamp)
with open(file, "wb") as f:
f.write(
ET.tostring(
self.tree, xml_declaration=xml_declaration, encoding="UTF-8"
)
)
return file
|