Coverage for apis_core/utils/rdf.py: 88%
86 statements
« prev ^ index » next coverage.py v7.5.3, created at 2025-12-04 11:32 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2025-12-04 11:32 +0000
1# SPDX-FileCopyrightText: 2025 Birger Schacht
2# SPDX-License-Identifier: MIT
4import logging
5import tomllib
6from collections import defaultdict
7from pathlib import Path
9from AcdhArcheAssets.uri_norm_rules import get_normalized_uri
10from django.template.exceptions import TemplateDoesNotExist
11from django.template.loader import render_to_string
12from django.template.utils import get_app_template_dirs
13from rdflib import RDF, BNode, Graph, URIRef
14from rdflib.exceptions import ParserError
16logger = logging.getLogger(__name__)
19def resolve(obj, graph):
20 """
21 Look at the value of object and return the parsed
22 value. If the value starts and ens with angle brackets,
23 we interpret it as and transform it to an URI.
24 If the value is simple text we interpret it as an curie
25 and we expand it using the graphs namespace manager.
26 Otherwise we simply return the value
27 """
28 if isinstance(obj, str):
29 if obj.startswith("<") and obj.endswith(">"):
30 return URIRef(obj[1:-1])
31 return graph.namespace_manager.expand_curie(obj)
32 if isinstance(obj, bool) and obj is True:
33 return None
34 return obj
37def load_path(path: str | Path) -> dict:
38 """
39 Load a tomlfile either from a path or from the directory
40 `triple_configs` in any of the app directories.
41 """
42 try:
43 return tomllib.loads(render_to_string(path))
44 except TemplateDoesNotExist:
45 logger.debug("Tried to load template %s but it does not exist", path)
46 if isinstance(path, str):
47 files = [
48 directory / path for directory in get_app_template_dirs("triple_configs")
49 ]
50 files = list(filter(lambda file: file.exists(), files))
51 if files:
52 path = files[0]
53 else:
54 raise ValueError(f"Could not find {path}")
55 return tomllib.loads(Path(path).read_text())
58def graph_matches_config(graph: Graph, configfile: Path) -> dict:
59 """
60 Check if a file contains a config that matches this
61 graph and if so, return the config as dict. Otherwise
62 return False
63 """
64 config = load_path(configfile)
65 for _filter in config.get("filters", [{None: None}]):
66 try:
67 triples = []
68 for predicate, obj in _filter.items():
69 triples.append((None, resolve(predicate, graph), resolve(obj, graph)))
70 triples = [triple in graph for triple in triples]
71 if all(triples):
72 logger.debug("Using %s for parsing graph", configfile)
73 return config
74 except ValueError as e:
75 logger.debug("Filter %s does not match: %s", _filter, e)
76 return {}
79def build_sparql_query(curie: str) -> str:
80 """
81 Build a SPARQL query with language preferences.
83 Args:
84 curie: predicate to filter on as defined in the toml.
85 needs to include the predicate and optionally
86 a lang tag to filter for separated with a comma.
87 Eg "wdt:P122,en".
89 Returns:
90 A SPARQL query string
91 """
92 if curie.lower().strip().startswith(("select", "prefix")):
93 return curie
94 lang_tag = ""
95 if "," in curie:
96 curie, lang_tag = curie.split(",", 1)
97 lang_tag = f'FILTER LANGMATCHES(LANG(?object), "{lang_tag}")'
98 query = f"""
99 SELECT ?object
100 WHERE {
101 ?subject {curie} ?object {lang_tag}
102 }
103 """
105 logger.debug("Generated SPARQL query: %s", query)
106 return query
109def get_value_graph(graph: Graph, curies: str | list[str]) -> list:
110 values = []
111 if isinstance(curies, str):
112 curies = [curies]
113 for curie in curies:
114 results = graph.query(build_sparql_query(curie))
115 objects = [result[0] for result in results]
116 for obj in objects:
117 if isinstance(obj, BNode):
118 values.extend(
119 [
120 str(value)
121 for value in graph.objects(subject=obj)
122 if value != RDF.Seq
123 ]
124 )
125 else:
126 values.append(str(obj))
127 return list(dict.fromkeys(values))
130def load_uri_using_path(uri, configfile: Path) -> dict:
131 uri = get_normalized_uri(uri)
132 graph = Graph()
133 # workaround for a bug in d-nb: with the default list of accept
134 # headers of rdflib, d-nb sometimes returns json-ld and sometimes turtle
135 # with json-ld, rdflib has problems finding the namespaces
136 format = "turtle" if uri.startswith("https://d-nb.info/gnd/") else None
137 try:
138 graph.parse(uri, format=format)
139 except ParserError as e:
140 logger.info(e)
142 if config := graph_matches_config(graph, configfile):
143 result = defaultdict(list)
144 result["relations"] = defaultdict(list)
145 for attribute, curies in config.get("attributes", {}).items():
146 values = get_value_graph(graph, curies)
147 result[attribute].extend(values)
148 for relation, details in config.get("relations", {}).items():
149 details["curies"] = get_value_graph(graph, details.get("curies", []))
150 result["relations"][relation] = details
151 return dict(result)
152 return None