Coverage for apis_core/utils/rdf.py: 85%
110 statements
« prev ^ index » next coverage.py v7.5.3, created at 2026-03-06 11:42 +0000
« prev ^ index » next coverage.py v7.5.3, created at 2026-03-06 11:42 +0000
1# SPDX-FileCopyrightText: 2025 Birger Schacht
2# SPDX-License-Identifier: MIT
4import inspect
5import logging
6import tomllib
7from collections import defaultdict
8from dataclasses import dataclass
9from pathlib import Path
10from typing import Dict, List, Tuple, Union
11from warnings import deprecated
13from AcdhArcheAssets.uri_norm_rules import get_normalized_uri
14from django.template.utils import get_app_template_dirs
15from rdflib import RDF, BNode, Graph, URIRef
16from rdflib.exceptions import ParserError
18logger = logging.getLogger(__name__)
21@dataclass
22class Attribute:
23 value: Union[str | list[str]]
26@dataclass
27class Relation:
28 name: str
29 value: Dict[str, str]
32@dataclass
33class Filter:
34 value: List[Tuple[str:str]]
37def resolve(obj, graph):
38 """
39 Look at the value of object and return the parsed
40 value. If the value starts and ens with angle brackets,
41 we interpret it as and transform it to an URI.
42 If the value is simple text we interpret it as an curie
43 and we expand it using the graphs namespace manager.
44 Otherwise we simply return the value
45 """
46 if isinstance(obj, str):
47 if obj.startswith("<") and obj.endswith(">"):
48 return URIRef(obj[1:-1])
49 return graph.namespace_manager.expand_curie(obj)
50 if isinstance(obj, bool) and obj is True:
51 return None
52 return obj
55@deprecated("Please switch to using objects instead of toml files.")
56def load_path(path: str | Path) -> dict:
57 """
58 Load a tomlfile either from a path or from the directory
59 `triple_configs` in any of the app directories.
60 """
61 if isinstance(path, str):
62 files = [
63 directory / path for directory in get_app_template_dirs("triple_configs")
64 ]
65 files = list(filter(lambda file: file.exists(), files))
66 if files:
67 path = files[0]
68 else:
69 raise ValueError(f"Could not find {path}")
70 return tomllib.loads(Path(path).read_text())
73def graph_matches_config(graph: Graph, configfile: Path) -> dict:
74 """
75 Check if a file contains a config that matches this
76 graph and if so, return the config as dict. Otherwise
77 return False
78 """
79 if inspect.isclass(configfile):
80 config = {"relations": {}, "filters": [], "attributes": {}}
81 for key in [att for att in dir(configfile) if not att.startswith("__")]:
82 value = getattr(configfile, key)
83 match value:
84 case Filter():
85 config["filters"].append(value.value)
86 case Attribute():
87 config["attributes"][key.lower()] = value.value
88 case Relation():
89 config["relations"][value.name] = value.value
90 else:
91 config = load_path(configfile)
92 for _filter in config.get("filters", [[(None, None)]]):
93 if isinstance(_filter, dict):
94 _filter = _filter.items()
95 try:
96 triples = []
97 for predicate, obj in _filter:
98 triples.append((None, resolve(predicate, graph), resolve(obj, graph)))
99 triples = [triple in graph for triple in triples]
100 if all(triples):
101 logger.debug("Using %s for parsing graph", configfile)
102 return config
103 except ValueError as e:
104 logger.debug("Filter %s does not match: %s", _filter, e)
105 return {}
108def build_sparql_query(curie: str) -> str:
109 """
110 Build a SPARQL query with language preferences.
112 Args:
113 curie: predicate to filter on as defined in the toml.
114 needs to include the predicate and optionally
115 a lang tag to filter for separated with a comma.
116 Eg "wdt:P122,en".
118 Returns:
119 A SPARQL query string
120 """
121 if curie.lower().strip().startswith(("select", "prefix")):
122 return curie
123 lang_tag = ""
124 if "," in curie:
125 curie, lang_tag = curie.split(",", 1)
126 lang_tag = f'FILTER LANGMATCHES(LANG(?object), "{lang_tag}")'
127 query = f"""
128 SELECT ?object
129 WHERE {
130 ?subject {curie} ?object {lang_tag}
131 }
132 """
134 logger.debug("Generated SPARQL query: %s", query)
135 return query
138def get_value_graph(graph: Graph, curies: str | list[str]) -> list:
139 values = []
140 if curies is None:
141 return []
142 if isinstance(curies, str):
143 curies = [curies]
144 for curie in curies:
145 results = graph.query(build_sparql_query(curie))
146 objects = [result[0] for result in results]
147 for obj in objects:
148 if isinstance(obj, BNode):
149 values.extend(
150 [
151 str(value)
152 for value in graph.objects(subject=obj)
153 if value != RDF.Seq
154 ]
155 )
156 else:
157 values.append(str(obj))
158 return list(dict.fromkeys(values))
161def load_uri_using_path(uri, configfile: Path | str) -> dict:
162 uri = get_normalized_uri(uri)
163 graph = Graph()
164 # workaround for a bug in d-nb: with the default list of accept
165 # headers of rdflib, d-nb sometimes returns json-ld and sometimes turtle
166 # with json-ld, rdflib has problems finding the namespaces
167 format = "turtle" if uri.startswith("https://d-nb.info/gnd/") else None
168 try:
169 graph.parse(uri, format=format)
170 except ParserError as e:
171 logger.info(e)
173 if config := graph_matches_config(graph, configfile):
174 result = defaultdict(list)
175 result["relations"] = defaultdict(list)
176 for attribute, curies in config.get("attributes", {}).items():
177 values = get_value_graph(graph, curies)
178 result[attribute].extend(values)
179 for relation, details in config.get("relations", {}).items():
180 details["curies"] = get_value_graph(graph, details.get("curies", []))
181 result["relations"][relation] = details
182 return dict(result)
183 return None