Coverage for apis_core/utils/rdf.py: 85%

110 statements  

« prev     ^ index     » next       coverage.py v7.5.3, created at 2026-03-06 11:42 +0000

1# SPDX-FileCopyrightText: 2025 Birger Schacht 

2# SPDX-License-Identifier: MIT 

3 

4import inspect 

5import logging 

6import tomllib 

7from collections import defaultdict 

8from dataclasses import dataclass 

9from pathlib import Path 

10from typing import Dict, List, Tuple, Union 

11from warnings import deprecated 

12 

13from AcdhArcheAssets.uri_norm_rules import get_normalized_uri 

14from django.template.utils import get_app_template_dirs 

15from rdflib import RDF, BNode, Graph, URIRef 

16from rdflib.exceptions import ParserError 

17 

18logger = logging.getLogger(__name__) 

19 

20 

21@dataclass 

22class Attribute: 

23 value: Union[str | list[str]] 

24 

25 

26@dataclass 

27class Relation: 

28 name: str 

29 value: Dict[str, str] 

30 

31 

32@dataclass 

33class Filter: 

34 value: List[Tuple[str:str]] 

35 

36 

37def resolve(obj, graph): 

38 """ 

39 Look at the value of object and return the parsed 

40 value. If the value starts and ens with angle brackets, 

41 we interpret it as and transform it to an URI. 

42 If the value is simple text we interpret it as an curie 

43 and we expand it using the graphs namespace manager. 

44 Otherwise we simply return the value 

45 """ 

46 if isinstance(obj, str): 

47 if obj.startswith("<") and obj.endswith(">"): 

48 return URIRef(obj[1:-1]) 

49 return graph.namespace_manager.expand_curie(obj) 

50 if isinstance(obj, bool) and obj is True: 

51 return None 

52 return obj 

53 

54 

55@deprecated("Please switch to using objects instead of toml files.") 

56def load_path(path: str | Path) -> dict: 

57 """ 

58 Load a tomlfile either from a path or from the directory 

59 `triple_configs` in any of the app directories. 

60 """ 

61 if isinstance(path, str): 

62 files = [ 

63 directory / path for directory in get_app_template_dirs("triple_configs") 

64 ] 

65 files = list(filter(lambda file: file.exists(), files)) 

66 if files: 

67 path = files[0] 

68 else: 

69 raise ValueError(f"Could not find {path}") 

70 return tomllib.loads(Path(path).read_text()) 

71 

72 

73def graph_matches_config(graph: Graph, configfile: Path) -> dict: 

74 """ 

75 Check if a file contains a config that matches this 

76 graph and if so, return the config as dict. Otherwise 

77 return False 

78 """ 

79 if inspect.isclass(configfile): 

80 config = {"relations": {}, "filters": [], "attributes": {}} 

81 for key in [att for att in dir(configfile) if not att.startswith("__")]: 

82 value = getattr(configfile, key) 

83 match value: 

84 case Filter(): 

85 config["filters"].append(value.value) 

86 case Attribute(): 

87 config["attributes"][key.lower()] = value.value 

88 case Relation(): 

89 config["relations"][value.name] = value.value 

90 else: 

91 config = load_path(configfile) 

92 for _filter in config.get("filters", [[(None, None)]]): 

93 if isinstance(_filter, dict): 

94 _filter = _filter.items() 

95 try: 

96 triples = [] 

97 for predicate, obj in _filter: 

98 triples.append((None, resolve(predicate, graph), resolve(obj, graph))) 

99 triples = [triple in graph for triple in triples] 

100 if all(triples): 

101 logger.debug("Using %s for parsing graph", configfile) 

102 return config 

103 except ValueError as e: 

104 logger.debug("Filter %s does not match: %s", _filter, e) 

105 return {} 

106 

107 

108def build_sparql_query(curie: str) -> str: 

109 """ 

110 Build a SPARQL query with language preferences. 

111 

112 Args: 

113 curie: predicate to filter on as defined in the toml. 

114 needs to include the predicate and optionally 

115 a lang tag to filter for separated with a comma. 

116 Eg "wdt:P122,en". 

117 

118 Returns: 

119 A SPARQL query string 

120 """ 

121 if curie.lower().strip().startswith(("select", "prefix")): 

122 return curie 

123 lang_tag = "" 

124 if "," in curie: 

125 curie, lang_tag = curie.split(",", 1) 

126 lang_tag = f'FILTER LANGMATCHES(LANG(?object), "{lang_tag}")' 

127 query = f""" 

128 SELECT ?object  

129 WHERE {  

130 ?subject {curie} ?object {lang_tag} 

131 } 

132 """ 

133 

134 logger.debug("Generated SPARQL query: %s", query) 

135 return query 

136 

137 

138def get_value_graph(graph: Graph, curies: str | list[str]) -> list: 

139 values = [] 

140 if curies is None: 

141 return [] 

142 if isinstance(curies, str): 

143 curies = [curies] 

144 for curie in curies: 

145 results = graph.query(build_sparql_query(curie)) 

146 objects = [result[0] for result in results] 

147 for obj in objects: 

148 if isinstance(obj, BNode): 

149 values.extend( 

150 [ 

151 str(value) 

152 for value in graph.objects(subject=obj) 

153 if value != RDF.Seq 

154 ] 

155 ) 

156 else: 

157 values.append(str(obj)) 

158 return list(dict.fromkeys(values)) 

159 

160 

161def load_uri_using_path(uri, configfile: Path | str) -> dict: 

162 uri = get_normalized_uri(uri) 

163 graph = Graph() 

164 # workaround for a bug in d-nb: with the default list of accept 

165 # headers of rdflib, d-nb sometimes returns json-ld and sometimes turtle 

166 # with json-ld, rdflib has problems finding the namespaces 

167 format = "turtle" if uri.startswith("https://d-nb.info/gnd/") else None 

168 try: 

169 graph.parse(uri, format=format) 

170 except ParserError as e: 

171 logger.info(e) 

172 

173 if config := graph_matches_config(graph, configfile): 

174 result = defaultdict(list) 

175 result["relations"] = defaultdict(list) 

176 for attribute, curies in config.get("attributes", {}).items(): 

177 values = get_value_graph(graph, curies) 

178 result[attribute].extend(values) 

179 for relation, details in config.get("relations", {}).items(): 

180 details["curies"] = get_value_graph(graph, details.get("curies", [])) 

181 result["relations"][relation] = details 

182 return dict(result) 

183 return None