Coverage for apis_core/utils/rdf.py: 88%

86 statements  

« prev     ^ index     » next       coverage.py v7.5.3, created at 2025-12-04 11:32 +0000

1# SPDX-FileCopyrightText: 2025 Birger Schacht 

2# SPDX-License-Identifier: MIT 

3 

4import logging 

5import tomllib 

6from collections import defaultdict 

7from pathlib import Path 

8 

9from AcdhArcheAssets.uri_norm_rules import get_normalized_uri 

10from django.template.exceptions import TemplateDoesNotExist 

11from django.template.loader import render_to_string 

12from django.template.utils import get_app_template_dirs 

13from rdflib import RDF, BNode, Graph, URIRef 

14from rdflib.exceptions import ParserError 

15 

16logger = logging.getLogger(__name__) 

17 

18 

19def resolve(obj, graph): 

20 """ 

21 Look at the value of object and return the parsed 

22 value. If the value starts and ens with angle brackets, 

23 we interpret it as and transform it to an URI. 

24 If the value is simple text we interpret it as an curie 

25 and we expand it using the graphs namespace manager. 

26 Otherwise we simply return the value 

27 """ 

28 if isinstance(obj, str): 

29 if obj.startswith("<") and obj.endswith(">"): 

30 return URIRef(obj[1:-1]) 

31 return graph.namespace_manager.expand_curie(obj) 

32 if isinstance(obj, bool) and obj is True: 

33 return None 

34 return obj 

35 

36 

37def load_path(path: str | Path) -> dict: 

38 """ 

39 Load a tomlfile either from a path or from the directory 

40 `triple_configs` in any of the app directories. 

41 """ 

42 try: 

43 return tomllib.loads(render_to_string(path)) 

44 except TemplateDoesNotExist: 

45 logger.debug("Tried to load template %s but it does not exist", path) 

46 if isinstance(path, str): 

47 files = [ 

48 directory / path for directory in get_app_template_dirs("triple_configs") 

49 ] 

50 files = list(filter(lambda file: file.exists(), files)) 

51 if files: 

52 path = files[0] 

53 else: 

54 raise ValueError(f"Could not find {path}") 

55 return tomllib.loads(Path(path).read_text()) 

56 

57 

58def graph_matches_config(graph: Graph, configfile: Path) -> dict: 

59 """ 

60 Check if a file contains a config that matches this 

61 graph and if so, return the config as dict. Otherwise 

62 return False 

63 """ 

64 config = load_path(configfile) 

65 for _filter in config.get("filters", [{None: None}]): 

66 try: 

67 triples = [] 

68 for predicate, obj in _filter.items(): 

69 triples.append((None, resolve(predicate, graph), resolve(obj, graph))) 

70 triples = [triple in graph for triple in triples] 

71 if all(triples): 

72 logger.debug("Using %s for parsing graph", configfile) 

73 return config 

74 except ValueError as e: 

75 logger.debug("Filter %s does not match: %s", _filter, e) 

76 return {} 

77 

78 

79def build_sparql_query(curie: str) -> str: 

80 """ 

81 Build a SPARQL query with language preferences. 

82 

83 Args: 

84 curie: predicate to filter on as defined in the toml. 

85 needs to include the predicate and optionally 

86 a lang tag to filter for separated with a comma. 

87 Eg "wdt:P122,en". 

88 

89 Returns: 

90 A SPARQL query string 

91 """ 

92 if curie.lower().strip().startswith(("select", "prefix")): 

93 return curie 

94 lang_tag = "" 

95 if "," in curie: 

96 curie, lang_tag = curie.split(",", 1) 

97 lang_tag = f'FILTER LANGMATCHES(LANG(?object), "{lang_tag}")' 

98 query = f""" 

99 SELECT ?object  

100 WHERE {  

101 ?subject {curie} ?object {lang_tag} 

102 } 

103 """ 

104 

105 logger.debug("Generated SPARQL query: %s", query) 

106 return query 

107 

108 

109def get_value_graph(graph: Graph, curies: str | list[str]) -> list: 

110 values = [] 

111 if isinstance(curies, str): 

112 curies = [curies] 

113 for curie in curies: 

114 results = graph.query(build_sparql_query(curie)) 

115 objects = [result[0] for result in results] 

116 for obj in objects: 

117 if isinstance(obj, BNode): 

118 values.extend( 

119 [ 

120 str(value) 

121 for value in graph.objects(subject=obj) 

122 if value != RDF.Seq 

123 ] 

124 ) 

125 else: 

126 values.append(str(obj)) 

127 return list(dict.fromkeys(values)) 

128 

129 

130def load_uri_using_path(uri, configfile: Path) -> dict: 

131 uri = get_normalized_uri(uri) 

132 graph = Graph() 

133 # workaround for a bug in d-nb: with the default list of accept 

134 # headers of rdflib, d-nb sometimes returns json-ld and sometimes turtle 

135 # with json-ld, rdflib has problems finding the namespaces 

136 format = "turtle" if uri.startswith("https://d-nb.info/gnd/") else None 

137 try: 

138 graph.parse(uri, format=format) 

139 except ParserError as e: 

140 logger.info(e) 

141 

142 if config := graph_matches_config(graph, configfile): 

143 result = defaultdict(list) 

144 result["relations"] = defaultdict(list) 

145 for attribute, curies in config.get("attributes", {}).items(): 

146 values = get_value_graph(graph, curies) 

147 result[attribute].extend(values) 

148 for relation, details in config.get("relations", {}).items(): 

149 details["curies"] = get_value_graph(graph, details.get("curies", [])) 

150 result["relations"][relation] = details 

151 return dict(result) 

152 return None