Coverage for src / crawler / zbmath.py: 15%

152 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-03-19 14:59 +0000

1import logging 

2import time 

3from datetime import datetime, timedelta 

4from typing import TYPE_CHECKING, Literal 

5 

6import langcodes 

7import requests 

8from django.conf import settings 

9from matching.matching import get_article_params, zbmath_matching_query 

10from ptf import model_data 

11from ptf.cmds.xml.ckeditor.utils import ( 

12 build_jats_data_from_html_field, 

13 get_html_and_xml_from_text_with_formulas, 

14) 

15from ptf.cmds.xml.xml_utils import escape 

16from ptf.model_data_converter import get_contrib_xml 

17from requests import HTTPError, ReadTimeout 

18 

19from crawler.utils import add_pdf_link_to_xarticle, get_session 

20 

21if TYPE_CHECKING: 

22 from ptf.models import Article 

23 

24logger = logging.getLogger(__name__) 

25 

26last_zbmath_request = 0 

27request_delay = 5 

28_logger = logging.getLogger(__name__) 

29ZBMATH_URL = "https://zbmath.org" 

30ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}" 

31ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf" 

32ZBMATH_REQUEST_INTERVAL = getattr(settings, "REQUESTS_INTERVAL", 10) 

33 

34 

35def parse_zbmath_article(zbmath_article: dict): 

36 """ 

37 Parse the json response of the zbMATH OPEN article fetched given its zblid 

38 zbmath_article: a json storing article data 

39 returns an ArticleData, that has no pid 

40 TODO: Move in ptf-back 

41 """ 

42 xarticle = model_data.create_articledata() 

43 for json_author in zbmath_article["contributors"]["authors"]: 

44 author = model_data.create_contributor(role="author", string_name=json_author["name"]) 

45 author["contrib_xml"] = get_contrib_xml(author) 

46 xarticle.contributors.append(author) 

47 

48 # extids 

49 zbmath_id = zbmath_article["identifier"] 

50 if zbmath_id is None: 

51 zbmath_id = zbmath_article["id"] 

52 xarticle.extids.append(("zbl-item-id", str(zbmath_id).strip())) 

53 

54 # Lang 

55 languages = zbmath_article["language"]["languages"] 

56 if len(languages) > 0: 

57 xarticle.lang = zbmath_article["language"]["languages"][0] 

58 

59 # Title 

60 xarticle.title_tex = zbmath_article["title"]["title"] 

61 

62 ckeditor_data = build_jats_data_from_html_field( 

63 xarticle.title_tex, 

64 tag="article-title", 

65 text_lang=xarticle.lang, 

66 delimiter_inline="$", 

67 delimiter_disp="$", 

68 ) 

69 xarticle.title_html = ckeditor_data["value_html"] 

70 xarticle.title_xml = ckeditor_data["value_xml"] 

71 

72 # Abstract 

73 zbl_abstract = next( 

74 ( 

75 c 

76 for c in zbmath_article["editorial_contributions"] 

77 if c["contribution_type"] in "summary" 

78 ), 

79 None, 

80 ) 

81 if zbl_abstract: 

82 abstract_data = model_data.create_abstract(value_tex=escape(zbl_abstract["text"])) 

83 if zbl_abstract["language"]: 

84 lang = langcodes.Language.find(zbl_abstract["language"]).language 

85 if lang and xarticle.lang and lang != xarticle.lang: 

86 abstract_data["lang"] = lang 

87 # Zbmath abstracts are sometimes impossible to parse 

88 # (They do not escape html-reserved characters like < >) 

89 try: 

90 abstract_data["value_html"], abstract_data["value_xml"] = ( 

91 get_html_and_xml_from_text_with_formulas( 

92 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\[" 

93 ) 

94 ) 

95 xarticle.abstracts.append(abstract_data) 

96 except BaseException as e: 

97 logger.error(f"Got exception while parsing abstract for {zbmath_id} : {e}") 

98 

99 # Keywords 

100 for kwd in zbmath_article["keywords"]: 

101 if not kwd: 

102 continue 

103 xarticle.kwds.append(model_data.create_subj(value=kwd)) 

104 # MSC 

105 for msc in zbmath_article["msc"]: 

106 if msc["scheme"] != "msc2020": 

107 continue 

108 xarticle.kwds.append(model_data.create_subj(value=msc["code"], type="msc")) 

109 

110 # Pages 

111 pages: str = zbmath_article["source"]["pages"] 

112 if pages: 

113 pages.split("-") 

114 if len(pages) > 0: 

115 xarticle.fpage = pages[0] 

116 if len(pages) > 1: 

117 xarticle.lpage = pages[1] 

118 

119 pdf_link: str | None = None 

120 for link in zbmath_article["links"]: 

121 match link["type"]: 

122 case "doi": 

123 xarticle.doi = link["identifier"].strip() 

124 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

125 xarticle.extids.append(("doi", link["identifier"].strip())) 

126 case "eudml": 

127 xarticle.extids.append(("eudml-item-id", link["identifier"].strip())) 

128 case "arxiv": 

129 xarticle.extids.append(("arxiv", link["identifier"].strip())) 

130 case "emis_ft": 

131 pdf_link = link["url"].strip() 

132 

133 case "emis": 

134 pdf_link = ( 

135 link["url"].strip().replace("http://www.emis.de/", "http://www.emis.de/ft/") 

136 ) 

137 if pdf_link: 

138 from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

139 

140 BaseCollectionCrawler.session = get_session() 

141 isok, *_ = BaseCollectionCrawler.check_pdf_link_validity(url=pdf_link) 

142 if not isok: 

143 logger.warning(f"This pdf link doesn't seems to work : {pdf_link}") 

144 else: 

145 add_pdf_link_to_xarticle(xarticle, pdf_link) 

146 

147 return xarticle 

148 

149 

150def zbmath_request_article(zblid, force_refresh=False): 

151 """ 

152 Fetch the article from zbMATH OPEN given its zblid 

153 returns an ArticleData, that has no pid 

154 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module 

155 """ 

156 # LOTs of cache-specific code here. 

157 # Maybe handle the caching logic elsewhere 

158 # so this code can be reused without the requests_cache module 

159 

160 session = get_session() 

161 response = session.get( 

162 ZBMATH_API_URL.format(query=zblid), 

163 headers={**session.headers, "Content-Type": "text/json"}, 

164 force_refresh=force_refresh, 

165 ) 

166 # Silence errors, but continue operation 

167 try: 

168 response.raise_for_status() 

169 except HTTPError: 

170 return 

171 

172 try: 

173 response = response.json() 

174 except requests.exceptions.JSONDecodeError as e: 

175 # Cannot invalidate: response is not taken from cache 

176 if force_refresh or not response.from_cache: 

177 raise requests.exceptions.JSONDecodeError( 

178 f"Couldn't decode JSON from ZBMATH at address {response.url}", 

179 response=response, 

180 ) from e 

181 logger.error(f"Couldn't decode JSON from ZBMATH at address {response.url}.") 

182 logger.info(f"Retrying in {60}s ({(datetime.now() + timedelta(minutes=1)).time()})") 

183 time.sleep(60) 

184 return zbmath_request_article(zblid, force_refresh=True) 

185 

186 return parse_zbmath_article(response["result"]) 

187 

188 

189id_type_mapping = {"doi": "DOI", "arXiv": "arXiv%20ID"} 

190 

191 

192def _zbmath_request_article_by_extid(extid: str, id_type: Literal["doi", "arXiv"]): 

193 query_search = id_type_mapping[id_type] 

194 session = get_session() 

195 search = f"_structured_search?results_per_page=1&{query_search}={extid}" 

196 response = session.get( 

197 ZBMATH_API_URL.format(query=search), 

198 headers={**session.headers, "Content-Type": "text/json"}, 

199 ) 

200 if response.status_code == 404: 

201 logging.debug(f"ZBMATH API {id_type}={extid} not found") 

202 return 

203 

204 response.raise_for_status() 

205 response = response.json() 

206 

207 if response["status"]["nr_total_results"] > 1: 

208 logging.error(f"ZBMATH API found multiple candidates for {id_type} {extid}") 

209 return 

210 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}") 

211 if response["status"]["nr_total_results"] == 0: 

212 logging.debug(f"ZBMATH API {id_type}={extid} not found") 

213 return 

214 return parse_zbmath_article(response["result"][0]) 

215 

216 

217def zbmath_request_article_by_doi(doi: str): 

218 return _zbmath_request_article_by_extid(doi, "doi") 

219 

220 

221def zbmath_request_article_by_arxivId(id: str): 

222 return _zbmath_request_article_by_extid(id, "arXiv") 

223 

224 

225def _zbmath_query_retry(params: dict, timeout: int): 

226 global last_zbmath_request 

227 RETRIES = 3 

228 for i in range(RETRIES): 

229 try: 

230 response = zbmath_matching_query(params, timeout, get_session()) 

231 return response 

232 except ReadTimeout as e: 

233 if i >= RETRIES - 1: 

234 raise e 

235 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs") 

236 time.sleep(60) 

237 

238 assert False, "unreachable" 

239 

240 

241def match_zbl_article(article: "Article"): 

242 """Finds article using matching score""" 

243 # query zbl 

244 params = get_article_params(article) 

245 response = _zbmath_query_retry(params, timeout=30) 

246 results = response.json() 

247 

248 try: 

249 item = results["results"][0][0] 

250 except (KeyError, IndexError): 

251 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping") 

252 return 

253 return item 

254 

255 

256def zbmath_get_pdfurl(zbmathid: str): 

257 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)