Coverage for src / crawler / zbmath.py: 0%

131 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-23 15:27 +0000

1# import shutil 

2# import tempfile 

3# import urllib 

4# import xml.etree.ElementTree as ET 

5 

6 

7import logging 

8import time 

9from typing import TYPE_CHECKING 

10 

11import langcodes 

12from matching.matching import get_article_params, zbmath_matching_query 

13from ptf import model_data 

14from ptf.cmds.xml.ckeditor.utils import ( 

15 build_jats_data_from_html_field, 

16 get_html_and_xml_from_text_with_formulas, 

17) 

18from ptf.model_data_converter import get_contrib_xml 

19from requests import HTTPError, ReadTimeout 

20 

21from crawler.utils import add_pdf_link_to_xarticle, get_session 

22 

23if TYPE_CHECKING: 

24 from ptf.models import Article 

25 

26 

27last_zbmath_request = 0 

28request_delay = 5 

29_logger = logging.getLogger(__name__) 

30ZBMATH_URL = "https://zbmath.org" 

31ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}" 

32ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf" 

33 

34 

35def parse_zbmath_article(zbmath_article): 

36 """ 

37 Parse the json response of the zbMATH OPEN article fetched given its zblid 

38 zbmath_article: a json storing article data 

39 returns an ArticleData, that has no pid 

40 TODO: Move in ptf-back 

41 """ 

42 xarticle = model_data.create_articledata() 

43 for json_author in zbmath_article["contributors"]["authors"]: 

44 author = model_data.create_contributor(role="author", string_name=json_author["name"]) 

45 author["contrib_xml"] = get_contrib_xml(author) 

46 xarticle.contributors.append(author) 

47 

48 # Lang 

49 languages = zbmath_article["language"]["languages"] 

50 if len(languages) > 0: 

51 xarticle.lang = zbmath_article["language"]["languages"][0] 

52 

53 # Title 

54 xarticle.title_tex = zbmath_article["title"]["title"] 

55 

56 ckeditor_data = build_jats_data_from_html_field( 

57 xarticle.title_tex, 

58 tag="article-title", 

59 text_lang=xarticle.lang, 

60 delimiter_inline="$", 

61 delimiter_disp="$", 

62 ) 

63 xarticle.title_html = ckeditor_data["value_html"] 

64 xarticle.title_xml = ckeditor_data["value_xml"] 

65 

66 # Abstract 

67 zbl_abstract = next( 

68 ( 

69 c 

70 for c in zbmath_article["editorial_contributions"] 

71 if c["contribution_type"] in "summary" 

72 ), 

73 None, 

74 ) 

75 if zbl_abstract: 

76 abstract_data = model_data.create_abstract(value_tex=zbl_abstract["text"]) 

77 if zbl_abstract["language"]: 

78 lang = langcodes.Language.find(zbl_abstract["language"]).language 

79 if lang and xarticle.lang and lang != xarticle.lang: 

80 abstract_data["lang"] = lang 

81 

82 abstract_data["value_html"], abstract_data["value_xml"] = ( 

83 get_html_and_xml_from_text_with_formulas( 

84 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\[" 

85 ) 

86 ) 

87 xarticle.abstracts.append(abstract_data) 

88 

89 # Keywords 

90 for kwd in zbmath_article["keywords"]: 

91 if not kwd: 

92 continue 

93 xarticle.kwds.append(model_data.create_subj(value=kwd)) 

94 # MSC 

95 for msc in zbmath_article["msc"]: 

96 if msc["scheme"] != "msc2020": 

97 continue 

98 xarticle.kwds.append(model_data.create_subj(value=msc["code"])) 

99 

100 # Pages 

101 pages: str = zbmath_article["source"]["pages"] 

102 if pages: 

103 pages.split("-") 

104 if len(pages) > 0: 

105 xarticle.fpage = pages[0] 

106 if len(pages) > 1: 

107 xarticle.lpage = pages[1] 

108 

109 # extids 

110 zbmath_id = zbmath_article["identifier"] 

111 if zbmath_id is None: 

112 zbmath_id = zbmath_article["id"] 

113 xarticle.extids.append(("zbl-item-id", zbmath_id.strip())) 

114 

115 for link in zbmath_article["links"]: 

116 match link["type"]: 

117 case "doi": 

118 xarticle.doi = link["identifier"].strip() 

119 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

120 xarticle.extids.append(("doi", link["identifier"].strip())) 

121 case "eudml": 

122 xarticle.extids.append(("eudml-item-id", link["identifier"].strip())) 

123 case "arxiv": 

124 xarticle.extids.append(("arxiv", link["identifier"].strip())) 

125 case "emis_ft": 

126 add_pdf_link_to_xarticle(xarticle, link["url"].strip()) 

127 return xarticle 

128 

129 

130def zbmath_request_article(zblid): 

131 """ 

132 Fetch the article from zbMATH OPEN given its zblid 

133 returns an ArticleData, that has no pid 

134 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module 

135 """ 

136 session = get_session() 

137 response = session.get( 

138 ZBMATH_API_URL.format(query=zblid), 

139 headers={**session.headers, "Content-Type": "text/json"}, 

140 ) 

141 try: 

142 response.raise_for_status() 

143 except HTTPError: 

144 return 

145 

146 if response.from_cache: 

147 time.sleep(10) 

148 

149 response = response.json() 

150 return parse_zbmath_article(response["result"]) 

151 

152 

153def zbmath_request_article_by_doi(doi: str): 

154 session = get_session() 

155 search = f"_structured_search?results_per_page=1&DOI={doi}" 

156 response = session.get( 

157 ZBMATH_API_URL.format(query=search), 

158 headers={**session.headers, "Content-Type": "text/json"}, 

159 ) 

160 if response.status_code == 404: 

161 logging.debug(f"ZBMATH API {doi} not found") 

162 return 

163 

164 response.raise_for_status() 

165 

166 if response.from_cache: 

167 time.sleep(10) 

168 

169 response = response.json() 

170 if response["status"]["nr_total_results"] > 1: 

171 logging.error(f"ZBMATH API found multiple candidates for doi {doi}") 

172 return 

173 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}") 

174 if response["status"]["nr_total_results"] == 0: 

175 logging.debug(f"ZBMATH API {doi} not found") 

176 return 

177 return parse_zbmath_article(response["result"][0]) 

178 

179 

180def zbmath_request_article_by_extid(extid: str): 

181 pass 

182 

183 

184def _zbmath_query_retry(params: dict, timeout: int): 

185 global last_zbmath_request 

186 RETRIES = 3 

187 for i in range(RETRIES): 

188 try: 

189 delta = (last_zbmath_request + request_delay) - time.time() 

190 if delta > 0: 

191 _logger.info(f"Waiting {round(delta)}s before making another request") 

192 time.sleep(delta) 

193 

194 response = zbmath_matching_query(params, timeout, get_session()) 

195 if not getattr(response, "from_cache", False): 

196 last_zbmath_request = time.time() 

197 return response 

198 except ReadTimeout as e: 

199 if i >= RETRIES - 1: 

200 raise e 

201 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs") 

202 time.sleep(60) 

203 

204 assert False, "unreachable" 

205 

206 

207def match_zbl_article(article: "Article"): 

208 """Finds article using matching score""" 

209 # query zbl 

210 params = get_article_params(article) 

211 response = _zbmath_query_retry(params, timeout=30) 

212 results = response.json() 

213 

214 try: 

215 item = results["results"][0][0] 

216 except (KeyError, IndexError): 

217 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping") 

218 return 

219 return item 

220 

221 

222def zbmath_get_pdfurl(zbmathid: str): 

223 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)