Coverage for src / crawler / zbmath.py: 14%

138 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-02-17 12:56 +0000

1import logging 

2import time 

3from typing import TYPE_CHECKING 

4 

5import langcodes 

6from django.conf import settings 

7from matching.matching import get_article_params, zbmath_matching_query 

8from ptf import model_data 

9from ptf.cmds.xml.ckeditor.utils import ( 

10 build_jats_data_from_html_field, 

11 get_html_and_xml_from_text_with_formulas, 

12) 

13from ptf.cmds.xml.xml_utils import escape 

14from ptf.model_data_converter import get_contrib_xml 

15from requests import HTTPError, ReadTimeout 

16 

17from crawler.utils import add_pdf_link_to_xarticle, get_session 

18 

19if TYPE_CHECKING: 

20 from ptf.models import Article 

21 

22logger = logging.getLogger(__name__) 

23 

24last_zbmath_request = 0 

25request_delay = 5 

26_logger = logging.getLogger(__name__) 

27ZBMATH_URL = "https://zbmath.org" 

28ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}" 

29ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf" 

30ZBMATH_REQUEST_INTERVAL = getattr(settings, "REQUESTS_INTERVAL", 10) 

31 

32 

33def parse_zbmath_article(zbmath_article: dict): 

34 """ 

35 Parse the json response of the zbMATH OPEN article fetched given its zblid 

36 zbmath_article: a json storing article data 

37 returns an ArticleData, that has no pid 

38 TODO: Move in ptf-back 

39 """ 

40 xarticle = model_data.create_articledata() 

41 for json_author in zbmath_article["contributors"]["authors"]: 

42 author = model_data.create_contributor(role="author", string_name=json_author["name"]) 

43 author["contrib_xml"] = get_contrib_xml(author) 

44 xarticle.contributors.append(author) 

45 

46 # extids 

47 zbmath_id = zbmath_article["identifier"] 

48 if zbmath_id is None: 

49 zbmath_id = zbmath_article["id"] 

50 xarticle.extids.append(("zbl-item-id", str(zbmath_id).strip())) 

51 

52 # Lang 

53 languages = zbmath_article["language"]["languages"] 

54 if len(languages) > 0: 

55 xarticle.lang = zbmath_article["language"]["languages"][0] 

56 

57 # Title 

58 xarticle.title_tex = zbmath_article["title"]["title"] 

59 

60 ckeditor_data = build_jats_data_from_html_field( 

61 xarticle.title_tex, 

62 tag="article-title", 

63 text_lang=xarticle.lang, 

64 delimiter_inline="$", 

65 delimiter_disp="$", 

66 ) 

67 xarticle.title_html = ckeditor_data["value_html"] 

68 xarticle.title_xml = ckeditor_data["value_xml"] 

69 

70 # Abstract 

71 zbl_abstract = next( 

72 ( 

73 c 

74 for c in zbmath_article["editorial_contributions"] 

75 if c["contribution_type"] in "summary" 

76 ), 

77 None, 

78 ) 

79 if zbl_abstract: 

80 abstract_data = model_data.create_abstract(value_tex=escape(zbl_abstract["text"])) 

81 if zbl_abstract["language"]: 

82 lang = langcodes.Language.find(zbl_abstract["language"]).language 

83 if lang and xarticle.lang and lang != xarticle.lang: 

84 abstract_data["lang"] = lang 

85 # Zbmath abstracts are sometimes impossible to parse 

86 # (They do not escape html-reserved characters like < >) 

87 try: 

88 abstract_data["value_html"], abstract_data["value_xml"] = ( 

89 get_html_and_xml_from_text_with_formulas( 

90 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\[" 

91 ) 

92 ) 

93 xarticle.abstracts.append(abstract_data) 

94 except BaseException as e: 

95 logger.error(f"Got exception while parsing abstract for {zbmath_id} : {e}") 

96 

97 # Keywords 

98 for kwd in zbmath_article["keywords"]: 

99 if not kwd: 

100 continue 

101 xarticle.kwds.append(model_data.create_subj(value=kwd)) 

102 # MSC 

103 for msc in zbmath_article["msc"]: 

104 if msc["scheme"] != "msc2020": 

105 continue 

106 xarticle.kwds.append(model_data.create_subj(value=msc["code"], type="msc")) 

107 

108 # Pages 

109 pages: str = zbmath_article["source"]["pages"] 

110 if pages: 

111 pages.split("-") 

112 if len(pages) > 0: 

113 xarticle.fpage = pages[0] 

114 if len(pages) > 1: 

115 xarticle.lpage = pages[1] 

116 

117 pdf_link: str | None = None 

118 for link in zbmath_article["links"]: 

119 match link["type"]: 

120 case "doi": 

121 xarticle.doi = link["identifier"].strip() 

122 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

123 xarticle.extids.append(("doi", link["identifier"].strip())) 

124 case "eudml": 

125 xarticle.extids.append(("eudml-item-id", link["identifier"].strip())) 

126 case "arxiv": 

127 xarticle.extids.append(("arxiv", link["identifier"].strip())) 

128 case "emis_ft": 

129 pdf_link = link["url"].strip() 

130 

131 case "emis": 

132 pdf_link = ( 

133 link["url"].strip().replace("http://www.emis.de/", "http://www.emis.de/ft/") 

134 ) 

135 if pdf_link: 

136 from crawler.base_crawler import BaseCollectionCrawler 

137 

138 BaseCollectionCrawler.session = get_session() 

139 isok, *_ = BaseCollectionCrawler.check_pdf_link_validity(url=pdf_link) 

140 if not isok: 

141 logger.warning(f"This pdf link doesn't seems to work : {pdf_link}") 

142 else: 

143 add_pdf_link_to_xarticle(xarticle, pdf_link) 

144 

145 return xarticle 

146 

147 

148def zbmath_request_article(zblid): 

149 """ 

150 Fetch the article from zbMATH OPEN given its zblid 

151 returns an ArticleData, that has no pid 

152 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module 

153 """ 

154 session = get_session() 

155 response = session.get( 

156 ZBMATH_API_URL.format(query=zblid), 

157 headers={**session.headers, "Content-Type": "text/json"}, 

158 ) 

159 # Silence errors, but continue operation 

160 try: 

161 response.raise_for_status() 

162 except HTTPError: 

163 return 

164 

165 response = response.json() 

166 return parse_zbmath_article(response["result"]) 

167 

168 

169def zbmath_request_article_by_doi(doi: str): 

170 session = get_session() 

171 search = f"_structured_search?results_per_page=1&DOI={doi}" 

172 response = session.get( 

173 ZBMATH_API_URL.format(query=search), 

174 headers={**session.headers, "Content-Type": "text/json"}, 

175 ) 

176 if response.status_code == 404: 

177 logging.debug(f"ZBMATH API {doi} not found") 

178 return 

179 

180 response.raise_for_status() 

181 response = response.json() 

182 

183 if response["status"]["nr_total_results"] > 1: 

184 logging.error(f"ZBMATH API found multiple candidates for doi {doi}") 

185 return 

186 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}") 

187 if response["status"]["nr_total_results"] == 0: 

188 logging.debug(f"ZBMATH API {doi} not found") 

189 return 

190 return parse_zbmath_article(response["result"][0]) 

191 

192 

193def zbmath_request_article_by_extid(extid: str): 

194 pass 

195 

196 

197def _zbmath_query_retry(params: dict, timeout: int): 

198 global last_zbmath_request 

199 RETRIES = 3 

200 for i in range(RETRIES): 

201 try: 

202 response = zbmath_matching_query(params, timeout, get_session()) 

203 return response 

204 except ReadTimeout as e: 

205 if i >= RETRIES - 1: 

206 raise e 

207 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs") 

208 time.sleep(60) 

209 

210 assert False, "unreachable" 

211 

212 

213def match_zbl_article(article: "Article"): 

214 """Finds article using matching score""" 

215 # query zbl 

216 params = get_article_params(article) 

217 response = _zbmath_query_retry(params, timeout=30) 

218 results = response.json() 

219 

220 try: 

221 item = results["results"][0][0] 

222 except (KeyError, IndexError): 

223 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping") 

224 return 

225 return item 

226 

227 

228def zbmath_get_pdfurl(zbmathid: str): 

229 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)