Coverage for src / crawler / zbmath.py: 14%

134 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1import logging 

2import time 

3from typing import TYPE_CHECKING 

4 

5import langcodes 

6from django.conf import settings 

7from matching.matching import get_article_params, zbmath_matching_query 

8from ptf import model_data 

9from ptf.cmds.xml.ckeditor.utils import ( 

10 build_jats_data_from_html_field, 

11 get_html_and_xml_from_text_with_formulas, 

12) 

13from ptf.model_data_converter import get_contrib_xml 

14from requests import HTTPError, ReadTimeout 

15 

16from crawler.utils import add_pdf_link_to_xarticle, get_session 

17 

18if TYPE_CHECKING: 

19 from ptf.models import Article 

20 

21logger = logging.getLogger(__name__) 

22 

23last_zbmath_request = 0 

24request_delay = 5 

25_logger = logging.getLogger(__name__) 

26ZBMATH_URL = "https://zbmath.org" 

27ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}" 

28ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf" 

29ZBMATH_REQUEST_INTERVAL = getattr(settings, "REQUESTS_INTERVAL", 10) 

30 

31 

32def parse_zbmath_article(zbmath_article): 

33 """ 

34 Parse the json response of the zbMATH OPEN article fetched given its zblid 

35 zbmath_article: a json storing article data 

36 returns an ArticleData, that has no pid 

37 TODO: Move in ptf-back 

38 """ 

39 xarticle = model_data.create_articledata() 

40 for json_author in zbmath_article["contributors"]["authors"]: 

41 author = model_data.create_contributor(role="author", string_name=json_author["name"]) 

42 author["contrib_xml"] = get_contrib_xml(author) 

43 xarticle.contributors.append(author) 

44 

45 # Lang 

46 languages = zbmath_article["language"]["languages"] 

47 if len(languages) > 0: 

48 xarticle.lang = zbmath_article["language"]["languages"][0] 

49 

50 # Title 

51 xarticle.title_tex = zbmath_article["title"]["title"] 

52 

53 ckeditor_data = build_jats_data_from_html_field( 

54 xarticle.title_tex, 

55 tag="article-title", 

56 text_lang=xarticle.lang, 

57 delimiter_inline="$", 

58 delimiter_disp="$", 

59 ) 

60 xarticle.title_html = ckeditor_data["value_html"] 

61 xarticle.title_xml = ckeditor_data["value_xml"] 

62 

63 # Abstract 

64 zbl_abstract = next( 

65 ( 

66 c 

67 for c in zbmath_article["editorial_contributions"] 

68 if c["contribution_type"] in "summary" 

69 ), 

70 None, 

71 ) 

72 if zbl_abstract: 

73 abstract_data = model_data.create_abstract(value_tex=zbl_abstract["text"]) 

74 if zbl_abstract["language"]: 

75 lang = langcodes.Language.find(zbl_abstract["language"]).language 

76 if lang and xarticle.lang and lang != xarticle.lang: 

77 abstract_data["lang"] = lang 

78 

79 abstract_data["value_html"], abstract_data["value_xml"] = ( 

80 get_html_and_xml_from_text_with_formulas( 

81 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\[" 

82 ) 

83 ) 

84 xarticle.abstracts.append(abstract_data) 

85 

86 # Keywords 

87 for kwd in zbmath_article["keywords"]: 

88 if not kwd: 

89 continue 

90 xarticle.kwds.append(model_data.create_subj(value=kwd)) 

91 # MSC 

92 for msc in zbmath_article["msc"]: 

93 if msc["scheme"] != "msc2020": 

94 continue 

95 xarticle.kwds.append(model_data.create_subj(value=msc["code"], type="msc")) 

96 

97 # Pages 

98 pages: str = zbmath_article["source"]["pages"] 

99 if pages: 

100 pages.split("-") 

101 if len(pages) > 0: 

102 xarticle.fpage = pages[0] 

103 if len(pages) > 1: 

104 xarticle.lpage = pages[1] 

105 

106 # extids 

107 zbmath_id = zbmath_article["identifier"] 

108 if zbmath_id is None: 

109 zbmath_id = zbmath_article["id"] 

110 xarticle.extids.append(("zbl-item-id", str(zbmath_id).strip())) 

111 

112 pdf_link: str | None = None 

113 for link in zbmath_article["links"]: 

114 match link["type"]: 

115 case "doi": 

116 xarticle.doi = link["identifier"].strip() 

117 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_") 

118 xarticle.extids.append(("doi", link["identifier"].strip())) 

119 case "eudml": 

120 xarticle.extids.append(("eudml-item-id", link["identifier"].strip())) 

121 case "arxiv": 

122 xarticle.extids.append(("arxiv", link["identifier"].strip())) 

123 case "emis_ft": 

124 pdf_link = link["url"].strip() 

125 

126 case "emis": 

127 pdf_link = ( 

128 link["url"].strip().replace("http://www.emis.de/", "http://www.emis.de/ft/") 

129 ) 

130 if pdf_link: 

131 from crawler.base_crawler import BaseCollectionCrawler 

132 

133 BaseCollectionCrawler.session = get_session() 

134 isok, *_ = BaseCollectionCrawler.check_pdf_link_validity(url=pdf_link) 

135 if not isok: 

136 logger.warning(f"This pdf link doesn't seems to work : {pdf_link}") 

137 else: 

138 add_pdf_link_to_xarticle(xarticle, pdf_link) 

139 

140 return xarticle 

141 

142 

143def zbmath_request_article(zblid): 

144 """ 

145 Fetch the article from zbMATH OPEN given its zblid 

146 returns an ArticleData, that has no pid 

147 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module 

148 """ 

149 session = get_session() 

150 response = session.get( 

151 ZBMATH_API_URL.format(query=zblid), 

152 headers={**session.headers, "Content-Type": "text/json"}, 

153 ) 

154 # Silence errors, but continue operation 

155 try: 

156 response.raise_for_status() 

157 except HTTPError: 

158 return 

159 

160 response = response.json() 

161 return parse_zbmath_article(response["result"]) 

162 

163 

164def zbmath_request_article_by_doi(doi: str): 

165 session = get_session() 

166 search = f"_structured_search?results_per_page=1&DOI={doi}" 

167 response = session.get( 

168 ZBMATH_API_URL.format(query=search), 

169 headers={**session.headers, "Content-Type": "text/json"}, 

170 ) 

171 if response.status_code == 404: 

172 logging.debug(f"ZBMATH API {doi} not found") 

173 return 

174 

175 response.raise_for_status() 

176 response = response.json() 

177 

178 if response["status"]["nr_total_results"] > 1: 

179 logging.error(f"ZBMATH API found multiple candidates for doi {doi}") 

180 return 

181 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}") 

182 if response["status"]["nr_total_results"] == 0: 

183 logging.debug(f"ZBMATH API {doi} not found") 

184 return 

185 return parse_zbmath_article(response["result"][0]) 

186 

187 

188def zbmath_request_article_by_extid(extid: str): 

189 pass 

190 

191 

192def _zbmath_query_retry(params: dict, timeout: int): 

193 global last_zbmath_request 

194 RETRIES = 3 

195 for i in range(RETRIES): 

196 try: 

197 response = zbmath_matching_query(params, timeout, get_session()) 

198 return response 

199 except ReadTimeout as e: 

200 if i >= RETRIES - 1: 

201 raise e 

202 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs") 

203 time.sleep(60) 

204 

205 assert False, "unreachable" 

206 

207 

208def match_zbl_article(article: "Article"): 

209 """Finds article using matching score""" 

210 # query zbl 

211 params = get_article_params(article) 

212 response = _zbmath_query_retry(params, timeout=30) 

213 results = response.json() 

214 

215 try: 

216 item = results["results"][0][0] 

217 except (KeyError, IndexError): 

218 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping") 

219 return 

220 return item 

221 

222 

223def zbmath_get_pdfurl(zbmathid: str): 

224 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)