Coverage for src/crawler/zbmath.py: 0%

108 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-11-21 14:41 +0000

1# import shutil 

2# import tempfile 

3# import urllib 

4# import xml.etree.ElementTree as ET 

5 

6 

7import logging 

8import time 

9from typing import TYPE_CHECKING 

10 

11import langcodes 

12from matching.matching import get_article_params, zbmath_matching_query 

13from ptf import model_data 

14from ptf.cmds.xml.ckeditor.utils import ( 

15 build_jats_data_from_html_field, 

16 get_html_and_xml_from_text_with_formulas, 

17) 

18from ptf.external import get_session 

19from ptf.model_data_converter import get_contrib_xml 

20from requests import ReadTimeout 

21 

22if TYPE_CHECKING: 

23 from ptf.models import Article 

24_logger = logging.getLogger(__name__) 

25ZBMATH_URL = "https://zbmath.org" 

26ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}" 

27ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf" 

28 

29 

30def parse_zbmath_article(zbmath_article): 

31 xarticle = model_data.create_articledata() 

32 for json_author in zbmath_article["contributors"]["authors"]: 

33 author = model_data.create_contributor(role="author", string_name=json_author["name"]) 

34 author["contrib_xml"] = get_contrib_xml(author) 

35 xarticle.contributors.append(author) 

36 

37 # Lang 

38 xarticle.lang = zbmath_article["language"]["languages"][0] 

39 

40 # Title 

41 xarticle.title_tex = zbmath_article["title"]["title"] 

42 

43 ckeditor_data = build_jats_data_from_html_field( 

44 xarticle.title_tex, 

45 tag="article-title", 

46 text_lang=xarticle.lang, 

47 delimiter_inline="$", 

48 delimiter_disp="$", 

49 ) 

50 xarticle.title_html = ckeditor_data["value_html"] 

51 xarticle.title_xml = ckeditor_data["value_xml"] 

52 

53 # Abstract 

54 zbl_abstract = next( 

55 ( 

56 c 

57 for c in zbmath_article["editorial_contributions"] 

58 if c["contribution_type"] == "summary" 

59 ), 

60 None, 

61 ) 

62 if zbl_abstract: 

63 abstract_data = model_data.create_abstract(value_tex=zbl_abstract["text"]) 

64 if zbl_abstract["language"]: 

65 lang = langcodes.Language.find(zbl_abstract["language"]).language 

66 if lang and xarticle.lang and lang != xarticle.lang: 

67 abstract_data["lang"] = lang 

68 

69 abstract_data["value_html"], abstract_data["value_xml"] = ( 

70 get_html_and_xml_from_text_with_formulas( 

71 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\[" 

72 ) 

73 ) 

74 xarticle.abstracts.append(abstract_data) 

75 

76 # Keywords 

77 for kwd in zbmath_article["keywords"]: 

78 xarticle.kwds.append(model_data.create_subj(value=kwd)) 

79 # MSC 

80 for msc in zbmath_article["msc"]: 

81 if msc["scheme"] != "msc2020": 

82 continue 

83 xarticle.kwds.append(model_data.create_subj(value=msc["code"])) 

84 

85 # Pages 

86 pages: str = zbmath_article["source"]["pages"] 

87 pages.split("-") 

88 if len(pages) > 0: 

89 xarticle.fpage = pages[0] 

90 if len(pages) > 1: 

91 xarticle.lpage = pages[1] 

92 

93 # extids 

94 zbmath_id = zbmath_article["identifier"] 

95 if zbmath_id is None: 

96 zbmath_id = zbmath_article["id"] 

97 xarticle.extids.append(("zbl-item-id", zbmath_id)) 

98 

99 for link in zbmath_article["links"]: 

100 if link["type"] == "http": 

101 continue 

102 if link["type"] == "doi": 

103 xarticle.doi = link["identifier"] 

104 continue 

105 if link["type"] == "eudml": 

106 xarticle.extids.append(("eudml-item-id", link["identifier"])) 

107 return xarticle 

108 

109 

110def zbmath_request_article(zblid): 

111 session = get_session() 

112 response = session.get( 

113 ZBMATH_API_URL.format(query=zblid), 

114 headers={**session.headers, "Content-Type": "text/json"}, 

115 ) 

116 response.raise_for_status() 

117 

118 response = response.json() 

119 return parse_zbmath_article(response["result"]) 

120 

121 

122def zbmath_request_article_by_doi(doi: str): 

123 session = get_session() 

124 search = f"_structured_search?results_per_page=1&DOI={doi}" 

125 response = session.get( 

126 ZBMATH_API_URL.format(query=search), 

127 headers={**session.headers, "Content-Type": "text/json"}, 

128 ) 

129 if response.status_code == 404: 

130 logging.debug(f"ZBMATH API {doi} not found") 

131 return 

132 

133 response.raise_for_status() 

134 

135 response = response.json() 

136 if response["status"]["nr_total_results"] > 1: 

137 logging.error(f"ZBMATH API found multiple candidates for doi {doi}") 

138 return 

139 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}") 

140 if response["status"]["nr_total_results"] == 0: 

141 logging.debug(f"ZBMATH API {doi} not found") 

142 return 

143 return parse_zbmath_article(response["result"][0]) 

144 

145 

146def zbmath_request_article_by_extid(extid: str): 

147 pass 

148 

149 

150def _zbmath_query_retry(params: dict, timeout: int): 

151 RETRIES = 3 

152 for i in range(RETRIES): 

153 try: 

154 return zbmath_matching_query(params, timeout) 

155 except ReadTimeout as e: 

156 if i >= RETRIES - 1: 

157 raise e 

158 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs") 

159 time.sleep(60) 

160 

161 assert False, "unreachable" 

162 

163 

164def match_zbl_article(article: "Article"): 

165 """Finds article using matching score""" 

166 # query zbl 

167 params = get_article_params(article) 

168 response = _zbmath_query_retry(params, timeout=30) 

169 results = response.json() 

170 

171 try: 

172 item = results["results"][0][0] 

173 except (KeyError, IndexError): 

174 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping") 

175 return 

176 return item 

177 

178 

179def zbmath_get_pdfurl(zbmathid: str): 

180 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)