Coverage for src/crawler/by_source/mathnetru_crawler.py: 85%

178 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import parse_qs, urljoin, urlparse 

4from uu import Error 

5 

6import langcodes 

7import lingua 

8import regex 

9import requests 

10from bs4 import BeautifulSoup, Tag 

11from lingua import LanguageDetectorBuilder 

12from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

13from ptf.cmds.xml.jats.jats_parser import JatsBase 

14from ptf.cmds.xml.xml_utils import escape 

15from ptf.model_data import ( 

16 ArticleData, 

17 IssueData, 

18 create_abstract, 

19 create_articledata, 

20 create_contributor, 

21) 

22 

23from crawler.base_crawler import BaseCollectionCrawler 

24from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

25 

26 

27class MathnetruCrawler(BaseCollectionCrawler): 

28 source_domain = "MATHNETRU" 

29 source_name = "Math-Net.Ru" 

30 source_website = "https://www.mathnet.ru" 

31 

32 issue_regex = regex.compile( 

33 r"(?P<year>\d{4})(?:, ?(?:Volume) (?P<volume>\d+))?(?:, ?(?:Number|Issue) (?P<number>\d+)\((?P<volume>\d+)\)?)?" 

34 ) 

35 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)" 

36 

37 issue_regex_col = regex.compile( 

38 r"[\w \-,\.]+, (?P<year>\d+)(?:, Volume[  ](?P<volume>\d+))?(?:, Issue[  ](?P<number>[\d\-]+))?" 

39 ) 

40 

41 language_detector = LanguageDetectorBuilder.from_languages( 

42 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH 

43 ).build() 

44 

45 def parse_collection_content(self, content): 

46 xissues = [] 

47 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

48 soup = BeautifulSoup(content, "html5lib") 

49 

50 # Issues without names 

51 issue_tags = soup.select( 

52 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], " 

53 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], " 

54 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']" 

55 ) 

56 for index, link_tag in enumerate(issue_tags): 

57 href = link_tag.get("href") 

58 

59 if not isinstance(href, str): 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError( 

61 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

62 ) 

63 url = urljoin(self.source_website, href) + "&bshow=contents" 

64 

65 title = link_tag.get("title") 

66 if not isinstance(title, str): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise Error("Couldn't find issue data") 

68 issue_search = self.issue_regex_col.search(title) 

69 if not issue_search: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 raise Error("Couldn't parse issue data") 

71 issue_dict = issue_search.groupdict() 

72 year = issue_dict["year"] 

73 volume = issue_dict.get("volume", None) 

74 number = issue_dict.get("number", None) 

75 

76 # Use another method to parse issue metadata if the first one is not successfull 

77 parsed_url = urlparse(url) 

78 query_args = parse_qs(parsed_url.query) 

79 

80 year = year or next(iter(query_args.get("year") or []), None) 

81 volume = volume or next(iter(query_args.get("volume") or []), None) 

82 number = number or next(iter(query_args.get("issue") or []), None) 

83 if not year: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 raise Error("Couldn't parse issue year") 

85 xissue = self.create_xissue( 

86 url, 

87 year, 

88 volume, 

89 number, 

90 ) 

91 

92 xissues.append(xissue) 

93 

94 return xissues 

95 

96 def parse_issue_content(self, content, xissue): 

97 soup = BeautifulSoup(content, "html.parser") 

98 # Parse issue title (if exists) 

99 issue_title_tag = soup.select_one("td[valign='top'] div.red font") 

100 if issue_title_tag: 

101 title_str = cleanup_str(issue_title_tag.text) 

102 part_search = regex.search(r"Part (?P<number>\d+)$", title_str) 

103 if part_search: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 title_str = title_str.removesuffix(f"Part {xissue.number}") 

105 

106 xissue.title_tex = title_str 

107 

108 # Parse Articles 

109 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

110 articles_tags = soup.select( 

111 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

112 ) 

113 for i, a in enumerate(articles_tags): 

114 article = create_articledata() 

115 href = a.get("href") 

116 if not isinstance(href, str): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError( 

118 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

119 ) 

120 

121 article.url = self.source_website + href 

122 article.pid = "a" + str(i) 

123 xissue.articles.append(article) 

124 

125 def parse_article_content(self, content, xissue, xarticle, url): 

126 soup = BeautifulSoup(content, "html.parser") 

127 

128 # Language 

129 language_candidates = soup.select("div.around-button > div.msc") 

130 language_span = next( 

131 ( 

132 span 

133 for span in language_candidates 

134 if cleanup_str(span.text).startswith("Language: ") 

135 ), 

136 None, 

137 ) 

138 

139 if not language_span: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 raise ValueError( 

141 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

142 ) 

143 

144 language_b = language_span.select_one("b") 

145 if language_b: 145 ↛ 148line 145 didn't jump to line 148 because the condition on line 145 was always true

146 language_b.decompose() 

147 

148 long_lang = cleanup_str(language_span.text) 

149 xarticle.lang = str(langcodes.find(long_lang)) 

150 

151 # Title 

152 title_tag = soup.select_one("span.red font") 

153 if not title_tag: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 raise ValueError( 

155 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

156 ) 

157 xarticle.title_tex = title_tag.text 

158 

159 amsbib_tag = soup.select_one("div.showamsbib") 

160 

161 if amsbib_tag: 161 ↛ 189line 161 didn't jump to line 189 because the condition on line 161 was always true

162 amsbib = amsbib_tag.text 

163 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

164 if authors_match: 

165 authors = authors_match.group(1).split(",") 

166 for author_text in authors: 

167 if author_text != "": 167 ↛ 166line 167 didn't jump to line 166 because the condition on line 167 was always true

168 author_text = self.latext_parser.latex_to_text(author_text) 

169 author = create_contributor() 

170 author["role"] = "author" 

171 author["string_name"] = cleanup_str(author_text) 

172 xarticle.contributors.append(author) 

173 

174 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

175 if title_match: 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true

176 xarticle.title_tex = title_match.group(1) 

177 

178 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

179 if title_match: 179 ↛ 189line 179 didn't jump to line 189 because the condition on line 179 was always true

180 page_range = title_match.group(1) 

181 pages = page_range.split("--") 

182 if len(pages) == 2: 

183 xarticle.fpage = pages[0].replace(",", "") 

184 xarticle.lpage = pages[1].replace(",", "") 

185 else: 

186 xarticle.page_range = page_range 

187 

188 # Pdf 

189 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

190 if not pdf_tag: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')") 

192 if pdf_tag: 192 ↛ 198line 192 didn't jump to line 198 because the condition on line 192 was always true

193 href = pdf_tag.get("href") 

194 if isinstance(href, str): 194 ↛ 198line 194 didn't jump to line 198 because the condition on line 194 was always true

195 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

196 

197 # References 

198 a_id = url.split("/")[-1] 

199 ref_url = ( 

200 self.source_website 

201 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

202 ) 

203 

204 self.parse_references(self.download_file(ref_url), xarticle) 

205 

206 # Keywords 

207 keyword_tag = next( 

208 iter( 

209 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")] 

210 ), 

211 None, 

212 ) 

213 if keyword_tag: 

214 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ") 

215 for kwd in keywords: 

216 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)}) 

217 

218 abstract_tag = next( 

219 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]), 

220 None, 

221 ) 

222 if abstract_tag: 

223 abstract_tag_b = abstract_tag.select_one("b") 

224 if abstract_tag_b: 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 abstract_tag_b.decompose() 

226 xabstract = create_abstract( 

227 tag="abstract", 

228 value_tex=abstract_tag.text, 

229 lang=self.detect_language(abstract_tag.text), 

230 ) 

231 xarticle.abstracts.append(xabstract) 

232 return xarticle 

233 

234 def parse_references(self, content: str, xarticle: ArticleData): 

235 soup = BeautifulSoup(content, "html.parser") 

236 references = soup.select('tr:has(td[valign="top"])') 

237 

238 bibitems = [self.parse_ref(item) for item in references] 

239 if len(bibitems) > 0: 

240 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

241 

242 def parse_ref(self, tag: Tag): 

243 links_xml = "" 

244 for a_tag in tag.select("a"): 

245 a_href = a_tag.get("href") 

246 if not isinstance(a_href, str): 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true

247 continue 

248 a_href = escape(a_href) 

249 if a_tag.select_one("img[alt='crossref']"): 

250 links_xml += get_ext_link_xml( 

251 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

252 ) 

253 elif a_tag.select_one("img[alt='mathscinet']"): 

254 links_xml += get_ext_link_xml( 

255 a_href, 

256 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

257 "mr-item-id", 

258 ) 

259 elif a_tag.select_one("img[alt='zmath']"): 

260 links_xml += get_ext_link_xml( 

261 a_href, 

262 a_href.removeprefix("https://zbmath.org/?q=an:"), 

263 "zbl-item-id", 

264 ) 

265 elif a_tag.select_one("img"): 

266 print(f"Unimplemented reference link : {a_tag.get('href', '')}") 

267 else: 

268 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

269 a_tag.decompose() 

270 

271 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml)) 

272 

273 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

274 """Override this if the content-type headers from the sources are advertising something else than the actual content 

275 SASA needs this""" 

276 if "charset=" in response.headers["Content-Type"]: 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true

277 encoding = response.headers["Content-Type"].split("charset=")[1] 

278 return response.content.decode(encoding) 

279 

280 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

281 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

282 parsed_xarticle = xarticle 

283 if hasattr(xarticle, "url") and xarticle.url: 283 ↛ 301line 283 didn't jump to line 301 because the condition on line 283 was always true

284 parsed_xarticle = None 

285 attempts = 0 

286 while parsed_xarticle is None and attempts < 3: 

287 try: 

288 parsed_xarticle = super().crawl_article(xarticle, xissue) 

289 except ValueError as e: 

290 print(f"{xarticle.pid} : Caught error : {e} ") 

291 attempts += 1 

292 print( 

293 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})" 

294 ) 

295 # 15 mins, 30 mins, 45 mins 

296 time.sleep(attempts * 15 * 60) 

297 self.download_file(xarticle.url, force_refresh=True) 

298 

299 if parsed_xarticle is None: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true

300 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

301 return self.process_resource_metadata(parsed_xarticle)