Coverage for src/crawler/by_source/mathnetru_crawler.py: 84%

175 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import parse_qs, urljoin, urlparse 

4 

5import langcodes 

6import lingua 

7import regex 

8import requests 

9from bs4 import BeautifulSoup, Tag 

10from lingua import LanguageDetectorBuilder 

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

12from ptf.cmds.xml.jats.jats_parser import JatsBase 

13from ptf.cmds.xml.xml_utils import escape 

14from ptf.model_data import ( 

15 ArticleData, 

16 IssueData, 

17 create_abstract, 

18 create_articledata, 

19 create_contributor, 

20) 

21 

22from crawler.base_crawler import BaseCollectionCrawler 

23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

24 

25 

26class MathnetruCrawler(BaseCollectionCrawler): 

27 source_domain = "MATHNETRU" 

28 source_name = "Math-Net.Ru" 

29 source_website = "https://www.mathnet.ru" 

30 

31 issue_regex = regex.compile( 

32 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[  ](?P<volume>\d+))?(?:, +Issue[  ](?P<number>[\d\-]+))?" 

33 ) 

34 

35 language_detector = LanguageDetectorBuilder.from_languages( 

36 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH 

37 ).build() 

38 

39 def parse_collection_content(self, content): 

40 xissues = [] 

41 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

42 soup = BeautifulSoup(content, "html5lib") 

43 

44 # Issues without names 

45 issue_tags = soup.select( 

46 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], " 

47 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], " 

48 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']" 

49 ) 

50 for index, link_tag in enumerate(issue_tags): 

51 href = link_tag.get("href") 

52 

53 if not isinstance(href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError( 

55 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

56 ) 

57 url = urljoin(self.source_website, href) + "&bshow=contents" 

58 

59 title = link_tag.get("title") 

60 if not isinstance(title, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Couldn't find issue data") 

62 issue_search = self.issue_regex.search(cleanup_str(title)) 

63 if not issue_search: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("Couldn't parse issue data") 

65 issue_dict = issue_search.groupdict() 

66 year = issue_dict["year"] 

67 volume = issue_dict.get("volume", None) 

68 number = issue_dict.get("number", None) 

69 

70 # Use another method to parse issue metadata if the first one is not successfull 

71 parsed_url = urlparse(url) 

72 query_args = parse_qs(parsed_url.query) 

73 

74 year = year or next(iter(query_args.get("year") or []), None) 

75 volume = volume or next(iter(query_args.get("volume") or []), None) 

76 number = number or next(iter(query_args.get("issue") or []), None) 

77 if not year: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Couldn't parse issue year") 

79 xissue = self.create_xissue( 

80 url, 

81 year, 

82 volume, 

83 number, 

84 ) 

85 

86 xissues.append(xissue) 

87 

88 return xissues 

89 

90 def parse_issue_content(self, content, xissue): 

91 soup = BeautifulSoup(content, "html.parser") 

92 # Parse issue title (if exists) 

93 issue_title_tag = soup.select_one("td[valign='top'] div.red font") 

94 if issue_title_tag: 

95 title_str = cleanup_str(issue_title_tag.text) 

96 part_search = regex.search(r"Part (?P<number>\d+)$", title_str) 

97 if part_search: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 title_str = title_str.removesuffix(f"Part {xissue.number}") 

99 

100 xissue.title_tex = title_str 

101 

102 # Parse Articles 

103 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

104 articles_tags = soup.select( 

105 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

106 ) 

107 for i, a in enumerate(articles_tags): 

108 article = create_articledata() 

109 href = a.get("href") 

110 if not isinstance(href, str): 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 raise ValueError( 

112 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

113 ) 

114 

115 article.url = self.source_website + href 

116 article.pid = "a" + str(i) 

117 xissue.articles.append(article) 

118 

119 def parse_article_content(self, content, xissue, xarticle, url): 

120 soup = BeautifulSoup(content, "html.parser") 

121 

122 # Language 

123 language_candidates = soup.select("div.around-button > div.msc") 

124 language_span = next( 

125 ( 

126 span 

127 for span in language_candidates 

128 if cleanup_str(span.text).startswith("Language: ") 

129 ), 

130 None, 

131 ) 

132 

133 if not language_span: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError( 

135 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

136 ) 

137 

138 language_b = language_span.select_one("b") 

139 if language_b: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was always true

140 language_b.decompose() 

141 

142 long_lang = cleanup_str(language_span.text) 

143 xarticle.lang = str(langcodes.find(long_lang)) 

144 

145 # Title 

146 title_tag = soup.select_one("span.red font") 

147 if not title_tag: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 raise ValueError( 

149 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

150 ) 

151 xarticle.title_tex = title_tag.text 

152 

153 amsbib_tag = soup.select_one("div.showamsbib") 

154 

155 if amsbib_tag: 155 ↛ 183line 155 didn't jump to line 183 because the condition on line 155 was always true

156 amsbib = amsbib_tag.text 

157 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

158 if authors_match: 

159 authors = authors_match.group(1).split(",") 

160 for author_text in authors: 

161 if author_text != "": 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true

162 author_text = self.latext_parser.latex_to_text(author_text) 

163 author = create_contributor() 

164 author["role"] = "author" 

165 author["string_name"] = cleanup_str(author_text) 

166 xarticle.contributors.append(author) 

167 

168 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

169 if title_match: 169 ↛ 172line 169 didn't jump to line 172 because the condition on line 169 was always true

170 xarticle.title_tex = title_match.group(1) 

171 

172 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

173 if title_match: 173 ↛ 183line 173 didn't jump to line 183 because the condition on line 173 was always true

174 page_range = title_match.group(1) 

175 pages = page_range.split("--") 

176 if len(pages) == 2: 

177 xarticle.fpage = pages[0].replace(",", "") 

178 xarticle.lpage = pages[1].replace(",", "") 

179 else: 

180 xarticle.page_range = page_range 

181 

182 # Pdf 

183 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

184 if not pdf_tag: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')") 

186 if pdf_tag: 186 ↛ 192line 186 didn't jump to line 192 because the condition on line 186 was always true

187 href = pdf_tag.get("href") 

188 if isinstance(href, str): 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was always true

189 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

190 

191 # References 

192 a_id = url.split("/")[-1] 

193 ref_url = ( 

194 self.source_website 

195 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

196 ) 

197 

198 self.parse_references(self.download_file(ref_url), xarticle) 

199 

200 # Keywords 

201 keyword_tag = next( 

202 iter( 

203 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")] 

204 ), 

205 None, 

206 ) 

207 if keyword_tag: 

208 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ") 

209 for kwd in keywords: 

210 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)}) 

211 

212 abstract_tag = next( 

213 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]), 

214 None, 

215 ) 

216 if abstract_tag: 

217 abstract_tag_b = abstract_tag.select_one("b") 

218 if abstract_tag_b: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true

219 abstract_tag_b.decompose() 

220 xabstract = create_abstract( 

221 tag="abstract", 

222 value_tex=abstract_tag.text, 

223 lang=self.detect_language(abstract_tag.text), 

224 ) 

225 xarticle.abstracts.append(xabstract) 

226 return xarticle 

227 

228 def parse_references(self, content: str, xarticle: ArticleData): 

229 soup = BeautifulSoup(content, "html.parser") 

230 references = soup.select('tr:has(td[valign="top"])') 

231 

232 bibitems = [self.parse_ref(item) for item in references] 

233 if len(bibitems) > 0: 

234 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

235 

236 def parse_ref(self, tag: Tag): 

237 links_xml = "" 

238 for a_tag in tag.select("a"): 

239 a_href = a_tag.get("href") 

240 if not isinstance(a_href, str): 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 continue 

242 a_href = escape(a_href) 

243 if a_tag.select_one("img[alt='crossref']"): 

244 links_xml += get_ext_link_xml( 

245 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

246 ) 

247 elif a_tag.select_one("img[alt='mathscinet']"): 

248 links_xml += get_ext_link_xml( 

249 a_href, 

250 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

251 "mr-item-id", 

252 ) 

253 elif a_tag.select_one("img[alt='zmath']"): 

254 links_xml += get_ext_link_xml( 

255 a_href, 

256 a_href.removeprefix("https://zbmath.org/?q=an:"), 

257 "zbl-item-id", 

258 ) 

259 elif a_tag.select_one("img"): 

260 print(f"Unimplemented reference link : {a_tag.get('href', '')}") 

261 else: 

262 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

263 a_tag.decompose() 

264 

265 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml)) 

266 

267 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

268 """Override this if the content-type headers from the sources are advertising something else than the actual content 

269 SASA needs this""" 

270 if "charset=" in response.headers["Content-Type"]: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true

271 encoding = response.headers["Content-Type"].split("charset=")[1] 

272 return response.content.decode(encoding) 

273 

274 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

275 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

276 parsed_xarticle = xarticle 

277 if hasattr(xarticle, "url") and xarticle.url: 277 ↛ 295line 277 didn't jump to line 295 because the condition on line 277 was always true

278 parsed_xarticle = None 

279 attempts = 0 

280 while parsed_xarticle is None and attempts < 3: 

281 try: 

282 parsed_xarticle = super().crawl_article(xarticle, xissue) 

283 except ValueError as e: 

284 print(f"{xarticle.pid} : Caught error : {e} ") 

285 attempts += 1 

286 print( 

287 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})" 

288 ) 

289 # 15 mins, 30 mins, 45 mins 

290 time.sleep(attempts * 15 * 60) 

291 self.download_file(xarticle.url, force_refresh=True) 

292 

293 if parsed_xarticle is None: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

295 return self.process_article_metadata(parsed_xarticle)