Coverage for src/crawler/by_source/mathnetru_crawler.py: 81%

171 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import urljoin 

4 

5import langcodes 

6import lingua 

7import regex 

8import requests 

9from bs4 import BeautifulSoup, Tag 

10from lingua import LanguageDetectorBuilder 

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

12from ptf.cmds.xml.xml_utils import escape 

13from ptf.model_data import ( 

14 ArticleData, 

15 IssueData, 

16 create_abstract, 

17 create_articledata, 

18 create_contributor, 

19 create_issuedata, 

20) 

21 

22from crawler.base_crawler import BaseCollectionCrawler 

23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

24 

25 

26class MathnetruCrawler(BaseCollectionCrawler): 

27 source_domain = "MATHNETRU" 

28 source_name = "Math-Net.Ru" 

29 source_website = "https://www.mathnet.ru" 

30 

31 issue_regex = regex.compile( 

32 r"(?P<year>\d{4})(?:, ?(?:Volume) (?P<volume>\d+))?(?:, ?(?:Number|Issue) (?P<number>\d+)\((?P<volume>\d+)\)?)?" 

33 ) 

34 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)" 

35 

36 language_detector = LanguageDetectorBuilder.from_languages( 

37 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH 

38 ).build() 

39 

40 def parse_collection_content(self, content): 

41 xissues = [] 

42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

43 soup = BeautifulSoup(content, "html5lib") 

44 

45 # Issues without names 

46 issue_tags = soup.select( 

47 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], " 

48 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], " 

49 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']" 

50 ) 

51 for index, link_tag in enumerate(issue_tags): 

52 href = link_tag.get("href") 

53 

54 if not isinstance(href, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError( 

56 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

57 ) 

58 xissue = create_issuedata() 

59 xissue.pid = f"{self.collection_id}_TEMP_{index}" 

60 xissue.url = urljoin(self.source_website, href) + "&bshow=contents" 

61 xissues.append(xissue) 

62 

63 return xissues 

64 

65 def parse_issue_content(self, content, xissue): 

66 soup = BeautifulSoup(content, "html.parser") 

67 

68 # Parse issue PID 

69 volume_tag = soup.select_one("td[valign='top'] span.red font") 

70 if not volume_tag: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 raise ValueError("Couldn't parse volume number") 

72 volume_str = cleanup_str(volume_tag.text) 

73 volume_re = self.issue_regex.search(volume_str) 

74 if not volume_re: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 raise ValueError( 

76 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

77 ) 

78 

79 volume_data = volume_re.groupdict() 

80 if volume_data.get("volume", None): 80 ↛ 82line 80 didn't jump to line 82 because the condition on line 80 was always true

81 volume_data["volume"] = volume_data["volume"].strip() 

82 elif volume_data.get("volume_2", None): 

83 volume_data["volume"] = volume_data["volume_2"].strip() 

84 

85 xissue.pid = self.get_issue_pid( 

86 self.collection_id, 

87 volume_data["year"], 

88 volume_data.get("volume", None), 

89 volume_data.get("number", None), 

90 ) 

91 xissue.year = volume_data["year"] 

92 xissue.volume = volume_data["volume"] 

93 xissue.number = volume_data["number"] 

94 

95 # Parse issue title (if exists) 

96 issue_title_tag = soup.select_one("td[valign='top'] div.red font") 

97 if issue_title_tag: 

98 xissue.title_tex = cleanup_str(issue_title_tag.text) 

99 

100 # Parse Articles 

101 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

102 articles_tags = soup.select( 

103 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

104 ) 

105 for i, a in enumerate(articles_tags): 

106 article = create_articledata() 

107 href = a.get("href") 

108 if not isinstance(href, str): 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 raise ValueError( 

110 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

111 ) 

112 

113 article.url = self.source_website + href 

114 article.pid = "a" + str(i) 

115 xissue.articles.append(article) 

116 

117 def parse_article_content(self, content, xissue, xarticle, url): 

118 soup = BeautifulSoup(content, "html.parser") 

119 

120 # Language 

121 language_candidates = soup.select("div.around-button > div.msc") 

122 language_span = next( 

123 ( 

124 span 

125 for span in language_candidates 

126 if cleanup_str(span.text).startswith("Language: ") 

127 ), 

128 None, 

129 ) 

130 

131 if not language_span: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 raise ValueError( 

133 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

134 ) 

135 

136 language_b = language_span.select_one("b") 

137 if language_b: 137 ↛ 140line 137 didn't jump to line 140 because the condition on line 137 was always true

138 language_b.decompose() 

139 

140 long_lang = cleanup_str(language_span.text) 

141 xarticle.lang = str(langcodes.find(long_lang)) 

142 

143 # Title 

144 title_tag = soup.select_one("span.red font") 

145 if not title_tag: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 raise ValueError( 

147 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

148 ) 

149 xarticle.title_tex = title_tag.text 

150 

151 amsbib_tag = soup.select_one("div.showamsbib") 

152 

153 if amsbib_tag: 153 ↛ 181line 153 didn't jump to line 181 because the condition on line 153 was always true

154 amsbib = amsbib_tag.text 

155 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

156 if authors_match: 156 ↛ 166line 156 didn't jump to line 166 because the condition on line 156 was always true

157 authors = authors_match.group(1).split(",") 

158 for author_text in authors: 

159 if author_text != "": 159 ↛ 158line 159 didn't jump to line 158 because the condition on line 159 was always true

160 author_text = self.latext_parser.latex_to_text(author_text) 

161 author = create_contributor() 

162 author["role"] = "author" 

163 author["string_name"] = cleanup_str(author_text) 

164 xarticle.contributors.append(author) 

165 

166 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

167 if title_match: 167 ↛ 170line 167 didn't jump to line 170 because the condition on line 167 was always true

168 xarticle.title_tex = title_match.group(1) 

169 

170 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

171 if title_match: 171 ↛ 181line 171 didn't jump to line 181 because the condition on line 171 was always true

172 page_range = title_match.group(1) 

173 pages = page_range.split("--") 

174 if len(pages) == 2: 174 ↛ 178line 174 didn't jump to line 178 because the condition on line 174 was always true

175 xarticle.fpage = pages[0].replace(",", "") 

176 xarticle.lpage = pages[1].replace(",", "") 

177 else: 

178 xarticle.page_range = page_range 

179 

180 # Pdf 

181 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

182 if not pdf_tag: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')") 

184 if pdf_tag: 184 ↛ 190line 184 didn't jump to line 190 because the condition on line 184 was always true

185 href = pdf_tag.get("href") 

186 if isinstance(href, str): 186 ↛ 190line 186 didn't jump to line 190 because the condition on line 186 was always true

187 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

188 

189 # References 

190 a_id = url.split("/")[-1] 

191 ref_url = ( 

192 self.source_website 

193 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

194 ) 

195 

196 self.parse_references(self.download_file(ref_url), xarticle) 

197 

198 # Keywords 

199 keyword_tag = next( 

200 iter( 

201 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")] 

202 ), 

203 None, 

204 ) 

205 if keyword_tag: 

206 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ") 

207 for kwd in keywords: 

208 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)}) 

209 

210 abstract_tag = next( 

211 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]), 

212 None, 

213 ) 

214 if abstract_tag: 

215 abstract_tag_b = abstract_tag.select_one("b") 

216 if abstract_tag_b: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true

217 abstract_tag_b.decompose() 

218 xabstract = create_abstract( 

219 tag="abstract", 

220 value_tex=abstract_tag.text, 

221 lang=self.detect_language(abstract_tag.text), 

222 ) 

223 xarticle.abstracts.append(xabstract) 

224 return xarticle 

225 

226 def parse_references(self, content: str, xarticle: ArticleData): 

227 soup = BeautifulSoup(content, "html.parser") 

228 references = soup.select('tr:has(td[valign="top"])') 

229 

230 bibitems = [self.parse_ref(item) for item in references] 

231 if len(bibitems) > 0: 

232 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

233 

234 def parse_ref(self, tag: Tag): 

235 links_xml = "" 

236 for a_tag in tag.select("a"): 

237 a_href = a_tag.get("href") 

238 if not isinstance(a_href, str): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 continue 

240 a_href = escape(a_href) 

241 if a_tag.select_one("img[alt='crossref']"): 

242 links_xml += get_ext_link_xml( 

243 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

244 ) 

245 elif a_tag.select_one("img[alt='mathscinet']"): 

246 links_xml += get_ext_link_xml( 

247 a_href, 

248 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

249 "mr-item-id", 

250 ) 

251 elif a_tag.select_one("img[alt='zmath']"): 

252 links_xml += get_ext_link_xml( 

253 a_href, 

254 a_href.removeprefix("https://zbmath.org/?q=an:"), 

255 "zbl-item-id", 

256 ) 

257 elif a_tag.select_one("img"): 257 ↛ 260line 257 didn't jump to line 260 because the condition on line 257 was always true

258 print(f"Unimplemented reference link : {a_tag.get('href', '')}") 

259 else: 

260 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

261 a_tag.decompose() 

262 

263 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml)) 

264 

265 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

266 """Override this if the content-type headers from the sources are advertising something else than the actual content 

267 SASA needs this""" 

268 if "charset=" in response.headers["Content-Type"]: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true

269 encoding = response.headers["Content-Type"].split("charset=")[1] 

270 return response.content.decode(encoding) 

271 

272 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

273 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

274 parsed_xarticle = xarticle 

275 if hasattr(xarticle, "url") and xarticle.url: 275 ↛ 293line 275 didn't jump to line 293 because the condition on line 275 was always true

276 parsed_xarticle = None 

277 attempts = 0 

278 while parsed_xarticle is None and attempts < 3: 

279 try: 

280 parsed_xarticle = super().crawl_article(xarticle, xissue) 

281 except ValueError as e: 

282 print(f"{xarticle.pid} : Caught error : {e} ") 

283 attempts += 1 

284 print( 

285 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})" 

286 ) 

287 # 15 mins, 30 mins, 45 mins 

288 time.sleep(attempts * 15 * 60) 

289 self.download_file(xarticle.url, force_refresh=True) 

290 

291 if parsed_xarticle is None: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

293 return self.process_resource_metadata(parsed_xarticle)