Coverage for src/crawler/by_source/mathnetru_crawler.py: 84%

171 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import urljoin 

4 

5import langcodes 

6import lingua 

7import regex 

8import requests 

9from bs4 import BeautifulSoup, Tag 

10from lingua import LanguageDetectorBuilder 

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

12from ptf.cmds.xml.xml_utils import escape 

13from ptf.model_data import ( 

14 ArticleData, 

15 IssueData, 

16 create_abstract, 

17 create_articledata, 

18 create_contributor, 

19 create_issuedata, 

20) 

21 

22from crawler.base_crawler import BaseCollectionCrawler 

23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

24 

25 

26class MathnetruCrawler(BaseCollectionCrawler): 

27 source_domain = "MATHNETRU" 

28 source_name = "Math-Net.Ru" 

29 source_website = "https://www.mathnet.ru" 

30 

31 issue_regex = regex.compile( 

32 r"(?P<year>\d{4})(?:, ?(?:Volume) (?P<volume>\d+))?(?:, ?(?:Number|Issue) (?P<number>\d+)\((?P<volume>\d+)\)?)?" 

33 ) 

34 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)" 

35 

36 language_detector = LanguageDetectorBuilder.from_languages( 

37 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH 

38 ).build() 

39 

40 def parse_collection_content(self, content): 

41 xissues = [] 

42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

43 soup = BeautifulSoup(content, "html5lib") 

44 

45 # Issues without names 

46 issue_tags = soup.select( 

47 "table.Card td:not(.year) a.SLink[href^='/php'], table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php'], table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php']" 

48 ) 

49 for index, link_tag in enumerate(issue_tags): 

50 href = link_tag.get("href") 

51 

52 if not isinstance(href, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 raise ValueError( 

54 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

55 ) 

56 xissue = create_issuedata() 

57 xissue.pid = f"{self.collection_id}_TEMP_{index}" 

58 xissue.url = urljoin(self.source_website, href) + "&bshow=contents" 

59 xissues.append(xissue) 

60 

61 return xissues 

62 

63 def parse_issue_content(self, content, xissue): 

64 soup = BeautifulSoup(content, "html.parser") 

65 

66 # Parse issue PID 

67 volume_tag = soup.select_one("td[valign='top'] span.red font") 

68 if not volume_tag: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("Couldn't parse volume number") 

70 volume_str = cleanup_str(volume_tag.text) 

71 volume_re = self.issue_regex.search(volume_str) 

72 if not volume_re: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise ValueError( 

74 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

75 ) 

76 

77 volume_data = volume_re.groupdict() 

78 if volume_data.get("volume", None): 

79 volume_data["volume"] = volume_data["volume"].strip() 

80 elif volume_data.get("volume_2", None): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 volume_data["volume"] = volume_data["volume_2"].strip() 

82 

83 xissue.pid = self.get_issue_pid( 

84 self.collection_id, 

85 volume_data["year"], 

86 volume_data.get("volume", None), 

87 volume_data.get("number", None), 

88 ) 

89 xissue.year = volume_data["year"] 

90 xissue.volume = volume_data["volume"] 

91 xissue.number = volume_data["number"] 

92 

93 # Parse issue title (if exists) 

94 issue_title_tag = soup.select_one("td[valign='top'] div.red font") 

95 if issue_title_tag: 

96 xissue.title_tex = cleanup_str(issue_title_tag.text) 

97 

98 # Parse Articles 

99 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

100 articles_tags = soup.select( 

101 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

102 ) 

103 for i, a in enumerate(articles_tags): 

104 article = create_articledata() 

105 href = a.get("href") 

106 if not isinstance(href, str): 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 raise ValueError( 

108 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

109 ) 

110 

111 article.url = self.source_website + href 

112 article.pid = "a" + str(i) 

113 xissue.articles.append(article) 

114 

115 def parse_article_content(self, content, xissue, xarticle, url): 

116 soup = BeautifulSoup(content, "html.parser") 

117 

118 # Language 

119 language_candidates = soup.select("div.around-button > div.msc") 

120 language_span = next( 

121 ( 

122 span 

123 for span in language_candidates 

124 if cleanup_str(span.text).startswith("Language: ") 

125 ), 

126 None, 

127 ) 

128 

129 if not language_span: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError( 

131 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

132 ) 

133 

134 language_b = language_span.select_one("b") 

135 if language_b: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true

136 language_b.decompose() 

137 

138 long_lang = cleanup_str(language_span.text) 

139 xarticle.lang = str(langcodes.find(long_lang)) 

140 

141 # Title 

142 title_tag = soup.select_one("span.red font") 

143 if not title_tag: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 raise ValueError( 

145 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

146 ) 

147 xarticle.title_tex = title_tag.text 

148 

149 amsbib_tag = soup.select_one("div.showamsbib") 

150 

151 if amsbib_tag: 151 ↛ 179line 151 didn't jump to line 179 because the condition on line 151 was always true

152 amsbib = amsbib_tag.text 

153 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

154 if authors_match: 154 ↛ 164line 154 didn't jump to line 164 because the condition on line 154 was always true

155 authors = authors_match.group(1).split(",") 

156 for author_text in authors: 

157 if author_text != "": 157 ↛ 156line 157 didn't jump to line 156 because the condition on line 157 was always true

158 author_text = self.latext_parser.latex_to_text(author_text) 

159 author = create_contributor() 

160 author["role"] = "author" 

161 author["string_name"] = cleanup_str(author_text) 

162 xarticle.contributors.append(author) 

163 

164 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

165 if title_match: 165 ↛ 168line 165 didn't jump to line 168 because the condition on line 165 was always true

166 xarticle.title_tex = title_match.group(1) 

167 

168 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

169 if title_match: 169 ↛ 179line 169 didn't jump to line 179 because the condition on line 169 was always true

170 page_range = title_match.group(1) 

171 pages = page_range.split("--") 

172 if len(pages) == 2: 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was always true

173 xarticle.fpage = pages[0].replace(",", "") 

174 xarticle.lpage = pages[1].replace(",", "") 

175 else: 

176 xarticle.page_range = page_range 

177 

178 # Pdf 

179 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

180 if not pdf_tag: 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true

181 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')") 

182 if pdf_tag: 182 ↛ 188line 182 didn't jump to line 188 because the condition on line 182 was always true

183 href = pdf_tag.get("href") 

184 if isinstance(href, str): 184 ↛ 188line 184 didn't jump to line 188 because the condition on line 184 was always true

185 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

186 

187 # References 

188 a_id = url.split("/")[-1] 

189 ref_url = ( 

190 self.source_website 

191 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

192 ) 

193 

194 self.parse_references(self.download_file(ref_url), xarticle) 

195 

196 # Keywords 

197 keyword_tag = next( 

198 iter( 

199 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")] 

200 ), 

201 None, 

202 ) 

203 if keyword_tag: 

204 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ") 

205 for kwd in keywords: 

206 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)}) 

207 

208 abstract_tag = next( 

209 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]), 

210 None, 

211 ) 

212 if abstract_tag: 

213 abstract_tag_b = abstract_tag.select_one("b") 

214 if abstract_tag_b: 214 ↛ 216line 214 didn't jump to line 216 because the condition on line 214 was always true

215 abstract_tag_b.decompose() 

216 xabstract = create_abstract( 

217 tag="abstract", 

218 value_tex=abstract_tag.text, 

219 lang=self.detect_language(abstract_tag.text), 

220 ) 

221 xarticle.abstracts.append(xabstract) 

222 return xarticle 

223 

224 def parse_references(self, content: str, xarticle: ArticleData): 

225 soup = BeautifulSoup(content, "html.parser") 

226 references = soup.select('tr:has(td[valign="top"])') 

227 

228 bibitems = [self.parse_ref(item) for item in references] 

229 if len(bibitems) > 0: 

230 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

231 

232 def parse_ref(self, tag: Tag): 

233 links_xml = "" 

234 for a_tag in tag.select("a"): 

235 a_href = a_tag.get("href") 

236 if not isinstance(a_href, str): 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 continue 

238 a_href = escape(a_href) 

239 if a_tag.select_one("img[alt='crossref']"): 

240 links_xml += get_ext_link_xml( 

241 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

242 ) 

243 elif a_tag.select_one("img[alt='mathscinet']"): 

244 links_xml += get_ext_link_xml( 

245 a_href, 

246 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

247 "mr-item-id", 

248 ) 

249 elif a_tag.select_one("img[alt='zmath']"): 

250 links_xml += get_ext_link_xml( 

251 a_href, 

252 a_href.removeprefix("https://zbmath.org/?q=an:"), 

253 "zbl-item-id", 

254 ) 

255 elif a_tag.select_one("img"): 

256 print(f"Unimplemented reference link : {a_tag.get('href', '')}") 

257 else: 

258 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

259 a_tag.decompose() 

260 

261 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml)) 

262 

263 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

264 """Override this if the content-type headers from the sources are advertising something else than the actual content 

265 SASA needs this""" 

266 if "charset=" in response.headers["Content-Type"]: 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true

267 encoding = response.headers["Content-Type"].split("charset=")[1] 

268 return response.content.decode(encoding) 

269 

270 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

271 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

272 parsed_xarticle = xarticle 

273 if hasattr(xarticle, "url") and xarticle.url: 273 ↛ 291line 273 didn't jump to line 291 because the condition on line 273 was always true

274 parsed_xarticle = None 

275 attempts = 0 

276 while parsed_xarticle is None and attempts < 3: 

277 try: 

278 parsed_xarticle = super().crawl_article(xarticle, xissue) 

279 except ValueError as e: 

280 print(f"{xarticle.pid} : Caught error : {e} ") 

281 attempts += 1 

282 print( 

283 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})" 

284 ) 

285 # 15 mins, 30 mins, 45 mins 

286 time.sleep(attempts * 15 * 60) 

287 self.download_file(xarticle.url, force_refresh=True) 

288 

289 if parsed_xarticle is None: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

291 return self.process_resource_metadata(parsed_xarticle)