Coverage for src/crawler/by_source/mathnetru_crawler.py: 86%

170 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import time 

2from datetime import datetime, timedelta 

3 

4import langcodes 

5import lingua 

6import regex 

7import requests 

8from bs4 import BeautifulSoup, Tag 

9from lingua import LanguageDetectorBuilder 

10from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

11from ptf.cmds.xml.xml_utils import escape 

12from ptf.model_data import ( 

13 ArticleData, 

14 IssueData, 

15 create_abstract, 

16 create_articledata, 

17 create_contributor, 

18) 

19 

20from crawler.base_crawler import BaseCollectionCrawler 

21from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

22 

23 

24class MathnetruCrawler(BaseCollectionCrawler): 

25 source_domain = "MATHNETRU" 

26 source_name = "Math-Net.Ru" 

27 source_website = "https://www.mathnet.ru" 

28 periode_begin = 0 

29 periode_end = 0 

30 

31 issue_regex = r"(?:.+, )?(?P<year>\d{4}), ?(?:Volume|Issue|Number) (?P<volume>\d+)(?:, ?(?:Number|Issue) (?P<number>\d+))?" 

32 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)" 

33 

34 def __init__(self, *args, **kwargs): 

35 super().__init__(*args, **kwargs) 

36 self.language_detector = LanguageDetectorBuilder.from_languages( 

37 lingua.Language.ENGLISH, lingua.Language.ENGLISH 

38 ).build() 

39 

40 def parse_collection_content(self, content): 

41 xissues = [] 

42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

43 soup = BeautifulSoup(content, "html5lib") 

44 

45 # Periode 

46 periode_tag = soup.select_one("td.showUDC[title='Coverage']:nth-child(2)") 

47 if periode_tag: 

48 years = periode_tag.text.split("–") 

49 self.periode_begin = int(years[0]) 

50 self.periode_end = int(years[1]) 

51 

52 self.periode = self.get_or_create_periode() 

53 

54 # Issues 

55 issue_tags = soup.select( 

56 "table.Card td a.SLink[href^='/php'], table.cont td.issue_with_corner a.SLink[href^='/php']" 

57 ) 

58 for link_tag in issue_tags: 

59 href = link_tag.get("href") 

60 title = link_tag.get("title", None) 

61 if not isinstance(href, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError( 

63 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

64 ) 

65 if isinstance(title, str): 

66 title = cleanup_str(title) 

67 volume_re = regex.search(self.issue_regex, title) 

68 else: 

69 if not link_tag.parent: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 raise ValueError( 

71 f"[{self.source_domain}] {self.collection_id} : Title cannot be parsed" 

72 ) 

73 title = cleanup_str(link_tag.parent.text) 

74 volume_re = regex.search(self.issue_regex_alt, title) 

75 

76 if not volume_re: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 raise ValueError( 

78 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

79 ) 

80 

81 volume_data = volume_re.groupdict() 

82 if volume_data.get("volume", None): 

83 volume_data["volume"] = volume_data["volume"].strip() 

84 xissues.append( 

85 self.create_xissue( 

86 self.source_website + href + "&bshow=contents", 

87 volume_data["year"], 

88 volume_data.get("volume", None), 

89 volume_data.get("number", None), 

90 ) 

91 ) 

92 return xissues 

93 

94 def parse_issue_content(self, content, xissue): 

95 soup = BeautifulSoup(content, "html.parser") 

96 

97 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

98 articles_tags = soup.select( 

99 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

100 ) 

101 for i, a in enumerate(articles_tags): 

102 article = create_articledata() 

103 href = a.get("href") 

104 if not isinstance(href, str): 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 raise ValueError( 

106 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

107 ) 

108 

109 article.url = self.source_website + href 

110 article.pid = "a" + str(i) 

111 xissue.articles.append(article) 

112 

113 def parse_article_content(self, content, xissue, xarticle, url, pid): 

114 soup = BeautifulSoup(content, "html.parser") 

115 

116 xarticle.pid = pid 

117 

118 # Language 

119 language_candidates = soup.select("div.around-button > div.msc") 

120 language_span = next( 

121 ( 

122 span 

123 for span in language_candidates 

124 if cleanup_str(span.text).startswith("Language: ") 

125 ), 

126 None, 

127 ) 

128 

129 if not language_span: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError( 

131 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

132 ) 

133 

134 language_b = language_span.select_one("b") 

135 if language_b: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true

136 language_b.decompose() 

137 

138 long_lang = cleanup_str(language_span.text) 

139 xarticle.lang = str(langcodes.find(long_lang)) 

140 

141 # Title 

142 title_tag = soup.select_one("span.red font") 

143 if not title_tag: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 raise ValueError( 

145 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

146 ) 

147 xarticle.title_tex = title_tag.text 

148 

149 amsbib_tag = soup.select_one("div.showamsbib") 

150 

151 if amsbib_tag: 151 ↛ 179line 151 didn't jump to line 179 because the condition on line 151 was always true

152 amsbib = amsbib_tag.text 

153 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

154 if authors_match: 

155 authors = authors_match.group(1).split(",") 

156 for author_text in authors: 

157 if author_text != "": 157 ↛ 156line 157 didn't jump to line 156 because the condition on line 157 was always true

158 author_text = self.latext_parser.latex_to_text(author_text) 

159 author = create_contributor() 

160 author["role"] = "author" 

161 author["string_name"] = cleanup_str(author_text) 

162 xarticle.contributors.append(author) 

163 

164 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

165 if title_match: 165 ↛ 168line 165 didn't jump to line 168 because the condition on line 165 was always true

166 xarticle.title_tex = title_match.group(1) 

167 

168 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

169 if title_match: 169 ↛ 179line 169 didn't jump to line 179 because the condition on line 169 was always true

170 page_range = title_match.group(1) 

171 pages = page_range.split("--") 

172 if len(pages) == 2: 

173 xarticle.fpage = pages[0].replace(",", "") 

174 xarticle.lpage = pages[1].replace(",", "") 

175 else: 

176 xarticle.page_range = page_range 

177 

178 # Pdf 

179 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

180 if pdf_tag: 180 ↛ 186line 180 didn't jump to line 186 because the condition on line 180 was always true

181 href = pdf_tag.get("href") 

182 if isinstance(href, str): 182 ↛ 186line 182 didn't jump to line 186 because the condition on line 182 was always true

183 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

184 

185 # References 

186 a_id = url.split("/")[-1] 

187 ref_url = ( 

188 self.source_website 

189 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

190 ) 

191 

192 self.parse_references(self.download_file(ref_url), xarticle) 

193 

194 # Keywords 

195 keyword_tag = next( 

196 iter( 

197 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")] 

198 ), 

199 None, 

200 ) 

201 if keyword_tag: 

202 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ") 

203 for kwd in keywords: 

204 xarticle.kwds.append({"type": "", "lang": self.detect_language(kwd), "value": kwd}) 

205 

206 abstract_tag = next( 

207 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]), 

208 None, 

209 ) 

210 if abstract_tag: 

211 abstract_tag_b = abstract_tag.select_one("b") 

212 if abstract_tag_b: 212 ↛ 214line 212 didn't jump to line 214 because the condition on line 212 was always true

213 abstract_tag_b.decompose() 

214 xabstract = create_abstract( 

215 tag="abstract", 

216 value_tex=abstract_tag.text, 

217 lang=self.detect_language(abstract_tag.text), 

218 ) 

219 xarticle.abstracts.append(xabstract) 

220 return xarticle 

221 

222 def parse_references(self, content: str, xarticle: ArticleData): 

223 soup = BeautifulSoup(content, "html.parser") 

224 references = soup.select('tr:has(td[valign="top"])') 

225 

226 bibitems = [self.parse_ref(item) for item in references] 

227 if len(bibitems) > 0: 

228 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

229 

230 def parse_ref(self, tag: Tag): 

231 links_xml = "" 

232 for a_tag in tag.select("a"): 

233 a_href = a_tag.get("href") 

234 if not isinstance(a_href, str): 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 continue 

236 a_href = escape(a_href) 

237 if a_tag.select_one("img[alt='crossref']"): 

238 links_xml += get_ext_link_xml( 

239 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

240 ) 

241 elif a_tag.select_one("img[alt='mathscinet']"): 

242 links_xml += get_ext_link_xml( 

243 a_href, 

244 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

245 "mr-item-id", 

246 ) 

247 elif a_tag.select_one("img[alt='zmath']"): 

248 links_xml += get_ext_link_xml( 

249 a_href, 

250 a_href.removeprefix("https://zbmath.org/?q=an:"), 

251 "zbl-item-id", 

252 ) 

253 elif a_tag.select_one("img"): 

254 print(f"Unimplemented reference link : {a_tag.get('href', '')}") 

255 else: 

256 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

257 a_tag.decompose() 

258 

259 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml)) 

260 

261 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

262 """Override this if the content-type headers from the sources are advertising something else than the actual content 

263 SASA needs this""" 

264 if "charset=" in response.headers["Content-Type"]: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 encoding = response.headers["Content-Type"].split("charset=")[1] 

266 return response.content.decode(encoding) 

267 

268 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

269 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

270 parsed_xarticle = xarticle 

271 if hasattr(xarticle, "url") and xarticle.url: 271 ↛ 289line 271 didn't jump to line 289 because the condition on line 271 was always true

272 parsed_xarticle = None 

273 attempts = 0 

274 while parsed_xarticle is None and attempts < 3: 

275 try: 

276 parsed_xarticle = super().crawl_article(xarticle, xissue) 

277 except ValueError as e: 

278 print(f"{xarticle.pid} : Caught error : {e} ") 

279 print( 

280 f"Retrying in {(attempts * 15)}mins ({(datetime.now() - timedelta(minutes=15)).time()})" 

281 ) 

282 attempts += 1 

283 # 15 mins, 30 mins, 45 mins 

284 time.sleep(attempts * 15 * 60) 

285 self.download_file(xarticle.url, force_refresh=True) 

286 

287 if parsed_xarticle is None: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

289 return self.process_article_metadata(parsed_xarticle)