Coverage for src/crawler/by_source/mathnetru_crawler.py: 88%

161 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-10-29 14:25 +0000

1from urllib.parse import parse_qs, urljoin, urlparse 

2 

3import langcodes 

4import lingua 

5import regex 

6from bs4 import BeautifulSoup, Tag 

7from lingua import LanguageDetectorBuilder 

8from ptf.cmds.xml.jats.builder.references import get_ext_link_xml 

9from ptf.cmds.xml.jats.jats_parser import JatsBase 

10from ptf.cmds.xml.xml_utils import escape 

11from ptf.model_data import ( 

12 ArticleData, 

13 create_abstract, 

14 create_articledata, 

15 create_contributor, 

16) 

17from pylatexenc.latex2text import LatexNodes2Text 

18 

19from crawler.base_crawler import BaseCollectionCrawler 

20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

21 

22 

23class MathnetruCrawler(BaseCollectionCrawler): 

24 source_domain = "MATHNETRU" 

25 source_name = "Math-Net.Ru" 

26 source_website = "https://www.mathnet.ru" 

27 

28 issue_regex = regex.compile( 

29 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[  ](?P<volume>\d+))?(?:, +Issue[  ](?P<number>[\d\-]+))?" 

30 ) 

31 

32 _language_detector_builder = LanguageDetectorBuilder.from_languages( 

33 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH 

34 ) 

35 

36 def __init__(self, *args, **kwargs): 

37 super().__init__(*args, **kwargs) 

38 self.latex_converter = LatexNodes2Text(math_mode="verbatim") 

39 

40 def parse_collection_content(self, content): 

41 xissues = [] 

42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

43 soup = BeautifulSoup(content, "html5lib") 

44 

45 # Issues without names 

46 issue_tags = soup.select( 

47 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], " 

48 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], " 

49 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']" 

50 ) 

51 for link_tag in issue_tags: 

52 href = link_tag.get("href") 

53 

54 if not isinstance(href, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError( 

56 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

57 ) 

58 url = urljoin(self.source_website, href) + "&bshow=contents" 

59 

60 title = link_tag.get("title") 

61 if not isinstance(title, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Couldn't find issue data") 

63 issue_search = self.issue_regex.search(cleanup_str(title)) 

64 year = None 

65 volume = None 

66 number = None 

67 vseries = None 

68 if issue_search and self.collection_id != "MP": 68 ↛ 75line 68 didn't jump to line 75 because the condition on line 68 was always true

69 issue_dict = issue_search.groupdict() 

70 year = issue_dict["year"] 

71 volume = issue_dict.get("volume", None) 

72 number = issue_dict.get("number", None) 

73 

74 # Use another method to parse issue metadata if the first one is not successfull 

75 parsed_url = urlparse(url) 

76 query_args = parse_qs(parsed_url.query) 

77 

78 # Query arguments can be lists 

79 year = year or next(iter(query_args.get("year") or []), None) 

80 volume = volume or next(iter(query_args.get("volume") or []), None) 

81 number = number or next(iter(query_args.get("issue") or []), None) 

82 vseries = vseries or next(iter(query_args.get("series") or []), None) 

83 

84 if not year: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 raise ValueError("Couldn't parse issue year") 

86 xissue = self.create_xissue(url, year, volume, number, vseries) 

87 

88 xissues.append(xissue) 

89 

90 return xissues 

91 

92 def parse_issue_content(self, content, xissue): 

93 soup = BeautifulSoup(content, "html.parser") 

94 # Parse issue title (if exists) 

95 issue_title_tag = soup.select_one("td[valign='top'] div.red font") 

96 if issue_title_tag: 

97 title_str = cleanup_str(issue_title_tag.text) 

98 part_search = regex.search(r"Part (?P<number>\d+)$", title_str) 

99 if part_search: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 title_str = title_str.removesuffix(f"Part {xissue.number}") 

101 

102 xissue.title_tex = title_str 

103 

104 # Parse Articles 

105 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

106 articles_tags = soup.select( 

107 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

108 ) 

109 for i, a in enumerate(articles_tags): 

110 article = create_articledata() 

111 href = a.get("href") 

112 if not isinstance(href, str): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError( 

114 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

115 ) 

116 

117 article.url = self.source_website + href 

118 article.pid = "a" + str(i) 

119 xissue.articles.append(article) 

120 

121 def parse_article_content(self, content, xissue, xarticle, url): 

122 soup = BeautifulSoup(content, "html5lib") 

123 

124 # Language 

125 language_candidates = soup.select("div.around-button > div.msc") 

126 language_span = next( 

127 ( 

128 span 

129 for span in language_candidates 

130 if cleanup_str(span.text).startswith("Language:") 

131 ), 

132 None, 

133 ) 

134 

135 if not language_span: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 raise ValueError( 

137 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

138 ) 

139 

140 language_b = language_span.select_one("b") 

141 if language_b: 141 ↛ 144line 141 didn't jump to line 144 because the condition on line 141 was always true

142 language_b.decompose() 

143 

144 long_lang = cleanup_str(language_span.text) 

145 xarticle.lang = str(langcodes.find(long_lang)) 

146 

147 # Title 

148 title_tag = soup.select_one("span.red font") 

149 if not title_tag: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 raise ValueError( 

151 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

152 ) 

153 xarticle.title_tex = title_tag.text 

154 

155 amsbib_tag = soup.select_one("div.showamsbib") 

156 

157 if amsbib_tag: 157 ↛ 187line 157 didn't jump to line 187 because the condition on line 157 was always true

158 amsbib = amsbib_tag.text 

159 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

160 if authors_match: 

161 authors = authors_match.group(1).split(",") 

162 for author_text in authors: 

163 if author_text != "": 163 ↛ 162line 163 didn't jump to line 162 because the condition on line 163 was always true

164 author_text = self.latext_parser.latex_to_text(author_text) 

165 author = create_contributor() 

166 author["role"] = "author" 

167 author["string_name"] = cleanup_str(author_text) 

168 xarticle.contributors.append(author) 

169 

170 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

171 if title_match: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true

172 xarticle.title_tex = title_match.group(1) 

173 

174 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

175 if title_match: 175 ↛ 187line 175 didn't jump to line 187 because the condition on line 175 was always true

176 page_range = title_match.group(1) 

177 pages = page_range.split("--") 

178 if len(pages) == 2: 

179 xarticle.fpage = pages[0].replace(",", "") 

180 xarticle.lpage = pages[1].replace(",", "") 

181 else: 

182 xarticle.page_range = page_range 

183 

184 # Workaround for pylatexenc not supporting latex \href{}{} commands 

185 # https://github.com/phfaist/pylatexenc/issues/58 

186 

187 article_title = cleanup_str(xarticle.title_tex) 

188 article_title = regex.sub(r"\\href\{(.+)\}(?:\{(.+)\})?", r"\1", article_title) 

189 

190 xarticle.title_tex = self.latex_converter.latex_to_text(article_title) 

191 

192 # Pdf 

193 pdf_tag = soup.select_one( 

194 "a.button_green[title='Full-text article is available'], a.button_yellow[title='Full-text article is available']" 

195 ) 

196 if not pdf_tag: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true

197 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')") 

198 if pdf_tag: 198 ↛ 204line 198 didn't jump to line 204 because the condition on line 198 was always true

199 href = pdf_tag.get("href") 

200 if isinstance(href, str): 200 ↛ 204line 200 didn't jump to line 204 because the condition on line 200 was always true

201 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

202 

203 # References 

204 a_id = url.split("/")[-1] 

205 ref_url = ( 

206 self.source_website 

207 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

208 ) 

209 

210 self.parse_references(self.download_file(ref_url), xarticle) 

211 

212 # Keywords 

213 keyword_tag = next( 

214 iter( 

215 [ 

216 d 

217 for d in soup.select("div.around-button") 

218 if cleanup_str(d.text).startswith("Keywords:") 

219 ] 

220 ), 

221 None, 

222 ) 

223 if keyword_tag: 

224 keywords = cleanup_str(keyword_tag.text).removeprefix("Keywords:").strip().split(", ") 

225 for kwd in keywords: 

226 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)}) 

227 

228 abstract_tag = next( 

229 iter( 

230 [ 

231 d 

232 for d in soup.select("div.around-button") 

233 if cleanup_str(d.text).startswith("Abstract") 

234 ] 

235 ), 

236 None, 

237 ) 

238 if abstract_tag: 

239 abstract_tag_b = abstract_tag.select_one("b") 

240 if abstract_tag_b: 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was always true

241 abstract_tag_b.decompose() 

242 abstract_text = cleanup_str(escape(abstract_tag.text)) 

243 xabstract = create_abstract( 

244 value_tex=abstract_text, 

245 lang=self.detect_language(abstract_text), 

246 ) 

247 xarticle.abstracts.append(xabstract) 

248 return xarticle 

249 

250 def parse_references(self, content: str, xarticle: ArticleData): 

251 soup = BeautifulSoup(content, "html.parser") 

252 references = soup.select('tr:has(td[valign="top"])') 

253 

254 xarticle.bibitems = [self.parse_ref(item) for item in references] 

255 

256 def parse_ref(self, tag: Tag): 

257 links_xml = "" 

258 for a_tag in tag.select("a"): 

259 a_href = a_tag.get("href") 

260 if not isinstance(a_href, str): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 continue 

262 a_href = escape(a_href) 

263 if a_tag.select_one("img[alt='crossref']"): 

264 links_xml += get_ext_link_xml( 

265 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

266 ) 

267 elif a_tag.select_one("img[alt='mathscinet']"): 

268 links_xml += get_ext_link_xml( 

269 a_href, 

270 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

271 "mr-item-id", 

272 ) 

273 elif a_tag.select_one("img[alt='zmath']"): 

274 links_xml += get_ext_link_xml( 

275 a_href, 

276 a_href.removeprefix("https://zbmath.org/?q=an:"), 

277 "zbl-item-id", 

278 ) 

279 elif a_tag.select_one("img"): 

280 # self.logger.debug(f"Unimplemented reference link : {a_tag.get('href', '')}") 

281 # alt_text = a_tag.get("alt", "") 

282 # if not isinstance(alt_text, str): 

283 # continue 

284 # links_xml += get_ext_link_xml(a_href, escape(a_tag.text or alt_text)) 

285 pass 

286 else: 

287 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

288 a_tag.decompose() 

289 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))