Coverage for src/crawler/by_source/mathnetru_crawler.py: 88%

157 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1from urllib.parse import parse_qs, urljoin, urlparse 

2 

3import langcodes 

4import lingua 

5import regex 

6from bs4 import BeautifulSoup, Tag 

7from lingua import LanguageDetectorBuilder 

8from ptf.cmds.xml.jats.builder.references import get_ext_link_xml 

9from ptf.cmds.xml.jats.jats_parser import JatsBase 

10from ptf.cmds.xml.xml_utils import escape 

11from ptf.model_data import ( 

12 ArticleData, 

13 create_abstract, 

14 create_articledata, 

15 create_contributor, 

16) 

17from pylatexenc.latex2text import LatexNodes2Text 

18 

19from crawler.base_crawler import BaseCollectionCrawler 

20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

21 

22 

23class MathnetruCrawler(BaseCollectionCrawler): 

24 source_domain = "MATHNETRU" 

25 source_name = "Math-Net.Ru" 

26 source_website = "https://www.mathnet.ru" 

27 

28 issue_regex = regex.compile( 

29 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[  ](?P<volume>\d+))?(?:, +Issue[  ](?P<number>[\d\-]+))?" 

30 ) 

31 

32 def __init__(self, *args, **kwargs): 

33 super().__init__(*args, **kwargs) 

34 self.latex_converter = LatexNodes2Text(math_mode="verbatim") 

35 self.language_detector = LanguageDetectorBuilder.from_languages( 

36 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH 

37 ).build() 

38 

39 def parse_collection_content(self, content): 

40 xissues = [] 

41 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

42 soup = BeautifulSoup(content, "html5lib") 

43 

44 # Issues without names 

45 issue_tags = soup.select( 

46 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], " 

47 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], " 

48 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']" 

49 ) 

50 for link_tag in issue_tags: 

51 href = link_tag.get("href") 

52 

53 if not isinstance(href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError( 

55 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

56 ) 

57 url = urljoin(self.source_website, href) + "&bshow=contents" 

58 

59 title = link_tag.get("title") 

60 if not isinstance(title, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Couldn't find issue data") 

62 issue_search = self.issue_regex.search(cleanup_str(title)) 

63 year = None 

64 volume = None 

65 number = None 

66 if issue_search: 66 ↛ 73line 66 didn't jump to line 73 because the condition on line 66 was always true

67 issue_dict = issue_search.groupdict() 

68 year = issue_dict["year"] 

69 volume = issue_dict.get("volume", None) 

70 number = issue_dict.get("number", None) 

71 

72 # Use another method to parse issue metadata if the first one is not successfull 

73 parsed_url = urlparse(url) 

74 query_args = parse_qs(parsed_url.query) 

75 

76 # Query arguments can be lists 

77 year = year or next(iter(query_args.get("year") or []), None) 

78 volume = volume or next(iter(query_args.get("volume") or []), None) 

79 number = number or next(iter(query_args.get("issue") or []), None) 

80 if not year: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Couldn't parse issue year") 

82 xissue = self.create_xissue( 

83 url, 

84 year, 

85 volume, 

86 number, 

87 ) 

88 

89 xissues.append(xissue) 

90 

91 return xissues 

92 

93 def parse_issue_content(self, content, xissue): 

94 soup = BeautifulSoup(content, "html.parser") 

95 # Parse issue title (if exists) 

96 issue_title_tag = soup.select_one("td[valign='top'] div.red font") 

97 if issue_title_tag: 

98 title_str = cleanup_str(issue_title_tag.text) 

99 part_search = regex.search(r"Part (?P<number>\d+)$", title_str) 

100 if part_search: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 title_str = title_str.removesuffix(f"Part {xissue.number}") 

102 

103 xissue.title_tex = title_str 

104 

105 # Parse Articles 

106 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

107 articles_tags = soup.select( 

108 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

109 ) 

110 for i, a in enumerate(articles_tags): 

111 article = create_articledata() 

112 href = a.get("href") 

113 if not isinstance(href, str): 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 raise ValueError( 

115 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

116 ) 

117 

118 article.url = self.source_website + href 

119 article.pid = "a" + str(i) 

120 xissue.articles.append(article) 

121 

122 def parse_article_content(self, content, xissue, xarticle, url): 

123 soup = BeautifulSoup(content, "html5lib") 

124 

125 # Language 

126 language_candidates = soup.select("div.around-button > div.msc") 

127 language_span = next( 

128 ( 

129 span 

130 for span in language_candidates 

131 if cleanup_str(span.text).startswith("Language:") 

132 ), 

133 None, 

134 ) 

135 

136 if not language_span: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 raise ValueError( 

138 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

139 ) 

140 

141 language_b = language_span.select_one("b") 

142 if language_b: 142 ↛ 145line 142 didn't jump to line 145 because the condition on line 142 was always true

143 language_b.decompose() 

144 

145 long_lang = cleanup_str(language_span.text) 

146 xarticle.lang = str(langcodes.find(long_lang)) 

147 

148 # Title 

149 title_tag = soup.select_one("span.red font") 

150 if not title_tag: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true

151 raise ValueError( 

152 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

153 ) 

154 xarticle.title_tex = title_tag.text 

155 

156 amsbib_tag = soup.select_one("div.showamsbib") 

157 

158 if amsbib_tag: 158 ↛ 185line 158 didn't jump to line 185 because the condition on line 158 was always true

159 amsbib = amsbib_tag.text 

160 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

161 if authors_match: 

162 authors = authors_match.group(1).split(",") 

163 for author_text in authors: 

164 if author_text != "": 164 ↛ 163line 164 didn't jump to line 163 because the condition on line 164 was always true

165 author_text = self.latext_parser.latex_to_text(author_text) 

166 author = create_contributor() 

167 author["role"] = "author" 

168 author["string_name"] = cleanup_str(author_text) 

169 xarticle.contributors.append(author) 

170 

171 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

172 if title_match: 172 ↛ 175line 172 didn't jump to line 175 because the condition on line 172 was always true

173 xarticle.title_tex = title_match.group(1) 

174 

175 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

176 if title_match: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true

177 page_range = title_match.group(1) 

178 pages = page_range.split("--") 

179 if len(pages) == 2: 

180 xarticle.fpage = pages[0].replace(",", "") 

181 xarticle.lpage = pages[1].replace(",", "") 

182 else: 

183 xarticle.page_range = page_range 

184 

185 xarticle.title_tex = self.latex_converter.latex_to_text(cleanup_str(xarticle.title_tex)) 

186 # Pdf 

187 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

188 if not pdf_tag: 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true

189 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')") 

190 if pdf_tag: 190 ↛ 196line 190 didn't jump to line 196 because the condition on line 190 was always true

191 href = pdf_tag.get("href") 

192 if isinstance(href, str): 192 ↛ 196line 192 didn't jump to line 196 because the condition on line 192 was always true

193 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

194 

195 # References 

196 a_id = url.split("/")[-1] 

197 ref_url = ( 

198 self.source_website 

199 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

200 ) 

201 

202 self.parse_references(self.download_file(ref_url), xarticle) 

203 

204 # Keywords 

205 keyword_tag = next( 

206 iter( 

207 [ 

208 d 

209 for d in soup.select("div.around-button") 

210 if cleanup_str(d.text).startswith("Keywords:") 

211 ] 

212 ), 

213 None, 

214 ) 

215 if keyword_tag: 

216 keywords = cleanup_str(keyword_tag.text).removeprefix("Keywords:").strip().split(", ") 

217 for kwd in keywords: 

218 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)}) 

219 

220 abstract_tag = next( 

221 iter( 

222 [ 

223 d 

224 for d in soup.select("div.around-button") 

225 if cleanup_str(d.text).startswith("Abstract") 

226 ] 

227 ), 

228 None, 

229 ) 

230 if abstract_tag: 

231 abstract_tag_b = abstract_tag.select_one("b") 

232 if abstract_tag_b: 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true

233 abstract_tag_b.decompose() 

234 abstract_text = cleanup_str(abstract_tag.text) 

235 xabstract = create_abstract( 

236 value_tex=abstract_text, 

237 lang=self.detect_language(abstract_text), 

238 ) 

239 xarticle.abstracts.append(xabstract) 

240 return xarticle 

241 

242 def parse_references(self, content: str, xarticle: ArticleData): 

243 soup = BeautifulSoup(content, "html.parser") 

244 references = soup.select('tr:has(td[valign="top"])') 

245 

246 xarticle.bibitems = [self.parse_ref(item) for item in references] 

247 

248 def parse_ref(self, tag: Tag): 

249 links_xml = "" 

250 for a_tag in tag.select("a"): 

251 a_href = a_tag.get("href") 

252 if not isinstance(a_href, str): 252 ↛ 253line 252 didn't jump to line 253 because the condition on line 252 was never true

253 continue 

254 a_href = escape(a_href) 

255 if a_tag.select_one("img[alt='crossref']"): 

256 links_xml += get_ext_link_xml( 

257 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

258 ) 

259 elif a_tag.select_one("img[alt='mathscinet']"): 

260 links_xml += get_ext_link_xml( 

261 a_href, 

262 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

263 "mr-item-id", 

264 ) 

265 elif a_tag.select_one("img[alt='zmath']"): 

266 links_xml += get_ext_link_xml( 

267 a_href, 

268 a_href.removeprefix("https://zbmath.org/?q=an:"), 

269 "zbl-item-id", 

270 ) 

271 elif a_tag.select_one("img"): 

272 # self.logger.debug(f"Unimplemented reference link : {a_tag.get('href', '')}") 

273 # alt_text = a_tag.get("alt", "") 

274 # if not isinstance(alt_text, str): 

275 # continue 

276 # links_xml += get_ext_link_xml(a_href, escape(a_tag.text or alt_text)) 

277 pass 

278 else: 

279 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

280 a_tag.decompose() 

281 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))