Coverage for src/crawler/by_source/mathnetru

1import time

2from datetime import datetime, timedelta

3from urllib.parse import parse_qs, urljoin, urlparse

5import langcodes

6import lingua

7import regex

8import requests

9from bs4 import BeautifulSoup, Tag

10from lingua import LanguageDetectorBuilder

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml

12from ptf.cmds.xml.jats.jats_parser import JatsBase

13from ptf.cmds.xml.xml_utils import escape

14from ptf.model_data import (

15 ArticleData,

16 IssueData,

17 create_abstract,

18 create_articledata,

19 create_contributor,

20)

22from crawler.base_crawler import BaseCollectionCrawler

23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

26class MathnetruCrawler(BaseCollectionCrawler):

27 source_domain = "MATHNETRU"

28 source_name = "Math-Net.Ru"

29 source_website = "https://www.mathnet.ru"

31 issue_regex = regex.compile(

32 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[ ](?P<volume>\d+))?(?:, +Issue[ ](?P<number>[\d\-]+))?"

33 )

35 language_detector = LanguageDetectorBuilder.from_languages(

36 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH

37 ).build()

39 def parse_collection_content(self, content):

40 xissues = []

41 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng

42 soup = BeautifulSoup(content, "html5lib")

44 # Issues without names

45 issue_tags = soup.select(

46 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], "

47 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], "

48 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']"

49 )

50 for index, link_tag in enumerate(issue_tags):

51 href = link_tag.get("href")

53 if not isinstance(href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError(

55 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"

56 )

57 url = urljoin(self.source_website, href) + "&bshow=contents"

59 title = link_tag.get("title")

60 if not isinstance(title, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Couldn't find issue data")

62 issue_search = self.issue_regex.search(cleanup_str(title))

63 if not issue_search: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("Couldn't parse issue data")

65 issue_dict = issue_search.groupdict()

66 year = issue_dict["year"]

67 volume = issue_dict.get("volume", None)

68 number = issue_dict.get("number", None)

70 # Use another method to parse issue metadata if the first one is not successfull

71 parsed_url = urlparse(url)

72 query_args = parse_qs(parsed_url.query)

74 year = year or next(iter(query_args.get("year") or []), None)

75 volume = volume or next(iter(query_args.get("volume") or []), None)

76 number = number or next(iter(query_args.get("issue") or []), None)

77 if not year: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Couldn't parse issue year")

79 xissue = self.create_xissue(

80 url,

81 year,

82 volume,

83 number,

84 )

86 xissues.append(xissue)

88 return xissues

90 def parse_issue_content(self, content, xissue):

91 soup = BeautifulSoup(content, "html.parser")

92 # Parse issue title (if exists)

93 issue_title_tag = soup.select_one("td[valign='top'] div.red font")

94 if issue_title_tag:

95 title_str = cleanup_str(issue_title_tag.text)

96 part_search = regex.search(r"Part (?P<number>\d+)$", title_str)

97 if part_search: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 title_str = title_str.removesuffix(f"Part {xissue.number}")

100 xissue.title_tex = title_str

101

102 # Parse Articles

103 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng

104 articles_tags = soup.select(

105 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"

106 )

107 for i, a in enumerate(articles_tags):

108 article = create_articledata()

109 href = a.get("href")

110 if not isinstance(href, str): 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 raise ValueError(

112 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"

113 )

114

115 article.url = self.source_website + href

116 article.pid = "a" + str(i)

117 xissue.articles.append(article)

118

119 def parse_article_content(self, content, xissue, xarticle, url):

120 soup = BeautifulSoup(content, "html.parser")

121

122 # Language

123 language_candidates = soup.select("div.around-button > div.msc")

124 language_span = next(

125 (

126 span

127 for span in language_candidates

128 if cleanup_str(span.text).startswith("Language: ")

129 ),

130 None,

131 )

132

133 if not language_span: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError(

135 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"

136 )

137

138 language_b = language_span.select_one("b")

139 if language_b: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was always true

140 language_b.decompose()

141

142 long_lang = cleanup_str(language_span.text)

143 xarticle.lang = str(langcodes.find(long_lang))

144

145 # Title

146 title_tag = soup.select_one("span.red font")

147 if not title_tag: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 raise ValueError(

149 f"[{self.source_domain}] {self.collection_id} : Article title not found"

150 )

151 xarticle.title_tex = title_tag.text

152

153 amsbib_tag = soup.select_one("div.showamsbib")

154

155 if amsbib_tag: 155 ↛ 183line 155 didn't jump to line 183 because the condition on line 155 was always true

156 amsbib = amsbib_tag.text

157 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)

158 if authors_match:

159 authors = authors_match.group(1).split(",")

160 for author_text in authors:

161 if author_text != "": 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true

162 author_text = self.latext_parser.latex_to_text(author_text)

163 author = create_contributor()

164 author["role"] = "author"

165 author["string_name"] = cleanup_str(author_text)

166 xarticle.contributors.append(author)

167

168 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)

169 if title_match: 169 ↛ 172line 169 didn't jump to line 172 because the condition on line 169 was always true

170 xarticle.title_tex = title_match.group(1)

171

172 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)

173 if title_match: 173 ↛ 183line 173 didn't jump to line 183 because the condition on line 173 was always true

174 page_range = title_match.group(1)

175 pages = page_range.split("--")

176 if len(pages) == 2:

177 xarticle.fpage = pages[0].replace(",", "")

178 xarticle.lpage = pages[1].replace(",", "")

179 else:

180 xarticle.page_range = page_range

181

182 # Pdf

183 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")

184 if not pdf_tag: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")

186 if pdf_tag: 186 ↛ 192line 186 didn't jump to line 192 because the condition on line 186 was always true

187 href = pdf_tag.get("href")

188 if isinstance(href, str): 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was always true

189 add_pdf_link_to_xarticle(xarticle, self.source_website + href)

190

191 # References

192 a_id = url.split("/")[-1]

193 ref_url = (

194 self.source_website

195 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"

196 )

197

198 self.parse_references(self.download_file(ref_url), xarticle)

199

200 # Keywords

201 keyword_tag = next(

202 iter(

203 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]

204 ),

205 None,

206 )

207 if keyword_tag:

208 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")

209 for kwd in keywords:

210 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})

211

212 abstract_tag = next(

213 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),

214 None,

215 )

216 if abstract_tag:

217 abstract_tag_b = abstract_tag.select_one("b")

218 if abstract_tag_b: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true

219 abstract_tag_b.decompose()

220 xabstract = create_abstract(

221 tag="abstract",

222 value_tex=abstract_tag.text,

223 lang=self.detect_language(abstract_tag.text),

224 )

225 xarticle.abstracts.append(xabstract)

226 return xarticle

227

228 def parse_references(self, content: str, xarticle: ArticleData):

229 soup = BeautifulSoup(content, "html.parser")

230 references = soup.select('tr:has(td[valign="top"])')

231

232 bibitems = [self.parse_ref(item) for item in references]

233 if len(bibitems) > 0:

234 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))

235

236 def parse_ref(self, tag: Tag):

237 links_xml = ""

238 for a_tag in tag.select("a"):

239 a_href = a_tag.get("href")

240 if not isinstance(a_href, str): 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 continue

242 a_href = escape(a_href)

243 if a_tag.select_one("img[alt='crossref']"):

244 links_xml += get_ext_link_xml(

245 a_href, a_href.removeprefix("https://doi.org/"), "doi"

246 )

247 elif a_tag.select_one("img[alt='mathscinet']"):

248 links_xml += get_ext_link_xml(

249 a_href,

250 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),

251 "mr-item-id",

252 )

253 elif a_tag.select_one("img[alt='zmath']"):

254 links_xml += get_ext_link_xml(

255 a_href,

256 a_href.removeprefix("https://zbmath.org/?q=an:"),

257 "zbl-item-id",

258 )

259 elif a_tag.select_one("img"):

260 self.logger.debug(

261 "Unimplemented reference link", extra={"url": a_tag.get("href", "")}

262 )

263 else:

264 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))

265 a_tag.decompose()

266

267 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))

268

269 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):

270 """Override this if the content-type headers from the sources are advertising something else than the actual content

271 SASA needs this"""

272 if "charset=" in response.headers["Content-Type"]: 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true

273 encoding = response.headers["Content-Type"].split("charset=")[1]

274 return response.content.decode(encoding)

275

276 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):

277 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`

278 parsed_xarticle = xarticle

279 if hasattr(xarticle, "url") and xarticle.url: 279 ↛ 298line 279 didn't jump to line 298 because the condition on line 279 was always true

280 parsed_xarticle = None

281 attempts = 0

282 while parsed_xarticle is None and attempts < 3:

283 try:

284 parsed_xarticle = super().crawl_article(xarticle, xissue)

285 except ValueError as e:

286 self.logger.debug(f"Caught error : {e}", extra={"pid": xarticle.pid})

287 attempts += 1

288 self.logger.debug(

289 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})",

290 extra={"pid": xarticle.pid},

291 )

292 # 15 mins, 30 mins, 45 mins

293 time.sleep(attempts * 15 * 60)

294 self.download_file(xarticle.url, force_refresh=True)

295

296 if parsed_xarticle is None: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 raise ValueError(f"Couldn't parse article {xarticle.pid}")

298 return self.process_article_metadata(parsed_xarticle)

Coverage for src/crawler/by_source/mathnetru_crawler.py: 84%

175 statements