Coverage for src/crawler/by_source/mathnetru_crawler.py: 85%
175 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1import time
2from datetime import datetime, timedelta
3from urllib.parse import parse_qs, urljoin, urlparse
5import langcodes
6import lingua
7import regex
8from bs4 import BeautifulSoup, Tag
9from lingua import LanguageDetectorBuilder
10from ptf.cmds.xml.jats.builder.references import get_ext_link_xml
11from ptf.cmds.xml.jats.jats_parser import JatsBase
12from ptf.cmds.xml.xml_utils import escape
13from ptf.model_data import (
14 ArticleData,
15 IssueData,
16 create_abstract,
17 create_articledata,
18 create_contributor,
19)
20from pylatexenc.latex2text import LatexNodes2Text
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
26class MathnetruCrawler(BaseCollectionCrawler):
27 source_domain = "MATHNETRU"
28 source_name = "Math-Net.Ru"
29 source_website = "https://www.mathnet.ru"
31 issue_regex = regex.compile(
32 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[ ](?P<volume>\d+))?(?:, +Issue[ ](?P<number>[\d\-]+))?"
33 )
35 def __init__(self, *args, **kwargs):
36 super().__init__(*args, **kwargs)
37 self.latex_converter = LatexNodes2Text(math_mode="verbatim")
38 self.language_detector = LanguageDetectorBuilder.from_languages(
39 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH
40 ).build()
42 def parse_collection_content(self, content):
43 xissues = []
44 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
45 soup = BeautifulSoup(content, "html5lib")
47 # Issues without names
48 issue_tags = soup.select(
49 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], "
50 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], "
51 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']"
52 )
53 for link_tag in issue_tags:
54 href = link_tag.get("href")
56 if not isinstance(href, str): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 raise ValueError(
58 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
59 )
60 url = urljoin(self.source_website, href) + "&bshow=contents"
62 title = link_tag.get("title")
63 if not isinstance(title, str): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 raise ValueError("Couldn't find issue data")
65 issue_search = self.issue_regex.search(cleanup_str(title))
66 year = None
67 volume = None
68 number = None
69 if issue_search: 69 ↛ 76line 69 didn't jump to line 76 because the condition on line 69 was always true
70 issue_dict = issue_search.groupdict()
71 year = issue_dict["year"]
72 volume = issue_dict.get("volume", None)
73 number = issue_dict.get("number", None)
75 # Use another method to parse issue metadata if the first one is not successfull
76 parsed_url = urlparse(url)
77 query_args = parse_qs(parsed_url.query)
79 # Query arguments can be lists
80 year = year or next(iter(query_args.get("year") or []), None)
81 volume = volume or next(iter(query_args.get("volume") or []), None)
82 number = number or next(iter(query_args.get("issue") or []), None)
83 if not year: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 raise ValueError("Couldn't parse issue year")
85 xissue = self.create_xissue(
86 url,
87 year,
88 volume,
89 number,
90 )
92 xissues.append(xissue)
94 return xissues
96 def parse_issue_content(self, content, xissue):
97 soup = BeautifulSoup(content, "html.parser")
98 # Parse issue title (if exists)
99 issue_title_tag = soup.select_one("td[valign='top'] div.red font")
100 if issue_title_tag:
101 title_str = cleanup_str(issue_title_tag.text)
102 part_search = regex.search(r"Part (?P<number>\d+)$", title_str)
103 if part_search: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 title_str = title_str.removesuffix(f"Part {xissue.number}")
106 xissue.title_tex = title_str
108 # Parse Articles
109 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
110 articles_tags = soup.select(
111 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
112 )
113 for i, a in enumerate(articles_tags):
114 article = create_articledata()
115 href = a.get("href")
116 if not isinstance(href, str): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 raise ValueError(
118 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
119 )
121 article.url = self.source_website + href
122 article.pid = "a" + str(i)
123 xissue.articles.append(article)
125 def parse_article_content(self, content, xissue, xarticle, url):
126 soup = BeautifulSoup(content, "html.parser")
128 # Language
129 language_candidates = soup.select("div.around-button > div.msc")
130 language_span = next(
131 (
132 span
133 for span in language_candidates
134 if cleanup_str(span.text).startswith("Language: ")
135 ),
136 None,
137 )
139 if not language_span: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 raise ValueError(
141 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
142 )
144 language_b = language_span.select_one("b")
145 if language_b: 145 ↛ 148line 145 didn't jump to line 148 because the condition on line 145 was always true
146 language_b.decompose()
148 long_lang = cleanup_str(language_span.text)
149 xarticle.lang = str(langcodes.find(long_lang))
151 # Title
152 title_tag = soup.select_one("span.red font")
153 if not title_tag: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 raise ValueError(
155 f"[{self.source_domain}] {self.collection_id} : Article title not found"
156 )
157 xarticle.title_tex = title_tag.text
159 amsbib_tag = soup.select_one("div.showamsbib")
161 if amsbib_tag: 161 ↛ 188line 161 didn't jump to line 188 because the condition on line 161 was always true
162 amsbib = amsbib_tag.text
163 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
164 if authors_match:
165 authors = authors_match.group(1).split(",")
166 for author_text in authors:
167 if author_text != "": 167 ↛ 166line 167 didn't jump to line 166 because the condition on line 167 was always true
168 author_text = self.latext_parser.latex_to_text(author_text)
169 author = create_contributor()
170 author["role"] = "author"
171 author["string_name"] = cleanup_str(author_text)
172 xarticle.contributors.append(author)
174 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
175 if title_match: 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true
176 xarticle.title_tex = title_match.group(1)
178 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
179 if title_match: 179 ↛ 188line 179 didn't jump to line 188 because the condition on line 179 was always true
180 page_range = title_match.group(1)
181 pages = page_range.split("--")
182 if len(pages) == 2:
183 xarticle.fpage = pages[0].replace(",", "")
184 xarticle.lpage = pages[1].replace(",", "")
185 else:
186 xarticle.page_range = page_range
188 xarticle.title_tex = self.latex_converter.latex_to_text(cleanup_str(xarticle.title_tex))
189 # Pdf
190 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
191 if not pdf_tag: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")
193 if pdf_tag: 193 ↛ 199line 193 didn't jump to line 199 because the condition on line 193 was always true
194 href = pdf_tag.get("href")
195 if isinstance(href, str): 195 ↛ 199line 195 didn't jump to line 199 because the condition on line 195 was always true
196 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
198 # References
199 a_id = url.split("/")[-1]
200 ref_url = (
201 self.source_website
202 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
203 )
205 self.parse_references(self.download_file(ref_url), xarticle)
207 # Keywords
208 keyword_tag = next(
209 iter(
210 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]
211 ),
212 None,
213 )
214 if keyword_tag:
215 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")
216 for kwd in keywords:
217 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})
219 abstract_tag = next(
220 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),
221 None,
222 )
223 if abstract_tag:
224 abstract_tag_b = abstract_tag.select_one("b")
225 if abstract_tag_b: 225 ↛ 227line 225 didn't jump to line 227 because the condition on line 225 was always true
226 abstract_tag_b.decompose()
227 xabstract = create_abstract(
228 value_tex=abstract_tag.text,
229 lang=self.detect_language(abstract_tag.text),
230 )
231 xarticle.abstracts.append(xabstract)
232 return xarticle
234 def parse_references(self, content: str, xarticle: ArticleData):
235 soup = BeautifulSoup(content, "html.parser")
236 references = soup.select('tr:has(td[valign="top"])')
238 xarticle.bibitems = [self.parse_ref(item) for item in references]
240 def parse_ref(self, tag: Tag):
241 links_xml = ""
242 for a_tag in tag.select("a"):
243 a_href = a_tag.get("href")
244 if not isinstance(a_href, str): 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 continue
246 a_href = escape(a_href)
247 if a_tag.select_one("img[alt='crossref']"):
248 links_xml += get_ext_link_xml(
249 a_href, a_href.removeprefix("https://doi.org/"), "doi"
250 )
251 elif a_tag.select_one("img[alt='mathscinet']"):
252 links_xml += get_ext_link_xml(
253 a_href,
254 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
255 "mr-item-id",
256 )
257 elif a_tag.select_one("img[alt='zmath']"):
258 links_xml += get_ext_link_xml(
259 a_href,
260 a_href.removeprefix("https://zbmath.org/?q=an:"),
261 "zbl-item-id",
262 )
263 elif a_tag.select_one("img"):
264 self.logger.debug(
265 "Unimplemented reference link", extra={"url": a_tag.get("href", "")}
266 )
267 else:
268 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
269 a_tag.decompose()
270 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))
272 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
273 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
274 parsed_xarticle = xarticle
275 if hasattr(xarticle, "url") and xarticle.url: 275 ↛ 294line 275 didn't jump to line 294 because the condition on line 275 was always true
276 parsed_xarticle = None
277 attempts = 0
278 while parsed_xarticle is None and attempts < 3:
279 try:
280 parsed_xarticle = super().crawl_article(xarticle, xissue)
281 except ValueError as e:
282 self.logger.debug(f"Caught error : {e}", extra={"pid": xarticle.pid})
283 attempts += 1
284 self.logger.debug(
285 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})",
286 extra={"pid": xarticle.pid},
287 )
288 # 15 mins, 30 mins, 45 mins
289 time.sleep(attempts * 15 * 60)
290 self.download_file(xarticle.url, force_refresh=True)
292 if parsed_xarticle is None: 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true
293 raise ValueError(f"Couldn't parse article {xarticle.pid}")
294 return self.process_article_metadata(parsed_xarticle)