Coverage for src/crawler/by_source/mathnetru_crawler.py: 84%
175 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1import time
2from datetime import datetime, timedelta
3from urllib.parse import parse_qs, urljoin, urlparse
5import langcodes
6import lingua
7import regex
8import requests
9from bs4 import BeautifulSoup, Tag
10from lingua import LanguageDetectorBuilder
11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
12from ptf.cmds.xml.jats.jats_parser import JatsBase
13from ptf.cmds.xml.xml_utils import escape
14from ptf.model_data import (
15 ArticleData,
16 IssueData,
17 create_abstract,
18 create_articledata,
19 create_contributor,
20)
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
26class MathnetruCrawler(BaseCollectionCrawler):
27 source_domain = "MATHNETRU"
28 source_name = "Math-Net.Ru"
29 source_website = "https://www.mathnet.ru"
31 issue_regex = regex.compile(
32 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[ ](?P<volume>\d+))?(?:, +Issue[ ](?P<number>[\d\-]+))?"
33 )
35 language_detector = LanguageDetectorBuilder.from_languages(
36 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH
37 ).build()
39 def parse_collection_content(self, content):
40 xissues = []
41 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
42 soup = BeautifulSoup(content, "html5lib")
44 # Issues without names
45 issue_tags = soup.select(
46 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], "
47 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], "
48 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']"
49 )
50 for index, link_tag in enumerate(issue_tags):
51 href = link_tag.get("href")
53 if not isinstance(href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 raise ValueError(
55 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
56 )
57 url = urljoin(self.source_website, href) + "&bshow=contents"
59 title = link_tag.get("title")
60 if not isinstance(title, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 raise ValueError("Couldn't find issue data")
62 issue_search = self.issue_regex.search(cleanup_str(title))
63 if not issue_search: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 raise ValueError("Couldn't parse issue data")
65 issue_dict = issue_search.groupdict()
66 year = issue_dict["year"]
67 volume = issue_dict.get("volume", None)
68 number = issue_dict.get("number", None)
70 # Use another method to parse issue metadata if the first one is not successfull
71 parsed_url = urlparse(url)
72 query_args = parse_qs(parsed_url.query)
74 year = year or next(iter(query_args.get("year") or []), None)
75 volume = volume or next(iter(query_args.get("volume") or []), None)
76 number = number or next(iter(query_args.get("issue") or []), None)
77 if not year: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 raise ValueError("Couldn't parse issue year")
79 xissue = self.create_xissue(
80 url,
81 year,
82 volume,
83 number,
84 )
86 xissues.append(xissue)
88 return xissues
90 def parse_issue_content(self, content, xissue):
91 soup = BeautifulSoup(content, "html.parser")
92 # Parse issue title (if exists)
93 issue_title_tag = soup.select_one("td[valign='top'] div.red font")
94 if issue_title_tag:
95 title_str = cleanup_str(issue_title_tag.text)
96 part_search = regex.search(r"Part (?P<number>\d+)$", title_str)
97 if part_search: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 title_str = title_str.removesuffix(f"Part {xissue.number}")
100 xissue.title_tex = title_str
102 # Parse Articles
103 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
104 articles_tags = soup.select(
105 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
106 )
107 for i, a in enumerate(articles_tags):
108 article = create_articledata()
109 href = a.get("href")
110 if not isinstance(href, str): 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true
111 raise ValueError(
112 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
113 )
115 article.url = self.source_website + href
116 article.pid = "a" + str(i)
117 xissue.articles.append(article)
119 def parse_article_content(self, content, xissue, xarticle, url):
120 soup = BeautifulSoup(content, "html.parser")
122 # Language
123 language_candidates = soup.select("div.around-button > div.msc")
124 language_span = next(
125 (
126 span
127 for span in language_candidates
128 if cleanup_str(span.text).startswith("Language: ")
129 ),
130 None,
131 )
133 if not language_span: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 raise ValueError(
135 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
136 )
138 language_b = language_span.select_one("b")
139 if language_b: 139 ↛ 142line 139 didn't jump to line 142 because the condition on line 139 was always true
140 language_b.decompose()
142 long_lang = cleanup_str(language_span.text)
143 xarticle.lang = str(langcodes.find(long_lang))
145 # Title
146 title_tag = soup.select_one("span.red font")
147 if not title_tag: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 raise ValueError(
149 f"[{self.source_domain}] {self.collection_id} : Article title not found"
150 )
151 xarticle.title_tex = title_tag.text
153 amsbib_tag = soup.select_one("div.showamsbib")
155 if amsbib_tag: 155 ↛ 183line 155 didn't jump to line 183 because the condition on line 155 was always true
156 amsbib = amsbib_tag.text
157 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
158 if authors_match:
159 authors = authors_match.group(1).split(",")
160 for author_text in authors:
161 if author_text != "": 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true
162 author_text = self.latext_parser.latex_to_text(author_text)
163 author = create_contributor()
164 author["role"] = "author"
165 author["string_name"] = cleanup_str(author_text)
166 xarticle.contributors.append(author)
168 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
169 if title_match: 169 ↛ 172line 169 didn't jump to line 172 because the condition on line 169 was always true
170 xarticle.title_tex = title_match.group(1)
172 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
173 if title_match: 173 ↛ 183line 173 didn't jump to line 183 because the condition on line 173 was always true
174 page_range = title_match.group(1)
175 pages = page_range.split("--")
176 if len(pages) == 2:
177 xarticle.fpage = pages[0].replace(",", "")
178 xarticle.lpage = pages[1].replace(",", "")
179 else:
180 xarticle.page_range = page_range
182 # Pdf
183 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
184 if not pdf_tag: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")
186 if pdf_tag: 186 ↛ 192line 186 didn't jump to line 192 because the condition on line 186 was always true
187 href = pdf_tag.get("href")
188 if isinstance(href, str): 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was always true
189 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
191 # References
192 a_id = url.split("/")[-1]
193 ref_url = (
194 self.source_website
195 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
196 )
198 self.parse_references(self.download_file(ref_url), xarticle)
200 # Keywords
201 keyword_tag = next(
202 iter(
203 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]
204 ),
205 None,
206 )
207 if keyword_tag:
208 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")
209 for kwd in keywords:
210 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})
212 abstract_tag = next(
213 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),
214 None,
215 )
216 if abstract_tag:
217 abstract_tag_b = abstract_tag.select_one("b")
218 if abstract_tag_b: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true
219 abstract_tag_b.decompose()
220 xabstract = create_abstract(
221 tag="abstract",
222 value_tex=abstract_tag.text,
223 lang=self.detect_language(abstract_tag.text),
224 )
225 xarticle.abstracts.append(xabstract)
226 return xarticle
228 def parse_references(self, content: str, xarticle: ArticleData):
229 soup = BeautifulSoup(content, "html.parser")
230 references = soup.select('tr:has(td[valign="top"])')
232 bibitems = [self.parse_ref(item) for item in references]
233 if len(bibitems) > 0:
234 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))
236 def parse_ref(self, tag: Tag):
237 links_xml = ""
238 for a_tag in tag.select("a"):
239 a_href = a_tag.get("href")
240 if not isinstance(a_href, str): 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true
241 continue
242 a_href = escape(a_href)
243 if a_tag.select_one("img[alt='crossref']"):
244 links_xml += get_ext_link_xml(
245 a_href, a_href.removeprefix("https://doi.org/"), "doi"
246 )
247 elif a_tag.select_one("img[alt='mathscinet']"):
248 links_xml += get_ext_link_xml(
249 a_href,
250 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
251 "mr-item-id",
252 )
253 elif a_tag.select_one("img[alt='zmath']"):
254 links_xml += get_ext_link_xml(
255 a_href,
256 a_href.removeprefix("https://zbmath.org/?q=an:"),
257 "zbl-item-id",
258 )
259 elif a_tag.select_one("img"):
260 print(f"Unimplemented reference link : {a_tag.get('href', '')}")
261 else:
262 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
263 a_tag.decompose()
265 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))
267 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
268 """Override this if the content-type headers from the sources are advertising something else than the actual content
269 SASA needs this"""
270 if "charset=" in response.headers["Content-Type"]: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true
271 encoding = response.headers["Content-Type"].split("charset=")[1]
272 return response.content.decode(encoding)
274 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
275 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
276 parsed_xarticle = xarticle
277 if hasattr(xarticle, "url") and xarticle.url: 277 ↛ 295line 277 didn't jump to line 295 because the condition on line 277 was always true
278 parsed_xarticle = None
279 attempts = 0
280 while parsed_xarticle is None and attempts < 3:
281 try:
282 parsed_xarticle = super().crawl_article(xarticle, xissue)
283 except ValueError as e:
284 print(f"{xarticle.pid} : Caught error : {e} ")
285 attempts += 1
286 print(
287 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})"
288 )
289 # 15 mins, 30 mins, 45 mins
290 time.sleep(attempts * 15 * 60)
291 self.download_file(xarticle.url, force_refresh=True)
293 if parsed_xarticle is None: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 raise ValueError(f"Couldn't parse article {xarticle.pid}")
295 return self.process_article_metadata(parsed_xarticle)