Coverage for src/crawler/by_source/mathnetru_crawler.py: 84%
171 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import time
2from datetime import datetime, timedelta
3from urllib.parse import urljoin
5import langcodes
6import lingua
7import regex
8import requests
9from bs4 import BeautifulSoup, Tag
10from lingua import LanguageDetectorBuilder
11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
12from ptf.cmds.xml.xml_utils import escape
13from ptf.model_data import (
14 ArticleData,
15 IssueData,
16 create_abstract,
17 create_articledata,
18 create_contributor,
19 create_issuedata,
20)
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
26class MathnetruCrawler(BaseCollectionCrawler):
27 source_domain = "MATHNETRU"
28 source_name = "Math-Net.Ru"
29 source_website = "https://www.mathnet.ru"
31 issue_regex = regex.compile(
32 r"(?P<year>\d{4})(?:, ?(?:Volume) (?P<volume>\d+))?(?:, ?(?:Number|Issue) (?P<number>\d+)\((?P<volume>\d+)\)?)?"
33 )
34 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)"
36 language_detector = LanguageDetectorBuilder.from_languages(
37 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH
38 ).build()
40 def parse_collection_content(self, content):
41 xissues = []
42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
43 soup = BeautifulSoup(content, "html5lib")
45 # Issues without names
46 issue_tags = soup.select(
47 "table.Card td:not(.year) a.SLink[href^='/php'], table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php'], table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php']"
48 )
49 for index, link_tag in enumerate(issue_tags):
50 href = link_tag.get("href")
52 if not isinstance(href, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 raise ValueError(
54 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
55 )
56 xissue = create_issuedata()
57 xissue.pid = f"{self.collection_id}_TEMP_{index}"
58 xissue.url = urljoin(self.source_website, href) + "&bshow=contents"
59 xissues.append(xissue)
61 return xissues
63 def parse_issue_content(self, content, xissue):
64 soup = BeautifulSoup(content, "html.parser")
66 # Parse issue PID
67 volume_tag = soup.select_one("td[valign='top'] span.red font")
68 if not volume_tag: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 raise ValueError("Couldn't parse volume number")
70 volume_str = cleanup_str(volume_tag.text)
71 volume_re = self.issue_regex.search(volume_str)
72 if not volume_re: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 raise ValueError(
74 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
75 )
77 volume_data = volume_re.groupdict()
78 if volume_data.get("volume", None):
79 volume_data["volume"] = volume_data["volume"].strip()
80 elif volume_data.get("volume_2", None): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 volume_data["volume"] = volume_data["volume_2"].strip()
83 xissue.pid = self.get_issue_pid(
84 self.collection_id,
85 volume_data["year"],
86 volume_data.get("volume", None),
87 volume_data.get("number", None),
88 )
89 xissue.year = volume_data["year"]
90 xissue.volume = volume_data["volume"]
91 xissue.number = volume_data["number"]
93 # Parse issue title (if exists)
94 issue_title_tag = soup.select_one("td[valign='top'] div.red font")
95 if issue_title_tag:
96 xissue.title_tex = cleanup_str(issue_title_tag.text)
98 # Parse Articles
99 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
100 articles_tags = soup.select(
101 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
102 )
103 for i, a in enumerate(articles_tags):
104 article = create_articledata()
105 href = a.get("href")
106 if not isinstance(href, str): 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 raise ValueError(
108 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
109 )
111 article.url = self.source_website + href
112 article.pid = "a" + str(i)
113 xissue.articles.append(article)
115 def parse_article_content(self, content, xissue, xarticle, url):
116 soup = BeautifulSoup(content, "html.parser")
118 # Language
119 language_candidates = soup.select("div.around-button > div.msc")
120 language_span = next(
121 (
122 span
123 for span in language_candidates
124 if cleanup_str(span.text).startswith("Language: ")
125 ),
126 None,
127 )
129 if not language_span: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise ValueError(
131 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
132 )
134 language_b = language_span.select_one("b")
135 if language_b: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true
136 language_b.decompose()
138 long_lang = cleanup_str(language_span.text)
139 xarticle.lang = str(langcodes.find(long_lang))
141 # Title
142 title_tag = soup.select_one("span.red font")
143 if not title_tag: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 raise ValueError(
145 f"[{self.source_domain}] {self.collection_id} : Article title not found"
146 )
147 xarticle.title_tex = title_tag.text
149 amsbib_tag = soup.select_one("div.showamsbib")
151 if amsbib_tag: 151 ↛ 179line 151 didn't jump to line 179 because the condition on line 151 was always true
152 amsbib = amsbib_tag.text
153 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
154 if authors_match: 154 ↛ 164line 154 didn't jump to line 164 because the condition on line 154 was always true
155 authors = authors_match.group(1).split(",")
156 for author_text in authors:
157 if author_text != "": 157 ↛ 156line 157 didn't jump to line 156 because the condition on line 157 was always true
158 author_text = self.latext_parser.latex_to_text(author_text)
159 author = create_contributor()
160 author["role"] = "author"
161 author["string_name"] = cleanup_str(author_text)
162 xarticle.contributors.append(author)
164 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
165 if title_match: 165 ↛ 168line 165 didn't jump to line 168 because the condition on line 165 was always true
166 xarticle.title_tex = title_match.group(1)
168 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
169 if title_match: 169 ↛ 179line 169 didn't jump to line 179 because the condition on line 169 was always true
170 page_range = title_match.group(1)
171 pages = page_range.split("--")
172 if len(pages) == 2: 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was always true
173 xarticle.fpage = pages[0].replace(",", "")
174 xarticle.lpage = pages[1].replace(",", "")
175 else:
176 xarticle.page_range = page_range
178 # Pdf
179 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
180 if not pdf_tag: 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true
181 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")
182 if pdf_tag: 182 ↛ 188line 182 didn't jump to line 188 because the condition on line 182 was always true
183 href = pdf_tag.get("href")
184 if isinstance(href, str): 184 ↛ 188line 184 didn't jump to line 188 because the condition on line 184 was always true
185 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
187 # References
188 a_id = url.split("/")[-1]
189 ref_url = (
190 self.source_website
191 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
192 )
194 self.parse_references(self.download_file(ref_url), xarticle)
196 # Keywords
197 keyword_tag = next(
198 iter(
199 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]
200 ),
201 None,
202 )
203 if keyword_tag:
204 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")
205 for kwd in keywords:
206 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})
208 abstract_tag = next(
209 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),
210 None,
211 )
212 if abstract_tag:
213 abstract_tag_b = abstract_tag.select_one("b")
214 if abstract_tag_b: 214 ↛ 216line 214 didn't jump to line 216 because the condition on line 214 was always true
215 abstract_tag_b.decompose()
216 xabstract = create_abstract(
217 tag="abstract",
218 value_tex=abstract_tag.text,
219 lang=self.detect_language(abstract_tag.text),
220 )
221 xarticle.abstracts.append(xabstract)
222 return xarticle
224 def parse_references(self, content: str, xarticle: ArticleData):
225 soup = BeautifulSoup(content, "html.parser")
226 references = soup.select('tr:has(td[valign="top"])')
228 bibitems = [self.parse_ref(item) for item in references]
229 if len(bibitems) > 0:
230 xarticle.abstracts.append(self.create_bibliography(bibitems))
232 def parse_ref(self, tag: Tag):
233 links_xml = ""
234 for a_tag in tag.select("a"):
235 a_href = a_tag.get("href")
236 if not isinstance(a_href, str): 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true
237 continue
238 a_href = escape(a_href)
239 if a_tag.select_one("img[alt='crossref']"):
240 links_xml += get_ext_link_xml(
241 a_href, a_href.removeprefix("https://doi.org/"), "doi"
242 )
243 elif a_tag.select_one("img[alt='mathscinet']"):
244 links_xml += get_ext_link_xml(
245 a_href,
246 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
247 "mr-item-id",
248 )
249 elif a_tag.select_one("img[alt='zmath']"):
250 links_xml += get_ext_link_xml(
251 a_href,
252 a_href.removeprefix("https://zbmath.org/?q=an:"),
253 "zbl-item-id",
254 )
255 elif a_tag.select_one("img"):
256 print(f"Unimplemented reference link : {a_tag.get('href', '')}")
257 else:
258 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
259 a_tag.decompose()
261 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml))
263 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
264 """Override this if the content-type headers from the sources are advertising something else than the actual content
265 SASA needs this"""
266 if "charset=" in response.headers["Content-Type"]: 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true
267 encoding = response.headers["Content-Type"].split("charset=")[1]
268 return response.content.decode(encoding)
270 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
271 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
272 parsed_xarticle = xarticle
273 if hasattr(xarticle, "url") and xarticle.url: 273 ↛ 291line 273 didn't jump to line 291 because the condition on line 273 was always true
274 parsed_xarticle = None
275 attempts = 0
276 while parsed_xarticle is None and attempts < 3:
277 try:
278 parsed_xarticle = super().crawl_article(xarticle, xissue)
279 except ValueError as e:
280 print(f"{xarticle.pid} : Caught error : {e} ")
281 attempts += 1
282 print(
283 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})"
284 )
285 # 15 mins, 30 mins, 45 mins
286 time.sleep(attempts * 15 * 60)
287 self.download_file(xarticle.url, force_refresh=True)
289 if parsed_xarticle is None: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 raise ValueError(f"Couldn't parse article {xarticle.pid}")
291 return self.process_resource_metadata(parsed_xarticle)