Coverage for src/crawler/by_source/mathnetru_crawler.py: 81%
171 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
1import time
2from datetime import datetime, timedelta
3from urllib.parse import urljoin
5import langcodes
6import lingua
7import regex
8import requests
9from bs4 import BeautifulSoup, Tag
10from lingua import LanguageDetectorBuilder
11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
12from ptf.cmds.xml.xml_utils import escape
13from ptf.model_data import (
14 ArticleData,
15 IssueData,
16 create_abstract,
17 create_articledata,
18 create_contributor,
19 create_issuedata,
20)
22from crawler.base_crawler import BaseCollectionCrawler
23from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
26class MathnetruCrawler(BaseCollectionCrawler):
27 source_domain = "MATHNETRU"
28 source_name = "Math-Net.Ru"
29 source_website = "https://www.mathnet.ru"
31 issue_regex = regex.compile(
32 r"(?P<year>\d{4})(?:, ?(?:Volume) (?P<volume>\d+))?(?:, ?(?:Number|Issue) (?P<number>\d+)\((?P<volume>\d+)\)?)?"
33 )
34 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)"
36 language_detector = LanguageDetectorBuilder.from_languages(
37 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH
38 ).build()
40 def parse_collection_content(self, content):
41 xissues = []
42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
43 soup = BeautifulSoup(content, "html5lib")
45 # Issues without names
46 issue_tags = soup.select(
47 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], "
48 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], "
49 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']"
50 )
51 for index, link_tag in enumerate(issue_tags):
52 href = link_tag.get("href")
54 if not isinstance(href, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError(
56 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
57 )
58 xissue = create_issuedata()
59 xissue.pid = f"{self.collection_id}_TEMP_{index}"
60 xissue.url = urljoin(self.source_website, href) + "&bshow=contents"
61 xissues.append(xissue)
63 return xissues
65 def parse_issue_content(self, content, xissue):
66 soup = BeautifulSoup(content, "html.parser")
68 # Parse issue PID
69 volume_tag = soup.select_one("td[valign='top'] span.red font")
70 if not volume_tag: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 raise ValueError("Couldn't parse volume number")
72 volume_str = cleanup_str(volume_tag.text)
73 volume_re = self.issue_regex.search(volume_str)
74 if not volume_re: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 raise ValueError(
76 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
77 )
79 volume_data = volume_re.groupdict()
80 if volume_data.get("volume", None): 80 ↛ 82line 80 didn't jump to line 82 because the condition on line 80 was always true
81 volume_data["volume"] = volume_data["volume"].strip()
82 elif volume_data.get("volume_2", None):
83 volume_data["volume"] = volume_data["volume_2"].strip()
85 xissue.pid = self.get_issue_pid(
86 self.collection_id,
87 volume_data["year"],
88 volume_data.get("volume", None),
89 volume_data.get("number", None),
90 )
91 xissue.year = volume_data["year"]
92 xissue.volume = volume_data["volume"]
93 xissue.number = volume_data["number"]
95 # Parse issue title (if exists)
96 issue_title_tag = soup.select_one("td[valign='top'] div.red font")
97 if issue_title_tag:
98 xissue.title_tex = cleanup_str(issue_title_tag.text)
100 # Parse Articles
101 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
102 articles_tags = soup.select(
103 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
104 )
105 for i, a in enumerate(articles_tags):
106 article = create_articledata()
107 href = a.get("href")
108 if not isinstance(href, str): 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 raise ValueError(
110 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
111 )
113 article.url = self.source_website + href
114 article.pid = "a" + str(i)
115 xissue.articles.append(article)
117 def parse_article_content(self, content, xissue, xarticle, url):
118 soup = BeautifulSoup(content, "html.parser")
120 # Language
121 language_candidates = soup.select("div.around-button > div.msc")
122 language_span = next(
123 (
124 span
125 for span in language_candidates
126 if cleanup_str(span.text).startswith("Language: ")
127 ),
128 None,
129 )
131 if not language_span: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 raise ValueError(
133 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
134 )
136 language_b = language_span.select_one("b")
137 if language_b: 137 ↛ 140line 137 didn't jump to line 140 because the condition on line 137 was always true
138 language_b.decompose()
140 long_lang = cleanup_str(language_span.text)
141 xarticle.lang = str(langcodes.find(long_lang))
143 # Title
144 title_tag = soup.select_one("span.red font")
145 if not title_tag: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 raise ValueError(
147 f"[{self.source_domain}] {self.collection_id} : Article title not found"
148 )
149 xarticle.title_tex = title_tag.text
151 amsbib_tag = soup.select_one("div.showamsbib")
153 if amsbib_tag: 153 ↛ 181line 153 didn't jump to line 181 because the condition on line 153 was always true
154 amsbib = amsbib_tag.text
155 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
156 if authors_match: 156 ↛ 166line 156 didn't jump to line 166 because the condition on line 156 was always true
157 authors = authors_match.group(1).split(",")
158 for author_text in authors:
159 if author_text != "": 159 ↛ 158line 159 didn't jump to line 158 because the condition on line 159 was always true
160 author_text = self.latext_parser.latex_to_text(author_text)
161 author = create_contributor()
162 author["role"] = "author"
163 author["string_name"] = cleanup_str(author_text)
164 xarticle.contributors.append(author)
166 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
167 if title_match: 167 ↛ 170line 167 didn't jump to line 170 because the condition on line 167 was always true
168 xarticle.title_tex = title_match.group(1)
170 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
171 if title_match: 171 ↛ 181line 171 didn't jump to line 181 because the condition on line 171 was always true
172 page_range = title_match.group(1)
173 pages = page_range.split("--")
174 if len(pages) == 2: 174 ↛ 178line 174 didn't jump to line 178 because the condition on line 174 was always true
175 xarticle.fpage = pages[0].replace(",", "")
176 xarticle.lpage = pages[1].replace(",", "")
177 else:
178 xarticle.page_range = page_range
180 # Pdf
181 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
182 if not pdf_tag: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")
184 if pdf_tag: 184 ↛ 190line 184 didn't jump to line 190 because the condition on line 184 was always true
185 href = pdf_tag.get("href")
186 if isinstance(href, str): 186 ↛ 190line 186 didn't jump to line 190 because the condition on line 186 was always true
187 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
189 # References
190 a_id = url.split("/")[-1]
191 ref_url = (
192 self.source_website
193 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
194 )
196 self.parse_references(self.download_file(ref_url), xarticle)
198 # Keywords
199 keyword_tag = next(
200 iter(
201 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]
202 ),
203 None,
204 )
205 if keyword_tag:
206 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")
207 for kwd in keywords:
208 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})
210 abstract_tag = next(
211 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),
212 None,
213 )
214 if abstract_tag:
215 abstract_tag_b = abstract_tag.select_one("b")
216 if abstract_tag_b: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true
217 abstract_tag_b.decompose()
218 xabstract = create_abstract(
219 tag="abstract",
220 value_tex=abstract_tag.text,
221 lang=self.detect_language(abstract_tag.text),
222 )
223 xarticle.abstracts.append(xabstract)
224 return xarticle
226 def parse_references(self, content: str, xarticle: ArticleData):
227 soup = BeautifulSoup(content, "html.parser")
228 references = soup.select('tr:has(td[valign="top"])')
230 bibitems = [self.parse_ref(item) for item in references]
231 if len(bibitems) > 0:
232 xarticle.abstracts.append(self.create_bibliography(bibitems))
234 def parse_ref(self, tag: Tag):
235 links_xml = ""
236 for a_tag in tag.select("a"):
237 a_href = a_tag.get("href")
238 if not isinstance(a_href, str): 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 continue
240 a_href = escape(a_href)
241 if a_tag.select_one("img[alt='crossref']"):
242 links_xml += get_ext_link_xml(
243 a_href, a_href.removeprefix("https://doi.org/"), "doi"
244 )
245 elif a_tag.select_one("img[alt='mathscinet']"):
246 links_xml += get_ext_link_xml(
247 a_href,
248 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
249 "mr-item-id",
250 )
251 elif a_tag.select_one("img[alt='zmath']"):
252 links_xml += get_ext_link_xml(
253 a_href,
254 a_href.removeprefix("https://zbmath.org/?q=an:"),
255 "zbl-item-id",
256 )
257 elif a_tag.select_one("img"): 257 ↛ 260line 257 didn't jump to line 260 because the condition on line 257 was always true
258 print(f"Unimplemented reference link : {a_tag.get('href', '')}")
259 else:
260 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
261 a_tag.decompose()
263 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml))
265 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
266 """Override this if the content-type headers from the sources are advertising something else than the actual content
267 SASA needs this"""
268 if "charset=" in response.headers["Content-Type"]: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true
269 encoding = response.headers["Content-Type"].split("charset=")[1]
270 return response.content.decode(encoding)
272 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
273 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
274 parsed_xarticle = xarticle
275 if hasattr(xarticle, "url") and xarticle.url: 275 ↛ 293line 275 didn't jump to line 293 because the condition on line 275 was always true
276 parsed_xarticle = None
277 attempts = 0
278 while parsed_xarticle is None and attempts < 3:
279 try:
280 parsed_xarticle = super().crawl_article(xarticle, xissue)
281 except ValueError as e:
282 print(f"{xarticle.pid} : Caught error : {e} ")
283 attempts += 1
284 print(
285 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})"
286 )
287 # 15 mins, 30 mins, 45 mins
288 time.sleep(attempts * 15 * 60)
289 self.download_file(xarticle.url, force_refresh=True)
291 if parsed_xarticle is None: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 raise ValueError(f"Couldn't parse article {xarticle.pid}")
293 return self.process_resource_metadata(parsed_xarticle)