Coverage for src/crawler/by_source/mathnetru_crawler.py: 86%
170 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import time
2from datetime import datetime, timedelta
4import langcodes
5import lingua
6import regex
7import requests
8from bs4 import BeautifulSoup, Tag
9from lingua import LanguageDetectorBuilder
10from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
11from ptf.cmds.xml.xml_utils import escape
12from ptf.model_data import (
13 ArticleData,
14 IssueData,
15 create_abstract,
16 create_articledata,
17 create_contributor,
18)
20from crawler.base_crawler import BaseCollectionCrawler
21from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
24class MathnetruCrawler(BaseCollectionCrawler):
25 source_domain = "MATHNETRU"
26 source_name = "Math-Net.Ru"
27 source_website = "https://www.mathnet.ru"
28 periode_begin = 0
29 periode_end = 0
31 issue_regex = r"(?:.+, )?(?P<year>\d{4}), ?(?:Volume|Issue|Number) (?P<volume>\d+)(?:, ?(?:Number|Issue) (?P<number>\d+))?"
32 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)"
34 def __init__(self, *args, **kwargs):
35 super().__init__(*args, **kwargs)
36 self.language_detector = LanguageDetectorBuilder.from_languages(
37 lingua.Language.ENGLISH, lingua.Language.ENGLISH
38 ).build()
40 def parse_collection_content(self, content):
41 xissues = []
42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
43 soup = BeautifulSoup(content, "html5lib")
45 # Periode
46 periode_tag = soup.select_one("td.showUDC[title='Coverage']:nth-child(2)")
47 if periode_tag:
48 years = periode_tag.text.split("–")
49 self.periode_begin = int(years[0])
50 self.periode_end = int(years[1])
52 self.periode = self.get_or_create_periode()
54 # Issues
55 issue_tags = soup.select(
56 "table.Card td a.SLink[href^='/php'], table.cont td.issue_with_corner a.SLink[href^='/php']"
57 )
58 for link_tag in issue_tags:
59 href = link_tag.get("href")
60 title = link_tag.get("title", None)
61 if not isinstance(href, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError(
63 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
64 )
65 if isinstance(title, str):
66 title = cleanup_str(title)
67 volume_re = regex.search(self.issue_regex, title)
68 else:
69 if not link_tag.parent: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 raise ValueError(
71 f"[{self.source_domain}] {self.collection_id} : Title cannot be parsed"
72 )
73 title = cleanup_str(link_tag.parent.text)
74 volume_re = regex.search(self.issue_regex_alt, title)
76 if not volume_re: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 raise ValueError(
78 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
79 )
81 volume_data = volume_re.groupdict()
82 if volume_data.get("volume", None):
83 volume_data["volume"] = volume_data["volume"].strip()
84 xissues.append(
85 self.create_xissue(
86 self.source_website + href + "&bshow=contents",
87 volume_data["year"],
88 volume_data.get("volume", None),
89 volume_data.get("number", None),
90 )
91 )
92 return xissues
94 def parse_issue_content(self, content, xissue):
95 soup = BeautifulSoup(content, "html.parser")
97 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
98 articles_tags = soup.select(
99 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
100 )
101 for i, a in enumerate(articles_tags):
102 article = create_articledata()
103 href = a.get("href")
104 if not isinstance(href, str): 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 raise ValueError(
106 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
107 )
109 article.url = self.source_website + href
110 article.pid = "a" + str(i)
111 xissue.articles.append(article)
113 def parse_article_content(self, content, xissue, xarticle, url, pid):
114 soup = BeautifulSoup(content, "html.parser")
116 xarticle.pid = pid
118 # Language
119 language_candidates = soup.select("div.around-button > div.msc")
120 language_span = next(
121 (
122 span
123 for span in language_candidates
124 if cleanup_str(span.text).startswith("Language: ")
125 ),
126 None,
127 )
129 if not language_span: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise ValueError(
131 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
132 )
134 language_b = language_span.select_one("b")
135 if language_b: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true
136 language_b.decompose()
138 long_lang = cleanup_str(language_span.text)
139 xarticle.lang = str(langcodes.find(long_lang))
141 # Title
142 title_tag = soup.select_one("span.red font")
143 if not title_tag: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 raise ValueError(
145 f"[{self.source_domain}] {self.collection_id} : Article title not found"
146 )
147 xarticle.title_tex = title_tag.text
149 amsbib_tag = soup.select_one("div.showamsbib")
151 if amsbib_tag: 151 ↛ 179line 151 didn't jump to line 179 because the condition on line 151 was always true
152 amsbib = amsbib_tag.text
153 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
154 if authors_match:
155 authors = authors_match.group(1).split(",")
156 for author_text in authors:
157 if author_text != "": 157 ↛ 156line 157 didn't jump to line 156 because the condition on line 157 was always true
158 author_text = self.latext_parser.latex_to_text(author_text)
159 author = create_contributor()
160 author["role"] = "author"
161 author["string_name"] = cleanup_str(author_text)
162 xarticle.contributors.append(author)
164 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
165 if title_match: 165 ↛ 168line 165 didn't jump to line 168 because the condition on line 165 was always true
166 xarticle.title_tex = title_match.group(1)
168 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
169 if title_match: 169 ↛ 179line 169 didn't jump to line 179 because the condition on line 169 was always true
170 page_range = title_match.group(1)
171 pages = page_range.split("--")
172 if len(pages) == 2:
173 xarticle.fpage = pages[0].replace(",", "")
174 xarticle.lpage = pages[1].replace(",", "")
175 else:
176 xarticle.page_range = page_range
178 # Pdf
179 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
180 if pdf_tag: 180 ↛ 186line 180 didn't jump to line 186 because the condition on line 180 was always true
181 href = pdf_tag.get("href")
182 if isinstance(href, str): 182 ↛ 186line 182 didn't jump to line 186 because the condition on line 182 was always true
183 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
185 # References
186 a_id = url.split("/")[-1]
187 ref_url = (
188 self.source_website
189 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
190 )
192 self.parse_references(self.download_file(ref_url), xarticle)
194 # Keywords
195 keyword_tag = next(
196 iter(
197 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]
198 ),
199 None,
200 )
201 if keyword_tag:
202 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")
203 for kwd in keywords:
204 xarticle.kwds.append({"type": "", "lang": self.detect_language(kwd), "value": kwd})
206 abstract_tag = next(
207 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),
208 None,
209 )
210 if abstract_tag:
211 abstract_tag_b = abstract_tag.select_one("b")
212 if abstract_tag_b: 212 ↛ 214line 212 didn't jump to line 214 because the condition on line 212 was always true
213 abstract_tag_b.decompose()
214 xabstract = create_abstract(
215 tag="abstract",
216 value_tex=abstract_tag.text,
217 lang=self.detect_language(abstract_tag.text),
218 )
219 xarticle.abstracts.append(xabstract)
220 return xarticle
222 def parse_references(self, content: str, xarticle: ArticleData):
223 soup = BeautifulSoup(content, "html.parser")
224 references = soup.select('tr:has(td[valign="top"])')
226 bibitems = [self.parse_ref(item) for item in references]
227 if len(bibitems) > 0:
228 xarticle.abstracts.append(self.create_bibliography(bibitems))
230 def parse_ref(self, tag: Tag):
231 links_xml = ""
232 for a_tag in tag.select("a"):
233 a_href = a_tag.get("href")
234 if not isinstance(a_href, str): 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true
235 continue
236 a_href = escape(a_href)
237 if a_tag.select_one("img[alt='crossref']"):
238 links_xml += get_ext_link_xml(
239 a_href, a_href.removeprefix("https://doi.org/"), "doi"
240 )
241 elif a_tag.select_one("img[alt='mathscinet']"):
242 links_xml += get_ext_link_xml(
243 a_href,
244 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
245 "mr-item-id",
246 )
247 elif a_tag.select_one("img[alt='zmath']"):
248 links_xml += get_ext_link_xml(
249 a_href,
250 a_href.removeprefix("https://zbmath.org/?q=an:"),
251 "zbl-item-id",
252 )
253 elif a_tag.select_one("img"):
254 print(f"Unimplemented reference link : {a_tag.get('href', '')}")
255 else:
256 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
257 a_tag.decompose()
259 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml))
261 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
262 """Override this if the content-type headers from the sources are advertising something else than the actual content
263 SASA needs this"""
264 if "charset=" in response.headers["Content-Type"]: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true
265 encoding = response.headers["Content-Type"].split("charset=")[1]
266 return response.content.decode(encoding)
268 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
269 # TODO : set pid in xarticle here instead of passing it to `parse_article_content`
270 parsed_xarticle = xarticle
271 if hasattr(xarticle, "url") and xarticle.url: 271 ↛ 289line 271 didn't jump to line 289 because the condition on line 271 was always true
272 parsed_xarticle = None
273 attempts = 0
274 while parsed_xarticle is None and attempts < 3:
275 try:
276 parsed_xarticle = super().crawl_article(xarticle, xissue)
277 except ValueError as e:
278 print(f"{xarticle.pid} : Caught error : {e} ")
279 print(
280 f"Retrying in {(attempts * 15)}mins ({(datetime.now() - timedelta(minutes=15)).time()})"
281 )
282 attempts += 1
283 # 15 mins, 30 mins, 45 mins
284 time.sleep(attempts * 15 * 60)
285 self.download_file(xarticle.url, force_refresh=True)
287 if parsed_xarticle is None: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true
288 raise ValueError(f"Couldn't parse article {xarticle.pid}")
289 return self.process_article_metadata(parsed_xarticle)