Coverage for src/crawler/by_source/mathnetru_crawler.py: 88%
161 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-10-29 14:25 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-10-29 14:25 +0000
1from urllib.parse import parse_qs, urljoin, urlparse
3import langcodes
4import lingua
5import regex
6from bs4 import BeautifulSoup, Tag
7from lingua import LanguageDetectorBuilder
8from ptf.cmds.xml.jats.builder.references import get_ext_link_xml
9from ptf.cmds.xml.jats.jats_parser import JatsBase
10from ptf.cmds.xml.xml_utils import escape
11from ptf.model_data import (
12 ArticleData,
13 create_abstract,
14 create_articledata,
15 create_contributor,
16)
17from pylatexenc.latex2text import LatexNodes2Text
19from crawler.base_crawler import BaseCollectionCrawler
20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
23class MathnetruCrawler(BaseCollectionCrawler):
24 source_domain = "MATHNETRU"
25 source_name = "Math-Net.Ru"
26 source_website = "https://www.mathnet.ru"
28 issue_regex = regex.compile(
29 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[ ](?P<volume>\d+))?(?:, +Issue[ ](?P<number>[\d\-]+))?"
30 )
32 _language_detector_builder = LanguageDetectorBuilder.from_languages(
33 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH
34 )
36 def __init__(self, *args, **kwargs):
37 super().__init__(*args, **kwargs)
38 self.latex_converter = LatexNodes2Text(math_mode="verbatim")
40 def parse_collection_content(self, content):
41 xissues = []
42 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
43 soup = BeautifulSoup(content, "html5lib")
45 # Issues without names
46 issue_tags = soup.select(
47 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], "
48 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], "
49 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']"
50 )
51 for link_tag in issue_tags:
52 href = link_tag.get("href")
54 if not isinstance(href, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError(
56 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
57 )
58 url = urljoin(self.source_website, href) + "&bshow=contents"
60 title = link_tag.get("title")
61 if not isinstance(title, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError("Couldn't find issue data")
63 issue_search = self.issue_regex.search(cleanup_str(title))
64 year = None
65 volume = None
66 number = None
67 vseries = None
68 if issue_search and self.collection_id != "MP": 68 ↛ 75line 68 didn't jump to line 75 because the condition on line 68 was always true
69 issue_dict = issue_search.groupdict()
70 year = issue_dict["year"]
71 volume = issue_dict.get("volume", None)
72 number = issue_dict.get("number", None)
74 # Use another method to parse issue metadata if the first one is not successfull
75 parsed_url = urlparse(url)
76 query_args = parse_qs(parsed_url.query)
78 # Query arguments can be lists
79 year = year or next(iter(query_args.get("year") or []), None)
80 volume = volume or next(iter(query_args.get("volume") or []), None)
81 number = number or next(iter(query_args.get("issue") or []), None)
82 vseries = vseries or next(iter(query_args.get("series") or []), None)
84 if not year: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 raise ValueError("Couldn't parse issue year")
86 xissue = self.create_xissue(url, year, volume, number, vseries)
88 xissues.append(xissue)
90 return xissues
92 def parse_issue_content(self, content, xissue):
93 soup = BeautifulSoup(content, "html.parser")
94 # Parse issue title (if exists)
95 issue_title_tag = soup.select_one("td[valign='top'] div.red font")
96 if issue_title_tag:
97 title_str = cleanup_str(issue_title_tag.text)
98 part_search = regex.search(r"Part (?P<number>\d+)$", title_str)
99 if part_search: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 title_str = title_str.removesuffix(f"Part {xissue.number}")
102 xissue.title_tex = title_str
104 # Parse Articles
105 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
106 articles_tags = soup.select(
107 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
108 )
109 for i, a in enumerate(articles_tags):
110 article = create_articledata()
111 href = a.get("href")
112 if not isinstance(href, str): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError(
114 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
115 )
117 article.url = self.source_website + href
118 article.pid = "a" + str(i)
119 xissue.articles.append(article)
121 def parse_article_content(self, content, xissue, xarticle, url):
122 soup = BeautifulSoup(content, "html5lib")
124 # Language
125 language_candidates = soup.select("div.around-button > div.msc")
126 language_span = next(
127 (
128 span
129 for span in language_candidates
130 if cleanup_str(span.text).startswith("Language:")
131 ),
132 None,
133 )
135 if not language_span: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 raise ValueError(
137 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
138 )
140 language_b = language_span.select_one("b")
141 if language_b: 141 ↛ 144line 141 didn't jump to line 144 because the condition on line 141 was always true
142 language_b.decompose()
144 long_lang = cleanup_str(language_span.text)
145 xarticle.lang = str(langcodes.find(long_lang))
147 # Title
148 title_tag = soup.select_one("span.red font")
149 if not title_tag: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 raise ValueError(
151 f"[{self.source_domain}] {self.collection_id} : Article title not found"
152 )
153 xarticle.title_tex = title_tag.text
155 amsbib_tag = soup.select_one("div.showamsbib")
157 if amsbib_tag: 157 ↛ 187line 157 didn't jump to line 187 because the condition on line 157 was always true
158 amsbib = amsbib_tag.text
159 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
160 if authors_match:
161 authors = authors_match.group(1).split(",")
162 for author_text in authors:
163 if author_text != "": 163 ↛ 162line 163 didn't jump to line 162 because the condition on line 163 was always true
164 author_text = self.latext_parser.latex_to_text(author_text)
165 author = create_contributor()
166 author["role"] = "author"
167 author["string_name"] = cleanup_str(author_text)
168 xarticle.contributors.append(author)
170 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
171 if title_match: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true
172 xarticle.title_tex = title_match.group(1)
174 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
175 if title_match: 175 ↛ 187line 175 didn't jump to line 187 because the condition on line 175 was always true
176 page_range = title_match.group(1)
177 pages = page_range.split("--")
178 if len(pages) == 2:
179 xarticle.fpage = pages[0].replace(",", "")
180 xarticle.lpage = pages[1].replace(",", "")
181 else:
182 xarticle.page_range = page_range
184 # Workaround for pylatexenc not supporting latex \href{}{} commands
185 # https://github.com/phfaist/pylatexenc/issues/58
187 article_title = cleanup_str(xarticle.title_tex)
188 article_title = regex.sub(r"\\href\{(.+)\}(?:\{(.+)\})?", r"\1", article_title)
190 xarticle.title_tex = self.latex_converter.latex_to_text(article_title)
192 # Pdf
193 pdf_tag = soup.select_one(
194 "a.button_green[title='Full-text article is available'], a.button_yellow[title='Full-text article is available']"
195 )
196 if not pdf_tag: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true
197 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")
198 if pdf_tag: 198 ↛ 204line 198 didn't jump to line 204 because the condition on line 198 was always true
199 href = pdf_tag.get("href")
200 if isinstance(href, str): 200 ↛ 204line 200 didn't jump to line 204 because the condition on line 200 was always true
201 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
203 # References
204 a_id = url.split("/")[-1]
205 ref_url = (
206 self.source_website
207 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
208 )
210 self.parse_references(self.download_file(ref_url), xarticle)
212 # Keywords
213 keyword_tag = next(
214 iter(
215 [
216 d
217 for d in soup.select("div.around-button")
218 if cleanup_str(d.text).startswith("Keywords:")
219 ]
220 ),
221 None,
222 )
223 if keyword_tag:
224 keywords = cleanup_str(keyword_tag.text).removeprefix("Keywords:").strip().split(", ")
225 for kwd in keywords:
226 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})
228 abstract_tag = next(
229 iter(
230 [
231 d
232 for d in soup.select("div.around-button")
233 if cleanup_str(d.text).startswith("Abstract")
234 ]
235 ),
236 None,
237 )
238 if abstract_tag:
239 abstract_tag_b = abstract_tag.select_one("b")
240 if abstract_tag_b: 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was always true
241 abstract_tag_b.decompose()
242 abstract_text = cleanup_str(escape(abstract_tag.text))
243 xabstract = create_abstract(
244 value_tex=abstract_text,
245 lang=self.detect_language(abstract_text),
246 )
247 xarticle.abstracts.append(xabstract)
248 return xarticle
250 def parse_references(self, content: str, xarticle: ArticleData):
251 soup = BeautifulSoup(content, "html.parser")
252 references = soup.select('tr:has(td[valign="top"])')
254 xarticle.bibitems = [self.parse_ref(item) for item in references]
256 def parse_ref(self, tag: Tag):
257 links_xml = ""
258 for a_tag in tag.select("a"):
259 a_href = a_tag.get("href")
260 if not isinstance(a_href, str): 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true
261 continue
262 a_href = escape(a_href)
263 if a_tag.select_one("img[alt='crossref']"):
264 links_xml += get_ext_link_xml(
265 a_href, a_href.removeprefix("https://doi.org/"), "doi"
266 )
267 elif a_tag.select_one("img[alt='mathscinet']"):
268 links_xml += get_ext_link_xml(
269 a_href,
270 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
271 "mr-item-id",
272 )
273 elif a_tag.select_one("img[alt='zmath']"):
274 links_xml += get_ext_link_xml(
275 a_href,
276 a_href.removeprefix("https://zbmath.org/?q=an:"),
277 "zbl-item-id",
278 )
279 elif a_tag.select_one("img"):
280 # self.logger.debug(f"Unimplemented reference link : {a_tag.get('href', '')}")
281 # alt_text = a_tag.get("alt", "")
282 # if not isinstance(alt_text, str):
283 # continue
284 # links_xml += get_ext_link_xml(a_href, escape(a_tag.text or alt_text))
285 pass
286 else:
287 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
288 a_tag.decompose()
289 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))