Coverage for src/crawler/by_source/mathnetru_crawler.py: 88%
157 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1from urllib.parse import parse_qs, urljoin, urlparse
3import langcodes
4import lingua
5import regex
6from bs4 import BeautifulSoup, Tag
7from lingua import LanguageDetectorBuilder
8from ptf.cmds.xml.jats.builder.references import get_ext_link_xml
9from ptf.cmds.xml.jats.jats_parser import JatsBase
10from ptf.cmds.xml.xml_utils import escape
11from ptf.model_data import (
12 ArticleData,
13 create_abstract,
14 create_articledata,
15 create_contributor,
16)
17from pylatexenc.latex2text import LatexNodes2Text
19from crawler.base_crawler import BaseCollectionCrawler
20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
23class MathnetruCrawler(BaseCollectionCrawler):
24 source_domain = "MATHNETRU"
25 source_name = "Math-Net.Ru"
26 source_website = "https://www.mathnet.ru"
28 issue_regex = regex.compile(
29 r"(?:[\w \-,\.\[\]]+, )?(?P<year>\d+)(?:, +Volume[ ](?P<volume>\d+))?(?:, +Issue[ ](?P<number>[\d\-]+))?"
30 )
32 def __init__(self, *args, **kwargs):
33 super().__init__(*args, **kwargs)
34 self.latex_converter = LatexNodes2Text(math_mode="verbatim")
35 self.language_detector = LanguageDetectorBuilder.from_languages(
36 lingua.Language.ENGLISH, lingua.Language.RUSSIAN, lingua.Language.FRENCH
37 ).build()
39 def parse_collection_content(self, content):
40 xissues = []
41 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
42 soup = BeautifulSoup(content, "html5lib")
44 # Issues without names
45 issue_tags = soup.select(
46 "table.Card td:not(.year) a.SLink[href^='/php/archive.phtml'], "
47 "table.cont td.issue_with_corner:not(.year) a.SLink[href^='/php/archive.phtml'], "
48 "table[bordercolorlight='black'] tr:not([class]) td:not(.year) a.SLink[href^='/php/archive.phtml']"
49 )
50 for link_tag in issue_tags:
51 href = link_tag.get("href")
53 if not isinstance(href, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 raise ValueError(
55 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
56 )
57 url = urljoin(self.source_website, href) + "&bshow=contents"
59 title = link_tag.get("title")
60 if not isinstance(title, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 raise ValueError("Couldn't find issue data")
62 issue_search = self.issue_regex.search(cleanup_str(title))
63 year = None
64 volume = None
65 number = None
66 if issue_search: 66 ↛ 73line 66 didn't jump to line 73 because the condition on line 66 was always true
67 issue_dict = issue_search.groupdict()
68 year = issue_dict["year"]
69 volume = issue_dict.get("volume", None)
70 number = issue_dict.get("number", None)
72 # Use another method to parse issue metadata if the first one is not successfull
73 parsed_url = urlparse(url)
74 query_args = parse_qs(parsed_url.query)
76 # Query arguments can be lists
77 year = year or next(iter(query_args.get("year") or []), None)
78 volume = volume or next(iter(query_args.get("volume") or []), None)
79 number = number or next(iter(query_args.get("issue") or []), None)
80 if not year: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Couldn't parse issue year")
82 xissue = self.create_xissue(
83 url,
84 year,
85 volume,
86 number,
87 )
89 xissues.append(xissue)
91 return xissues
93 def parse_issue_content(self, content, xissue):
94 soup = BeautifulSoup(content, "html.parser")
95 # Parse issue title (if exists)
96 issue_title_tag = soup.select_one("td[valign='top'] div.red font")
97 if issue_title_tag:
98 title_str = cleanup_str(issue_title_tag.text)
99 part_search = regex.search(r"Part (?P<number>\d+)$", title_str)
100 if part_search: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 title_str = title_str.removesuffix(f"Part {xissue.number}")
103 xissue.title_tex = title_str
105 # Parse Articles
106 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
107 articles_tags = soup.select(
108 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
109 )
110 for i, a in enumerate(articles_tags):
111 article = create_articledata()
112 href = a.get("href")
113 if not isinstance(href, str): 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 raise ValueError(
115 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
116 )
118 article.url = self.source_website + href
119 article.pid = "a" + str(i)
120 xissue.articles.append(article)
122 def parse_article_content(self, content, xissue, xarticle, url):
123 soup = BeautifulSoup(content, "html5lib")
125 # Language
126 language_candidates = soup.select("div.around-button > div.msc")
127 language_span = next(
128 (
129 span
130 for span in language_candidates
131 if cleanup_str(span.text).startswith("Language:")
132 ),
133 None,
134 )
136 if not language_span: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 raise ValueError(
138 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
139 )
141 language_b = language_span.select_one("b")
142 if language_b: 142 ↛ 145line 142 didn't jump to line 145 because the condition on line 142 was always true
143 language_b.decompose()
145 long_lang = cleanup_str(language_span.text)
146 xarticle.lang = str(langcodes.find(long_lang))
148 # Title
149 title_tag = soup.select_one("span.red font")
150 if not title_tag: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true
151 raise ValueError(
152 f"[{self.source_domain}] {self.collection_id} : Article title not found"
153 )
154 xarticle.title_tex = title_tag.text
156 amsbib_tag = soup.select_one("div.showamsbib")
158 if amsbib_tag: 158 ↛ 185line 158 didn't jump to line 185 because the condition on line 158 was always true
159 amsbib = amsbib_tag.text
160 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
161 if authors_match:
162 authors = authors_match.group(1).split(",")
163 for author_text in authors:
164 if author_text != "": 164 ↛ 163line 164 didn't jump to line 163 because the condition on line 164 was always true
165 author_text = self.latext_parser.latex_to_text(author_text)
166 author = create_contributor()
167 author["role"] = "author"
168 author["string_name"] = cleanup_str(author_text)
169 xarticle.contributors.append(author)
171 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
172 if title_match: 172 ↛ 175line 172 didn't jump to line 175 because the condition on line 172 was always true
173 xarticle.title_tex = title_match.group(1)
175 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
176 if title_match: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true
177 page_range = title_match.group(1)
178 pages = page_range.split("--")
179 if len(pages) == 2:
180 xarticle.fpage = pages[0].replace(",", "")
181 xarticle.lpage = pages[1].replace(",", "")
182 else:
183 xarticle.page_range = page_range
185 xarticle.title_tex = self.latex_converter.latex_to_text(cleanup_str(xarticle.title_tex))
186 # Pdf
187 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
188 if not pdf_tag: 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true
189 pdf_tag = soup.select_one("a.button_green:-soup-contains-own('English version PDF')")
190 if pdf_tag: 190 ↛ 196line 190 didn't jump to line 196 because the condition on line 190 was always true
191 href = pdf_tag.get("href")
192 if isinstance(href, str): 192 ↛ 196line 192 didn't jump to line 196 because the condition on line 192 was always true
193 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
195 # References
196 a_id = url.split("/")[-1]
197 ref_url = (
198 self.source_website
199 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
200 )
202 self.parse_references(self.download_file(ref_url), xarticle)
204 # Keywords
205 keyword_tag = next(
206 iter(
207 [
208 d
209 for d in soup.select("div.around-button")
210 if cleanup_str(d.text).startswith("Keywords:")
211 ]
212 ),
213 None,
214 )
215 if keyword_tag:
216 keywords = cleanup_str(keyword_tag.text).removeprefix("Keywords:").strip().split(", ")
217 for kwd in keywords:
218 xarticle.kwds.append({"type": "", "value": kwd, "lang": self.detect_language(kwd)})
220 abstract_tag = next(
221 iter(
222 [
223 d
224 for d in soup.select("div.around-button")
225 if cleanup_str(d.text).startswith("Abstract")
226 ]
227 ),
228 None,
229 )
230 if abstract_tag:
231 abstract_tag_b = abstract_tag.select_one("b")
232 if abstract_tag_b: 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true
233 abstract_tag_b.decompose()
234 abstract_text = cleanup_str(abstract_tag.text)
235 xabstract = create_abstract(
236 value_tex=abstract_text,
237 lang=self.detect_language(abstract_text),
238 )
239 xarticle.abstracts.append(xabstract)
240 return xarticle
242 def parse_references(self, content: str, xarticle: ArticleData):
243 soup = BeautifulSoup(content, "html.parser")
244 references = soup.select('tr:has(td[valign="top"])')
246 xarticle.bibitems = [self.parse_ref(item) for item in references]
248 def parse_ref(self, tag: Tag):
249 links_xml = ""
250 for a_tag in tag.select("a"):
251 a_href = a_tag.get("href")
252 if not isinstance(a_href, str): 252 ↛ 253line 252 didn't jump to line 253 because the condition on line 252 was never true
253 continue
254 a_href = escape(a_href)
255 if a_tag.select_one("img[alt='crossref']"):
256 links_xml += get_ext_link_xml(
257 a_href, a_href.removeprefix("https://doi.org/"), "doi"
258 )
259 elif a_tag.select_one("img[alt='mathscinet']"):
260 links_xml += get_ext_link_xml(
261 a_href,
262 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
263 "mr-item-id",
264 )
265 elif a_tag.select_one("img[alt='zmath']"):
266 links_xml += get_ext_link_xml(
267 a_href,
268 a_href.removeprefix("https://zbmath.org/?q=an:"),
269 "zbl-item-id",
270 )
271 elif a_tag.select_one("img"):
272 # self.logger.debug(f"Unimplemented reference link : {a_tag.get('href', '')}")
273 # alt_text = a_tag.get("alt", "")
274 # if not isinstance(alt_text, str):
275 # continue
276 # links_xml += get_ext_link_xml(a_href, escape(a_tag.text or alt_text))
277 pass
278 else:
279 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
280 a_tag.decompose()
281 return JatsBase.bake_ref(cleanup_str(tag.text + links_xml))