Coverage for src/crawler/by_source/mathnetru_crawler.py: 89%
150 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import langcodes
2import lingua
3import regex
4import requests
5from bs4 import BeautifulSoup, Tag
6from lingua import LanguageDetectorBuilder
7from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
8from ptf.cmds.xml.xml_utils import escape
9from ptf.model_data import ArticleData, create_abstract, create_articledata, create_contributor
11from crawler.base_crawler import BaseCollectionCrawler
12from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
15class MathnetruCrawler(BaseCollectionCrawler):
16 source_domain = "MATHNETRU"
17 source_name = "Math-Net.Ru"
18 source_website = "https://www.mathnet.ru"
19 periode_begin = 0
20 periode_end = 0
22 issue_regex = r"(?:.+, )?(?P<year>\d{4}), ?(?:Volume|Issue|Number) (?P<volume>\d+)(?:, ?(?:Number|Issue) (?P<number>\d+))?"
23 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)"
25 def build_language_detector(self):
26 self.language_detector = LanguageDetectorBuilder.from_languages(
27 lingua.Language.ENGLISH, lingua.Language.ENGLISH
28 ).build()
30 def parse_collection_content(self, content):
31 xissues = []
32 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng
33 soup = BeautifulSoup(content, "html5lib")
35 # Periode
36 periode_tag = soup.select_one("td.showUDC[title='Coverage']:nth-child(2)")
37 if periode_tag:
38 years = periode_tag.text.split("–")
39 self.periode_begin = int(years[0])
40 self.periode_end = int(years[1])
42 self.periode = self.get_or_create_periode()
44 # Issues
45 issue_tags = soup.select(
46 "table.Card td a.SLink[href^='/php'], table.cont td.issue_with_corner a.SLink[href^='/php']"
47 )
48 for link_tag in issue_tags:
49 href = link_tag.get("href")
50 title = link_tag.get("title", None)
51 if not isinstance(href, str): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError(
53 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed"
54 )
55 if isinstance(title, str):
56 title = cleanup_str(title)
57 volume_re = regex.search(self.issue_regex, title)
58 else:
59 if not link_tag.parent: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 raise ValueError(
61 f"[{self.source_domain}] {self.collection_id} : Title cannot be parsed"
62 )
63 title = cleanup_str(link_tag.parent.text)
64 volume_re = regex.search(self.issue_regex_alt, title)
66 if not volume_re: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 raise ValueError(
68 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed"
69 )
71 volume_data = volume_re.groupdict()
72 if volume_data.get("volume", None):
73 volume_data["volume"] = volume_data["volume"].strip()
74 xissues.append(
75 self.create_xissue(
76 self.source_website + href + "&bshow=contents",
77 volume_data["year"],
78 volume_data.get("volume", None),
79 volume_data.get("number", None),
80 )
81 )
82 return xissues
84 def parse_issue_content(self, content, xissue):
85 soup = BeautifulSoup(content, "html.parser")
87 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng
88 articles_tags = soup.select(
89 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']"
90 )
91 for i, a in enumerate(articles_tags):
92 article = create_articledata()
93 href = a.get("href")
94 if not isinstance(href, str): 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 raise ValueError(
96 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed"
97 )
99 article.url = self.source_website + href
100 article.pid = "a" + str(i)
101 xissue.articles.append(article)
103 def parse_article_content(self, content, xissue, xarticle, url, pid):
104 soup = BeautifulSoup(content, "html.parser")
106 xarticle.pid = pid
108 # Language
109 language_candidates = soup.select("div.around-button > div.msc")
110 language_span = next(
111 (
112 span
113 for span in language_candidates
114 if cleanup_str(span.text).startswith("Language: ")
115 ),
116 None,
117 )
119 if not language_span: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 raise ValueError(
121 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language"
122 )
124 language_b = language_span.select_one("b")
125 if language_b: 125 ↛ 128line 125 didn't jump to line 128 because the condition on line 125 was always true
126 language_b.decompose()
128 long_lang = cleanup_str(language_span.text)
129 xarticle.lang = str(langcodes.find(long_lang))
131 # Title
132 title_tag = soup.select_one("span.red font")
133 if not title_tag: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 raise ValueError(
135 f"[{self.source_domain}] {self.collection_id} : Article title not found"
136 )
137 xarticle.title_tex = title_tag.text
139 amsbib_tag = soup.select_one("div.showamsbib")
141 if amsbib_tag: 141 ↛ 169line 141 didn't jump to line 169 because the condition on line 141 was always true
142 amsbib = amsbib_tag.text
143 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE)
144 if authors_match:
145 authors = authors_match.group(1).split(",")
146 for author_text in authors:
147 if author_text != "": 147 ↛ 146line 147 didn't jump to line 146 because the condition on line 147 was always true
148 author_text = self.latext_parser.latex_to_text(author_text)
149 author = create_contributor()
150 author["role"] = "author"
151 author["string_name"] = cleanup_str(author_text)
152 xarticle.contributors.append(author)
154 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE)
155 if title_match: 155 ↛ 158line 155 didn't jump to line 158 because the condition on line 155 was always true
156 xarticle.title_tex = title_match.group(1)
158 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE)
159 if title_match: 159 ↛ 169line 159 didn't jump to line 169 because the condition on line 159 was always true
160 page_range = title_match.group(1)
161 pages = page_range.split("--")
162 if len(pages) == 2:
163 xarticle.fpage = pages[0].replace(",", "")
164 xarticle.lpage = pages[1].replace(",", "")
165 else:
166 xarticle.page_range = page_range
168 # Pdf
169 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']")
170 if pdf_tag: 170 ↛ 176line 170 didn't jump to line 176 because the condition on line 170 was always true
171 href = pdf_tag.get("href")
172 if isinstance(href, str): 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was always true
173 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
175 # References
176 a_id = url.split("/")[-1]
177 ref_url = (
178 self.source_website
179 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng"
180 )
182 self.parse_references(self.download_file(ref_url), xarticle)
184 # Keywords
185 keyword_tag = next(
186 iter(
187 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")]
188 ),
189 None,
190 )
191 if keyword_tag:
192 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ")
193 for kwd in keywords:
194 xarticle.kwds.append({"type": "", "lang": self.detect_language(kwd), "value": kwd})
196 abstract_tag = next(
197 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]),
198 None,
199 )
200 if abstract_tag:
201 abstract_tag_b = abstract_tag.select_one("b")
202 if abstract_tag_b: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true
203 abstract_tag_b.decompose()
204 xabstract = create_abstract(
205 tag="abstract",
206 value_tex=abstract_tag.text,
207 lang=self.detect_language(abstract_tag.text),
208 )
209 xarticle.abstracts.append(xabstract)
210 return xarticle
212 def parse_references(self, content: str, xarticle: ArticleData):
213 soup = BeautifulSoup(content, "html.parser")
214 references = soup.select('tr:has(td[valign="top"])')
216 bibitems = [self.parse_ref(item) for item in references]
217 if len(bibitems) > 0:
218 xarticle.abstracts.append(self.create_bibliography(bibitems))
220 def parse_ref(self, tag: Tag):
221 links_xml = ""
222 for a_tag in tag.select("a"):
223 a_href = a_tag.get("href")
224 if not isinstance(a_href, str): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 continue
226 a_href = escape(a_href)
227 if a_tag.select_one("img[alt='crossref']"):
228 links_xml += get_ext_link_xml(
229 a_href, a_href.removeprefix("https://doi.org/"), "doi"
230 )
231 elif a_tag.select_one("img[alt='mathscinet']"):
232 links_xml += get_ext_link_xml(
233 a_href,
234 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="),
235 "mr-item-id",
236 )
237 elif a_tag.select_one("img[alt='zmath']"):
238 links_xml += get_ext_link_xml(
239 a_href,
240 a_href.removeprefix("https://zbmath.org/?q=an:"),
241 "zbl-item-id",
242 )
243 elif a_tag.select_one("img"):
244 print(f"Unimplemented reference link : {a_tag.get('href', '')}")
245 else:
246 links_xml += get_ext_link_xml(a_href, escape(a_tag.text))
247 a_tag.decompose()
249 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml))
251 def decode_response(self, response: requests.Response, encoding: str = "utf-8"):
252 """Override this if the content-type headers from the sources are advertising something else than the actual content
253 SASA needs this"""
254 if "charset=" in response.headers["Content-Type"]: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 encoding = response.headers["Content-Type"].split("charset=")[1]
256 return response.content.decode(encoding)