Coverage for src/crawler/by_source/mathnetru_crawler.py: 89%

150 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import langcodes 

2import lingua 

3import regex 

4import requests 

5from bs4 import BeautifulSoup, Tag 

6from lingua import LanguageDetectorBuilder 

7from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

8from ptf.cmds.xml.xml_utils import escape 

9from ptf.model_data import ArticleData, create_abstract, create_articledata, create_contributor 

10 

11from crawler.base_crawler import BaseCollectionCrawler 

12from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

13 

14 

15class MathnetruCrawler(BaseCollectionCrawler): 

16 source_domain = "MATHNETRU" 

17 source_name = "Math-Net.Ru" 

18 source_website = "https://www.mathnet.ru" 

19 periode_begin = 0 

20 periode_end = 0 

21 

22 issue_regex = r"(?:.+, )?(?P<year>\d{4}), ?(?:Volume|Issue|Number) (?P<volume>\d+)(?:, ?(?:Number|Issue) (?P<number>\d+))?" 

23 issue_regex_alt = r"«.+»(?:, Volume (?P<volume>\d+))? \((?P<year>\d+)\)" 

24 

25 def build_language_detector(self): 

26 self.language_detector = LanguageDetectorBuilder.from_languages( 

27 lingua.Language.ENGLISH, lingua.Language.ENGLISH 

28 ).build() 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 # Some mathnetru pages are broken : view-source:https://www.mathnet.ru/php/archive.phtml?jrnid=al&wshow=contents&option_lang=eng 

33 soup = BeautifulSoup(content, "html5lib") 

34 

35 # Periode 

36 periode_tag = soup.select_one("td.showUDC[title='Coverage']:nth-child(2)") 

37 if periode_tag: 

38 years = periode_tag.text.split("–") 

39 self.periode_begin = int(years[0]) 

40 self.periode_end = int(years[1]) 

41 

42 self.periode = self.get_or_create_periode() 

43 

44 # Issues 

45 issue_tags = soup.select( 

46 "table.Card td a.SLink[href^='/php'], table.cont td.issue_with_corner a.SLink[href^='/php']" 

47 ) 

48 for link_tag in issue_tags: 

49 href = link_tag.get("href") 

50 title = link_tag.get("title", None) 

51 if not isinstance(href, str): 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError( 

53 f"[{self.source_domain}] {self.collection_id} : Issue link cannot be parsed" 

54 ) 

55 if isinstance(title, str): 

56 title = cleanup_str(title) 

57 volume_re = regex.search(self.issue_regex, title) 

58 else: 

59 if not link_tag.parent: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError( 

61 f"[{self.source_domain}] {self.collection_id} : Title cannot be parsed" 

62 ) 

63 title = cleanup_str(link_tag.parent.text) 

64 volume_re = regex.search(self.issue_regex_alt, title) 

65 

66 if not volume_re: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise ValueError( 

68 f"[{self.source_domain}] {self.collection_id} : Volume cannot be parsed" 

69 ) 

70 

71 volume_data = volume_re.groupdict() 

72 if volume_data.get("volume", None): 

73 volume_data["volume"] = volume_data["volume"].strip() 

74 xissues.append( 

75 self.create_xissue( 

76 self.source_website + href + "&bshow=contents", 

77 volume_data["year"], 

78 volume_data.get("volume", None), 

79 volume_data.get("number", None), 

80 ) 

81 ) 

82 return xissues 

83 

84 def parse_issue_content(self, content, xissue): 

85 soup = BeautifulSoup(content, "html.parser") 

86 

87 # Workaround for https://www.mathnet.ru/php/archive.phtml?jrnid=mais&wshow=issue&year=2012&volume=19&issue=1&option_lang=eng 

88 articles_tags = soup.select( 

89 "td[colspan='2'] a.SLink[href^='/eng'], td[colspan='2'] a.SLink[href^='/rus']" 

90 ) 

91 for i, a in enumerate(articles_tags): 

92 article = create_articledata() 

93 href = a.get("href") 

94 if not isinstance(href, str): 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 raise ValueError( 

96 f"[{self.source_domain}] {self.collection_id} : Article link cannot be parsed" 

97 ) 

98 

99 article.url = self.source_website + href 

100 article.pid = "a" + str(i) 

101 xissue.articles.append(article) 

102 

103 def parse_article_content(self, content, xissue, xarticle, url, pid): 

104 soup = BeautifulSoup(content, "html.parser") 

105 

106 xarticle.pid = pid 

107 

108 # Language 

109 language_candidates = soup.select("div.around-button > div.msc") 

110 language_span = next( 

111 ( 

112 span 

113 for span in language_candidates 

114 if cleanup_str(span.text).startswith("Language: ") 

115 ), 

116 None, 

117 ) 

118 

119 if not language_span: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 raise ValueError( 

121 f"[{self.source_domain}] {self.collection_id} : Couldn't find article language" 

122 ) 

123 

124 language_b = language_span.select_one("b") 

125 if language_b: 125 ↛ 128line 125 didn't jump to line 128 because the condition on line 125 was always true

126 language_b.decompose() 

127 

128 long_lang = cleanup_str(language_span.text) 

129 xarticle.lang = str(langcodes.find(long_lang)) 

130 

131 # Title 

132 title_tag = soup.select_one("span.red font") 

133 if not title_tag: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError( 

135 f"[{self.source_domain}] {self.collection_id} : Article title not found" 

136 ) 

137 xarticle.title_tex = title_tag.text 

138 

139 amsbib_tag = soup.select_one("div.showamsbib") 

140 

141 if amsbib_tag: 141 ↛ 169line 141 didn't jump to line 169 because the condition on line 141 was always true

142 amsbib = amsbib_tag.text 

143 authors_match = regex.search(r"^\\by (.*)$", amsbib, flags=regex.MULTILINE) 

144 if authors_match: 

145 authors = authors_match.group(1).split(",") 

146 for author_text in authors: 

147 if author_text != "": 147 ↛ 146line 147 didn't jump to line 146 because the condition on line 147 was always true

148 author_text = self.latext_parser.latex_to_text(author_text) 

149 author = create_contributor() 

150 author["role"] = "author" 

151 author["string_name"] = cleanup_str(author_text) 

152 xarticle.contributors.append(author) 

153 

154 title_match = regex.search(r"^\\paper (.*)$", amsbib, flags=regex.MULTILINE) 

155 if title_match: 155 ↛ 158line 155 didn't jump to line 158 because the condition on line 155 was always true

156 xarticle.title_tex = title_match.group(1) 

157 

158 title_match = regex.search(r"^\\pages (.*)$", amsbib, flags=regex.MULTILINE) 

159 if title_match: 159 ↛ 169line 159 didn't jump to line 169 because the condition on line 159 was always true

160 page_range = title_match.group(1) 

161 pages = page_range.split("--") 

162 if len(pages) == 2: 

163 xarticle.fpage = pages[0].replace(",", "") 

164 xarticle.lpage = pages[1].replace(",", "") 

165 else: 

166 xarticle.page_range = page_range 

167 

168 # Pdf 

169 pdf_tag = soup.select_one("a.button_green[title='Full-text article is available']") 

170 if pdf_tag: 170 ↛ 176line 170 didn't jump to line 176 because the condition on line 170 was always true

171 href = pdf_tag.get("href") 

172 if isinstance(href, str): 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was always true

173 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

174 

175 # References 

176 a_id = url.split("/")[-1] 

177 ref_url = ( 

178 self.source_website 

179 + f"/php/getRefFromDB.phtml?jrnid={''.join(filter(str.isalpha, a_id))}&paperid={''.join(filter(str.isnumeric, a_id))}&output=htm&option_lang=eng" 

180 ) 

181 

182 self.parse_references(self.download_file(ref_url), xarticle) 

183 

184 # Keywords 

185 keyword_tag = next( 

186 iter( 

187 [d for d in soup.select("div.around-button") if d.text.startswith("\nKeywords:")] 

188 ), 

189 None, 

190 ) 

191 if keyword_tag: 

192 keywords = keyword_tag.text.removeprefix("\nKeywords:\n").strip().split(", ") 

193 for kwd in keywords: 

194 xarticle.kwds.append({"type": "", "lang": self.detect_language(kwd), "value": kwd}) 

195 

196 abstract_tag = next( 

197 iter([d for d in soup.select("div.around-button") if d.text.startswith("\nAbstract")]), 

198 None, 

199 ) 

200 if abstract_tag: 

201 abstract_tag_b = abstract_tag.select_one("b") 

202 if abstract_tag_b: 202 ↛ 204line 202 didn't jump to line 204 because the condition on line 202 was always true

203 abstract_tag_b.decompose() 

204 xabstract = create_abstract( 

205 tag="abstract", 

206 value_tex=abstract_tag.text, 

207 lang=self.detect_language(abstract_tag.text), 

208 ) 

209 xarticle.abstracts.append(xabstract) 

210 return xarticle 

211 

212 def parse_references(self, content: str, xarticle: ArticleData): 

213 soup = BeautifulSoup(content, "html.parser") 

214 references = soup.select('tr:has(td[valign="top"])') 

215 

216 bibitems = [self.parse_ref(item) for item in references] 

217 if len(bibitems) > 0: 

218 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

219 

220 def parse_ref(self, tag: Tag): 

221 links_xml = "" 

222 for a_tag in tag.select("a"): 

223 a_href = a_tag.get("href") 

224 if not isinstance(a_href, str): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 continue 

226 a_href = escape(a_href) 

227 if a_tag.select_one("img[alt='crossref']"): 

228 links_xml += get_ext_link_xml( 

229 a_href, a_href.removeprefix("https://doi.org/"), "doi" 

230 ) 

231 elif a_tag.select_one("img[alt='mathscinet']"): 

232 links_xml += get_ext_link_xml( 

233 a_href, 

234 a_href.removeprefix("http://mathscinet.ams.org/mathscinet-getitem?mr="), 

235 "mr-item-id", 

236 ) 

237 elif a_tag.select_one("img[alt='zmath']"): 

238 links_xml += get_ext_link_xml( 

239 a_href, 

240 a_href.removeprefix("https://zbmath.org/?q=an:"), 

241 "zbl-item-id", 

242 ) 

243 elif a_tag.select_one("img"): 

244 print(f"Unimplemented reference link : {a_tag.get('href', '')}") 

245 else: 

246 links_xml += get_ext_link_xml(a_href, escape(a_tag.text)) 

247 a_tag.decompose() 

248 

249 return self.create_crawled_bibitem(cleanup_str(tag.text + links_xml)) 

250 

251 def decode_response(self, response: requests.Response, encoding: str = "utf-8"): 

252 """Override this if the content-type headers from the sources are advertising something else than the actual content 

253 SASA needs this""" 

254 if "charset=" in response.headers["Content-Type"]: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 encoding = response.headers["Content-Type"].split("charset=")[1] 

256 return response.content.decode(encoding)