Coverage for src / crawler / by_source / amuc_crawler.py: 9%

207 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-04-08 09:35 +0000

1import logging 

2import re 

3 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_abstract, create_articledata, create_contributor 

6 

7from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

9 

10 

11class AmucCrawler(BaseCollectionCrawler): 

12 source_name = "AMUC" 

13 source_domain = "AMUC" 

14 source_website = "http://www.iam.fmph.uniba.sk" 

15 

16 issue_re = r"Vol (?P<volume>\d+) No (?P<issue>\S+) \((?P<year>\S+)\)" 

17 archive_issue_re = r"Number (?P<issue>\S+)" 

18 archive_volume_re = r"Volume (?P<volume>\S+) \((?P<year>\S+)\)" 

19 archive_abstract_re = r"Abstract (?P<abstract>\S+) AMS" 

20 

21 def parse_collection_content(self, content): 

22 xissues = [] 

23 soup = BeautifulSoup(content, "html.parser") 

24 issues = soup.select("div.issue-summary") 

25 for issue in issues: 

26 issue_group = regex_to_dict( 

27 self.issue_re, issue.text, error_msg="Couldn't parse issue data" 

28 ) 

29 

30 issue_href = issue.select("a.title")[0].get("href") 

31 if not isinstance(issue_href, str): 

32 raise ValueError("Couldn't parse issue url") 

33 

34 if issue_group["volume"] == "0": 

35 xissues += self.parse_archived_collection_content(issue_href) 

36 

37 else: 

38 xissues.append( 

39 self.create_xissue( 

40 url=issue_href, 

41 year=issue_group["year"], 

42 volume_number=issue_group["volume"], 

43 issue_number=issue_group["issue"], 

44 ) 

45 ) 

46 

47 # If next page exist, get next page issues 

48 pagination_tag = soup.select_one("ul.pagination") 

49 next_pages_tag = pagination_tag.select("a") 

50 for next_page_tag in next_pages_tag: 

51 if next_page_tag.text == ">": 

52 next_page_url = next_page_tag.get("href") 

53 next_page_content = self.download_file(next_page_url) 

54 xissues += self.parse_collection_content(next_page_content) 

55 return xissues 

56 

57 def parse_archived_collection_content(self, url): 

58 """ 

59 For volumes from 60 to 80 

60 """ 

61 content = self.download_file(url) 

62 soup = BeautifulSoup(content, "html.parser") 

63 issues_tag = soup.select("ul > li.show > ul > li.show > a") 

64 artived_xissues = [] 

65 for issue_tag in issues_tag: 

66 issue_href = issue_tag.get("href") 

67 if not isinstance(issue_href, str): 

68 raise ValueError("Couldn't parse issue url") 

69 

70 issue_nb = regex_to_dict( 

71 self.archive_issue_re, issue_tag.text, error_msg="Couldn't parse issue data" 

72 ).get("issue") 

73 

74 volume_tag = issue_tag.parent.parent.parent.select_one("span > strong") 

75 volume_group = regex_to_dict( 

76 self.archive_volume_re, volume_tag.text, error_msg="Couldn't parse volume data" 

77 ) 

78 

79 artived_xissues.append( 

80 self.create_xissue( 

81 url=issue_href, 

82 year=volume_group["year"], 

83 volume_number=volume_group["volume"], 

84 issue_number=issue_nb, 

85 ) 

86 ) 

87 return artived_xissues 

88 

89 def parse_issue_content(self, content, xissue): 

90 soup = BeautifulSoup(content, "html.parser") 

91 articles = soup.select("div.article-summary") 

92 if len(articles) == 0: 

93 self.parse_archived_issue_content(soup, xissue) 

94 else: 

95 article_number = 0 

96 for article in articles: 

97 xarticle = create_articledata() 

98 article_href = article.select("h3.media-heading")[0].select("a")[0].get("href") 

99 if not isinstance(article_href, str): 

100 raise ValueError("Couldn't parse article href") 

101 xarticle.url = article_href 

102 xarticle.pid = "a" + str(article_number) 

103 xissue.articles.append(xarticle) 

104 article_number += 1 

105 

106 def parse_archived_issue_content(self, soup, xissue): 

107 """ 

108 For issues from volumes 60 to 80 

109 """ 

110 article_number = 0 

111 body_tag = soup.find("body") 

112 if not body_tag: 

113 raise ValueError("Couldn't find body tag in archived issue") 

114 if body_tag and body_tag.get("bgcolor") == "#FFFFF0": 

115 # On cible les liens "Abstract" qui sont présents dans tous les formats 

116 abstract_links = soup.find_all( 

117 "a", string=lambda t: t and "abstract" in t.strip().lower() 

118 ) 

119 if not abstract_links: 

120 raise ValueError( 

121 "Couldn't find abstract links in archived issue with white background" 

122 ) 

123 for abstract_link in abstract_links: 

124 href = abstract_link.get("href") 

125 if not href: 

126 continue 

127 

128 xarticle = create_articledata() 

129 abstract_url = "/".join(xissue.url.split("/")[0:-1]) + "/" + href 

130 xarticle.pid = "a" + str(article_number) 

131 article_number += 1 

132 xarticle.title_tex = "Archived article white background" 

133 xarticle.url = abstract_url 

134 xissue.articles.append(xarticle) 

135 if body_tag and body_tag.get("bgcolor") == "#CCE6FF": 

136 articles_abstract_tags = soup.findAll("a", href=True, text="Abstract") 

137 if not articles_abstract_tags: 

138 raise ValueError( 

139 "Couldn't find abstract links in archived issue with blue background" 

140 ) 

141 

142 for article_abstract_tag in articles_abstract_tags: 

143 xarticle = create_articledata() 

144 abstract_url = ( 

145 "/".join(xissue.url.split("/")[0:-1]) + "/" + article_abstract_tag.get("href") 

146 ) 

147 xarticle.pid = "a" + str(article_number) 

148 article_number += 1 

149 xarticle.title_tex = "Archived article blue background" 

150 xarticle.url = abstract_url 

151 xissue.articles.append(xarticle) 

152 

153 def parse_article_content(self, content, xissue, xarticle, url): 

154 soup = BeautifulSoup(content, "html.parser") 

155 

156 # If archived/old article : 

157 if xarticle.title_tex in [ 

158 "Archived article white background", 

159 "Archived article blue background", 

160 ]: 

161 return self.parse_archived_article_content(soup, xissue, xarticle) 

162 

163 self.get_metadata_using_citation_meta( 

164 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"] 

165 ) 

166 

167 # Contributors 

168 contributors = soup.select_one("div.authors").select("strong") 

169 for contributor in contributors: 

170 xarticle.contributors.append( 

171 create_contributor(role="author", string_name=contributor.text) 

172 ) 

173 

174 # pdf link 

175 pdf_url = soup.select_one("div.download").select_one("a").get("href") 

176 if isinstance(pdf_url, str): 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 # Abstract 

180 abstract_tag = soup.select_one("div.article-abstract") 

181 if abstract_tag: 

182 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text))) 

183 return xarticle 

184 

185 def parse_archived_article_content(self, soup, xissue, xarticle): 

186 """ 

187 Parse content of archived articles (from volumes 60 to 80) 

188 """ 

189 try: 

190 extract_metadata = self.extract_archived_metadata(soup, xarticle) 

191 except ValueError as e: 

192 logging.error(f"Error extracting metadata for archived article: {e}") 

193 xarticle = self.parse_archived_article_content(soup, xissue, xarticle) 

194 

195 xarticle.title_tex = extract_metadata["title"] 

196 if extract_metadata.get("authors"): 

197 for author in extract_metadata["authors"]: 

198 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

199 if extract_metadata.get("abstract"): 

200 xarticle.abstracts.append( 

201 create_abstract(value_tex=cleanup_str(extract_metadata["abstract"])) 

202 ) 

203 if extract_metadata.get("keywords"): 

204 xarticle.keywords = extract_metadata["keywords"] 

205 if extract_metadata.get("pdf_url"): 

206 pdf_url = "/".join(xarticle.url.split("/")[0:-1]) + "/" + extract_metadata["pdf_url"] 

207 add_pdf_link_to_xarticle(xarticle, pdf_url) 

208 return xarticle 

209 

210 def extract_archived_metadata_blue_bg(self, soup): 

211 """ 

212 Extract metadata for articles with blue background 

213 """ 

214 title_tag = soup.find("font", {"color": "#A52A2A"}) 

215 if not title_tag: 

216 raise ValueError("Couldn't find title in archived article with blue background") 

217 title = title_tag.get_text(separator=" ", strip=True) if title_tag else None 

218 

219 author_tag = soup.find("font", {"color": "#008B8B"}) 

220 if not author_tag: 

221 raise ValueError("Couldn't find authors in archived article with blue background") 

222 authors = author_tag.get_text(strip=True) if author_tag else None 

223 authors = re.split(", | and ", authors) if authors else [] 

224 

225 pdf_url_tag = soup.select_one("a", href=True, text="PDF") 

226 if not pdf_url_tag: 

227 raise ValueError("Couldn't find pdf url") 

228 pdf_url = pdf_url_tag.get("href") 

229 

230 return title, authors, pdf_url 

231 

232 def extract_archived_metadata_white_bg(self, soup): 

233 """ 

234 Extract metadata for articles with white background 

235 """ 

236 title_tag = soup.find("font", {"color": "#A52A2A"}) 

237 if not title_tag: 

238 title_tag = soup.select_one('span[style*="color: brown"]') 

239 if not title_tag: 

240 title_tag = soup.select("font", {"color": "#a52a2a"}) 

241 if not title_tag: 

242 raise ValueError( 

243 "Couldn't find title in archived article with white background" 

244 ) 

245 title_tag = title_tag[4] 

246 title = title_tag.get_text(separator=" ", strip=True) if title_tag else None 

247 

248 author_tag = soup.find("font", {"color": "#008B8B"}) 

249 if not author_tag: 

250 author_tag = soup.select_one('span[style*="color: darkcyan"]') 

251 if not author_tag: 

252 author_tag = soup.select("font") 

253 if not author_tag: 

254 raise ValueError( 

255 "Couldn't find authors in archived article with white background" 

256 ) 

257 author_tag = author_tag[5] 

258 authors = author_tag.get_text(strip=True) if author_tag else None 

259 authors = re.split(", | and ", authors) if authors else [] 

260 authors = self.parse_authors_caps_names(authors) 

261 pdf_url_tag = soup.select_one("a", href=True, text="Adobe PDF") 

262 

263 if not pdf_url_tag: 

264 raise ValueError("Couldn't find pdf url") 

265 pdf_url = pdf_url_tag.get("href") 

266 

267 return title, authors, pdf_url 

268 

269 def get_text_until_next_section(self, tag): 

270 """ 

271 For archived articles, get the text content of a section (abstract or keywords) until the next section (keywords or AMS) or the end of the document. 

272 """ 

273 SECTION_KEYWORDS = ["abstract", "keyword", "ams"] 

274 content = [] 

275 for sibling in tag.next_siblings: 

276 if sibling.name == "b": 

277 if any( 

278 sibling.get_text(strip=True).lower().startswith(k) for k in SECTION_KEYWORDS 

279 ): 

280 break 

281 content.append(sibling if isinstance(sibling, str) else sibling.get_text()) 

282 return " ".join(content).strip().lstrip(":. \xa0") 

283 

284 def extract_abstract_and_keywords(self, soup): 

285 """ 

286 Extract abstract and keywords for archived articles, which can be in different formats and places depending on the article. The method looks for the "Abstract" section and the "Keywords" section, and extracts their content until the next section or the end of the document. 

287 """ 

288 abstract = None 

289 keywords = [] 

290 

291 for tag in soup.find_all("b"): 

292 text = tag.get_text(strip=True).lower() 

293 if text.startswith("abstract"): 

294 abstract = self.get_text_until_next_section(tag) 

295 elif text.startswith("keyword"): 

296 raw = self.get_text_until_next_section(tag) 

297 keywords = [kw.strip() for kw in raw.split(";") if kw.strip()] 

298 

299 return abstract, keywords 

300 

301 def extract_archived_metadata(self, soup, xarticle): 

302 """ 

303 Extract metadata for archived articles. 

304 """ 

305 background_color = xarticle.title_tex 

306 if background_color == "Archived article blue background": 

307 title, authors, pdf_url = self.extract_archived_metadata_blue_bg(soup) 

308 elif background_color == "Archived article white background": 

309 title, authors, pdf_url = self.extract_archived_metadata_white_bg(soup) 

310 else: 

311 raise ValueError("Unrecognized archived article backgroud color") 

312 

313 abstract, keywords = self.extract_abstract_and_keywords(soup) 

314 

315 return { 

316 "title": title, 

317 "authors": authors, 

318 "pdf_url": pdf_url, 

319 "abstract": abstract, 

320 "keywords": keywords, 

321 } 

322 

323 def parse_authors_caps_names(self, string_name_list): 

324 final_string_name_list = [] 

325 for string_name in string_name_list: 

326 string_name_split = string_name.split() 

327 family_name = string_name_split[-1] 

328 family_name = family_name[0].upper() + family_name[1:].lower() 

329 string_name = " ".join(string_name_split[:-1] + [family_name]) 

330 final_string_name_list.append(string_name) 

331 return final_string_name_list