Coverage for src/crawler/by_source/amuc

1import logging

2import re

4from bs4 import BeautifulSoup

5from ptf.model_data import create_abstract, create_articledata, create_contributor

7from crawler.abstract_crawlers.matching_crawler import MatchingCrawler

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

11class AmucCrawler(MatchingCrawler):

12 source_name = "Comenius University"

13 source_domain = "AMUC"

14 source_website = "http://www.iam.fmph.uniba.sk"

16 issue_re = r"Vol (?P<volume>\d+) No (?P<issue>\S+) \((?P<year>\S+)\)"

17 archive_issue_re = r"Number (?P<issue>\S+)"

18 archive_volume_re = r"Volume (?P<volume>\S+) \((?P<year>\S+)\)"

19 archive_abstract_re = r"Abstract (?P<abstract>\S+) AMS"

21 def parse_collection_content(self, content):

22 xissues = []

23 soup = BeautifulSoup(content, "html.parser")

24 issues = soup.select("div.issue-summary")

25 for issue in issues:

26 issue_group = regex_to_dict(

27 self.issue_re, issue.text, error_msg="Couldn't parse issue data"

28 )

30 issue_href = issue.select("a.title")[0].get("href")

31 if not isinstance(issue_href, str):

32 raise ValueError("Couldn't parse issue url")

34 if issue_group["volume"] == "0":

35 xissues += self.parse_archived_collection_content(issue_href)

37 else:

38 xissues.append(

39 self.create_xissue(

40 url=issue_href,

41 year=issue_group["year"],

42 volume_number=issue_group["volume"],

43 issue_number=issue_group["issue"],

44 )

45 )

47 # If next page exist, get next page issues

48 pagination_tag = soup.select_one("ul.pagination")

49 next_pages_tag = pagination_tag.select("a")

50 for next_page_tag in next_pages_tag:

51 if next_page_tag.text == ">":

52 next_page_url = next_page_tag.get("href")

53 next_page_content = self.download_file(next_page_url)

54 xissues += self.parse_collection_content(next_page_content)

55 return xissues

57 def parse_archived_collection_content(self, url):

58 """

59 For volumes from 60 to 80

60 """

61 content = self.download_file(url)

62 soup = BeautifulSoup(content, "html.parser")

63 issues_tag = soup.select("ul > li.show > ul > li.show > a")

64 artived_xissues = []

65 for issue_tag in issues_tag:

66 issue_href = issue_tag.get("href")

67 if not isinstance(issue_href, str):

68 raise ValueError("Couldn't parse issue url")

70 issue_nb = regex_to_dict(

71 self.archive_issue_re, issue_tag.text, error_msg="Couldn't parse issue data"

72 ).get("issue")

74 volume_tag = issue_tag.parent.parent.parent.select_one("span > strong")

75 volume_group = regex_to_dict(

76 self.archive_volume_re, volume_tag.text, error_msg="Couldn't parse volume data"

77 )

79 artived_xissues.append(

80 self.create_xissue(

81 url=issue_href,

82 year=volume_group["year"],

83 volume_number=volume_group["volume"],

84 issue_number=issue_nb,

85 )

86 )

87 return artived_xissues

89 def parse_issue_content(self, content, xissue):

90 soup = BeautifulSoup(content, "html.parser")

91 articles = soup.select("div.article-summary")

92 if len(articles) == 0:

93 self.parse_archived_issue_content(soup, xissue)

94 else:

95 article_number = 0

96 for article in articles:

97 xarticle = create_articledata()

98 article_href = article.select("h3.media-heading")[0].select("a")[0].get("href")

99 if not isinstance(article_href, str):

100 raise ValueError("Couldn't parse article href")

101 xarticle.url = article_href

102 xarticle.pid = "a" + str(article_number)

103 xissue.articles.append(xarticle)

104 article_number += 1

105

106 def parse_archived_issue_content(self, soup, xissue):

107 """

108 For issues from volumes 60 to 80

109 """

110 article_number = 0

111 body_tag = soup.find("body")

112 if not body_tag:

113 raise ValueError("Couldn't find body tag in archived issue")

114 if body_tag and body_tag.get("bgcolor") == "#FFFFF0":

115 # On cible les liens "Abstract" qui sont présents dans tous les formats

116 abstract_links = soup.find_all(

117 "a", string=lambda t: t and "abstract" in t.strip().lower()

118 )

119 if not abstract_links:

120 raise ValueError(

121 "Couldn't find abstract links in archived issue with white background"

122 )

123 for abstract_link in abstract_links:

124 href = abstract_link.get("href")

125 if not href:

126 continue

127

128 xarticle = create_articledata()

129 abstract_url = "/".join(xissue.url.split("/")[0:-1]) + "/" + href

130 xarticle.pid = "a" + str(article_number)

131 article_number += 1

132 xarticle.title_tex = "Archived article white background"

133 xarticle.url = abstract_url

134 xissue.articles.append(xarticle)

135 if body_tag and body_tag.get("bgcolor") == "#CCE6FF":

136 articles_abstract_tags = soup.findAll("a", href=True, text="Abstract")

137 if not articles_abstract_tags:

138 raise ValueError(

139 "Couldn't find abstract links in archived issue with blue background"

140 )

141

142 for article_abstract_tag in articles_abstract_tags:

143 xarticle = create_articledata()

144 abstract_url = (

145 "/".join(xissue.url.split("/")[0:-1]) + "/" + article_abstract_tag.get("href")

146 )

147 xarticle.pid = "a" + str(article_number)

148 article_number += 1

149 xarticle.title_tex = "Archived article blue background"

150 xarticle.url = abstract_url

151 xissue.articles.append(xarticle)

152

153 def parse_article_content(self, content, xissue, xarticle, url):

154 soup = BeautifulSoup(content, "html.parser")

155

156 # If archived/old article :

157 if xarticle.title_tex in [

158 "Archived article white background",

159 "Archived article blue background",

160 ]:

161 return self.parse_archived_article_content(soup, xissue, xarticle)

162

163 self.get_metadata_using_citation_meta(

164 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"]

165 )

166

167 # Contributors

168 contributors = soup.select_one("div.authors").select("strong")

169 for contributor in contributors:

170 xarticle.contributors.append(

171 create_contributor(role="author", string_name=contributor.text)

172 )

173

174 # pdf link

175 pdf_url = soup.select_one("div.download").select_one("a").get("href")

176 if isinstance(pdf_url, str):

177 add_pdf_link_to_xarticle(xarticle, pdf_url)

178

179 # Abstract

180 abstract_tag = soup.select_one("div.article-abstract")

181 if abstract_tag:

182 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text)))

183 return xarticle

184

185 def parse_archived_article_content(self, soup, xissue, xarticle):

186 """

187 Parse content of archived articles (from volumes 60 to 80)

188 """

189 try:

190 extract_metadata = self.extract_archived_metadata(soup, xarticle)

191 except ValueError as e:

192 logging.error(f"Error extracting metadata for archived article: {e}")

193 xarticle = self.parse_archived_article_content(soup, xissue, xarticle)

194

195 xarticle.title_tex = extract_metadata["title"]

196 if extract_metadata.get("authors"):

197 for author in extract_metadata["authors"]:

198 xarticle.contributors.append(create_contributor(role="author", string_name=author))

199 if extract_metadata.get("abstract"):

200 xarticle.abstracts.append(

201 create_abstract(value_tex=cleanup_str(extract_metadata["abstract"]))

202 )

203 if extract_metadata.get("keywords"):

204 xarticle.keywords = extract_metadata["keywords"]

205 if extract_metadata.get("pdf_url"):

206 pdf_url = "/".join(xarticle.url.split("/")[0:-1]) + "/" + extract_metadata["pdf_url"]

207 add_pdf_link_to_xarticle(xarticle, pdf_url)

208 return xarticle

209

210 def extract_archived_metadata_blue_bg(self, soup):

211 """

212 Extract metadata for articles with blue background

213 """

214 title_tag = soup.find("font", {"color": "#A52A2A"})

215 if not title_tag:

216 raise ValueError("Couldn't find title in archived article with blue background")

217 title = title_tag.get_text(separator=" ", strip=True) if title_tag else None

218

219 author_tag = soup.find("font", {"color": "#008B8B"})

220 if not author_tag:

221 raise ValueError("Couldn't find authors in archived article with blue background")

222 authors = author_tag.get_text(strip=True) if author_tag else None

223 authors = re.split(", | and ", authors) if authors else []

224

225 pdf_url_tag = soup.select_one("a", href=True, text="PDF")

226 if not pdf_url_tag:

227 raise ValueError("Couldn't find pdf url")

228 pdf_url = pdf_url_tag.get("href")

229

230 return title, authors, pdf_url

231

232 def extract_archived_metadata_white_bg(self, soup):

233 """

234 Extract metadata for articles with white background

235 """

236 title_tag = soup.find("font", {"color": "#A52A2A"})

237 if not title_tag:

238 title_tag = soup.select_one('span[style*="color: brown"]')

239 if not title_tag:

240 title_tag = soup.select("font", {"color": "#a52a2a"})

241 if not title_tag:

242 raise ValueError(

243 "Couldn't find title in archived article with white background"

244 )

245 title_tag = title_tag[4]

246 title = title_tag.get_text(separator=" ", strip=True) if title_tag else None

247

248 author_tag = soup.find("font", {"color": "#008B8B"})

249 if not author_tag:

250 author_tag = soup.select_one('span[style*="color: darkcyan"]')

251 if not author_tag:

252 author_tag = soup.select("font")

253 if not author_tag:

254 raise ValueError(

255 "Couldn't find authors in archived article with white background"

256 )

257 author_tag = author_tag[5]

258 authors = author_tag.get_text(strip=True) if author_tag else None

259 authors = re.split(", | and ", authors) if authors else []

260 authors = self.parse_authors_caps_names(authors)

261 pdf_url_tag = soup.select_one("a", href=True, text="Adobe PDF")

262

263 if not pdf_url_tag:

264 raise ValueError("Couldn't find pdf url")

265 pdf_url = pdf_url_tag.get("href")

266

267 return title, authors, pdf_url

268

269 def get_text_until_next_section(self, tag):

270 """

271 For archived articles, get the text content of a section (abstract or keywords) until the next section (keywords or AMS) or the end of the document.

272 """

273 SECTION_KEYWORDS = ["abstract", "keyword", "ams"]

274 content = []

275 for sibling in tag.next_siblings:

276 if sibling.name == "b":

277 if any(

278 sibling.get_text(strip=True).lower().startswith(k) for k in SECTION_KEYWORDS

279 ):

280 break

281 content.append(sibling if isinstance(sibling, str) else sibling.get_text())

282 return " ".join(content).strip().lstrip(":. \xa0")

283

284 def extract_abstract_and_keywords(self, soup):

285 """

286 Extract abstract and keywords for archived articles, which can be in different formats and places depending on the article. The method looks for the "Abstract" section and the "Keywords" section, and extracts their content until the next section or the end of the document.

287 """

288 abstract = None

289 keywords = []

290

291 for tag in soup.find_all("b"):

292 text = tag.get_text(strip=True).lower()

293 if text.startswith("abstract"):

294 abstract = self.get_text_until_next_section(tag)

295 elif text.startswith("keyword"):

296 raw = self.get_text_until_next_section(tag)

297 keywords = [kw.strip() for kw in raw.split(";") if kw.strip()]

298

299 return abstract, keywords

300

301 def extract_archived_metadata(self, soup, xarticle):

302 """

303 Extract metadata for archived articles.

304 """

305 background_color = xarticle.title_tex

306 if background_color == "Archived article blue background":

307 title, authors, pdf_url = self.extract_archived_metadata_blue_bg(soup)

308 elif background_color == "Archived article white background":

309 title, authors, pdf_url = self.extract_archived_metadata_white_bg(soup)

310 else:

311 raise ValueError("Unrecognized archived article backgroud color")

312

313 abstract, keywords = self.extract_abstract_and_keywords(soup)

314

315 return {

316 "title": title,

317 "authors": authors,

318 "pdf_url": pdf_url,

319 "abstract": abstract,

320 "keywords": keywords,

321 }

322

323 def parse_authors_caps_names(self, string_name_list):

324 final_string_name_list = []

325 for string_name in string_name_list:

326 string_name_split = string_name.split()

327 family_name = string_name_split[-1]

328 family_name = family_name[0].upper() + family_name[1:].lower()

329 string_name = " ".join(string_name_split[:-1] + [family_name])

330 final_string_name_list.append(string_name)

331 return final_string_name_list

Coverage for src / crawler / by_source / amuc_crawler.py: 9%

207 statements