Coverage for src / crawler / by_source / amuc_crawler.py: 9%
207 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
1import logging
2import re
4from bs4 import BeautifulSoup
5from ptf.model_data import create_abstract, create_articledata, create_contributor
7from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
11class AmucCrawler(BaseCollectionCrawler):
12 source_name = "AMUC"
13 source_domain = "AMUC"
14 source_website = "http://www.iam.fmph.uniba.sk"
16 issue_re = r"Vol (?P<volume>\d+) No (?P<issue>\S+) \((?P<year>\S+)\)"
17 archive_issue_re = r"Number (?P<issue>\S+)"
18 archive_volume_re = r"Volume (?P<volume>\S+) \((?P<year>\S+)\)"
19 archive_abstract_re = r"Abstract (?P<abstract>\S+) AMS"
21 def parse_collection_content(self, content):
22 xissues = []
23 soup = BeautifulSoup(content, "html.parser")
24 issues = soup.select("div.issue-summary")
25 for issue in issues:
26 issue_group = regex_to_dict(
27 self.issue_re, issue.text, error_msg="Couldn't parse issue data"
28 )
30 issue_href = issue.select("a.title")[0].get("href")
31 if not isinstance(issue_href, str):
32 raise ValueError("Couldn't parse issue url")
34 if issue_group["volume"] == "0":
35 xissues += self.parse_archived_collection_content(issue_href)
37 else:
38 xissues.append(
39 self.create_xissue(
40 url=issue_href,
41 year=issue_group["year"],
42 volume_number=issue_group["volume"],
43 issue_number=issue_group["issue"],
44 )
45 )
47 # If next page exist, get next page issues
48 pagination_tag = soup.select_one("ul.pagination")
49 next_pages_tag = pagination_tag.select("a")
50 for next_page_tag in next_pages_tag:
51 if next_page_tag.text == ">":
52 next_page_url = next_page_tag.get("href")
53 next_page_content = self.download_file(next_page_url)
54 xissues += self.parse_collection_content(next_page_content)
55 return xissues
57 def parse_archived_collection_content(self, url):
58 """
59 For volumes from 60 to 80
60 """
61 content = self.download_file(url)
62 soup = BeautifulSoup(content, "html.parser")
63 issues_tag = soup.select("ul > li.show > ul > li.show > a")
64 artived_xissues = []
65 for issue_tag in issues_tag:
66 issue_href = issue_tag.get("href")
67 if not isinstance(issue_href, str):
68 raise ValueError("Couldn't parse issue url")
70 issue_nb = regex_to_dict(
71 self.archive_issue_re, issue_tag.text, error_msg="Couldn't parse issue data"
72 ).get("issue")
74 volume_tag = issue_tag.parent.parent.parent.select_one("span > strong")
75 volume_group = regex_to_dict(
76 self.archive_volume_re, volume_tag.text, error_msg="Couldn't parse volume data"
77 )
79 artived_xissues.append(
80 self.create_xissue(
81 url=issue_href,
82 year=volume_group["year"],
83 volume_number=volume_group["volume"],
84 issue_number=issue_nb,
85 )
86 )
87 return artived_xissues
89 def parse_issue_content(self, content, xissue):
90 soup = BeautifulSoup(content, "html.parser")
91 articles = soup.select("div.article-summary")
92 if len(articles) == 0:
93 self.parse_archived_issue_content(soup, xissue)
94 else:
95 article_number = 0
96 for article in articles:
97 xarticle = create_articledata()
98 article_href = article.select("h3.media-heading")[0].select("a")[0].get("href")
99 if not isinstance(article_href, str):
100 raise ValueError("Couldn't parse article href")
101 xarticle.url = article_href
102 xarticle.pid = "a" + str(article_number)
103 xissue.articles.append(xarticle)
104 article_number += 1
106 def parse_archived_issue_content(self, soup, xissue):
107 """
108 For issues from volumes 60 to 80
109 """
110 article_number = 0
111 body_tag = soup.find("body")
112 if not body_tag:
113 raise ValueError("Couldn't find body tag in archived issue")
114 if body_tag and body_tag.get("bgcolor") == "#FFFFF0":
115 # On cible les liens "Abstract" qui sont présents dans tous les formats
116 abstract_links = soup.find_all(
117 "a", string=lambda t: t and "abstract" in t.strip().lower()
118 )
119 if not abstract_links:
120 raise ValueError(
121 "Couldn't find abstract links in archived issue with white background"
122 )
123 for abstract_link in abstract_links:
124 href = abstract_link.get("href")
125 if not href:
126 continue
128 xarticle = create_articledata()
129 abstract_url = "/".join(xissue.url.split("/")[0:-1]) + "/" + href
130 xarticle.pid = "a" + str(article_number)
131 article_number += 1
132 xarticle.title_tex = "Archived article white background"
133 xarticle.url = abstract_url
134 xissue.articles.append(xarticle)
135 if body_tag and body_tag.get("bgcolor") == "#CCE6FF":
136 articles_abstract_tags = soup.findAll("a", href=True, text="Abstract")
137 if not articles_abstract_tags:
138 raise ValueError(
139 "Couldn't find abstract links in archived issue with blue background"
140 )
142 for article_abstract_tag in articles_abstract_tags:
143 xarticle = create_articledata()
144 abstract_url = (
145 "/".join(xissue.url.split("/")[0:-1]) + "/" + article_abstract_tag.get("href")
146 )
147 xarticle.pid = "a" + str(article_number)
148 article_number += 1
149 xarticle.title_tex = "Archived article blue background"
150 xarticle.url = abstract_url
151 xissue.articles.append(xarticle)
153 def parse_article_content(self, content, xissue, xarticle, url):
154 soup = BeautifulSoup(content, "html.parser")
156 # If archived/old article :
157 if xarticle.title_tex in [
158 "Archived article white background",
159 "Archived article blue background",
160 ]:
161 return self.parse_archived_article_content(soup, xissue, xarticle)
163 self.get_metadata_using_citation_meta(
164 xarticle, xissue, soup, ["author", "doi", "title", "pdf", "page", "title"]
165 )
167 # Contributors
168 contributors = soup.select_one("div.authors").select("strong")
169 for contributor in contributors:
170 xarticle.contributors.append(
171 create_contributor(role="author", string_name=contributor.text)
172 )
174 # pdf link
175 pdf_url = soup.select_one("div.download").select_one("a").get("href")
176 if isinstance(pdf_url, str):
177 add_pdf_link_to_xarticle(xarticle, pdf_url)
179 # Abstract
180 abstract_tag = soup.select_one("div.article-abstract")
181 if abstract_tag:
182 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text)))
183 return xarticle
185 def parse_archived_article_content(self, soup, xissue, xarticle):
186 """
187 Parse content of archived articles (from volumes 60 to 80)
188 """
189 try:
190 extract_metadata = self.extract_archived_metadata(soup, xarticle)
191 except ValueError as e:
192 logging.error(f"Error extracting metadata for archived article: {e}")
193 xarticle = self.parse_archived_article_content(soup, xissue, xarticle)
195 xarticle.title_tex = extract_metadata["title"]
196 if extract_metadata.get("authors"):
197 for author in extract_metadata["authors"]:
198 xarticle.contributors.append(create_contributor(role="author", string_name=author))
199 if extract_metadata.get("abstract"):
200 xarticle.abstracts.append(
201 create_abstract(value_tex=cleanup_str(extract_metadata["abstract"]))
202 )
203 if extract_metadata.get("keywords"):
204 xarticle.keywords = extract_metadata["keywords"]
205 if extract_metadata.get("pdf_url"):
206 pdf_url = "/".join(xarticle.url.split("/")[0:-1]) + "/" + extract_metadata["pdf_url"]
207 add_pdf_link_to_xarticle(xarticle, pdf_url)
208 return xarticle
210 def extract_archived_metadata_blue_bg(self, soup):
211 """
212 Extract metadata for articles with blue background
213 """
214 title_tag = soup.find("font", {"color": "#A52A2A"})
215 if not title_tag:
216 raise ValueError("Couldn't find title in archived article with blue background")
217 title = title_tag.get_text(separator=" ", strip=True) if title_tag else None
219 author_tag = soup.find("font", {"color": "#008B8B"})
220 if not author_tag:
221 raise ValueError("Couldn't find authors in archived article with blue background")
222 authors = author_tag.get_text(strip=True) if author_tag else None
223 authors = re.split(", | and ", authors) if authors else []
225 pdf_url_tag = soup.select_one("a", href=True, text="PDF")
226 if not pdf_url_tag:
227 raise ValueError("Couldn't find pdf url")
228 pdf_url = pdf_url_tag.get("href")
230 return title, authors, pdf_url
232 def extract_archived_metadata_white_bg(self, soup):
233 """
234 Extract metadata for articles with white background
235 """
236 title_tag = soup.find("font", {"color": "#A52A2A"})
237 if not title_tag:
238 title_tag = soup.select_one('span[style*="color: brown"]')
239 if not title_tag:
240 title_tag = soup.select("font", {"color": "#a52a2a"})
241 if not title_tag:
242 raise ValueError(
243 "Couldn't find title in archived article with white background"
244 )
245 title_tag = title_tag[4]
246 title = title_tag.get_text(separator=" ", strip=True) if title_tag else None
248 author_tag = soup.find("font", {"color": "#008B8B"})
249 if not author_tag:
250 author_tag = soup.select_one('span[style*="color: darkcyan"]')
251 if not author_tag:
252 author_tag = soup.select("font")
253 if not author_tag:
254 raise ValueError(
255 "Couldn't find authors in archived article with white background"
256 )
257 author_tag = author_tag[5]
258 authors = author_tag.get_text(strip=True) if author_tag else None
259 authors = re.split(", | and ", authors) if authors else []
260 authors = self.parse_authors_caps_names(authors)
261 pdf_url_tag = soup.select_one("a", href=True, text="Adobe PDF")
263 if not pdf_url_tag:
264 raise ValueError("Couldn't find pdf url")
265 pdf_url = pdf_url_tag.get("href")
267 return title, authors, pdf_url
269 def get_text_until_next_section(self, tag):
270 """
271 For archived articles, get the text content of a section (abstract or keywords) until the next section (keywords or AMS) or the end of the document.
272 """
273 SECTION_KEYWORDS = ["abstract", "keyword", "ams"]
274 content = []
275 for sibling in tag.next_siblings:
276 if sibling.name == "b":
277 if any(
278 sibling.get_text(strip=True).lower().startswith(k) for k in SECTION_KEYWORDS
279 ):
280 break
281 content.append(sibling if isinstance(sibling, str) else sibling.get_text())
282 return " ".join(content).strip().lstrip(":. \xa0")
284 def extract_abstract_and_keywords(self, soup):
285 """
286 Extract abstract and keywords for archived articles, which can be in different formats and places depending on the article. The method looks for the "Abstract" section and the "Keywords" section, and extracts their content until the next section or the end of the document.
287 """
288 abstract = None
289 keywords = []
291 for tag in soup.find_all("b"):
292 text = tag.get_text(strip=True).lower()
293 if text.startswith("abstract"):
294 abstract = self.get_text_until_next_section(tag)
295 elif text.startswith("keyword"):
296 raw = self.get_text_until_next_section(tag)
297 keywords = [kw.strip() for kw in raw.split(";") if kw.strip()]
299 return abstract, keywords
301 def extract_archived_metadata(self, soup, xarticle):
302 """
303 Extract metadata for archived articles.
304 """
305 background_color = xarticle.title_tex
306 if background_color == "Archived article blue background":
307 title, authors, pdf_url = self.extract_archived_metadata_blue_bg(soup)
308 elif background_color == "Archived article white background":
309 title, authors, pdf_url = self.extract_archived_metadata_white_bg(soup)
310 else:
311 raise ValueError("Unrecognized archived article backgroud color")
313 abstract, keywords = self.extract_abstract_and_keywords(soup)
315 return {
316 "title": title,
317 "authors": authors,
318 "pdf_url": pdf_url,
319 "abstract": abstract,
320 "keywords": keywords,
321 }
323 def parse_authors_caps_names(self, string_name_list):
324 final_string_name_list = []
325 for string_name in string_name_list:
326 string_name_split = string_name.split()
327 family_name = string_name_split[-1]
328 family_name = family_name[0].upper() + family_name[1:].lower()
329 string_name = " ".join(string_name_split[:-1] + [family_name])
330 final_string_name_list.append(string_name)
331 return final_string_name_list