Coverage for src/crawler/by_source/amc_crawler.py: 83%
124 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1import lingua
2from bs4 import BeautifulSoup, Tag
3from lingua import LanguageDetectorBuilder
4from ptf.model_data import (
5 create_abstract,
6 create_articledata,
7 create_contributor,
8 create_issuedata,
9 create_subj,
10)
12from crawler.base_crawler import BaseCollectionCrawler
13from crawler.utils import add_pdf_link_to_xarticle
16class AmcCrawler(BaseCollectionCrawler):
17 source_name = "Ars Mathematica Contemporanea website"
18 source_domain = "AMC"
19 source_website = "https://amc-journal.eu"
21 language_detector = LanguageDetectorBuilder.from_languages(
22 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.SLOVENE
23 ).build()
25 def parse_collection_content(self, content):
26 """
27 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue.
28 Each xissue has its volume/number/year metadata + its url
29 This web site has multiple pages for its issues. so we need to crawl all of them
30 """
31 xissues = []
33 soup = BeautifulSoup(content, "html.parser")
34 self.parse_one_issues_page(content, xissues)
35 next_button = soup.select_one("a.next")
37 while next_button:
38 url = next_button.get("href")
39 if not isinstance(url, str): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 next_button = None
41 continue
42 content = self.download_file(url)
43 soup = BeautifulSoup(content, "html.parser")
44 self.parse_one_issues_page(content, xissues)
45 next_button = soup.select_one("a.next")
46 return xissues
48 def parse_one_issues_page(self, content, xissues):
49 soup = BeautifulSoup(content, "html.parser")
51 # Extract the list of issues
52 issue_nodes = soup.find_all("h2")
54 for issue_node in issue_nodes:
55 issue_link_node = issue_node.find("a")
56 if issue_link_node:
57 url = issue_link_node.get("href")
58 text = issue_link_node.get_text().strip()
59 if text.find("Vol.") == 0:
60 text = text[5:]
61 parts = text.split("No.")
62 volume = parts[0].strip()
63 parts = parts[1].split("(")
64 number = parts[0].strip()
65 year = parts[1][0:4]
67 xissue = create_issuedata()
68 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
69 xissue.year = year
70 xissue.volume = volume
71 xissue.number = number
72 xissue.url = url
74 xissues.append(xissue)
76 def parse_issue_content(self, content, xissue):
77 soup = BeautifulSoup(content, "html.parser")
78 article_nodes = soup.find_all("h3", {"class": "title"})
80 for index_article, article_node in enumerate(article_nodes):
81 article_link_node = article_node.find("a")
82 if article_link_node: 82 ↛ 80line 82 didn't jump to line 80 because the condition on line 82 was always true
83 url = article_link_node.get("href")
84 xarticle = create_articledata()
85 xarticle.pid = "a" + str(index_article)
86 xarticle.url = url
88 meta_node = article_node.find_next_sibling("div")
89 if meta_node: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true
90 pages_node = meta_node.find("div", {"class": "pages"})
91 if pages_node is not None:
92 text = pages_node.get_text()
94 if "," in text and "pp" in text: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 parts = text.split(",")
96 number_parts = parts[0].split(".")
97 if len(number_parts) == 2:
98 xarticle.article_number = number_parts[1].strip()
100 text = parts[1].split("pp")[0].strip()
101 xarticle.counts.append(("page-count", text))
102 elif "-" in text: 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was always true
103 parts = text.split("-")
104 xarticle.fpage = parts[0].strip()
105 xarticle.lpage = parts[1].strip()
107 xissue.articles.append(xarticle)
109 def parse_article_content(self, content, xissue, xarticle, url):
110 """
111 Parse the content with Beautifulsoup and returns an ArticleData
112 """
114 xarticle.lang = "en"
116 soup = BeautifulSoup(content, "html.parser")
118 # TITLE
119 title_node = soup.select_one("h1.page_title")
120 if title_node: 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true
121 xarticle.title_tex = title_node.get_text()
123 # AUTHORS
124 authors_node = soup.select_one("ul.authors")
125 if authors_node and isinstance(authors_node, Tag): 125 ↛ 135line 125 didn't jump to line 135 because the condition on line 125 was always true
126 span_nodes = authors_node.find_all("span", {"class": "name"})
127 for span_node in span_nodes:
128 text = span_node.get_text().strip()
130 author = create_contributor(role="author", string_name=text)
132 xarticle.contributors.append(author)
134 # DOI
135 doi_node = soup.select_one("section.item.doi")
136 if doi_node: 136 ↛ 148line 136 didn't jump to line 148 because the condition on line 136 was always true
137 doi_node = doi_node.find("a")
138 if doi_node and isinstance(doi_node, Tag): 138 ↛ 148line 138 didn't jump to line 148 because the condition on line 138 was always true
139 url = doi_node.get("href")
140 if isinstance(url, str): 140 ↛ 148line 140 didn't jump to line 148 because the condition on line 140 was always true
141 pos = url.find("10.")
142 if pos > 0: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true
143 doi = url[pos:]
144 xarticle.doi = doi
145 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
147 # KEYWORDS
148 kwds_node = soup.select_one("section.item.keywords")
149 if kwds_node: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true
150 span_node = kwds_node.select_one("span.value")
151 if span_node and not isinstance(span_node, int): 151 ↛ 160line 151 didn't jump to line 160 because the condition on line 151 was always true
152 text = span_node.get_text().strip()
153 for kwd in text.split(", "):
154 subject = create_subj()
155 subject["value"] = kwd
156 subject["lang"] = xarticle.lang
157 xarticle.kwds.append(subject)
159 # ABSTRACT
160 abstract_node = soup.select_one("section.item.abstract")
161 if abstract_node: 161 ↛ 170line 161 didn't jump to line 170 because the condition on line 161 was always true
162 text = abstract_node.get_text().strip()
163 if text.find("Abstract") == 0: 163 ↛ 170line 163 didn't jump to line 170 because the condition on line 163 was always true
164 text = text[9:]
165 xarticle.abstracts.append(
166 create_abstract(lang=self.detect_language(text), value_tex=text)
167 )
169 # PDF
170 pdf_node = soup.select_one("a.obj_galley_link.pdf")
171 if pdf_node and isinstance(pdf_node, Tag): 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true
172 pdf_url = pdf_node.get("href")
173 if isinstance(pdf_url, list): 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 raise ValueError("pdf_url is a list")
175 if pdf_url is None: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 raise ValueError("pdf_url not found")
177 add_pdf_link_to_xarticle(xarticle, pdf_url)
179 return xarticle