Coverage for src/crawler/by_source/amc_crawler.py: 83%
124 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-02 15:25 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-02 15:25 +0000
1import lingua
2from bs4 import BeautifulSoup, Tag
3from lingua import LanguageDetectorBuilder
4from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle
10class AmcCrawler(BaseCollectionCrawler):
11 source_name = "Ars Mathematica Contemporanea website"
12 source_domain = "AMC"
13 source_website = "https://amc-journal.eu"
15 language_detector = LanguageDetectorBuilder.from_languages(
16 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.SLOVENE
17 ).build()
19 def parse_collection_content(self, content):
20 """
21 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue.
22 Each xissue has its volume/number/year metadata + its url
23 This web site has multiple pages for its issues. so we need to crawl all of them
24 """
25 xissues = []
27 soup = BeautifulSoup(content, "html.parser")
28 self.parse_one_issues_page(content, xissues)
29 next_button = soup.select_one("a.next")
31 while next_button:
32 url = next_button.get("href")
33 if not isinstance(url, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true
34 next_button = None
35 continue
36 content = self.download_file(url)
37 soup = BeautifulSoup(content, "html.parser")
38 self.parse_one_issues_page(content, xissues)
39 next_button = soup.select_one("a.next")
40 return xissues
42 def parse_one_issues_page(self, content, xissues):
43 soup = BeautifulSoup(content, "html.parser")
45 # Extract the list of issues
46 issue_nodes = soup.find_all("h2")
48 for issue_node in issue_nodes:
49 issue_link_node = issue_node.find("a")
50 if issue_link_node:
51 url = issue_link_node.get("href")
52 text = issue_link_node.get_text().strip()
53 if text.find("Vol.") == 0:
54 text = text[5:]
55 parts = text.split("No.")
56 volume = parts[0].strip()
57 parts = parts[1].split("(")
58 number = parts[0].strip()
59 year = parts[1][0:4]
61 xissue = create_issuedata()
62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
63 xissue.year = year
64 xissue.volume = volume
65 xissue.number = number
66 xissue.url = url
68 xissues.append(xissue)
70 def parse_issue_content(self, content, xissue):
71 soup = BeautifulSoup(content, "html.parser")
72 article_nodes = soup.find_all("h3", {"class": "title"})
74 for index_article, article_node in enumerate(article_nodes):
75 article_link_node = article_node.find("a")
76 if article_link_node: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true
77 url = article_link_node.get("href")
78 xarticle = create_articledata()
79 xarticle.pid = "a" + str(index_article)
80 xarticle.url = url
82 meta_node = article_node.find_next_sibling("div")
83 if meta_node: 83 ↛ 101line 83 didn't jump to line 101 because the condition on line 83 was always true
84 pages_node = meta_node.find("div", {"class": "pages"})
85 if pages_node is not None:
86 text = pages_node.get_text()
88 if "," in text and "pp" in text: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 parts = text.split(",")
90 number_parts = parts[0].split(".")
91 if len(number_parts) == 2:
92 xarticle.article_number = number_parts[1].strip()
94 text = parts[1].split("pp")[0].strip()
95 xarticle.counts.append(("page-count", text))
96 elif "-" in text: 96 ↛ 101line 96 didn't jump to line 101 because the condition on line 96 was always true
97 parts = text.split("-")
98 xarticle.fpage = parts[0].strip()
99 xarticle.lpage = parts[1].strip()
101 xissue.articles.append(xarticle)
103 def parse_article_content(self, content, xissue, xarticle, url):
104 """
105 Parse the content with Beautifulsoup and returns an ArticleData
106 """
108 xarticle.lang = "en"
110 soup = BeautifulSoup(content, "html.parser")
112 # TITLE
113 title_node = soup.select_one("h1.page_title")
114 if title_node: 114 ↛ 118line 114 didn't jump to line 118 because the condition on line 114 was always true
115 xarticle.title_tex = title_node.get_text()
117 # AUTHORS
118 authors_node = soup.select_one("ul.authors")
119 if authors_node and isinstance(authors_node, Tag): 119 ↛ 129line 119 didn't jump to line 129 because the condition on line 119 was always true
120 span_nodes = authors_node.find_all("span", {"class": "name"})
121 for span_node in span_nodes:
122 text = span_node.get_text().strip()
124 author = create_contributor(role="author", string_name=text)
126 xarticle.contributors.append(author)
128 # DOI
129 doi_node = soup.select_one("section.item.doi")
130 if doi_node: 130 ↛ 142line 130 didn't jump to line 142 because the condition on line 130 was always true
131 doi_node = doi_node.find("a")
132 if doi_node and isinstance(doi_node, Tag): 132 ↛ 142line 132 didn't jump to line 142 because the condition on line 132 was always true
133 url = doi_node.get("href")
134 if isinstance(url, str): 134 ↛ 142line 134 didn't jump to line 142 because the condition on line 134 was always true
135 pos = url.find("10.")
136 if pos > 0: 136 ↛ 142line 136 didn't jump to line 142 because the condition on line 136 was always true
137 doi = url[pos:]
138 xarticle.doi = doi
139 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
141 # KEYWORDS
142 kwds_node = soup.select_one("section.item.keywords")
143 if kwds_node: 143 ↛ 154line 143 didn't jump to line 154 because the condition on line 143 was always true
144 span_node = kwds_node.select_one("span.value")
145 if span_node and not isinstance(span_node, int): 145 ↛ 154line 145 didn't jump to line 154 because the condition on line 145 was always true
146 text = span_node.get_text().strip()
147 for kwd in text.split(", "):
148 subject = create_subj()
149 subject["value"] = kwd
150 subject["lang"] = xarticle.lang
151 xarticle.kwds.append(subject)
153 # ABSTRACT
154 abstract_node = soup.select_one("section.item.abstract")
155 if abstract_node: 155 ↛ 170line 155 didn't jump to line 170 because the condition on line 155 was always true
156 text = abstract_node.get_text().strip()
157 if text.find("Abstract") == 0: 157 ↛ 170line 157 didn't jump to line 170 because the condition on line 157 was always true
158 text = text[9:]
159 xarticle.abstracts.append(
160 {
161 "tag": "abstract",
162 "value_html": "",
163 "value_tex": text,
164 "value_xml": "",
165 "lang": self.detect_language(text),
166 }
167 )
169 # PDF
170 pdf_node = soup.select_one("a.obj_galley_link.pdf")
171 if pdf_node and isinstance(pdf_node, Tag): 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true
172 pdf_url = pdf_node.get("href")
173 if isinstance(pdf_url, list): 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 raise ValueError("pdf_url is a list")
175 if pdf_url is None: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 raise ValueError("pdf_url not found")
177 add_pdf_link_to_xarticle(xarticle, pdf_url)
179 return xarticle