Coverage for src/crawler/by_source/amc_crawler.py: 82%
119 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1from bs4 import BeautifulSoup, Tag
2from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj
4from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
7class AmcCrawler(BaseCollectionCrawler):
8 source_domain = "AMC"
9 source_name = "Ars Mathematica Contemporanea website"
10 source_website = "https://amc-journal.eu"
11 periode_begin = 2009
12 periode_end = 2024
14 def __init__(self, *args, **kwargs):
15 super().__init__(*args, **kwargs)
17 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue.
18 # self.collection_id = "AM"
19 # self.collection_url = "https://annals.math.princeton.edu"
21 self.source = self.get_or_create_source()
22 self.periode = self.get_or_create_periode()
24 def parse_collection_content(self, content):
25 """
26 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue.
27 Each xissue has its volume/number/year metadata + its url
28 This web site has multiple pages for its issues. so we need to crawl all of them
29 """
30 xissues = []
31 self.parse_one_issues_page(content, xissues)
33 url = self.collection_url + "/2"
34 content = self.download_file(url)
35 self.parse_one_issues_page(content, xissues)
37 return xissues
39 def parse_one_issues_page(self, content, xissues):
40 soup = BeautifulSoup(content, "html.parser")
42 # Extract the list of issues
43 issue_nodes = soup.find_all("h2")
45 for issue_node in issue_nodes:
46 issue_link_node = issue_node.find("a")
47 if issue_link_node:
48 url = issue_link_node.get("href")
49 text = issue_link_node.get_text().strip()
50 if text.find("Vol.") == 0:
51 text = text[5:]
52 parts = text.split("No.")
53 volume = parts[0].strip()
54 parts = parts[1].split("(")
55 number = parts[0].strip()
56 year = parts[1][0:4]
58 xissue = create_issuedata()
59 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"
60 xissue.year = year
61 xissue.volume = volume
62 xissue.number = number
63 xissue.url = url
65 xissues.append(xissue)
67 def parse_issue_content(self, content, xissue):
68 soup = BeautifulSoup(content, "html.parser")
69 article_nodes = soup.find_all("h3", {"class": "title"})
71 for index_article, article_node in enumerate(article_nodes):
72 article_link_node = article_node.find("a")
73 if article_link_node: 73 ↛ 71line 73 didn't jump to line 71 because the condition on line 73 was always true
74 url = article_link_node.get("href")
75 xarticle = create_articledata()
76 xarticle.pid = "a" + str(index_article)
77 xarticle.url = url
79 meta_node = article_node.find_next_sibling("div")
80 if meta_node: 80 ↛ 98line 80 didn't jump to line 98 because the condition on line 80 was always true
81 pages_node = meta_node.find("div", {"class": "pages"})
82 if pages_node is not None:
83 text = pages_node.get_text()
85 if "," in text and "pp" in text: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 parts = text.split(",")
87 number_parts = parts[0].split(".")
88 if len(number_parts) == 2:
89 xarticle.article_number = number_parts[1].strip()
91 text = parts[1].split("pp")[0].strip()
92 xarticle.counts.append(("page-count", text))
93 elif "-" in text: 93 ↛ 98line 93 didn't jump to line 98 because the condition on line 93 was always true
94 parts = text.split("-")
95 xarticle.fpage = parts[0].strip()
96 xarticle.lpage = parts[1].strip()
98 xissue.articles.append(xarticle)
100 def parse_article_content(self, content, xissue, xarticle, url, pid):
101 """
102 Parse the content with Beautifulsoup and returns an ArticleData
103 """
104 xarticle.pid = pid
105 xarticle.lang = "en"
107 soup = BeautifulSoup(content, "html.parser")
109 # TITLE
110 title_node = soup.find("h1", {"class": "page_title"})
111 if title_node: 111 ↛ 115line 111 didn't jump to line 115 because the condition on line 111 was always true
112 xarticle.title_tex = title_node.get_text()
114 # AUTHORS
115 authors_node = soup.find("ul", {"class": "authors"})
116 if authors_node and isinstance(authors_node, Tag): 116 ↛ 126line 116 didn't jump to line 126 because the condition on line 116 was always true
117 span_nodes = authors_node.find_all("span", {"class": "name"})
118 for span_node in span_nodes:
119 text = span_node.get_text().strip()
121 author = create_contributor(role="author", string_name=text)
123 xarticle.contributors.append(author)
125 # DOI
126 doi_node = soup.find("section", {"class": "item doi"})
127 if doi_node: 127 ↛ 139line 127 didn't jump to line 139 because the condition on line 127 was always true
128 doi_node = doi_node.find("a")
129 if doi_node and isinstance(doi_node, Tag): 129 ↛ 139line 129 didn't jump to line 139 because the condition on line 129 was always true
130 url = doi_node.get("href")
131 if isinstance(url, str): 131 ↛ 139line 131 didn't jump to line 139 because the condition on line 131 was always true
132 pos = url.find("10.")
133 if pos > 0: 133 ↛ 139line 133 didn't jump to line 139 because the condition on line 133 was always true
134 doi = url[pos:]
135 xarticle.doi = doi
136 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")
138 # KEYWORDS
139 kwds_node = soup.find("section", {"class": "item keywords"})
140 if kwds_node: 140 ↛ 151line 140 didn't jump to line 151 because the condition on line 140 was always true
141 span_node = kwds_node.find("span", {"class": "value"})
142 if span_node and not isinstance(span_node, int): 142 ↛ 151line 142 didn't jump to line 151 because the condition on line 142 was always true
143 text = span_node.get_text().strip()
144 for kwd in text.split(", "):
145 subject = create_subj()
146 subject["value"] = kwd
147 subject["lang"] = "en"
148 xarticle.kwds.append(subject)
150 # ABSTRACT
151 abstract_node = soup.find("section", {"class": "item abstract"})
152 if abstract_node: 152 ↛ 167line 152 didn't jump to line 167 because the condition on line 152 was always true
153 text = abstract_node.get_text().strip()
154 if text.find("Abstract") == 0: 154 ↛ 167line 154 didn't jump to line 167 because the condition on line 154 was always true
155 text = text[9:]
156 xarticle.abstracts.append(
157 {
158 "tag": "abstract",
159 "value_html": "",
160 "value_tex": text,
161 "value_xml": "",
162 "lang": "en",
163 }
164 )
166 # PDF
167 pdf_node = soup.find("a", {"class": "obj_galley_link pdf"})
168 if pdf_node and isinstance(pdf_node, Tag): 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true
169 pdf_url = pdf_node.get("href")
170 if isinstance(pdf_url, list): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true
171 raise ValueError("pdf_url is a list")
172 if pdf_url is None: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 raise ValueError("pdf_url not found")
174 add_pdf_link_to_xarticle(xarticle, pdf_url)
176 return xarticle