Coverage for src/crawler/by_source/csis_crawler.py: 69%
139 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1"""
2This source has invalid DOIs in some article.
3For now, those are ignored in order to be able to crawl the collection.
4"""
6from urllib.parse import urljoin
8import regex
9from bs4 import BeautifulSoup, Tag
10from ptf.model_data import (
11 ContributorDict,
12 create_abstract,
13 create_articledata,
14 create_contributor,
15 create_issuedata,
16 create_subj,
17)
19from crawler.base_crawler import BaseCollectionCrawler
20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
23class CsisCrawler(BaseCollectionCrawler):
24 source_name = "Computer Science and Information Systems website"
25 source_domain = "CSIS"
26 source_website = "http://www.comsis.org/"
28 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"
30 def parse_collection_content(self, content):
31 xissues = []
32 soup = BeautifulSoup(content, "html.parser")
33 col_issue_tags = soup.select("#content > p")
34 for index, tag in enumerate(col_issue_tags):
35 xissue = self.parse_col_issue_tag(tag)
36 xissue.pid = self.collection_id + "_TEMPPID_" + str(index)
37 xissues.append(xissue)
38 return xissues
40 def parse_col_issue_tag(self, col_issue_tag: Tag):
41 issue_title = col_issue_tag.select_one("a.hidden")
42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("Couldn't parse issue link")
44 issue_href = issue_title.get("href")
45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Couldn't parse issue href")
47 xissue = create_issuedata()
48 xissue.url = urljoin(self.source_website, issue_href)
49 return xissue
51 def parse_issue_content(self, content, xissue):
52 soup = BeautifulSoup(content, "html.parser")
54 content = soup.select_one("#content")
55 if not content: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 raise ValueError("Couldn't find issue content")
57 title_tag = content.select_one("h1")
58 if not title_tag: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 raise ValueError("Couldn't find issue title")
61 title_search = regex.search(self.issue_re, title_tag.text)
62 if not title_search: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 raise ValueError("Couldn't parse issue title")
64 title_group = title_search.groupdict()
66 xissue.number = title_group["number"]
67 xissue.volume = title_group["volume"]
68 xissue.year = title_group["year"]
70 xissue.pid = self.get_issue_pid(
71 self.collection_id, title_group["year"], title_group["volume"], title_group["number"]
72 )
74 for index, article_tag in enumerate(content.select("p")):
75 if len(article_tag.contents) == 1:
76 continue
78 if article_tag.text == "Editorial": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 continue
81 article_title = article_tag.select_one("a.hidden")
82 if not article_title: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 raise ValueError("Couldn't parse article link")
84 article_href = article_title.get("href")
85 if not isinstance(article_href, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 raise ValueError("Couldn't parse article href")
88 xarticle = create_articledata()
89 xarticle.url = urljoin(self.source_website, article_href)
90 xarticle.pid = "a" + str(index)
91 xissue.articles.append(xarticle)
93 def parse_article_content(self, content, xissue, xarticle, url, pid):
94 xarticle.pid = pid
96 soup = BeautifulSoup(content, "html.parser")
97 content = soup.select_one("#content")
98 if not content: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 raise ValueError("Couldn't parse article content")
100 id_tag = content.select_one("p.id")
101 if id_tag: 101 ↛ 105line 101 didn't jump to line 105 because the condition on line 101 was always true
102 id_tag.decompose()
104 # Title
105 if xarticle.pid == "CSIS_2012_9_3_a13": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 xarticle.title_tex = "Modeling a Holonic Agent based Solution"
107 else:
108 title_tag = content.select_one(".title")
109 if not title_tag: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 raise ValueError("Couldn't find title")
111 xarticle.title_tex = title_tag.text
112 title_tag.decompose()
114 # Authors
115 authors_tag = content.select_one(".authors")
116 if not authors_tag: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 raise ValueError("Couldn't find authors")
118 current_contributor: ContributorDict | None = None
119 for c in authors_tag.children:
120 if isinstance(c, str):
121 author_str = cleanup_str(c)
122 if author_str == "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 continue
124 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()
125 current_contributor = create_contributor(role="author", string_name=author_str)
126 xarticle.contributors.append(current_contributor)
127 continue
129 if not isinstance(c, Tag): 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 continue
131 if not current_contributor: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 raise ValueError("Couldn't find author")
134 if c.name == "sup": 134 ↛ 137line 134 didn't jump to line 137 because the condition on line 134 was always true
135 # affiliations
136 continue
137 if c.name == "a":
138 orcid_href = c.get("href")
139 if not isinstance(orcid_href, str):
140 print("Couldn't parse contributor orcid")
141 continue
142 if not orcid_href.startswith("https://orcid.org/"):
143 print(
144 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/"
145 )
146 continue
147 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")
148 authors_tag.decompose()
150 # Affiliations
151 affiliations_tag = content.select_one("ol")
152 if affiliations_tag: 152 ↛ 155line 152 didn't jump to line 155 because the condition on line 152 was always true
153 affiliations_tag.decompose()
155 current_header: str | None = None
156 categories: dict[str, Tag] = {}
157 for tag in content.findChildren(recursive=False):
158 if tag.name == "h3":
159 current_header = tag.text
160 continue
161 if tag.name == "p": 161 ↛ 166line 161 didn't jump to line 166 because the condition on line 161 was always true
162 if current_header is None: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 raise ValueError("Couldn't parse article content")
164 categories[current_header] = tag
165 continue
166 raise ValueError("Found foreign tag in article content")
167 del current_header
169 # Abstract
170 if "Abstract" in categories: 170 ↛ 177line 170 didn't jump to line 177 because the condition on line 170 was always true
171 xabstract = create_abstract(
172 tag="abstract", value_tex=categories["Abstract"].text, lang="en"
173 )
174 xarticle.abstracts.append(xabstract)
176 # PDF
177 if "Full text" in categories: 177 ↛ 186line 177 didn't jump to line 186 because the condition on line 177 was always true
178 pdf_tag = categories["Full text"].select_one("a.download")
179 if not pdf_tag: 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true
180 raise ValueError("Couldn't find pdf url")
181 pdf_url = pdf_tag.get("href")
182 if not isinstance(pdf_url, str): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 raise ValueError("Couldn't parse pdf url")
184 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))
185 else:
186 print(f"No PDF Found for article {xarticle.pid}. Skipping pdf")
188 # DOI
189 # TODO : contact CSIS to make them fix their DOIs
190 # if "Digital Object Identifier (DOI)" in categories:
191 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")
192 # if not doi_tag:
193 # raise ValueError("Couldn't find doi url")
194 # doi_url = doi_tag.get("href")
195 # if not isinstance(doi_url, str):
196 # raise ValueError("Couldn't parse doi url")
197 # if not doi_url.startswith("https://doi.org/"):
198 # raise ValueError("Malformed DOI url")
199 # doi_url = doi_url.removeprefix("https://doi.org/")
200 # xarticle.doi = doi_url
202 # if xarticle.pid == "CSIS_2023_20_4_a2":
203 # xarticle.doi = "10.2298/CSIS230400viiL"
204 # if xarticle.pid == "CSIS_2023_20_1_a0":
205 # xarticle.doi = "10.2298/CSIS230100iI"
206 # if xarticle.pid == "CSIS_2021_18_1_a4":
207 # xarticle.doi = "10.2298/CSIS200330035A"
208 # if xarticle.pid == "CSIS_2020_17_1_a14":
209 # xarticle.doi = "10.2298/CSIS180717038L"
210 # if xarticle.pid == "CSIS_2020_17_1_a15":
211 # xarticle.doi = "10.2298/CSIS190430041C"
212 # if xarticle.pid == "CSIS_2020_17_1_a16":
213 # xarticle.doi = "10.2298/CSIS190501042A"
214 # if xarticle.pid == "CSIS_2020_17_1_a17":
215 # xarticle.doi = "10.2298/CSIS190511043L"
217 # Keywords
218 if "Key words" in categories: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true
219 keywords = categories["Key words"].text.split(", ")
220 for k in keywords:
221 xarticle.kwds.append(create_subj(value=k, lang="en"))
222 return xarticle