Coverage for src/crawler/by_source/csis_crawler.py: 69%
134 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1"""
2This source has invalid DOIs in some article.
3For now, those are ignored in order to be able to crawl the collection.
4"""
6from urllib.parse import urljoin
8from bs4 import BeautifulSoup, Tag
9from ptf.model_data import (
10 ContributorDict,
11 create_abstract,
12 create_articledata,
13 create_contributor,
14 create_issuedata,
15 create_subj,
16)
18from crawler.base_crawler import BaseCollectionCrawler
19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
22class CsisCrawler(BaseCollectionCrawler):
23 source_name = "Computer Science and Information Systems website"
24 source_domain = "CSIS"
25 source_website = "http://www.comsis.org/"
27 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"
29 def parse_collection_content(self, content):
30 xissues = []
31 soup = BeautifulSoup(content, "html.parser")
32 col_issue_tags = soup.select("#content > p")
33 for index, tag in enumerate(col_issue_tags):
34 xissue = self.parse_col_issue_tag(tag)
35 xissue.pid = self.collection_id + "_TEMPPID_" + str(index)
36 xissues.append(xissue)
37 return xissues
39 def parse_col_issue_tag(self, col_issue_tag: Tag):
40 issue_title = col_issue_tag.select_one("a.hidden")
41 if not issue_title: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't parse issue link")
43 issue_href = issue_title.get("href")
44 if not isinstance(issue_href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError("Couldn't parse issue href")
46 xissue = create_issuedata()
47 xissue.url = urljoin(self.source_website, issue_href)
48 return xissue
50 def parse_issue_content(self, content, xissue):
51 soup = BeautifulSoup(content, "html.parser")
53 content = soup.select_one("#content")
54 if not content: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError("Couldn't find issue content")
56 title_tag = content.select_one("h1")
57 if not title_tag: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true
58 raise ValueError("Couldn't find issue title")
60 title_group = regex_to_dict(
61 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title"
62 )
63 xissue.number = title_group["number"]
64 xissue.volume = title_group["volume"]
65 xissue.year = title_group["year"]
67 xissue.pid = self.get_issue_pid(
68 self.collection_id, title_group["year"], title_group["volume"], title_group["number"]
69 )
71 for index, article_tag in enumerate(content.select("p")):
72 if len(article_tag.contents) == 1:
73 continue
75 if article_tag.text == "Editorial": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 continue
78 article_title = article_tag.select_one("a.hidden")
79 if not article_title: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 raise ValueError("Couldn't parse article link")
81 article_href = article_title.get("href")
82 if not isinstance(article_href, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 raise ValueError("Couldn't parse article href")
85 xarticle = create_articledata()
86 xarticle.url = urljoin(self.source_website, article_href)
87 xarticle.pid = "a" + str(index)
88 xissue.articles.append(xarticle)
90 def parse_article_content(self, content, xissue, xarticle, url):
91 soup = BeautifulSoup(content, "html.parser")
92 content = soup.select_one("#content")
93 if not content: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 raise ValueError("Couldn't parse article content")
95 id_tag = content.select_one("p.id")
96 if id_tag: 96 ↛ 100line 96 didn't jump to line 100 because the condition on line 96 was always true
97 id_tag.decompose()
99 # Title
100 if xarticle.pid == "CSIS_2012_9_3_a13": 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 xarticle.title_tex = "Modeling a Holonic Agent based Solution"
102 else:
103 title_tag = content.select_one(".title")
104 if not title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 raise ValueError("Couldn't find title")
106 xarticle.title_tex = title_tag.text
107 title_tag.decompose()
109 # Authors
110 authors_tag = content.select_one(".authors")
111 if not authors_tag: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 raise ValueError("Couldn't find authors")
113 current_contributor: ContributorDict | None = None
114 for c in authors_tag.children:
115 if isinstance(c, str):
116 author_str = cleanup_str(c)
117 if author_str == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 continue
119 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()
120 current_contributor = create_contributor(role="author", string_name=author_str)
121 xarticle.contributors.append(current_contributor)
122 continue
124 if not isinstance(c, Tag): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 continue
126 if not current_contributor: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 raise ValueError("Couldn't find author")
129 if c.name == "sup": 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true
130 # affiliations
131 continue
132 if c.name == "a":
133 orcid_href = c.get("href")
134 if not isinstance(orcid_href, str):
135 self.logger.warning(
136 "Couldn't parse contributor orcid.",
137 extra={"pid": xarticle.pid},
138 )
139 continue
140 if not orcid_href.startswith("https://orcid.org/"):
141 self.logger.warning(
142 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/",
143 extra={"pid": xarticle.pid},
144 )
145 continue
146 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")
147 authors_tag.decompose()
149 # Affiliations
150 affiliations_tag = content.select_one("ol")
151 if affiliations_tag: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was always true
152 affiliations_tag.decompose()
154 current_header: str | None = None
155 categories: dict[str, Tag] = {}
156 for tag in content.findChildren(recursive=False):
157 if tag.name == "h3":
158 current_header = tag.text
159 continue
160 if tag.name == "p": 160 ↛ 165line 160 didn't jump to line 165 because the condition on line 160 was always true
161 if current_header is None: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 raise ValueError("Couldn't parse article content")
163 categories[current_header] = tag
164 continue
165 raise ValueError("Found foreign tag in article content")
166 del current_header
168 # Abstract
169 if "Abstract" in categories: 169 ↛ 176line 169 didn't jump to line 176 because the condition on line 169 was always true
170 xabstract = create_abstract(
171 tag="abstract", value_tex=categories["Abstract"].text, lang="en"
172 )
173 xarticle.abstracts.append(xabstract)
175 # PDF
176 if "Full text" in categories: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true
177 pdf_tag = categories["Full text"].select_one("a.download")
178 if not pdf_tag: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 raise ValueError("Couldn't find pdf url")
180 pdf_url = pdf_tag.get("href")
181 if not isinstance(pdf_url, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 raise ValueError("Couldn't parse pdf url")
183 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))
184 else:
185 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid})
187 # DOI
188 # TODO : contact CSIS to make them fix their DOIs
189 # if "Digital Object Identifier (DOI)" in categories:
190 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")
191 # if not doi_tag:
192 # raise ValueError("Couldn't find doi url")
193 # doi_url = doi_tag.get("href")
194 # if not isinstance(doi_url, str):
195 # raise ValueError("Couldn't parse doi url")
196 # if not doi_url.startswith("https://doi.org/"):
197 # raise ValueError("Malformed DOI url")
198 # doi_url = doi_url.removeprefix("https://doi.org/")
199 # xarticle.doi = doi_url
201 # if xarticle.pid == "CSIS_2023_20_4_a2":
202 # xarticle.doi = "10.2298/CSIS230400viiL"
203 # if xarticle.pid == "CSIS_2023_20_1_a0":
204 # xarticle.doi = "10.2298/CSIS230100iI"
205 # if xarticle.pid == "CSIS_2021_18_1_a4":
206 # xarticle.doi = "10.2298/CSIS200330035A"
207 # if xarticle.pid == "CSIS_2020_17_1_a14":
208 # xarticle.doi = "10.2298/CSIS180717038L"
209 # if xarticle.pid == "CSIS_2020_17_1_a15":
210 # xarticle.doi = "10.2298/CSIS190430041C"
211 # if xarticle.pid == "CSIS_2020_17_1_a16":
212 # xarticle.doi = "10.2298/CSIS190501042A"
213 # if xarticle.pid == "CSIS_2020_17_1_a17":
214 # xarticle.doi = "10.2298/CSIS190511043L"
216 # Keywords
217 if "Key words" in categories: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 keywords = categories["Key words"].text.split(", ")
219 for k in keywords:
220 xarticle.kwds.append(create_subj(value=k, lang="en"))
221 return xarticle