Coverage for src / crawler / by_source / csis_crawler.py: 69%
134 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1"""
2This source has invalid DOIs in some article.
3For now, those are ignored in order to be able to crawl the collection.
4"""
6from urllib.parse import urljoin
8from bs4 import BeautifulSoup, Tag
9from ptf.model_data import (
10 ContributorDict,
11 create_abstract,
12 create_articledata,
13 create_contributor,
14 create_issuedata,
15 create_subj,
16)
18from crawler.base_crawler import BaseCollectionCrawler
19from crawler.crawler_utils import get_issue_pid
20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
23class CsisCrawler(BaseCollectionCrawler):
24 source_name = "Computer Science and Information Systems website"
25 source_domain = "CSIS"
26 source_website = "http://www.comsis.org/"
28 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"
30 def parse_collection_content(self, content):
31 xissues = []
32 soup = BeautifulSoup(content, "html.parser")
33 col_issue_tags = soup.select("#content > p")
34 for index, tag in enumerate(col_issue_tags):
35 xissue = self.parse_col_issue_tag(tag)
36 xissue.pid = self.collection_id + "_TEMPPID_" + str(index)
37 xissues.append(xissue)
38 return xissues
40 def parse_col_issue_tag(self, col_issue_tag: Tag):
41 issue_title = col_issue_tag.select_one("a.hidden")
42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("Couldn't parse issue link")
44 issue_href = issue_title.get("href")
45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Couldn't parse issue href")
47 xissue = create_issuedata()
48 xissue.url = urljoin(self.source_website, issue_href)
49 return xissue
51 def parse_issue_content(self, content, xissue):
52 soup = BeautifulSoup(content, "html.parser")
54 content = soup.select_one("#content")
55 if not content: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 raise ValueError("Couldn't find issue content")
57 title_tag = content.select_one("h1")
58 if not title_tag: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 raise ValueError("Couldn't find issue title")
61 title_group = regex_to_dict(
62 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title"
63 )
64 xissue.number = title_group["number"]
65 xissue.volume = title_group["volume"]
66 xissue.year = title_group["year"]
68 xissue.pid = get_issue_pid(
69 self.collection_id, title_group["year"], title_group["volume"], title_group["number"]
70 )
72 for index, article_tag in enumerate(content.select("p")):
73 if len(article_tag.contents) == 1:
74 continue
76 if article_tag.text == "Editorial": 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 continue
79 article_title = article_tag.select_one("a.hidden")
80 if not article_title: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Couldn't parse article link")
82 article_href = article_title.get("href")
83 if not isinstance(article_href, str): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 raise ValueError("Couldn't parse article href")
86 xarticle = create_articledata()
87 xarticle.url = urljoin(self.source_website, article_href)
88 xarticle.pid = "a" + str(index)
89 xissue.articles.append(xarticle)
91 def parse_article_content(self, content, xissue, xarticle, url):
92 soup = BeautifulSoup(content, "html.parser")
93 content = soup.select_one("#content")
94 if not content: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 raise ValueError("Couldn't parse article content")
96 id_tag = content.select_one("p.id")
97 if id_tag: 97 ↛ 101line 97 didn't jump to line 101 because the condition on line 97 was always true
98 id_tag.decompose()
100 # Title
101 if xarticle.pid == "CSIS_2012_9_3_a13": 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 xarticle.title_tex = "Modeling a Holonic Agent based Solution"
103 else:
104 title_tag = content.select_one(".title")
105 if not title_tag: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 raise ValueError("Couldn't find title")
107 xarticle.title_tex = title_tag.text
108 title_tag.decompose()
110 # Authors
111 authors_tag = content.select_one(".authors")
112 if not authors_tag: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError("Couldn't find authors")
114 current_contributor: ContributorDict | None = None
115 for c in authors_tag.children:
116 if isinstance(c, str):
117 author_str = cleanup_str(c)
118 if author_str == "": 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 continue
120 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()
121 current_contributor = create_contributor(role="author", string_name=author_str)
122 xarticle.contributors.append(current_contributor)
123 continue
125 if not isinstance(c, Tag): 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 continue
127 if not current_contributor: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 raise ValueError("Couldn't find author")
130 if c.name == "sup": 130 ↛ 133line 130 didn't jump to line 133 because the condition on line 130 was always true
131 # affiliations
132 continue
133 if c.name == "a":
134 orcid_href = c.get("href")
135 if not isinstance(orcid_href, str):
136 self.logger.warning(
137 "Couldn't parse contributor orcid.",
138 extra={"pid": xarticle.pid},
139 )
140 continue
141 if not orcid_href.startswith("https://orcid.org/"):
142 self.logger.warning(
143 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/",
144 extra={"pid": xarticle.pid},
145 )
146 continue
147 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")
148 authors_tag.decompose()
150 # Affiliations
151 affiliations_tag = content.select_one("ol")
152 if affiliations_tag: 152 ↛ 155line 152 didn't jump to line 155 because the condition on line 152 was always true
153 affiliations_tag.decompose()
155 current_header: str | None = None
156 categories: dict[str, Tag] = {}
157 for tag in content.findChildren(recursive=False):
158 if tag.name == "h3":
159 current_header = tag.text
160 continue
161 if tag.name == "p": 161 ↛ 166line 161 didn't jump to line 166 because the condition on line 161 was always true
162 if current_header is None: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 raise ValueError("Couldn't parse article content")
164 categories[current_header] = tag
165 continue
166 raise ValueError("Found foreign tag in article content")
167 del current_header
169 # Abstract
170 if "Abstract" in categories: 170 ↛ 176line 170 didn't jump to line 176 because the condition on line 170 was always true
171 xarticle.abstracts.append(
172 create_abstract(value_tex=categories["Abstract"].text, lang="en")
173 )
175 # PDF
176 if "Full text" in categories: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true
177 pdf_tag = categories["Full text"].select_one("a.download")
178 if not pdf_tag: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 raise ValueError("Couldn't find pdf url")
180 pdf_url = pdf_tag.get("href")
181 if not isinstance(pdf_url, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 raise ValueError("Couldn't parse pdf url")
183 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))
184 else:
185 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid})
187 # DOI
188 # TODO : contact CSIS to make them fix their DOIs
189 # if "Digital Object Identifier (DOI)" in categories:
190 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")
191 # if not doi_tag:
192 # raise ValueError("Couldn't find doi url")
193 # doi_url = doi_tag.get("href")
194 # if not isinstance(doi_url, str):
195 # raise ValueError("Couldn't parse doi url")
196 # if not doi_url.startswith("https://doi.org/"):
197 # raise ValueError("Malformed DOI url")
198 # doi_url = doi_url.removeprefix("https://doi.org/")
199 # xarticle.doi = doi_url
201 # if xarticle.pid == "CSIS_2023_20_4_a2":
202 # xarticle.doi = "10.2298/CSIS230400viiL"
203 # if xarticle.pid == "CSIS_2023_20_1_a0":
204 # xarticle.doi = "10.2298/CSIS230100iI"
205 # if xarticle.pid == "CSIS_2021_18_1_a4":
206 # xarticle.doi = "10.2298/CSIS200330035A"
207 # if xarticle.pid == "CSIS_2020_17_1_a14":
208 # xarticle.doi = "10.2298/CSIS180717038L"
209 # if xarticle.pid == "CSIS_2020_17_1_a15":
210 # xarticle.doi = "10.2298/CSIS190430041C"
211 # if xarticle.pid == "CSIS_2020_17_1_a16":
212 # xarticle.doi = "10.2298/CSIS190501042A"
213 # if xarticle.pid == "CSIS_2020_17_1_a17":
214 # xarticle.doi = "10.2298/CSIS190511043L"
216 # Keywords
217 if "Key words" in categories: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 keywords = categories["Key words"].text.split(", ")
219 for k in keywords:
220 xarticle.kwds.append(create_subj(value=k, lang="en"))
221 return xarticle