Coverage for src / crawler / by_source / csis_crawler.py: 69%
135 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
1"""
2This source has invalid DOIs in some article.
3For now, those are ignored in order to be able to crawl the collection.
4"""
6from urllib.parse import urljoin
8from bs4 import BeautifulSoup, Tag
9from ptf.model_data import (
10 ContributorDict,
11 create_abstract,
12 create_articledata,
13 create_contributor,
14 create_subj,
15)
17from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler
18from crawler.crawler_utils import create_xissue
19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
22class CsisCrawler(ThreadedCrawler):
23 source_name = "Computer Science and Information Systems website"
24 source_domain = "CSIS"
25 source_website = "http://www.comsis.org/"
27 issue_browse_re = r"Number (?P<number>\d+), \w+ (?P<year>\d+)"
28 volume_browse_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
29 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"
31 def parse_collection_content(self, content):
32 xissues = []
33 soup = BeautifulSoup(content, "html.parser")
34 col_issue_tags = soup.select("#content > p")
35 for index, tag in enumerate(col_issue_tags):
36 xissue = self.parse_col_issue_tag(tag)
37 xissues.append(xissue)
38 return xissues
40 def parse_col_issue_tag(self, col_issue_tag: Tag):
41 issue_title = col_issue_tag.select_one("a.hidden")
42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("Couldn't parse issue link")
44 issue_href = issue_title.get("href")
45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Couldn't parse issue href")
48 volume_tag = col_issue_tag.findPrevious("h3")
49 if not volume_tag or not volume_tag.text.startswith("Volume"): 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 raise ValueError("Could not find volume tag")
52 volume_group = regex_to_dict(self.volume_browse_re, volume_tag.text)
53 title_group = regex_to_dict(self.issue_browse_re, issue_title.text)
55 xissue = create_xissue(
56 self.collection_id,
57 url=urljoin(self.source_website, issue_href),
58 year=title_group["year"],
59 volume_number=volume_group["volume"],
60 issue_number=title_group["number"],
61 )
62 issue_title = col_issue_tag.select_one("a")
64 return xissue
66 def parse_issue_content(self, content, xissue):
67 soup = BeautifulSoup(content, "html.parser")
69 content = soup.select_one("#content")
70 if not content: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 raise ValueError("Couldn't find issue content")
72 title_tag = content.select_one("h1")
73 if not title_tag: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 raise ValueError("Couldn't find issue title")
76 for index, article_tag in enumerate(content.select("p")):
77 if len(article_tag.contents) == 1:
78 continue
80 if article_tag.text == "Editorial": 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 continue
83 article_title = article_tag.select_one("a.hidden")
84 if not article_title: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 raise ValueError("Couldn't parse article link")
86 article_href = article_title.get("href")
87 if not isinstance(article_href, str): 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 raise ValueError("Couldn't parse article href")
90 xarticle = create_articledata()
91 xarticle.url = urljoin(self.source_website, article_href)
92 xarticle.pid = "a" + str(index)
93 xissue.articles.append(xarticle)
95 def parse_article_content(self, content, xissue, xarticle, url):
96 soup = BeautifulSoup(content, "html.parser")
97 content = soup.select_one("#content")
98 if not content: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 raise ValueError("Couldn't parse article content")
100 id_tag = content.select_one("p.id")
101 if id_tag: 101 ↛ 105line 101 didn't jump to line 105 because the condition on line 101 was always true
102 id_tag.decompose()
104 # Title
105 if xarticle.pid == "CSIS_2012_9_3_a13": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 xarticle.title_tex = "Modeling a Holonic Agent based Solution"
107 else:
108 title_tag = content.select_one(".title")
109 if not title_tag: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 raise ValueError("Couldn't find title")
111 xarticle.title_tex = title_tag.text
112 title_tag.decompose()
114 # Authors
115 authors_tag = content.select_one(".authors")
116 if not authors_tag: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 raise ValueError("Couldn't find authors")
118 current_contributor: ContributorDict | None = None
119 for c in authors_tag.children:
120 if isinstance(c, str):
121 author_str = cleanup_str(c)
122 if author_str == "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 continue
124 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()
125 current_contributor = create_contributor(role="author", string_name=author_str)
126 xarticle.contributors.append(current_contributor)
127 continue
129 if not isinstance(c, Tag): 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 continue
131 if not current_contributor: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 raise ValueError("Couldn't find author")
134 if c.name == "sup": 134 ↛ 137line 134 didn't jump to line 137 because the condition on line 134 was always true
135 # affiliations
136 continue
137 if c.name == "a":
138 orcid_href = c.get("href")
139 if not isinstance(orcid_href, str):
140 self.logger.warning(
141 "Couldn't parse contributor orcid.",
142 extra={"pid": xarticle.pid},
143 )
144 continue
145 if not orcid_href.startswith("https://orcid.org/"):
146 self.logger.warning(
147 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/",
148 extra={"pid": xarticle.pid},
149 )
150 continue
151 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")
152 authors_tag.decompose()
154 # Affiliations
155 affiliations_tag = content.select_one("ol")
156 if affiliations_tag: 156 ↛ 159line 156 didn't jump to line 159 because the condition on line 156 was always true
157 affiliations_tag.decompose()
159 current_header: str | None = None
160 categories: dict[str, Tag] = {}
161 for tag in content.findChildren(recursive=False):
162 if tag.name == "h3":
163 current_header = tag.text
164 continue
165 if tag.name == "p": 165 ↛ 170line 165 didn't jump to line 170 because the condition on line 165 was always true
166 if current_header is None: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 raise ValueError("Couldn't parse article content")
168 categories[current_header] = tag
169 continue
170 raise ValueError("Found foreign tag in article content")
171 del current_header
173 # Abstract
174 if "Abstract" in categories: 174 ↛ 180line 174 didn't jump to line 180 because the condition on line 174 was always true
175 xarticle.abstracts.append(
176 create_abstract(value_tex=categories["Abstract"].text, lang="en")
177 )
179 # PDF
180 if "Full text" in categories: 180 ↛ 189line 180 didn't jump to line 189 because the condition on line 180 was always true
181 pdf_tag = categories["Full text"].select_one("a.download")
182 if not pdf_tag: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 raise ValueError("Couldn't find pdf url")
184 pdf_url = pdf_tag.get("href")
185 if not isinstance(pdf_url, str): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 raise ValueError("Couldn't parse pdf url")
187 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))
188 else:
189 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid})
191 # DOI
192 # TODO : contact CSIS to make them fix their DOIs
193 # if "Digital Object Identifier (DOI)" in categories:
194 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")
195 # if not doi_tag:
196 # raise ValueError("Couldn't find doi url")
197 # doi_url = doi_tag.get("href")
198 # if not isinstance(doi_url, str):
199 # raise ValueError("Couldn't parse doi url")
200 # if not doi_url.startswith("https://doi.org/"):
201 # raise ValueError("Malformed DOI url")
202 # doi_url = doi_url.removeprefix("https://doi.org/")
203 # xarticle.doi = doi_url
205 # if xarticle.pid == "CSIS_2023_20_4_a2":
206 # xarticle.doi = "10.2298/CSIS230400viiL"
207 # if xarticle.pid == "CSIS_2023_20_1_a0":
208 # xarticle.doi = "10.2298/CSIS230100iI"
209 # if xarticle.pid == "CSIS_2021_18_1_a4":
210 # xarticle.doi = "10.2298/CSIS200330035A"
211 # if xarticle.pid == "CSIS_2020_17_1_a14":
212 # xarticle.doi = "10.2298/CSIS180717038L"
213 # if xarticle.pid == "CSIS_2020_17_1_a15":
214 # xarticle.doi = "10.2298/CSIS190430041C"
215 # if xarticle.pid == "CSIS_2020_17_1_a16":
216 # xarticle.doi = "10.2298/CSIS190501042A"
217 # if xarticle.pid == "CSIS_2020_17_1_a17":
218 # xarticle.doi = "10.2298/CSIS190511043L"
220 # Keywords
221 if "Key words" in categories: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 keywords = categories["Key words"].text.split(", ")
223 for k in keywords:
224 xarticle.kwds.append(create_subj(value=k, lang="en"))
225 return xarticle