Coverage for src/crawler/by_source/csis_crawler.py: 69%
134 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1"""
2This source has invalid DOIs in some article.
3For now, those are ignored in order to be able to crawl the collection.
4"""
6from urllib.parse import urljoin
8from bs4 import BeautifulSoup, Tag
9from ptf.model_data import (
10 ContributorDict,
11 create_abstract,
12 create_articledata,
13 create_contributor,
14 create_issuedata,
15 create_subj,
16)
18from crawler.base_crawler import BaseCollectionCrawler
19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
22class CsisCrawler(BaseCollectionCrawler):
23 source_name = "Computer Science and Information Systems website"
24 source_domain = "CSIS"
25 source_website = "http://www.comsis.org/"
27 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"
29 def parse_collection_content(self, content):
30 xissues = []
31 soup = BeautifulSoup(content, "html.parser")
32 col_issue_tags = soup.select("#content > p")
33 for index, tag in enumerate(col_issue_tags):
34 xissue = self.parse_col_issue_tag(tag)
35 xissue.pid = self.collection_id + "_TEMPPID_" + str(index)
36 xissues.append(xissue)
37 return xissues
39 def parse_col_issue_tag(self, col_issue_tag: Tag):
40 issue_title = col_issue_tag.select_one("a.hidden")
41 if not issue_title: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't parse issue link")
43 issue_href = issue_title.get("href")
44 if not isinstance(issue_href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError("Couldn't parse issue href")
46 xissue = create_issuedata()
47 xissue.url = urljoin(self.source_website, issue_href)
48 return xissue
50 def parse_issue_content(self, content, xissue):
51 soup = BeautifulSoup(content, "html.parser")
53 content = soup.select_one("#content")
54 if not content: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError("Couldn't find issue content")
56 title_tag = content.select_one("h1")
57 if not title_tag: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true
58 raise ValueError("Couldn't find issue title")
60 title_group = regex_to_dict(
61 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title"
62 )
63 xissue.number = title_group["number"]
64 xissue.volume = title_group["volume"]
65 xissue.year = title_group["year"]
67 xissue.pid = self.get_issue_pid(
68 self.collection_id, title_group["year"], title_group["volume"], title_group["number"]
69 )
71 for index, article_tag in enumerate(content.select("p")):
72 if len(article_tag.contents) == 1:
73 continue
75 if article_tag.text == "Editorial": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 continue
78 article_title = article_tag.select_one("a.hidden")
79 if not article_title: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 raise ValueError("Couldn't parse article link")
81 article_href = article_title.get("href")
82 if not isinstance(article_href, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 raise ValueError("Couldn't parse article href")
85 xarticle = create_articledata()
86 xarticle.url = urljoin(self.source_website, article_href)
87 xarticle.pid = "a" + str(index)
88 xissue.articles.append(xarticle)
90 def parse_article_content(self, content, xissue, xarticle, url):
91 soup = BeautifulSoup(content, "html.parser")
92 content = soup.select_one("#content")
93 if not content: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 raise ValueError("Couldn't parse article content")
95 id_tag = content.select_one("p.id")
96 if id_tag: 96 ↛ 100line 96 didn't jump to line 100 because the condition on line 96 was always true
97 id_tag.decompose()
99 # Title
100 if xarticle.pid == "CSIS_2012_9_3_a13": 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 xarticle.title_tex = "Modeling a Holonic Agent based Solution"
102 else:
103 title_tag = content.select_one(".title")
104 if not title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 raise ValueError("Couldn't find title")
106 xarticle.title_tex = title_tag.text
107 title_tag.decompose()
109 # Authors
110 authors_tag = content.select_one(".authors")
111 if not authors_tag: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 raise ValueError("Couldn't find authors")
113 current_contributor: ContributorDict | None = None
114 for c in authors_tag.children:
115 if isinstance(c, str):
116 author_str = cleanup_str(c)
117 if author_str == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 continue
119 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()
120 current_contributor = create_contributor(role="author", string_name=author_str)
121 xarticle.contributors.append(current_contributor)
122 continue
124 if not isinstance(c, Tag): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 continue
126 if not current_contributor: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 raise ValueError("Couldn't find author")
129 if c.name == "sup": 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true
130 # affiliations
131 continue
132 if c.name == "a":
133 orcid_href = c.get("href")
134 if not isinstance(orcid_href, str):
135 print("Couldn't parse contributor orcid")
136 continue
137 if not orcid_href.startswith("https://orcid.org/"):
138 print(
139 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/"
140 )
141 continue
142 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")
143 authors_tag.decompose()
145 # Affiliations
146 affiliations_tag = content.select_one("ol")
147 if affiliations_tag: 147 ↛ 150line 147 didn't jump to line 150 because the condition on line 147 was always true
148 affiliations_tag.decompose()
150 current_header: str | None = None
151 categories: dict[str, Tag] = {}
152 for tag in content.findChildren(recursive=False):
153 if tag.name == "h3":
154 current_header = tag.text
155 continue
156 if tag.name == "p": 156 ↛ 161line 156 didn't jump to line 161 because the condition on line 156 was always true
157 if current_header is None: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 raise ValueError("Couldn't parse article content")
159 categories[current_header] = tag
160 continue
161 raise ValueError("Found foreign tag in article content")
162 del current_header
164 # Abstract
165 if "Abstract" in categories: 165 ↛ 172line 165 didn't jump to line 172 because the condition on line 165 was always true
166 xabstract = create_abstract(
167 tag="abstract", value_tex=categories["Abstract"].text, lang="en"
168 )
169 xarticle.abstracts.append(xabstract)
171 # PDF
172 if "Full text" in categories: 172 ↛ 181line 172 didn't jump to line 181 because the condition on line 172 was always true
173 pdf_tag = categories["Full text"].select_one("a.download")
174 if not pdf_tag: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 raise ValueError("Couldn't find pdf url")
176 pdf_url = pdf_tag.get("href")
177 if not isinstance(pdf_url, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 raise ValueError("Couldn't parse pdf url")
179 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))
180 else:
181 print(f"No PDF Found for article {xarticle.pid}. Skipping pdf")
183 # DOI
184 # TODO : contact CSIS to make them fix their DOIs
185 # if "Digital Object Identifier (DOI)" in categories:
186 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")
187 # if not doi_tag:
188 # raise ValueError("Couldn't find doi url")
189 # doi_url = doi_tag.get("href")
190 # if not isinstance(doi_url, str):
191 # raise ValueError("Couldn't parse doi url")
192 # if not doi_url.startswith("https://doi.org/"):
193 # raise ValueError("Malformed DOI url")
194 # doi_url = doi_url.removeprefix("https://doi.org/")
195 # xarticle.doi = doi_url
197 # if xarticle.pid == "CSIS_2023_20_4_a2":
198 # xarticle.doi = "10.2298/CSIS230400viiL"
199 # if xarticle.pid == "CSIS_2023_20_1_a0":
200 # xarticle.doi = "10.2298/CSIS230100iI"
201 # if xarticle.pid == "CSIS_2021_18_1_a4":
202 # xarticle.doi = "10.2298/CSIS200330035A"
203 # if xarticle.pid == "CSIS_2020_17_1_a14":
204 # xarticle.doi = "10.2298/CSIS180717038L"
205 # if xarticle.pid == "CSIS_2020_17_1_a15":
206 # xarticle.doi = "10.2298/CSIS190430041C"
207 # if xarticle.pid == "CSIS_2020_17_1_a16":
208 # xarticle.doi = "10.2298/CSIS190501042A"
209 # if xarticle.pid == "CSIS_2020_17_1_a17":
210 # xarticle.doi = "10.2298/CSIS190511043L"
212 # Keywords
213 if "Key words" in categories: 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true
214 keywords = categories["Key words"].text.split(", ")
215 for k in keywords:
216 xarticle.kwds.append(create_subj(value=k, lang="en"))
217 return xarticle