Coverage for src/crawler/by_source/csis_crawler.py: 69%
138 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1"""
2This source has invalid DOIs in some article.
3For now, those are ignored in order to be able to crawl the collection.
4"""
6from urllib.parse import urljoin
8import regex
9from bs4 import BeautifulSoup, Tag
10from ptf.model_data import (
11 ContributorDict,
12 create_abstract,
13 create_articledata,
14 create_contributor,
15 create_issuedata,
16 create_subj,
17)
19from crawler.base_crawler import BaseCollectionCrawler
20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
23class CsisCrawler(BaseCollectionCrawler):
24 source_name = "Computer Science and Information Systems website"
25 source_domain = "CSIS"
26 source_website = "http://www.comsis.org/"
28 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"
30 def parse_collection_content(self, content):
31 xissues = []
32 soup = BeautifulSoup(content, "html.parser")
33 col_issue_tags = soup.select("#content > p")
34 for index, tag in enumerate(col_issue_tags):
35 xissue = self.parse_col_issue_tag(tag)
36 xissue.pid = self.collection_id + "_TEMPPID_" + str(index)
37 xissues.append(xissue)
38 return xissues
40 def parse_col_issue_tag(self, col_issue_tag: Tag):
41 issue_title = col_issue_tag.select_one("a.hidden")
42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("Couldn't parse issue link")
44 issue_href = issue_title.get("href")
45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Couldn't parse issue href")
47 xissue = create_issuedata()
48 xissue.url = urljoin(self.source_website, issue_href)
49 return xissue
51 def parse_issue_content(self, content, xissue):
52 soup = BeautifulSoup(content, "html.parser")
54 content = soup.select_one("#content")
55 if not content: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 raise ValueError("Couldn't find issue content")
57 title_tag = content.select_one("h1")
58 if not title_tag: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 raise ValueError("Couldn't find issue title")
61 title_search = regex.search(self.issue_re, title_tag.text)
62 if not title_search: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 raise ValueError("Couldn't parse issue title")
64 title_group = title_search.groupdict()
66 xissue.number = title_group["number"]
67 xissue.volume = title_group["volume"]
68 xissue.year = title_group["year"]
70 xissue.pid = self.get_issue_pid(
71 self.collection_id, title_group["year"], title_group["volume"], title_group["number"]
72 )
74 for index, article_tag in enumerate(content.select("p")):
75 if len(article_tag.contents) == 1:
76 continue
78 if article_tag.text == "Editorial": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 continue
81 article_title = article_tag.select_one("a.hidden")
82 if not article_title: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 raise ValueError("Couldn't parse article link")
84 article_href = article_title.get("href")
85 if not isinstance(article_href, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 raise ValueError("Couldn't parse article href")
88 xarticle = create_articledata()
89 xarticle.url = urljoin(self.source_website, article_href)
90 xarticle.pid = "a" + str(index)
91 xissue.articles.append(xarticle)
93 def parse_article_content(self, content, xissue, xarticle, url):
94 soup = BeautifulSoup(content, "html.parser")
95 content = soup.select_one("#content")
96 if not content: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 raise ValueError("Couldn't parse article content")
98 id_tag = content.select_one("p.id")
99 if id_tag: 99 ↛ 103line 99 didn't jump to line 103 because the condition on line 99 was always true
100 id_tag.decompose()
102 # Title
103 if xarticle.pid == "CSIS_2012_9_3_a13": 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 xarticle.title_tex = "Modeling a Holonic Agent based Solution"
105 else:
106 title_tag = content.select_one(".title")
107 if not title_tag: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 raise ValueError("Couldn't find title")
109 xarticle.title_tex = title_tag.text
110 title_tag.decompose()
112 # Authors
113 authors_tag = content.select_one(".authors")
114 if not authors_tag: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 raise ValueError("Couldn't find authors")
116 current_contributor: ContributorDict | None = None
117 for c in authors_tag.children:
118 if isinstance(c, str):
119 author_str = cleanup_str(c)
120 if author_str == "": 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 continue
122 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()
123 current_contributor = create_contributor(role="author", string_name=author_str)
124 xarticle.contributors.append(current_contributor)
125 continue
127 if not isinstance(c, Tag): 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 continue
129 if not current_contributor: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise ValueError("Couldn't find author")
132 if c.name == "sup": 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was always true
133 # affiliations
134 continue
135 if c.name == "a":
136 orcid_href = c.get("href")
137 if not isinstance(orcid_href, str):
138 print("Couldn't parse contributor orcid")
139 continue
140 if not orcid_href.startswith("https://orcid.org/"):
141 print(
142 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/"
143 )
144 continue
145 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")
146 authors_tag.decompose()
148 # Affiliations
149 affiliations_tag = content.select_one("ol")
150 if affiliations_tag: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true
151 affiliations_tag.decompose()
153 current_header: str | None = None
154 categories: dict[str, Tag] = {}
155 for tag in content.findChildren(recursive=False):
156 if tag.name == "h3":
157 current_header = tag.text
158 continue
159 if tag.name == "p": 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true
160 if current_header is None: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 raise ValueError("Couldn't parse article content")
162 categories[current_header] = tag
163 continue
164 raise ValueError("Found foreign tag in article content")
165 del current_header
167 # Abstract
168 if "Abstract" in categories: 168 ↛ 175line 168 didn't jump to line 175 because the condition on line 168 was always true
169 xabstract = create_abstract(
170 tag="abstract", value_tex=categories["Abstract"].text, lang="en"
171 )
172 xarticle.abstracts.append(xabstract)
174 # PDF
175 if "Full text" in categories: 175 ↛ 184line 175 didn't jump to line 184 because the condition on line 175 was always true
176 pdf_tag = categories["Full text"].select_one("a.download")
177 if not pdf_tag: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 raise ValueError("Couldn't find pdf url")
179 pdf_url = pdf_tag.get("href")
180 if not isinstance(pdf_url, str): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true
181 raise ValueError("Couldn't parse pdf url")
182 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))
183 else:
184 print(f"No PDF Found for article {xarticle.pid}. Skipping pdf")
186 # DOI
187 # TODO : contact CSIS to make them fix their DOIs
188 # if "Digital Object Identifier (DOI)" in categories:
189 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")
190 # if not doi_tag:
191 # raise ValueError("Couldn't find doi url")
192 # doi_url = doi_tag.get("href")
193 # if not isinstance(doi_url, str):
194 # raise ValueError("Couldn't parse doi url")
195 # if not doi_url.startswith("https://doi.org/"):
196 # raise ValueError("Malformed DOI url")
197 # doi_url = doi_url.removeprefix("https://doi.org/")
198 # xarticle.doi = doi_url
200 # if xarticle.pid == "CSIS_2023_20_4_a2":
201 # xarticle.doi = "10.2298/CSIS230400viiL"
202 # if xarticle.pid == "CSIS_2023_20_1_a0":
203 # xarticle.doi = "10.2298/CSIS230100iI"
204 # if xarticle.pid == "CSIS_2021_18_1_a4":
205 # xarticle.doi = "10.2298/CSIS200330035A"
206 # if xarticle.pid == "CSIS_2020_17_1_a14":
207 # xarticle.doi = "10.2298/CSIS180717038L"
208 # if xarticle.pid == "CSIS_2020_17_1_a15":
209 # xarticle.doi = "10.2298/CSIS190430041C"
210 # if xarticle.pid == "CSIS_2020_17_1_a16":
211 # xarticle.doi = "10.2298/CSIS190501042A"
212 # if xarticle.pid == "CSIS_2020_17_1_a17":
213 # xarticle.doi = "10.2298/CSIS190511043L"
215 # Keywords
216 if "Key words" in categories: 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true
217 keywords = categories["Key words"].text.split(", ")
218 for k in keywords:
219 xarticle.kwds.append(create_subj(value=k, lang="en"))
220 return xarticle