Coverage for src / crawler / by_source / csis_crawler.py: 69%

135 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-04-08 09:35 +0000

1""" 

2This source has invalid DOIs in some article. 

3For now, those are ignored in order to be able to crawl the collection. 

4""" 

5 

6from urllib.parse import urljoin 

7 

8from bs4 import BeautifulSoup, Tag 

9from ptf.model_data import ( 

10 ContributorDict, 

11 create_abstract, 

12 create_articledata, 

13 create_contributor, 

14 create_subj, 

15) 

16 

17from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler 

18from crawler.crawler_utils import create_xissue 

19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

20 

21 

22class CsisCrawler(ThreadedCrawler): 

23 source_name = "Computer Science and Information Systems website" 

24 source_domain = "CSIS" 

25 source_website = "http://www.comsis.org/" 

26 

27 issue_browse_re = r"Number (?P<number>\d+), \w+ (?P<year>\d+)" 

28 volume_browse_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

29 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" 

30 

31 def parse_collection_content(self, content): 

32 xissues = [] 

33 soup = BeautifulSoup(content, "html.parser") 

34 col_issue_tags = soup.select("#content > p") 

35 for index, tag in enumerate(col_issue_tags): 

36 xissue = self.parse_col_issue_tag(tag) 

37 xissues.append(xissue) 

38 return xissues 

39 

40 def parse_col_issue_tag(self, col_issue_tag: Tag): 

41 issue_title = col_issue_tag.select_one("a.hidden") 

42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Couldn't parse issue link") 

44 issue_href = issue_title.get("href") 

45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Couldn't parse issue href") 

47 

48 volume_tag = col_issue_tag.findPrevious("h3") 

49 if not volume_tag or not volume_tag.text.startswith("Volume"): 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError("Could not find volume tag") 

51 

52 volume_group = regex_to_dict(self.volume_browse_re, volume_tag.text) 

53 title_group = regex_to_dict(self.issue_browse_re, issue_title.text) 

54 

55 xissue = create_xissue( 

56 self.collection_id, 

57 url=urljoin(self.source_website, issue_href), 

58 year=title_group["year"], 

59 volume_number=volume_group["volume"], 

60 issue_number=title_group["number"], 

61 ) 

62 issue_title = col_issue_tag.select_one("a") 

63 

64 return xissue 

65 

66 def parse_issue_content(self, content, xissue): 

67 soup = BeautifulSoup(content, "html.parser") 

68 

69 content = soup.select_one("#content") 

70 if not content: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 raise ValueError("Couldn't find issue content") 

72 title_tag = content.select_one("h1") 

73 if not title_tag: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 raise ValueError("Couldn't find issue title") 

75 

76 for index, article_tag in enumerate(content.select("p")): 

77 if len(article_tag.contents) == 1: 

78 continue 

79 

80 if article_tag.text == "Editorial": 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 continue 

82 

83 article_title = article_tag.select_one("a.hidden") 

84 if not article_title: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 raise ValueError("Couldn't parse article link") 

86 article_href = article_title.get("href") 

87 if not isinstance(article_href, str): 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 raise ValueError("Couldn't parse article href") 

89 

90 xarticle = create_articledata() 

91 xarticle.url = urljoin(self.source_website, article_href) 

92 xarticle.pid = "a" + str(index) 

93 xissue.articles.append(xarticle) 

94 

95 def parse_article_content(self, content, xissue, xarticle, url): 

96 soup = BeautifulSoup(content, "html.parser") 

97 content = soup.select_one("#content") 

98 if not content: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 raise ValueError("Couldn't parse article content") 

100 id_tag = content.select_one("p.id") 

101 if id_tag: 101 ↛ 105line 101 didn't jump to line 105 because the condition on line 101 was always true

102 id_tag.decompose() 

103 

104 # Title 

105 if xarticle.pid == "CSIS_2012_9_3_a13": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 xarticle.title_tex = "Modeling a Holonic Agent based Solution" 

107 else: 

108 title_tag = content.select_one(".title") 

109 if not title_tag: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 raise ValueError("Couldn't find title") 

111 xarticle.title_tex = title_tag.text 

112 title_tag.decompose() 

113 

114 # Authors 

115 authors_tag = content.select_one(".authors") 

116 if not authors_tag: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError("Couldn't find authors") 

118 current_contributor: ContributorDict | None = None 

119 for c in authors_tag.children: 

120 if isinstance(c, str): 

121 author_str = cleanup_str(c) 

122 if author_str == "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 continue 

124 author_str = author_str.removeprefix(", ").removeprefix("and ").strip() 

125 current_contributor = create_contributor(role="author", string_name=author_str) 

126 xarticle.contributors.append(current_contributor) 

127 continue 

128 

129 if not isinstance(c, Tag): 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 continue 

131 if not current_contributor: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 raise ValueError("Couldn't find author") 

133 

134 if c.name == "sup": 134 ↛ 137line 134 didn't jump to line 137 because the condition on line 134 was always true

135 # affiliations 

136 continue 

137 if c.name == "a": 

138 orcid_href = c.get("href") 

139 if not isinstance(orcid_href, str): 

140 self.logger.warning( 

141 "Couldn't parse contributor orcid.", 

142 extra={"pid": xarticle.pid}, 

143 ) 

144 continue 

145 if not orcid_href.startswith("https://orcid.org/"): 

146 self.logger.warning( 

147 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/", 

148 extra={"pid": xarticle.pid}, 

149 ) 

150 continue 

151 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") 

152 authors_tag.decompose() 

153 

154 # Affiliations 

155 affiliations_tag = content.select_one("ol") 

156 if affiliations_tag: 156 ↛ 159line 156 didn't jump to line 159 because the condition on line 156 was always true

157 affiliations_tag.decompose() 

158 

159 current_header: str | None = None 

160 categories: dict[str, Tag] = {} 

161 for tag in content.findChildren(recursive=False): 

162 if tag.name == "h3": 

163 current_header = tag.text 

164 continue 

165 if tag.name == "p": 165 ↛ 170line 165 didn't jump to line 170 because the condition on line 165 was always true

166 if current_header is None: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 raise ValueError("Couldn't parse article content") 

168 categories[current_header] = tag 

169 continue 

170 raise ValueError("Found foreign tag in article content") 

171 del current_header 

172 

173 # Abstract 

174 if "Abstract" in categories: 174 ↛ 180line 174 didn't jump to line 180 because the condition on line 174 was always true

175 xarticle.abstracts.append( 

176 create_abstract(value_tex=categories["Abstract"].text, lang="en") 

177 ) 

178 

179 # PDF 

180 if "Full text" in categories: 180 ↛ 189line 180 didn't jump to line 189 because the condition on line 180 was always true

181 pdf_tag = categories["Full text"].select_one("a.download") 

182 if not pdf_tag: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 raise ValueError("Couldn't find pdf url") 

184 pdf_url = pdf_tag.get("href") 

185 if not isinstance(pdf_url, str): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 raise ValueError("Couldn't parse pdf url") 

187 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) 

188 else: 

189 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid}) 

190 

191 # DOI 

192 # TODO : contact CSIS to make them fix their DOIs 

193 # if "Digital Object Identifier (DOI)" in categories: 

194 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") 

195 # if not doi_tag: 

196 # raise ValueError("Couldn't find doi url") 

197 # doi_url = doi_tag.get("href") 

198 # if not isinstance(doi_url, str): 

199 # raise ValueError("Couldn't parse doi url") 

200 # if not doi_url.startswith("https://doi.org/"): 

201 # raise ValueError("Malformed DOI url") 

202 # doi_url = doi_url.removeprefix("https://doi.org/") 

203 # xarticle.doi = doi_url 

204 

205 # if xarticle.pid == "CSIS_2023_20_4_a2": 

206 # xarticle.doi = "10.2298/CSIS230400viiL" 

207 # if xarticle.pid == "CSIS_2023_20_1_a0": 

208 # xarticle.doi = "10.2298/CSIS230100iI" 

209 # if xarticle.pid == "CSIS_2021_18_1_a4": 

210 # xarticle.doi = "10.2298/CSIS200330035A" 

211 # if xarticle.pid == "CSIS_2020_17_1_a14": 

212 # xarticle.doi = "10.2298/CSIS180717038L" 

213 # if xarticle.pid == "CSIS_2020_17_1_a15": 

214 # xarticle.doi = "10.2298/CSIS190430041C" 

215 # if xarticle.pid == "CSIS_2020_17_1_a16": 

216 # xarticle.doi = "10.2298/CSIS190501042A" 

217 # if xarticle.pid == "CSIS_2020_17_1_a17": 

218 # xarticle.doi = "10.2298/CSIS190511043L" 

219 

220 # Keywords 

221 if "Key words" in categories: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 keywords = categories["Key words"].text.split(", ") 

223 for k in keywords: 

224 xarticle.kwds.append(create_subj(value=k, lang="en")) 

225 return xarticle