Coverage for src/crawler/by_source/csis_crawler.py: 69%

134 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1""" 

2This source has invalid DOIs in some article. 

3For now, those are ignored in order to be able to crawl the collection. 

4""" 

5 

6from urllib.parse import urljoin 

7 

8from bs4 import BeautifulSoup, Tag 

9from ptf.model_data import ( 

10 ContributorDict, 

11 create_abstract, 

12 create_articledata, 

13 create_contributor, 

14 create_issuedata, 

15 create_subj, 

16) 

17 

18from crawler.base_crawler import BaseCollectionCrawler 

19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

20 

21 

22class CsisCrawler(BaseCollectionCrawler): 

23 source_name = "Computer Science and Information Systems website" 

24 source_domain = "CSIS" 

25 source_website = "http://www.comsis.org/" 

26 

27 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" 

28 

29 def parse_collection_content(self, content): 

30 xissues = [] 

31 soup = BeautifulSoup(content, "html.parser") 

32 col_issue_tags = soup.select("#content > p") 

33 for index, tag in enumerate(col_issue_tags): 

34 xissue = self.parse_col_issue_tag(tag) 

35 xissue.pid = self.collection_id + "_TEMPPID_" + str(index) 

36 xissues.append(xissue) 

37 return xissues 

38 

39 def parse_col_issue_tag(self, col_issue_tag: Tag): 

40 issue_title = col_issue_tag.select_one("a.hidden") 

41 if not issue_title: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't parse issue link") 

43 issue_href = issue_title.get("href") 

44 if not isinstance(issue_href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Couldn't parse issue href") 

46 xissue = create_issuedata() 

47 xissue.url = urljoin(self.source_website, issue_href) 

48 return xissue 

49 

50 def parse_issue_content(self, content, xissue): 

51 soup = BeautifulSoup(content, "html.parser") 

52 

53 content = soup.select_one("#content") 

54 if not content: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Couldn't find issue content") 

56 title_tag = content.select_one("h1") 

57 if not title_tag: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 raise ValueError("Couldn't find issue title") 

59 

60 title_group = regex_to_dict( 

61 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title" 

62 ) 

63 xissue.number = title_group["number"] 

64 xissue.volume = title_group["volume"] 

65 xissue.year = title_group["year"] 

66 

67 xissue.pid = self.get_issue_pid( 

68 self.collection_id, title_group["year"], title_group["volume"], title_group["number"] 

69 ) 

70 

71 for index, article_tag in enumerate(content.select("p")): 

72 if len(article_tag.contents) == 1: 

73 continue 

74 

75 if article_tag.text == "Editorial": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 continue 

77 

78 article_title = article_tag.select_one("a.hidden") 

79 if not article_title: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Couldn't parse article link") 

81 article_href = article_title.get("href") 

82 if not isinstance(article_href, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't parse article href") 

84 

85 xarticle = create_articledata() 

86 xarticle.url = urljoin(self.source_website, article_href) 

87 xarticle.pid = "a" + str(index) 

88 xissue.articles.append(xarticle) 

89 

90 def parse_article_content(self, content, xissue, xarticle, url): 

91 soup = BeautifulSoup(content, "html.parser") 

92 content = soup.select_one("#content") 

93 if not content: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 raise ValueError("Couldn't parse article content") 

95 id_tag = content.select_one("p.id") 

96 if id_tag: 96 ↛ 100line 96 didn't jump to line 100 because the condition on line 96 was always true

97 id_tag.decompose() 

98 

99 # Title 

100 if xarticle.pid == "CSIS_2012_9_3_a13": 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 xarticle.title_tex = "Modeling a Holonic Agent based Solution" 

102 else: 

103 title_tag = content.select_one(".title") 

104 if not title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 raise ValueError("Couldn't find title") 

106 xarticle.title_tex = title_tag.text 

107 title_tag.decompose() 

108 

109 # Authors 

110 authors_tag = content.select_one(".authors") 

111 if not authors_tag: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Couldn't find authors") 

113 current_contributor: ContributorDict | None = None 

114 for c in authors_tag.children: 

115 if isinstance(c, str): 

116 author_str = cleanup_str(c) 

117 if author_str == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 continue 

119 author_str = author_str.removeprefix(", ").removeprefix("and ").strip() 

120 current_contributor = create_contributor(role="author", string_name=author_str) 

121 xarticle.contributors.append(current_contributor) 

122 continue 

123 

124 if not isinstance(c, Tag): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 continue 

126 if not current_contributor: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise ValueError("Couldn't find author") 

128 

129 if c.name == "sup": 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true

130 # affiliations 

131 continue 

132 if c.name == "a": 

133 orcid_href = c.get("href") 

134 if not isinstance(orcid_href, str): 

135 self.logger.warning( 

136 "Couldn't parse contributor orcid.", 

137 extra={"pid": xarticle.pid}, 

138 ) 

139 continue 

140 if not orcid_href.startswith("https://orcid.org/"): 

141 self.logger.warning( 

142 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/", 

143 extra={"pid": xarticle.pid}, 

144 ) 

145 continue 

146 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") 

147 authors_tag.decompose() 

148 

149 # Affiliations 

150 affiliations_tag = content.select_one("ol") 

151 if affiliations_tag: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was always true

152 affiliations_tag.decompose() 

153 

154 current_header: str | None = None 

155 categories: dict[str, Tag] = {} 

156 for tag in content.findChildren(recursive=False): 

157 if tag.name == "h3": 

158 current_header = tag.text 

159 continue 

160 if tag.name == "p": 160 ↛ 165line 160 didn't jump to line 165 because the condition on line 160 was always true

161 if current_header is None: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 raise ValueError("Couldn't parse article content") 

163 categories[current_header] = tag 

164 continue 

165 raise ValueError("Found foreign tag in article content") 

166 del current_header 

167 

168 # Abstract 

169 if "Abstract" in categories: 169 ↛ 176line 169 didn't jump to line 176 because the condition on line 169 was always true

170 xabstract = create_abstract( 

171 tag="abstract", value_tex=categories["Abstract"].text, lang="en" 

172 ) 

173 xarticle.abstracts.append(xabstract) 

174 

175 # PDF 

176 if "Full text" in categories: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true

177 pdf_tag = categories["Full text"].select_one("a.download") 

178 if not pdf_tag: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 raise ValueError("Couldn't find pdf url") 

180 pdf_url = pdf_tag.get("href") 

181 if not isinstance(pdf_url, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 raise ValueError("Couldn't parse pdf url") 

183 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) 

184 else: 

185 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid}) 

186 

187 # DOI 

188 # TODO : contact CSIS to make them fix their DOIs 

189 # if "Digital Object Identifier (DOI)" in categories: 

190 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") 

191 # if not doi_tag: 

192 # raise ValueError("Couldn't find doi url") 

193 # doi_url = doi_tag.get("href") 

194 # if not isinstance(doi_url, str): 

195 # raise ValueError("Couldn't parse doi url") 

196 # if not doi_url.startswith("https://doi.org/"): 

197 # raise ValueError("Malformed DOI url") 

198 # doi_url = doi_url.removeprefix("https://doi.org/") 

199 # xarticle.doi = doi_url 

200 

201 # if xarticle.pid == "CSIS_2023_20_4_a2": 

202 # xarticle.doi = "10.2298/CSIS230400viiL" 

203 # if xarticle.pid == "CSIS_2023_20_1_a0": 

204 # xarticle.doi = "10.2298/CSIS230100iI" 

205 # if xarticle.pid == "CSIS_2021_18_1_a4": 

206 # xarticle.doi = "10.2298/CSIS200330035A" 

207 # if xarticle.pid == "CSIS_2020_17_1_a14": 

208 # xarticle.doi = "10.2298/CSIS180717038L" 

209 # if xarticle.pid == "CSIS_2020_17_1_a15": 

210 # xarticle.doi = "10.2298/CSIS190430041C" 

211 # if xarticle.pid == "CSIS_2020_17_1_a16": 

212 # xarticle.doi = "10.2298/CSIS190501042A" 

213 # if xarticle.pid == "CSIS_2020_17_1_a17": 

214 # xarticle.doi = "10.2298/CSIS190511043L" 

215 

216 # Keywords 

217 if "Key words" in categories: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 keywords = categories["Key words"].text.split(", ") 

219 for k in keywords: 

220 xarticle.kwds.append(create_subj(value=k, lang="en")) 

221 return xarticle