Coverage for src / crawler / by_source / csis_crawler.py: 69%

134 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1""" 

2This source has invalid DOIs in some article. 

3For now, those are ignored in order to be able to crawl the collection. 

4""" 

5 

6from urllib.parse import urljoin 

7 

8from bs4 import BeautifulSoup, Tag 

9from ptf.model_data import ( 

10 ContributorDict, 

11 create_abstract, 

12 create_articledata, 

13 create_contributor, 

14 create_issuedata, 

15 create_subj, 

16) 

17 

18from crawler.base_crawler import BaseCollectionCrawler 

19from crawler.crawler_utils import get_issue_pid 

20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

21 

22 

23class CsisCrawler(BaseCollectionCrawler): 

24 source_name = "Computer Science and Information Systems website" 

25 source_domain = "CSIS" 

26 source_website = "http://www.comsis.org/" 

27 

28 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 soup = BeautifulSoup(content, "html.parser") 

33 col_issue_tags = soup.select("#content > p") 

34 for index, tag in enumerate(col_issue_tags): 

35 xissue = self.parse_col_issue_tag(tag) 

36 xissue.pid = self.collection_id + "_TEMPPID_" + str(index) 

37 xissues.append(xissue) 

38 return xissues 

39 

40 def parse_col_issue_tag(self, col_issue_tag: Tag): 

41 issue_title = col_issue_tag.select_one("a.hidden") 

42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Couldn't parse issue link") 

44 issue_href = issue_title.get("href") 

45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Couldn't parse issue href") 

47 xissue = create_issuedata() 

48 xissue.url = urljoin(self.source_website, issue_href) 

49 return xissue 

50 

51 def parse_issue_content(self, content, xissue): 

52 soup = BeautifulSoup(content, "html.parser") 

53 

54 content = soup.select_one("#content") 

55 if not content: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("Couldn't find issue content") 

57 title_tag = content.select_one("h1") 

58 if not title_tag: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError("Couldn't find issue title") 

60 

61 title_group = regex_to_dict( 

62 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title" 

63 ) 

64 xissue.number = title_group["number"] 

65 xissue.volume = title_group["volume"] 

66 xissue.year = title_group["year"] 

67 

68 xissue.pid = get_issue_pid( 

69 self.collection_id, title_group["year"], title_group["volume"], title_group["number"] 

70 ) 

71 

72 for index, article_tag in enumerate(content.select("p")): 

73 if len(article_tag.contents) == 1: 

74 continue 

75 

76 if article_tag.text == "Editorial": 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 continue 

78 

79 article_title = article_tag.select_one("a.hidden") 

80 if not article_title: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Couldn't parse article link") 

82 article_href = article_title.get("href") 

83 if not isinstance(article_href, str): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 raise ValueError("Couldn't parse article href") 

85 

86 xarticle = create_articledata() 

87 xarticle.url = urljoin(self.source_website, article_href) 

88 xarticle.pid = "a" + str(index) 

89 xissue.articles.append(xarticle) 

90 

91 def parse_article_content(self, content, xissue, xarticle, url): 

92 soup = BeautifulSoup(content, "html.parser") 

93 content = soup.select_one("#content") 

94 if not content: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 raise ValueError("Couldn't parse article content") 

96 id_tag = content.select_one("p.id") 

97 if id_tag: 97 ↛ 101line 97 didn't jump to line 101 because the condition on line 97 was always true

98 id_tag.decompose() 

99 

100 # Title 

101 if xarticle.pid == "CSIS_2012_9_3_a13": 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 xarticle.title_tex = "Modeling a Holonic Agent based Solution" 

103 else: 

104 title_tag = content.select_one(".title") 

105 if not title_tag: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 raise ValueError("Couldn't find title") 

107 xarticle.title_tex = title_tag.text 

108 title_tag.decompose() 

109 

110 # Authors 

111 authors_tag = content.select_one(".authors") 

112 if not authors_tag: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError("Couldn't find authors") 

114 current_contributor: ContributorDict | None = None 

115 for c in authors_tag.children: 

116 if isinstance(c, str): 

117 author_str = cleanup_str(c) 

118 if author_str == "": 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 continue 

120 author_str = author_str.removeprefix(", ").removeprefix("and ").strip() 

121 current_contributor = create_contributor(role="author", string_name=author_str) 

122 xarticle.contributors.append(current_contributor) 

123 continue 

124 

125 if not isinstance(c, Tag): 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 continue 

127 if not current_contributor: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 raise ValueError("Couldn't find author") 

129 

130 if c.name == "sup": 130 ↛ 133line 130 didn't jump to line 133 because the condition on line 130 was always true

131 # affiliations 

132 continue 

133 if c.name == "a": 

134 orcid_href = c.get("href") 

135 if not isinstance(orcid_href, str): 

136 self.logger.warning( 

137 "Couldn't parse contributor orcid.", 

138 extra={"pid": xarticle.pid}, 

139 ) 

140 continue 

141 if not orcid_href.startswith("https://orcid.org/"): 

142 self.logger.warning( 

143 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/", 

144 extra={"pid": xarticle.pid}, 

145 ) 

146 continue 

147 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") 

148 authors_tag.decompose() 

149 

150 # Affiliations 

151 affiliations_tag = content.select_one("ol") 

152 if affiliations_tag: 152 ↛ 155line 152 didn't jump to line 155 because the condition on line 152 was always true

153 affiliations_tag.decompose() 

154 

155 current_header: str | None = None 

156 categories: dict[str, Tag] = {} 

157 for tag in content.findChildren(recursive=False): 

158 if tag.name == "h3": 

159 current_header = tag.text 

160 continue 

161 if tag.name == "p": 161 ↛ 166line 161 didn't jump to line 166 because the condition on line 161 was always true

162 if current_header is None: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 raise ValueError("Couldn't parse article content") 

164 categories[current_header] = tag 

165 continue 

166 raise ValueError("Found foreign tag in article content") 

167 del current_header 

168 

169 # Abstract 

170 if "Abstract" in categories: 170 ↛ 176line 170 didn't jump to line 176 because the condition on line 170 was always true

171 xarticle.abstracts.append( 

172 create_abstract(value_tex=categories["Abstract"].text, lang="en") 

173 ) 

174 

175 # PDF 

176 if "Full text" in categories: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true

177 pdf_tag = categories["Full text"].select_one("a.download") 

178 if not pdf_tag: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 raise ValueError("Couldn't find pdf url") 

180 pdf_url = pdf_tag.get("href") 

181 if not isinstance(pdf_url, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 raise ValueError("Couldn't parse pdf url") 

183 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) 

184 else: 

185 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid}) 

186 

187 # DOI 

188 # TODO : contact CSIS to make them fix their DOIs 

189 # if "Digital Object Identifier (DOI)" in categories: 

190 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") 

191 # if not doi_tag: 

192 # raise ValueError("Couldn't find doi url") 

193 # doi_url = doi_tag.get("href") 

194 # if not isinstance(doi_url, str): 

195 # raise ValueError("Couldn't parse doi url") 

196 # if not doi_url.startswith("https://doi.org/"): 

197 # raise ValueError("Malformed DOI url") 

198 # doi_url = doi_url.removeprefix("https://doi.org/") 

199 # xarticle.doi = doi_url 

200 

201 # if xarticle.pid == "CSIS_2023_20_4_a2": 

202 # xarticle.doi = "10.2298/CSIS230400viiL" 

203 # if xarticle.pid == "CSIS_2023_20_1_a0": 

204 # xarticle.doi = "10.2298/CSIS230100iI" 

205 # if xarticle.pid == "CSIS_2021_18_1_a4": 

206 # xarticle.doi = "10.2298/CSIS200330035A" 

207 # if xarticle.pid == "CSIS_2020_17_1_a14": 

208 # xarticle.doi = "10.2298/CSIS180717038L" 

209 # if xarticle.pid == "CSIS_2020_17_1_a15": 

210 # xarticle.doi = "10.2298/CSIS190430041C" 

211 # if xarticle.pid == "CSIS_2020_17_1_a16": 

212 # xarticle.doi = "10.2298/CSIS190501042A" 

213 # if xarticle.pid == "CSIS_2020_17_1_a17": 

214 # xarticle.doi = "10.2298/CSIS190511043L" 

215 

216 # Keywords 

217 if "Key words" in categories: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 keywords = categories["Key words"].text.split(", ") 

219 for k in keywords: 

220 xarticle.kwds.append(create_subj(value=k, lang="en")) 

221 return xarticle