Coverage for src/crawler/by_source/csis_crawler.py: 69%

134 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1""" 

2This source has invalid DOIs in some article. 

3For now, those are ignored in order to be able to crawl the collection. 

4""" 

5 

6from urllib.parse import urljoin 

7 

8from bs4 import BeautifulSoup, Tag 

9from ptf.model_data import ( 

10 ContributorDict, 

11 create_abstract, 

12 create_articledata, 

13 create_contributor, 

14 create_issuedata, 

15 create_subj, 

16) 

17 

18from crawler.base_crawler import BaseCollectionCrawler 

19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

20 

21 

22class CsisCrawler(BaseCollectionCrawler): 

23 source_name = "Computer Science and Information Systems website" 

24 source_domain = "CSIS" 

25 source_website = "http://www.comsis.org/" 

26 

27 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" 

28 

29 def parse_collection_content(self, content): 

30 xissues = [] 

31 soup = BeautifulSoup(content, "html.parser") 

32 col_issue_tags = soup.select("#content > p") 

33 for index, tag in enumerate(col_issue_tags): 

34 xissue = self.parse_col_issue_tag(tag) 

35 xissue.pid = self.collection_id + "_TEMPPID_" + str(index) 

36 xissues.append(xissue) 

37 return xissues 

38 

39 def parse_col_issue_tag(self, col_issue_tag: Tag): 

40 issue_title = col_issue_tag.select_one("a.hidden") 

41 if not issue_title: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't parse issue link") 

43 issue_href = issue_title.get("href") 

44 if not isinstance(issue_href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Couldn't parse issue href") 

46 xissue = create_issuedata() 

47 xissue.url = urljoin(self.source_website, issue_href) 

48 return xissue 

49 

50 def parse_issue_content(self, content, xissue): 

51 soup = BeautifulSoup(content, "html.parser") 

52 

53 content = soup.select_one("#content") 

54 if not content: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Couldn't find issue content") 

56 title_tag = content.select_one("h1") 

57 if not title_tag: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 raise ValueError("Couldn't find issue title") 

59 

60 title_group = regex_to_dict( 

61 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title" 

62 ) 

63 xissue.number = title_group["number"] 

64 xissue.volume = title_group["volume"] 

65 xissue.year = title_group["year"] 

66 

67 xissue.pid = self.get_issue_pid( 

68 self.collection_id, title_group["year"], title_group["volume"], title_group["number"] 

69 ) 

70 

71 for index, article_tag in enumerate(content.select("p")): 

72 if len(article_tag.contents) == 1: 

73 continue 

74 

75 if article_tag.text == "Editorial": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 continue 

77 

78 article_title = article_tag.select_one("a.hidden") 

79 if not article_title: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Couldn't parse article link") 

81 article_href = article_title.get("href") 

82 if not isinstance(article_href, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't parse article href") 

84 

85 xarticle = create_articledata() 

86 xarticle.url = urljoin(self.source_website, article_href) 

87 xarticle.pid = "a" + str(index) 

88 xissue.articles.append(xarticle) 

89 

90 def parse_article_content(self, content, xissue, xarticle, url): 

91 soup = BeautifulSoup(content, "html.parser") 

92 content = soup.select_one("#content") 

93 if not content: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 raise ValueError("Couldn't parse article content") 

95 id_tag = content.select_one("p.id") 

96 if id_tag: 96 ↛ 100line 96 didn't jump to line 100 because the condition on line 96 was always true

97 id_tag.decompose() 

98 

99 # Title 

100 if xarticle.pid == "CSIS_2012_9_3_a13": 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 xarticle.title_tex = "Modeling a Holonic Agent based Solution" 

102 else: 

103 title_tag = content.select_one(".title") 

104 if not title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 raise ValueError("Couldn't find title") 

106 xarticle.title_tex = title_tag.text 

107 title_tag.decompose() 

108 

109 # Authors 

110 authors_tag = content.select_one(".authors") 

111 if not authors_tag: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Couldn't find authors") 

113 current_contributor: ContributorDict | None = None 

114 for c in authors_tag.children: 

115 if isinstance(c, str): 

116 author_str = cleanup_str(c) 

117 if author_str == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 continue 

119 author_str = author_str.removeprefix(", ").removeprefix("and ").strip() 

120 current_contributor = create_contributor(role="author", string_name=author_str) 

121 xarticle.contributors.append(current_contributor) 

122 continue 

123 

124 if not isinstance(c, Tag): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 continue 

126 if not current_contributor: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise ValueError("Couldn't find author") 

128 

129 if c.name == "sup": 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true

130 # affiliations 

131 continue 

132 if c.name == "a": 

133 orcid_href = c.get("href") 

134 if not isinstance(orcid_href, str): 

135 print("Couldn't parse contributor orcid") 

136 continue 

137 if not orcid_href.startswith("https://orcid.org/"): 

138 print( 

139 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/" 

140 ) 

141 continue 

142 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") 

143 authors_tag.decompose() 

144 

145 # Affiliations 

146 affiliations_tag = content.select_one("ol") 

147 if affiliations_tag: 147 ↛ 150line 147 didn't jump to line 150 because the condition on line 147 was always true

148 affiliations_tag.decompose() 

149 

150 current_header: str | None = None 

151 categories: dict[str, Tag] = {} 

152 for tag in content.findChildren(recursive=False): 

153 if tag.name == "h3": 

154 current_header = tag.text 

155 continue 

156 if tag.name == "p": 156 ↛ 161line 156 didn't jump to line 161 because the condition on line 156 was always true

157 if current_header is None: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 raise ValueError("Couldn't parse article content") 

159 categories[current_header] = tag 

160 continue 

161 raise ValueError("Found foreign tag in article content") 

162 del current_header 

163 

164 # Abstract 

165 if "Abstract" in categories: 165 ↛ 172line 165 didn't jump to line 172 because the condition on line 165 was always true

166 xabstract = create_abstract( 

167 tag="abstract", value_tex=categories["Abstract"].text, lang="en" 

168 ) 

169 xarticle.abstracts.append(xabstract) 

170 

171 # PDF 

172 if "Full text" in categories: 172 ↛ 181line 172 didn't jump to line 181 because the condition on line 172 was always true

173 pdf_tag = categories["Full text"].select_one("a.download") 

174 if not pdf_tag: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 raise ValueError("Couldn't find pdf url") 

176 pdf_url = pdf_tag.get("href") 

177 if not isinstance(pdf_url, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError("Couldn't parse pdf url") 

179 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) 

180 else: 

181 print(f"No PDF Found for article {xarticle.pid}. Skipping pdf") 

182 

183 # DOI 

184 # TODO : contact CSIS to make them fix their DOIs 

185 # if "Digital Object Identifier (DOI)" in categories: 

186 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") 

187 # if not doi_tag: 

188 # raise ValueError("Couldn't find doi url") 

189 # doi_url = doi_tag.get("href") 

190 # if not isinstance(doi_url, str): 

191 # raise ValueError("Couldn't parse doi url") 

192 # if not doi_url.startswith("https://doi.org/"): 

193 # raise ValueError("Malformed DOI url") 

194 # doi_url = doi_url.removeprefix("https://doi.org/") 

195 # xarticle.doi = doi_url 

196 

197 # if xarticle.pid == "CSIS_2023_20_4_a2": 

198 # xarticle.doi = "10.2298/CSIS230400viiL" 

199 # if xarticle.pid == "CSIS_2023_20_1_a0": 

200 # xarticle.doi = "10.2298/CSIS230100iI" 

201 # if xarticle.pid == "CSIS_2021_18_1_a4": 

202 # xarticle.doi = "10.2298/CSIS200330035A" 

203 # if xarticle.pid == "CSIS_2020_17_1_a14": 

204 # xarticle.doi = "10.2298/CSIS180717038L" 

205 # if xarticle.pid == "CSIS_2020_17_1_a15": 

206 # xarticle.doi = "10.2298/CSIS190430041C" 

207 # if xarticle.pid == "CSIS_2020_17_1_a16": 

208 # xarticle.doi = "10.2298/CSIS190501042A" 

209 # if xarticle.pid == "CSIS_2020_17_1_a17": 

210 # xarticle.doi = "10.2298/CSIS190511043L" 

211 

212 # Keywords 

213 if "Key words" in categories: 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 keywords = categories["Key words"].text.split(", ") 

215 for k in keywords: 

216 xarticle.kwds.append(create_subj(value=k, lang="en")) 

217 return xarticle