Coverage for src/crawler/by_source/csis_crawler.py: 69%

138 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1""" 

2This source has invalid DOIs in some article. 

3For now, those are ignored in order to be able to crawl the collection. 

4""" 

5 

6from urllib.parse import urljoin 

7 

8import regex 

9from bs4 import BeautifulSoup, Tag 

10from ptf.model_data import ( 

11 ContributorDict, 

12 create_abstract, 

13 create_articledata, 

14 create_contributor, 

15 create_issuedata, 

16 create_subj, 

17) 

18 

19from crawler.base_crawler import BaseCollectionCrawler 

20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

21 

22 

23class CsisCrawler(BaseCollectionCrawler): 

24 source_name = "Computer Science and Information Systems website" 

25 source_domain = "CSIS" 

26 source_website = "http://www.comsis.org/" 

27 

28 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 soup = BeautifulSoup(content, "html.parser") 

33 col_issue_tags = soup.select("#content > p") 

34 for index, tag in enumerate(col_issue_tags): 

35 xissue = self.parse_col_issue_tag(tag) 

36 xissue.pid = self.collection_id + "_TEMPPID_" + str(index) 

37 xissues.append(xissue) 

38 return xissues 

39 

40 def parse_col_issue_tag(self, col_issue_tag: Tag): 

41 issue_title = col_issue_tag.select_one("a.hidden") 

42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Couldn't parse issue link") 

44 issue_href = issue_title.get("href") 

45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Couldn't parse issue href") 

47 xissue = create_issuedata() 

48 xissue.url = urljoin(self.source_website, issue_href) 

49 return xissue 

50 

51 def parse_issue_content(self, content, xissue): 

52 soup = BeautifulSoup(content, "html.parser") 

53 

54 content = soup.select_one("#content") 

55 if not content: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("Couldn't find issue content") 

57 title_tag = content.select_one("h1") 

58 if not title_tag: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError("Couldn't find issue title") 

60 

61 title_search = regex.search(self.issue_re, title_tag.text) 

62 if not title_search: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError("Couldn't parse issue title") 

64 title_group = title_search.groupdict() 

65 

66 xissue.number = title_group["number"] 

67 xissue.volume = title_group["volume"] 

68 xissue.year = title_group["year"] 

69 

70 xissue.pid = self.get_issue_pid( 

71 self.collection_id, title_group["year"], title_group["volume"], title_group["number"] 

72 ) 

73 

74 for index, article_tag in enumerate(content.select("p")): 

75 if len(article_tag.contents) == 1: 

76 continue 

77 

78 if article_tag.text == "Editorial": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 continue 

80 

81 article_title = article_tag.select_one("a.hidden") 

82 if not article_title: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't parse article link") 

84 article_href = article_title.get("href") 

85 if not isinstance(article_href, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise ValueError("Couldn't parse article href") 

87 

88 xarticle = create_articledata() 

89 xarticle.url = urljoin(self.source_website, article_href) 

90 xarticle.pid = "a" + str(index) 

91 xissue.articles.append(xarticle) 

92 

93 def parse_article_content(self, content, xissue, xarticle, url): 

94 soup = BeautifulSoup(content, "html.parser") 

95 content = soup.select_one("#content") 

96 if not content: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError("Couldn't parse article content") 

98 id_tag = content.select_one("p.id") 

99 if id_tag: 99 ↛ 103line 99 didn't jump to line 103 because the condition on line 99 was always true

100 id_tag.decompose() 

101 

102 # Title 

103 if xarticle.pid == "CSIS_2012_9_3_a13": 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 xarticle.title_tex = "Modeling a Holonic Agent based Solution" 

105 else: 

106 title_tag = content.select_one(".title") 

107 if not title_tag: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 raise ValueError("Couldn't find title") 

109 xarticle.title_tex = title_tag.text 

110 title_tag.decompose() 

111 

112 # Authors 

113 authors_tag = content.select_one(".authors") 

114 if not authors_tag: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 raise ValueError("Couldn't find authors") 

116 current_contributor: ContributorDict | None = None 

117 for c in authors_tag.children: 

118 if isinstance(c, str): 

119 author_str = cleanup_str(c) 

120 if author_str == "": 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 continue 

122 author_str = author_str.removeprefix(", ").removeprefix("and ").strip() 

123 current_contributor = create_contributor(role="author", string_name=author_str) 

124 xarticle.contributors.append(current_contributor) 

125 continue 

126 

127 if not isinstance(c, Tag): 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 continue 

129 if not current_contributor: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError("Couldn't find author") 

131 

132 if c.name == "sup": 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was always true

133 # affiliations 

134 continue 

135 if c.name == "a": 

136 orcid_href = c.get("href") 

137 if not isinstance(orcid_href, str): 

138 print("Couldn't parse contributor orcid") 

139 continue 

140 if not orcid_href.startswith("https://orcid.org/"): 

141 print( 

142 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/" 

143 ) 

144 continue 

145 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") 

146 authors_tag.decompose() 

147 

148 # Affiliations 

149 affiliations_tag = content.select_one("ol") 

150 if affiliations_tag: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 affiliations_tag.decompose() 

152 

153 current_header: str | None = None 

154 categories: dict[str, Tag] = {} 

155 for tag in content.findChildren(recursive=False): 

156 if tag.name == "h3": 

157 current_header = tag.text 

158 continue 

159 if tag.name == "p": 159 ↛ 164line 159 didn't jump to line 164 because the condition on line 159 was always true

160 if current_header is None: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 raise ValueError("Couldn't parse article content") 

162 categories[current_header] = tag 

163 continue 

164 raise ValueError("Found foreign tag in article content") 

165 del current_header 

166 

167 # Abstract 

168 if "Abstract" in categories: 168 ↛ 175line 168 didn't jump to line 175 because the condition on line 168 was always true

169 xabstract = create_abstract( 

170 tag="abstract", value_tex=categories["Abstract"].text, lang="en" 

171 ) 

172 xarticle.abstracts.append(xabstract) 

173 

174 # PDF 

175 if "Full text" in categories: 175 ↛ 184line 175 didn't jump to line 184 because the condition on line 175 was always true

176 pdf_tag = categories["Full text"].select_one("a.download") 

177 if not pdf_tag: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError("Couldn't find pdf url") 

179 pdf_url = pdf_tag.get("href") 

180 if not isinstance(pdf_url, str): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true

181 raise ValueError("Couldn't parse pdf url") 

182 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) 

183 else: 

184 print(f"No PDF Found for article {xarticle.pid}. Skipping pdf") 

185 

186 # DOI 

187 # TODO : contact CSIS to make them fix their DOIs 

188 # if "Digital Object Identifier (DOI)" in categories: 

189 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") 

190 # if not doi_tag: 

191 # raise ValueError("Couldn't find doi url") 

192 # doi_url = doi_tag.get("href") 

193 # if not isinstance(doi_url, str): 

194 # raise ValueError("Couldn't parse doi url") 

195 # if not doi_url.startswith("https://doi.org/"): 

196 # raise ValueError("Malformed DOI url") 

197 # doi_url = doi_url.removeprefix("https://doi.org/") 

198 # xarticle.doi = doi_url 

199 

200 # if xarticle.pid == "CSIS_2023_20_4_a2": 

201 # xarticle.doi = "10.2298/CSIS230400viiL" 

202 # if xarticle.pid == "CSIS_2023_20_1_a0": 

203 # xarticle.doi = "10.2298/CSIS230100iI" 

204 # if xarticle.pid == "CSIS_2021_18_1_a4": 

205 # xarticle.doi = "10.2298/CSIS200330035A" 

206 # if xarticle.pid == "CSIS_2020_17_1_a14": 

207 # xarticle.doi = "10.2298/CSIS180717038L" 

208 # if xarticle.pid == "CSIS_2020_17_1_a15": 

209 # xarticle.doi = "10.2298/CSIS190430041C" 

210 # if xarticle.pid == "CSIS_2020_17_1_a16": 

211 # xarticle.doi = "10.2298/CSIS190501042A" 

212 # if xarticle.pid == "CSIS_2020_17_1_a17": 

213 # xarticle.doi = "10.2298/CSIS190511043L" 

214 

215 # Keywords 

216 if "Key words" in categories: 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true

217 keywords = categories["Key words"].text.split(", ") 

218 for k in keywords: 

219 xarticle.kwds.append(create_subj(value=k, lang="en")) 

220 return xarticle