Coverage for src/crawler/by_source/csis_crawler.py: 69%

139 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1""" 

2This source has invalid DOIs in some article. 

3For now, those are ignored in order to be able to crawl the collection. 

4""" 

5 

6from urllib.parse import urljoin 

7 

8import regex 

9from bs4 import BeautifulSoup, Tag 

10from ptf.model_data import ( 

11 ContributorDict, 

12 create_abstract, 

13 create_articledata, 

14 create_contributor, 

15 create_issuedata, 

16 create_subj, 

17) 

18 

19from crawler.base_crawler import BaseCollectionCrawler 

20from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

21 

22 

23class CsisCrawler(BaseCollectionCrawler): 

24 source_name = "Computer Science and Information Systems website" 

25 source_domain = "CSIS" 

26 source_website = "http://www.comsis.org/" 

27 

28 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)" 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 soup = BeautifulSoup(content, "html.parser") 

33 col_issue_tags = soup.select("#content > p") 

34 for index, tag in enumerate(col_issue_tags): 

35 xissue = self.parse_col_issue_tag(tag) 

36 xissue.pid = self.collection_id + "_TEMPPID_" + str(index) 

37 xissues.append(xissue) 

38 return xissues 

39 

40 def parse_col_issue_tag(self, col_issue_tag: Tag): 

41 issue_title = col_issue_tag.select_one("a.hidden") 

42 if not issue_title: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Couldn't parse issue link") 

44 issue_href = issue_title.get("href") 

45 if not isinstance(issue_href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Couldn't parse issue href") 

47 xissue = create_issuedata() 

48 xissue.url = urljoin(self.source_website, issue_href) 

49 return xissue 

50 

51 def parse_issue_content(self, content, xissue): 

52 soup = BeautifulSoup(content, "html.parser") 

53 

54 content = soup.select_one("#content") 

55 if not content: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("Couldn't find issue content") 

57 title_tag = content.select_one("h1") 

58 if not title_tag: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError("Couldn't find issue title") 

60 

61 title_search = regex.search(self.issue_re, title_tag.text) 

62 if not title_search: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError("Couldn't parse issue title") 

64 title_group = title_search.groupdict() 

65 

66 xissue.number = title_group["number"] 

67 xissue.volume = title_group["volume"] 

68 xissue.year = title_group["year"] 

69 

70 xissue.pid = self.get_issue_pid( 

71 self.collection_id, title_group["year"], title_group["volume"], title_group["number"] 

72 ) 

73 

74 for index, article_tag in enumerate(content.select("p")): 

75 if len(article_tag.contents) == 1: 

76 continue 

77 

78 if article_tag.text == "Editorial": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 continue 

80 

81 article_title = article_tag.select_one("a.hidden") 

82 if not article_title: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't parse article link") 

84 article_href = article_title.get("href") 

85 if not isinstance(article_href, str): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise ValueError("Couldn't parse article href") 

87 

88 xarticle = create_articledata() 

89 xarticle.url = urljoin(self.source_website, article_href) 

90 xarticle.pid = "a" + str(index) 

91 xissue.articles.append(xarticle) 

92 

93 def parse_article_content(self, content, xissue, xarticle, url, pid): 

94 xarticle.pid = pid 

95 

96 soup = BeautifulSoup(content, "html.parser") 

97 content = soup.select_one("#content") 

98 if not content: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 raise ValueError("Couldn't parse article content") 

100 id_tag = content.select_one("p.id") 

101 if id_tag: 101 ↛ 105line 101 didn't jump to line 105 because the condition on line 101 was always true

102 id_tag.decompose() 

103 

104 # Title 

105 if xarticle.pid == "CSIS_2012_9_3_a13": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 xarticle.title_tex = "Modeling a Holonic Agent based Solution" 

107 else: 

108 title_tag = content.select_one(".title") 

109 if not title_tag: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 raise ValueError("Couldn't find title") 

111 xarticle.title_tex = title_tag.text 

112 title_tag.decompose() 

113 

114 # Authors 

115 authors_tag = content.select_one(".authors") 

116 if not authors_tag: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError("Couldn't find authors") 

118 current_contributor: ContributorDict | None = None 

119 for c in authors_tag.children: 

120 if isinstance(c, str): 

121 author_str = cleanup_str(c) 

122 if author_str == "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 continue 

124 author_str = author_str.removeprefix(", ").removeprefix("and ").strip() 

125 current_contributor = create_contributor(role="author", string_name=author_str) 

126 xarticle.contributors.append(current_contributor) 

127 continue 

128 

129 if not isinstance(c, Tag): 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 continue 

131 if not current_contributor: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 raise ValueError("Couldn't find author") 

133 

134 if c.name == "sup": 134 ↛ 137line 134 didn't jump to line 137 because the condition on line 134 was always true

135 # affiliations 

136 continue 

137 if c.name == "a": 

138 orcid_href = c.get("href") 

139 if not isinstance(orcid_href, str): 

140 print("Couldn't parse contributor orcid") 

141 continue 

142 if not orcid_href.startswith("https://orcid.org/"): 

143 print( 

144 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/" 

145 ) 

146 continue 

147 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/") 

148 authors_tag.decompose() 

149 

150 # Affiliations 

151 affiliations_tag = content.select_one("ol") 

152 if affiliations_tag: 152 ↛ 155line 152 didn't jump to line 155 because the condition on line 152 was always true

153 affiliations_tag.decompose() 

154 

155 current_header: str | None = None 

156 categories: dict[str, Tag] = {} 

157 for tag in content.findChildren(recursive=False): 

158 if tag.name == "h3": 

159 current_header = tag.text 

160 continue 

161 if tag.name == "p": 161 ↛ 166line 161 didn't jump to line 166 because the condition on line 161 was always true

162 if current_header is None: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 raise ValueError("Couldn't parse article content") 

164 categories[current_header] = tag 

165 continue 

166 raise ValueError("Found foreign tag in article content") 

167 del current_header 

168 

169 # Abstract 

170 if "Abstract" in categories: 170 ↛ 177line 170 didn't jump to line 177 because the condition on line 170 was always true

171 xabstract = create_abstract( 

172 tag="abstract", value_tex=categories["Abstract"].text, lang="en" 

173 ) 

174 xarticle.abstracts.append(xabstract) 

175 

176 # PDF 

177 if "Full text" in categories: 177 ↛ 186line 177 didn't jump to line 186 because the condition on line 177 was always true

178 pdf_tag = categories["Full text"].select_one("a.download") 

179 if not pdf_tag: 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true

180 raise ValueError("Couldn't find pdf url") 

181 pdf_url = pdf_tag.get("href") 

182 if not isinstance(pdf_url, str): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 raise ValueError("Couldn't parse pdf url") 

184 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url)) 

185 else: 

186 print(f"No PDF Found for article {xarticle.pid}. Skipping pdf") 

187 

188 # DOI 

189 # TODO : contact CSIS to make them fix their DOIs 

190 # if "Digital Object Identifier (DOI)" in categories: 

191 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a") 

192 # if not doi_tag: 

193 # raise ValueError("Couldn't find doi url") 

194 # doi_url = doi_tag.get("href") 

195 # if not isinstance(doi_url, str): 

196 # raise ValueError("Couldn't parse doi url") 

197 # if not doi_url.startswith("https://doi.org/"): 

198 # raise ValueError("Malformed DOI url") 

199 # doi_url = doi_url.removeprefix("https://doi.org/") 

200 # xarticle.doi = doi_url 

201 

202 # if xarticle.pid == "CSIS_2023_20_4_a2": 

203 # xarticle.doi = "10.2298/CSIS230400viiL" 

204 # if xarticle.pid == "CSIS_2023_20_1_a0": 

205 # xarticle.doi = "10.2298/CSIS230100iI" 

206 # if xarticle.pid == "CSIS_2021_18_1_a4": 

207 # xarticle.doi = "10.2298/CSIS200330035A" 

208 # if xarticle.pid == "CSIS_2020_17_1_a14": 

209 # xarticle.doi = "10.2298/CSIS180717038L" 

210 # if xarticle.pid == "CSIS_2020_17_1_a15": 

211 # xarticle.doi = "10.2298/CSIS190430041C" 

212 # if xarticle.pid == "CSIS_2020_17_1_a16": 

213 # xarticle.doi = "10.2298/CSIS190501042A" 

214 # if xarticle.pid == "CSIS_2020_17_1_a17": 

215 # xarticle.doi = "10.2298/CSIS190511043L" 

216 

217 # Keywords 

218 if "Key words" in categories: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true

219 keywords = categories["Key words"].text.split(", ") 

220 for k in keywords: 

221 xarticle.kwds.append(create_subj(value=k, lang="en")) 

222 return xarticle