Coverage for src/crawler/by_source/csis

1"""

2This source has invalid DOIs in some article.

3For now, those are ignored in order to be able to crawl the collection.

4"""

6from urllib.parse import urljoin

8from bs4 import BeautifulSoup, Tag

9from ptf.model_data import (

10 ContributorDict,

11 create_abstract,

12 create_articledata,

13 create_contributor,

14 create_issuedata,

15 create_subj,

16)

18from crawler.base_crawler import BaseCollectionCrawler

19from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

22class CsisCrawler(BaseCollectionCrawler):

23 source_name = "Computer Science and Information Systems website"

24 source_domain = "CSIS"

25 source_website = "http://www.comsis.org/"

27 issue_re = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) \(\w+ (?P<year>\d+)\)"

29 def parse_collection_content(self, content):

30 xissues = []

31 soup = BeautifulSoup(content, "html.parser")

32 col_issue_tags = soup.select("#content > p")

33 for index, tag in enumerate(col_issue_tags):

34 xissue = self.parse_col_issue_tag(tag)

35 xissue.pid = self.collection_id + "_TEMPPID_" + str(index)

36 xissues.append(xissue)

37 return xissues

39 def parse_col_issue_tag(self, col_issue_tag: Tag):

40 issue_title = col_issue_tag.select_one("a.hidden")

41 if not issue_title: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't parse issue link")

43 issue_href = issue_title.get("href")

44 if not isinstance(issue_href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Couldn't parse issue href")

46 xissue = create_issuedata()

47 xissue.url = urljoin(self.source_website, issue_href)

48 return xissue

50 def parse_issue_content(self, content, xissue):

51 soup = BeautifulSoup(content, "html.parser")

53 content = soup.select_one("#content")

54 if not content: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Couldn't find issue content")

56 title_tag = content.select_one("h1")

57 if not title_tag: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 raise ValueError("Couldn't find issue title")

60 title_group = regex_to_dict(

61 self.issue_re, title_tag.text, error_msg="Couldn't parse issue title"

62 )

63 xissue.number = title_group["number"]

64 xissue.volume = title_group["volume"]

65 xissue.year = title_group["year"]

67 xissue.pid = self.get_issue_pid(

68 self.collection_id, title_group["year"], title_group["volume"], title_group["number"]

69 )

71 for index, article_tag in enumerate(content.select("p")):

72 if len(article_tag.contents) == 1:

73 continue

75 if article_tag.text == "Editorial": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 continue

78 article_title = article_tag.select_one("a.hidden")

79 if not article_title: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Couldn't parse article link")

81 article_href = article_title.get("href")

82 if not isinstance(article_href, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't parse article href")

85 xarticle = create_articledata()

86 xarticle.url = urljoin(self.source_website, article_href)

87 xarticle.pid = "a" + str(index)

88 xissue.articles.append(xarticle)

90 def parse_article_content(self, content, xissue, xarticle, url):

91 soup = BeautifulSoup(content, "html.parser")

92 content = soup.select_one("#content")

93 if not content: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 raise ValueError("Couldn't parse article content")

95 id_tag = content.select_one("p.id")

96 if id_tag: 96 ↛ 100line 96 didn't jump to line 100 because the condition on line 96 was always true

97 id_tag.decompose()

99 # Title

100 if xarticle.pid == "CSIS_2012_9_3_a13": 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 xarticle.title_tex = "Modeling a Holonic Agent based Solution"

102 else:

103 title_tag = content.select_one(".title")

104 if not title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 raise ValueError("Couldn't find title")

106 xarticle.title_tex = title_tag.text

107 title_tag.decompose()

108

109 # Authors

110 authors_tag = content.select_one(".authors")

111 if not authors_tag: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Couldn't find authors")

113 current_contributor: ContributorDict | None = None

114 for c in authors_tag.children:

115 if isinstance(c, str):

116 author_str = cleanup_str(c)

117 if author_str == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 continue

119 author_str = author_str.removeprefix(", ").removeprefix("and ").strip()

120 current_contributor = create_contributor(role="author", string_name=author_str)

121 xarticle.contributors.append(current_contributor)

122 continue

123

124 if not isinstance(c, Tag): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 continue

126 if not current_contributor: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise ValueError("Couldn't find author")

128

129 if c.name == "sup": 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true

130 # affiliations

131 continue

132 if c.name == "a":

133 orcid_href = c.get("href")

134 if not isinstance(orcid_href, str):

135 self.logger.warning(

136 "Couldn't parse contributor orcid.",

137 extra={"pid": xarticle.pid},

138 )

139 continue

140 if not orcid_href.startswith("https://orcid.org/"):

141 self.logger.warning(

142 "Couldn't parse contributor ocrid : ocrid must start with https://orcid.org/",

143 extra={"pid": xarticle.pid},

144 )

145 continue

146 current_contributor["orcid"] = orcid_href.removeprefix("https://orcid.org/")

147 authors_tag.decompose()

148

149 # Affiliations

150 affiliations_tag = content.select_one("ol")

151 if affiliations_tag: 151 ↛ 154line 151 didn't jump to line 154 because the condition on line 151 was always true

152 affiliations_tag.decompose()

153

154 current_header: str | None = None

155 categories: dict[str, Tag] = {}

156 for tag in content.findChildren(recursive=False):

157 if tag.name == "h3":

158 current_header = tag.text

159 continue

160 if tag.name == "p": 160 ↛ 165line 160 didn't jump to line 165 because the condition on line 160 was always true

161 if current_header is None: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 raise ValueError("Couldn't parse article content")

163 categories[current_header] = tag

164 continue

165 raise ValueError("Found foreign tag in article content")

166 del current_header

167

168 # Abstract

169 if "Abstract" in categories: 169 ↛ 176line 169 didn't jump to line 176 because the condition on line 169 was always true

170 xabstract = create_abstract(

171 tag="abstract", value_tex=categories["Abstract"].text, lang="en"

172 )

173 xarticle.abstracts.append(xabstract)

174

175 # PDF

176 if "Full text" in categories: 176 ↛ 185line 176 didn't jump to line 185 because the condition on line 176 was always true

177 pdf_tag = categories["Full text"].select_one("a.download")

178 if not pdf_tag: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 raise ValueError("Couldn't find pdf url")

180 pdf_url = pdf_tag.get("href")

181 if not isinstance(pdf_url, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 raise ValueError("Couldn't parse pdf url")

183 add_pdf_link_to_xarticle(xarticle, urljoin(self.source_website, pdf_url))

184 else:

185 self.logger.debug("No PDF Found", extra={"pid": xarticle.pid})

186

187 # DOI

188 # TODO : contact CSIS to make them fix their DOIs

189 # if "Digital Object Identifier (DOI)" in categories:

190 # doi_tag = categories["Digital Object Identifier (DOI)"].select_one("a")

191 # if not doi_tag:

192 # raise ValueError("Couldn't find doi url")

193 # doi_url = doi_tag.get("href")

194 # if not isinstance(doi_url, str):

195 # raise ValueError("Couldn't parse doi url")

196 # if not doi_url.startswith("https://doi.org/"):

197 # raise ValueError("Malformed DOI url")

198 # doi_url = doi_url.removeprefix("https://doi.org/")

199 # xarticle.doi = doi_url

200

201 # if xarticle.pid == "CSIS_2023_20_4_a2":

202 # xarticle.doi = "10.2298/CSIS230400viiL"

203 # if xarticle.pid == "CSIS_2023_20_1_a0":

204 # xarticle.doi = "10.2298/CSIS230100iI"

205 # if xarticle.pid == "CSIS_2021_18_1_a4":

206 # xarticle.doi = "10.2298/CSIS200330035A"

207 # if xarticle.pid == "CSIS_2020_17_1_a14":

208 # xarticle.doi = "10.2298/CSIS180717038L"

209 # if xarticle.pid == "CSIS_2020_17_1_a15":

210 # xarticle.doi = "10.2298/CSIS190430041C"

211 # if xarticle.pid == "CSIS_2020_17_1_a16":

212 # xarticle.doi = "10.2298/CSIS190501042A"

213 # if xarticle.pid == "CSIS_2020_17_1_a17":

214 # xarticle.doi = "10.2298/CSIS190511043L"

215

216 # Keywords

217 if "Key words" in categories: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 keywords = categories["Key words"].text.split(", ")

219 for k in keywords:

220 xarticle.kwds.append(create_subj(value=k, lang="en"))

221 return xarticle

Coverage for src/crawler/by_source/csis_crawler.py: 69%

134 statements