Coverage for src/crawler/by_source/heldermann

1from urllib.parse import urldefrag, urljoin

3import regex

4import requests

5from bs4 import BeautifulSoup, Comment

6from ptf.model_data import (

7 IssueData,

8 create_abstract,

9 create_articledata,

10 create_contributor,

11 create_subj,

12)

14from crawler.base_crawler import BaseCollectionCrawler

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

18class HeldermannCrawler(BaseCollectionCrawler):

19 source_name = "Heldermann Verlag"

20 source_domain = "HELDERMANN"

21 source_website = "https://www.heldermann.de/"

23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"

24 issue_re = r"Number (?P<number>\d+)"

25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"

27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"

28 article_page_re_2 = r'(?: )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'

30 def parse_collection_content(self, content):

31 xissues = []

32 soup = BeautifulSoup(content, "html5lib")

33 issues = soup.select("b > a")

34 for issue in issues:

35 volume_search = regex.search(self.volume_re, issue.text)

36 if not volume_search:

37 print(f"Couldn't parse volume year for : {issue.text}. Skipping")

38 continue

39 issue_href = issue.get("href")

40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError("Couldn't parse issue href")

42 volume_dict = volume_search.groupdict()

43 parsed_issues = self.parse_heldermann_issue_content(

44 urljoin(self.collection_url, issue_href),

45 volume_dict["year"],

46 volume_dict["volume"],

47 )

49 xissues.extend(parsed_issues)

50 return xissues

52 def parse_heldermann_issue_content(self, url, year, volume):

53 """

54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)

56 Therefore, we must parse volume pages when crawling the collection

57 """

58 content = self.download_file(url)

59 soup = BeautifulSoup(content, "html5lib")

60 div = soup.select("div[align='center']")

61 xissues = []

62 current_issue: IssueData | None = None

63 # Let's hope the website is consistent :

64 # first div should be the issue number

65 # second div should be the issue contents

66 for index, el in enumerate(div):

67 if url == "https://www.heldermann.de/JCA/jca02.htm": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 current_issue = self.create_xissue(None, year, volume, "1-2")

69 xissues.append(current_issue)

70 index = 1

72 if index % 2 == 0:

73 title = el.select_one("td:first-child font:-soup-contains('Number ')")

74 if title: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true

75 issue_number = None

76 number_search = regex.search(self.issue_re, title.text)

77 if number_search: 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true

78 number_data = number_search.groupdict()

79 issue_number = number_data["number"]

80 current_issue = self.create_xissue(None, year, volume, issue_number)

81 xissues.append(current_issue)

82 continue

83 else:

84 strong = el.select_one("strong")

85 if strong:

86 a_tags = strong

87 else:

88 a_tags = el.select_one("font font:last-child")

89 if a_tags is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError("Couldn't parse issue data")

91 if a_tags and a_tags.select_one("b"):

92 a_tags = a_tags.select_one("b")

93 del strong

95 for child in a_tags.contents:

96 if isinstance(child, Comment):

97 child.extract()

99 articles_tags = regex.split(

100 r"<br\/> ?<br\/>",

101 cleanup_str(str(a_tags))

102 .removeprefix("")

103 .removeprefix("")

104 .removesuffix("")

105 .removeprefix(""),

106 )

107

108 article_index = 0

109 for a_str in articles_tags:

110 a_str = cleanup_str(a_str)

111 if a_str == "":

112 continue

113 if "</a>" not in a_str:

114 continue

115 if not current_issue: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 raise ValueError("Error while parsing issue articles")

117 xarticle = self.parse_heldermann_article(a_str, url)

118 if xarticle is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 continue

120 xarticle.pid = f"{current_issue.pid}_a{article_index}"

121 article_index += 1

122 current_issue.articles.append(xarticle)

123 return xissues

124

125 def parse_heldermann_article(self, article_content: str, issue_href: str):

126 """

127 Some collections in Heldermann do not have a, article-specific page (article data in issue)

128 so we must parse the article data first before proceeding.

129

130 https://www.heldermann.de/JGG/jgg02.htm

131 """

132

133 content_strs = article_content.split(" ")

134 content_strs = [c for c in content_strs if c != ""]

135

136 authors_str = None

137 # cleanup_str(content_strs[0])

138

139 if content_strs[0] == ' ': 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 content_strs.pop(0)

141

142 if len(content_strs) >= 3: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true

143 authors_str = content_strs.pop(0)

144 cut_index = authors_str.rfind(">")

145 cut_index = cut_index + 1 if cut_index > 0 else 0

146 authors_str = cleanup_str(authors_str[cut_index:])

147

148 title_str = cleanup_str(content_strs[0])

149

150 xarticle = create_articledata()

151

152 article_search = regex.search(self.article_re, content_strs[1])

153 if not article_search: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 print(f"Couldn't find article url. Skipping article. {issue_href}")

155 return None

156 # raise ValueError("Couldn't find article url")

157

158 xarticle.title_tex = title_str

159

160 if authors_str:

161 for a in authors_str.split(", "):

162 author = create_contributor(role="author", string_name=a)

163 if len(a) > 256: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 pass

165 xarticle.contributors.append(author)

166

167 article_data = article_search.groupdict()

168 # Remove padding : 001 -> 1

169 xarticle.fpage = article_data["fpage"].rstrip("0")

170

171 if article_data["lpage"] is not None: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true

172 xarticle.lpage = article_data["lpage"].rstrip("0")

173

174 if article_data["articleurl"] is not None:

175 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")

176 href = a_tag.get("href")

177 if not isinstance(href, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError("Couldn't parse article url")

179 xarticle.url = urljoin(issue_href, href)

180 else:

181 if article_data["abstracturl"] is not None:

182 abstract_tag = BeautifulSoup(

183 article_data["abstracturl"], "html.parser"

184 ).select_one("a")

185 abstract_href = abstract_tag.get("href")

186 if not isinstance(abstract_href, str): 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 raise ValueError("Couldn't parse abstract url")

188

189 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))

190 if xabstract is not None:

191 xarticle.abstracts.append(xabstract)

192

193 if article_data["pdfurl"] is None: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 raise ValueError("Cannot find article pdf")

195

196 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")

197 pdf_href = pdf_tag.get("href")

198 if not isinstance(pdf_href, str): 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true

199 raise ValueError("Couldn't parse pdf url")

200 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))

201

202 return xarticle

203

204 def parse_heldermann_abstract(self, url: str):

205 url, fragment = urldefrag(url)

206 content = self.download_file(url)

207 content = cleanup_str(content)

208 soup = BeautifulSoup(content, "html5lib")

209 abstract_title = soup.select_one(f"[name={fragment}]")

210 if not abstract_title:

211 print(f"Couldn't parse abstract for url : {url} with fragment : {fragment}")

212 return None

213 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")

214 if not abstract_tag: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 raise ValueError("Cannot parse abstract")

216 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))

217

218 def parse_article_content(self, content, xissue, xarticle, url):

219 content = cleanup_str(content)

220 article_search = regex.search(self.article_page_re, content)

221 if not article_search: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 if "This article plagiarizes" in content:

223 return None

224 article_search = regex.search(self.article_page_re_2, content)

225

226 if not article_search: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 raise ValueError("Couldn't parse article page")

228

229 article_dict = article_search.groupdict()

230

231 xarticle.abstracts.append(

232 create_abstract(tag="abstract", value_tex=article_dict["abstract"])

233 )

234 if article_dict.get("keywords", None) is not None: 234 ↛ 238line 234 didn't jump to line 238 because the condition on line 234 was always true

235 for kwd in article_dict["keywords"].removesuffix(".").split(", "):

236 xarticle.kwds.append(create_subj(value=kwd))

237

238 if article_dict.get("msc", None) is not None: 238 ↛ 243line 238 didn't jump to line 243 because the condition on line 238 was always true

239 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")

240 for msc in article_dict["msc"].split(", "):

241 xarticle.kwds.append(create_subj(type="msc", value=msc))

242

243 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")

244 href = href_soup.get("href")

245 if not isinstance(href, str): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 raise ValueError("Article pdf cannot be parsed")

247 add_pdf_link_to_xarticle(xarticle, href)

248

249 return xarticle

250

251 def decode_response(self, response: requests.Response, encoding: str = "ISO-8859-1"):

252 """Override this if the content-type headers from the sources are advertising something else than the actual content

253 SASA needs this"""

254 response.encoding = encoding

255 return response.text

Coverage for src/crawler/by_source/heldermann_crawler.py: 82%

173 statements