Coverage for src/crawler/by_source/heldermann_crawler.py: 82%

177 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urldefrag, urljoin 

2 

3import regex 

4import requests 

5from bs4 import BeautifulSoup, Comment 

6from ptf.model_data import ( 

7 IssueData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_subj, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17 

18class HeldermannCrawler(BaseCollectionCrawler): 

19 source_name = "Heldermann Verlag" 

20 source_domain = "HELDERMANN" 

21 source_website = "https://www.heldermann.de/" 

22 

23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

24 issue_re = r"Number (?P<number>\d+)" 

25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?" 

26 

27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)" 

28 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)' 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 soup = BeautifulSoup(content, "html5lib") 

33 issues = soup.select("b > a") 

34 for issue in issues: 

35 volume_search = regex.search(self.volume_re, issue.text) 

36 if not volume_search: 

37 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping") 

38 continue 

39 issue_href = issue.get("href") 

40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError("Couldn't parse issue href") 

42 volume_dict = volume_search.groupdict() 

43 parsed_issues = self.parse_heldermann_issue_content( 

44 urljoin(self.collection_url, issue_href), 

45 volume_dict["year"], 

46 volume_dict["volume"], 

47 ) 

48 

49 xissues.extend(parsed_issues) 

50 return xissues 

51 

52 def parse_heldermann_issue_content(self, url, year, volume): 

53 """ 

54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page) 

55 

56 Therefore, we must parse volume pages when crawling the collection 

57 """ 

58 content = self.download_file(url) 

59 soup = BeautifulSoup(content, "html5lib") 

60 div = soup.select("div[align='center']") 

61 xissues = [] 

62 current_issue: IssueData | None = None 

63 # Let's hope the website is consistent : 

64 # first div should be the issue number 

65 # second div should be the issue contents 

66 for index, el in enumerate(div): 

67 if url == "https://www.heldermann.de/JCA/jca02.htm": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 current_issue = self.create_xissue(None, year, volume, "1-2") 

69 xissues.append(current_issue) 

70 index = 1 

71 

72 if index % 2 == 0: 

73 title = el.select_one("td:first-child font:-soup-contains('Number ')") 

74 if title: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true

75 issue_number = None 

76 number_search = regex.search(self.issue_re, title.text) 

77 if number_search: 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true

78 number_data = number_search.groupdict() 

79 issue_number = number_data["number"] 

80 current_issue = self.create_xissue(None, year, volume, issue_number) 

81 xissues.append(current_issue) 

82 continue 

83 else: 

84 strong = el.select_one("strong") 

85 if strong: 

86 a_tags = strong 

87 else: 

88 a_tags = el.select_one("font font:last-child") 

89 if a_tags is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError("Couldn't parse issue data") 

91 if a_tags and a_tags.select_one("b"): 

92 a_tags = a_tags.select_one("b") 

93 del strong 

94 

95 for child in a_tags.contents: 

96 if isinstance(child, Comment): 

97 child.extract() 

98 

99 articles_tags = regex.split( 

100 r"<br\/> ?<br\/>", 

101 cleanup_str(str(a_tags)) 

102 .removeprefix("<strong>") 

103 .removeprefix("<b>") 

104 .removesuffix("</strong>") 

105 .removeprefix("</b>"), 

106 ) 

107 

108 article_index = 0 

109 for a_str in articles_tags: 

110 a_str = cleanup_str(a_str) 

111 if a_str == "": 

112 continue 

113 if "</a>" not in a_str: 

114 continue 

115 if not current_issue: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 raise ValueError("Error while parsing issue articles") 

117 xarticle = self.parse_heldermann_article(a_str, url) 

118 if xarticle is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 continue 

120 xarticle.pid = f"{current_issue.pid}_a{article_index}" 

121 article_index += 1 

122 current_issue.articles.append(xarticle) 

123 return xissues 

124 

125 def parse_heldermann_article(self, article_content: str, issue_href: str): 

126 """ 

127 Some collections in Heldermann do not have a, article-specific page (article data in issue) 

128 so we must parse the article data first before proceeding. 

129 

130 https://www.heldermann.de/JGG/jgg02.htm 

131 """ 

132 

133 content_strs = article_content.split("<br/>") 

134 content_strs = [c for c in content_strs if c != ""] 

135 

136 authors_str = None 

137 # cleanup_str(content_strs[0]) 

138 

139 if content_strs[0] == '<font color="#0000A0" size="2"> ': 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 content_strs.pop(0) 

141 

142 if len(content_strs) >= 3: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true

143 authors_str = content_strs.pop(0) 

144 cut_index = authors_str.rfind(">") 

145 cut_index = cut_index + 1 if cut_index > 0 else 0 

146 authors_str = cleanup_str(authors_str[cut_index:]) 

147 

148 title_str = cleanup_str(content_strs[0]) 

149 

150 xarticle = create_articledata() 

151 

152 article_search = regex.search(self.article_re, content_strs[1]) 

153 if not article_search: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 self.logger.debug( 

155 "Couldn't find article url. Skipping article", extra={"url": issue_href} 

156 ) 

157 return None 

158 # raise ValueError("Couldn't find article url") 

159 

160 xarticle.title_tex = title_str 

161 

162 if authors_str: 

163 for a in authors_str.split(", "): 

164 author = create_contributor(role="author", string_name=a) 

165 if len(a) > 256: 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 pass 

167 xarticle.contributors.append(author) 

168 

169 article_data = article_search.groupdict() 

170 # Remove padding : 001 -> 1 

171 xarticle.fpage = article_data["fpage"].rstrip("0") 

172 

173 if article_data["lpage"] is not None: 173 ↛ 176line 173 didn't jump to line 176 because the condition on line 173 was always true

174 xarticle.lpage = article_data["lpage"].rstrip("0") 

175 

176 if article_data["articleurl"] is not None: 

177 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a") 

178 href = a_tag.get("href") 

179 if not isinstance(href, str): 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true

180 raise ValueError("Couldn't parse article url") 

181 xarticle.url = urljoin(issue_href, href) 

182 else: 

183 if article_data["abstracturl"] is not None: 

184 abstract_tag = BeautifulSoup( 

185 article_data["abstracturl"], "html.parser" 

186 ).select_one("a") 

187 abstract_href = abstract_tag.get("href") 

188 if not isinstance(abstract_href, str): 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true

189 raise ValueError("Couldn't parse abstract url") 

190 

191 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href)) 

192 if xabstract is not None: 

193 xarticle.abstracts.append(xabstract) 

194 

195 if article_data["pdfurl"] is None: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 raise ValueError("Cannot find article pdf") 

197 

198 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a") 

199 pdf_href = pdf_tag.get("href") 

200 if not isinstance(pdf_href, str): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 raise ValueError("Couldn't parse pdf url") 

202 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href)) 

203 

204 return xarticle 

205 

206 def parse_heldermann_abstract(self, url: str): 

207 url, fragment = urldefrag(url) 

208 content = self.download_file(url) 

209 content = cleanup_str(content) 

210 soup = BeautifulSoup(content, "html5lib") 

211 abstract_title = soup.select_one(f"[name={fragment}]") 

212 if not abstract_title: 

213 self.logger.debug( 

214 f"Couldn't parse abstract for url : {url} with fragment : {fragment}" 

215 ) 

216 return None 

217 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font") 

218 if not abstract_tag: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true

219 raise ValueError("Cannot parse abstract") 

220 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text)) 

221 

222 def parse_article_content(self, content, xissue, xarticle, url): 

223 content = cleanup_str(content) 

224 article_search = regex.search(self.article_page_re, content) 

225 if not article_search: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 if "This article plagiarizes" in content: 

227 return None 

228 article_search = regex.search(self.article_page_re_2, content) 

229 

230 if not article_search: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 raise ValueError("Couldn't parse article page") 

232 

233 article_dict = article_search.groupdict() 

234 

235 xarticle.abstracts.append( 

236 create_abstract(tag="abstract", value_tex=article_dict["abstract"]) 

237 ) 

238 if article_dict.get("keywords", None) is not None: 238 ↛ 242line 238 didn't jump to line 242 because the condition on line 238 was always true

239 for kwd in article_dict["keywords"].removesuffix(".").split(", "): 

240 xarticle.kwds.append(create_subj(value=kwd)) 

241 

242 if article_dict.get("msc", None) is not None: 242 ↛ 247line 242 didn't jump to line 247 because the condition on line 242 was always true

243 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".") 

244 for msc in article_dict["msc"].split(", "): 

245 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

246 

247 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a") 

248 href = href_soup.get("href") 

249 if not isinstance(href, str): 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true

250 raise ValueError("Article pdf cannot be parsed") 

251 add_pdf_link_to_xarticle(xarticle, href) 

252 

253 # Paywall check on pdf 

254 pdf_check = self.session.head(href) 

255 if pdf_check.status_code == 401: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 return None 

257 

258 return xarticle 

259 

260 def decode_response(self, response: requests.Response, encoding: str = ""): 

261 """Override this if the content-type headers from the sources are advertising something else than the actual content 

262 SASA needs this""" 

263 if encoding != "": 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 response.encoding = encoding 

265 return response.text