Coverage for src / crawler / by_source / heldermann_crawler.py: 7%

217 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-04-08 09:35 +0000

1from urllib.parse import urldefrag, urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Comment 

5from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

6from ptf.cmds.xml.jats.builder.issue import get_abstract_xml, get_single_title_xml 

7from ptf.model_data import ( 

8 IssueData, 

9 create_abstract, 

10 create_articledata, 

11 create_contributor, 

12 create_extlink, 

13 create_subj, 

14) 

15 

16from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

18 

19 

20class HeldermannCrawler(MatchingCrawler): 

21 source_name = "Heldermann Verlag" 

22 source_domain = "HELDERMANN" 

23 source_website = "https://www.heldermann.de/" 

24 

25 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

26 issue_re = r"Number (?P<number>\d+)" 

27 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?" 

28 

29 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)" 

30 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)' 

31 

32 def parse_collection_content(self, content): 

33 xissues = [] 

34 soup = BeautifulSoup(content, "html5lib") 

35 issues = soup.select("b > a") 

36 for issue in issues: 

37 volume_search = regex.search(self.volume_re, issue.text) 

38 if not volume_search: 

39 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping") 

40 continue 

41 issue_href = issue.get("href") 

42 if not isinstance(issue_href, str): 

43 raise ValueError("Couldn't parse issue href") 

44 volume_dict = volume_search.groupdict() 

45 parsed_issues = self.parse_heldermann_issue_content( 

46 urljoin(self.collection_url, issue_href), 

47 volume_dict["year"], 

48 volume_dict["volume"], 

49 ) 

50 

51 xissues.extend(parsed_issues) 

52 return xissues 

53 

54 def parse_heldermann_issue_content(self, url, year, volume): 

55 """ 

56 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page) 

57 

58 Therefore, we must parse volume pages when crawling the collection 

59 """ 

60 content = self.download_file(url) 

61 soup = BeautifulSoup(content, "html5lib") 

62 div = soup.select("div[align='center']") 

63 xissues = [] 

64 current_issue: IssueData | None = None 

65 # Let's hope the website is consistent : 

66 # first div should be the issue number 

67 # second div should be the issue contents 

68 for index, el in enumerate(div): 

69 if url == "https://www.heldermann.de/JCA/jca02.htm": 

70 current_issue = self.create_xissue(None, year, volume, "1-2") 

71 xissues.append(current_issue) 

72 index = 1 

73 

74 if index % 2 == 0: 

75 title = el.select_one("td:first-child font:-soup-contains('Number ')") 

76 if title: 

77 issue_number = None 

78 number_search = regex.search(self.issue_re, title.text) 

79 if number_search: 

80 number_data = number_search.groupdict() 

81 issue_number = number_data["number"] 

82 current_issue = self.create_xissue(None, year, volume, issue_number) 

83 xissues.append(current_issue) 

84 continue 

85 else: 

86 strong = el.select_one("strong") 

87 if strong: 

88 a_tags = strong 

89 else: 

90 a_tags = el.select_one("font font:last-child") 

91 if a_tags is None: 

92 raise ValueError("Couldn't parse issue data") 

93 if a_tags and a_tags.select_one("b"): 

94 a_tags = a_tags.select_one("b") 

95 del strong 

96 

97 for child in a_tags.contents: 

98 if isinstance(child, Comment): 

99 child.extract() 

100 

101 articles_tags = regex.split( 

102 r"<br\/> ?<br\/>", 

103 cleanup_str(str(a_tags), unsafe=True) 

104 .removeprefix("<strong>") 

105 .removeprefix("<b>") 

106 .removesuffix("</strong>") 

107 .removesuffix("</b>"), 

108 ) 

109 

110 article_index = 0 

111 for a_str in articles_tags: 

112 a_str = cleanup_str(a_str, unsafe=True) 

113 if a_str == "": 

114 continue 

115 if "</a>" not in a_str: 

116 continue 

117 if not current_issue: 

118 raise ValueError("Error while parsing issue articles") 

119 xarticle = self.parse_heldermann_article(a_str, url) 

120 if xarticle is None: 

121 continue 

122 if xarticle.url is None: 

123 xarticle.pid = f"{current_issue.pid}_a{article_index}" 

124 else: 

125 xarticle.pid = f"a{article_index}" 

126 article_index += 1 

127 current_issue.articles.append(xarticle) 

128 return xissues 

129 

130 def parse_heldermann_article(self, article_content: str, issue_href: str): 

131 """ 

132 Parse an article's data directly from the issue page 

133 Some collections in Heldermann do not have a, article-specific page (article data in issue) 

134 so we must parse the article data first before proceeding. 

135 

136 https://www.heldermann.de/JGG/jgg02.htm 

137 """ 

138 

139 content_strs = article_content.split("<br/>") 

140 content_strs = [c for c in content_strs if c != ""] 

141 

142 authors_str = None 

143 # cleanup_str(content_strs[0]) 

144 

145 if content_strs[0] == '<font color="#0000A0" size="2"> ': 

146 content_strs.pop(0) 

147 

148 if len(content_strs) >= 3: 

149 authors_str = content_strs.pop(0) 

150 cut_index = authors_str.rfind(">") 

151 cut_index = cut_index + 1 if cut_index > 0 else 0 

152 authors_str = cleanup_str(authors_str[cut_index:]) 

153 

154 title_str = get_single_title_xml(content_strs[0]) 

155 

156 xarticle = create_articledata() 

157 

158 article_search = regex.search(self.article_re, content_strs[1]) 

159 if not article_search: 

160 self.logger.debug( 

161 "Couldn't find article url. Skipping article", extra={"url": issue_href} 

162 ) 

163 return None 

164 # raise ValueError("Couldn't find article url") 

165 

166 xarticle.title_tex = title_str 

167 

168 if authors_str: 

169 for a in authors_str.split(", "): 

170 author = create_contributor(role="author", string_name=a) 

171 if len(a) > 256: 

172 pass 

173 xarticle.contributors.append(author) 

174 

175 article_data = article_search.groupdict() 

176 # Remove padding : 001 -> 1 

177 xarticle.fpage = article_data["fpage"].lstrip("0") 

178 

179 if article_data["lpage"] is not None: 

180 xarticle.lpage = article_data["lpage"].lstrip("0") 

181 

182 if article_data["articleurl"] is not None: 

183 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a") 

184 href = a_tag.get("href") 

185 if not isinstance(href, str): 

186 raise ValueError("Couldn't parse article url") 

187 xarticle.url = urljoin(issue_href, href) 

188 else: 

189 if article_data["abstracturl"] is not None: 

190 abstract_tag = BeautifulSoup( 

191 article_data["abstracturl"], "html.parser" 

192 ).select_one("a") 

193 abstract_href = abstract_tag.get("href") 

194 if not isinstance(abstract_href, str): 

195 raise ValueError("Couldn't parse abstract url") 

196 

197 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href)) 

198 if xabstract is not None: 

199 xarticle.abstracts.append(xabstract) 

200 

201 if article_data["pdfurl"] is None: 

202 raise ValueError("Cannot find article pdf") 

203 

204 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a") 

205 pdf_href = pdf_tag.get("href") 

206 if not isinstance(pdf_href, str): 

207 raise ValueError("Couldn't parse pdf url") 

208 

209 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href)) 

210 ext_link = create_extlink( 

211 rel="source", location=issue_href, metadata=self.source_domain 

212 ) 

213 xarticle.ext_links.append(ext_link) 

214 return xarticle 

215 

216 def parse_heldermann_abstract(self, url: str): 

217 url, fragment = urldefrag(url) 

218 content = self.download_file(url) 

219 content = cleanup_str(content) 

220 soup = BeautifulSoup(content, "html5lib") 

221 abstract_title = soup.select_one(f"[name={fragment}]") 

222 if not abstract_title: 

223 self.logger.debug( 

224 f"Couldn't parse abstract for url : {url} with fragment : {fragment}" 

225 ) 

226 return None 

227 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font") 

228 if not abstract_tag: 

229 raise ValueError("Cannot parse abstract") 

230 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text)) 

231 

232 def parse_article_content(self, content, xissue, xarticle, url): 

233 soup = BeautifulSoup(content, "html5lib") 

234 content = cleanup_str(content, unsafe=True) 

235 article_search = regex.search(self.article_page_re, content) 

236 if not article_search: 

237 if "This article plagiarizes" in content: 

238 return None 

239 article_search = regex.search(self.article_page_re_2, content) 

240 

241 if not article_search: 

242 raise ValueError("Couldn't parse article page") 

243 

244 article_dict = article_search.groupdict() 

245 ckeditor_data = CkeditorParser( 

246 html_value=article_dict["abstract"], 

247 mml_formulas="", 

248 ) 

249 

250 abstract = create_abstract( 

251 lang="en", 

252 value_xml=get_abstract_xml(ckeditor_data.value_xml, lang="en"), 

253 value_tex=ckeditor_data.value_tex, 

254 value_html=ckeditor_data.value_html, 

255 ) 

256 

257 xarticle.abstracts.append(abstract) 

258 title_tags = soup.select("font[size='4'] b") 

259 if len(title_tags) == 1: 

260 xarticle.title_tex = get_single_title_xml( 

261 str(title_tags[0]) 

262 .lstrip("<b>") 

263 .rstrip("</b>") 

264 .strip() 

265 .replace("&lt;", "<") 

266 .replace("&gt;", ">") 

267 ) 

268 

269 contributors = [] 

270 author = None 

271 author_tags = soup.select("font[size='3']") 

272 for author_tag in author_tags: 

273 author_name = cleanup_str(author_tag.get_text(), unsafe=True) 

274 if len(author_name) > 256: 

275 continue 

276 if author_name != "\x86": 

277 # 1 author has a dagger (deceased after publication) and the HTML becomes worse than usual 

278 # Ignore it and append address/email to the previous author 

279 author = create_contributor(role="author", string_name=author_name) 

280 

281 siblings = author_tag.find_next_sibling("font") 

282 if siblings: 

283 for sibling in siblings: 

284 parent = sibling.parent 

285 if parent.name == "font" and parent.get("size") == "2": 

286 children = sibling.contents 

287 

288 pos = 0 

289 keep_searching_for_address = True 

290 while pos < len(children) and keep_searching_for_address: 

291 if isinstance(children[pos], str): 

292 address = cleanup_str( 

293 children[pos].get_text(), unsafe=True 

294 ).replace("and: ", "") 

295 if address: 

296 author["addresses"].append(address) 

297 elif children[pos].name == "a": 

298 keep_searching_for_address = False 

299 href = children[pos].get("href") 

300 if href.find("mailto:") == 0: 

301 email = cleanup_str(children[pos].get_text(), unsafe=True) 

302 author["email"] = email 

303 pos += 1 

304 if author_name != "\x86" and author: 

305 contributors.append(author) 

306 xarticle.contributors = contributors 

307 

308 if article_dict.get("keywords", None) is not None: 

309 for kwd in article_dict["keywords"].removesuffix(".").split(", "): 

310 xarticle.kwds.append(create_subj(value=kwd)) 

311 

312 if article_dict.get("msc", None) is not None: 

313 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".") 

314 for msc in article_dict["msc"].split(", "): 

315 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc))) 

316 

317 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a") 

318 href = href_soup.get("href") 

319 if not isinstance(href, str): 

320 raise ValueError("Article pdf cannot be parsed") 

321 href = urljoin(url, href) 

322 add_pdf_link_to_xarticle(xarticle, href) 

323 

324 # Paywall check on pdf 

325 is_openaccess, response, *_ = self.check_pdf_link_validity(href) 

326 if not is_openaccess: 

327 return None 

328 

329 return xarticle