Coverage for src / crawler / by_source / heldermann_crawler.py: 8%

177 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-03-19 14:59 +0000

1from urllib.parse import urldefrag, urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Comment 

5from ptf.model_data import ( 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_subj, 

12) 

13 

14from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17 

18class HeldermannCrawler(MatchingCrawler): 

19 source_name = "Heldermann Verlag" 

20 source_domain = "HELDERMANN" 

21 source_website = "https://www.heldermann.de/" 

22 

23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

24 issue_re = r"Number (?P<number>\d+)" 

25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?" 

26 

27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)" 

28 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)' 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 soup = BeautifulSoup(content, "html5lib") 

33 issues = soup.select("b > a") 

34 for issue in issues: 

35 volume_search = regex.search(self.volume_re, issue.text) 

36 if not volume_search: 

37 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping") 

38 continue 

39 issue_href = issue.get("href") 

40 if not isinstance(issue_href, str): 

41 raise ValueError("Couldn't parse issue href") 

42 volume_dict = volume_search.groupdict() 

43 parsed_issues = self.parse_heldermann_issue_content( 

44 urljoin(self.collection_url, issue_href), 

45 volume_dict["year"], 

46 volume_dict["volume"], 

47 ) 

48 

49 xissues.extend(parsed_issues) 

50 return xissues 

51 

52 def parse_heldermann_issue_content(self, url, year, volume): 

53 """ 

54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page) 

55 

56 Therefore, we must parse volume pages when crawling the collection 

57 """ 

58 content = self.download_file(url) 

59 soup = BeautifulSoup(content, "html5lib") 

60 div = soup.select("div[align='center']") 

61 xissues = [] 

62 current_issue: IssueData | None = None 

63 # Let's hope the website is consistent : 

64 # first div should be the issue number 

65 # second div should be the issue contents 

66 for index, el in enumerate(div): 

67 if url == "https://www.heldermann.de/JCA/jca02.htm": 

68 current_issue = self.create_xissue(None, year, volume, "1-2") 

69 xissues.append(current_issue) 

70 index = 1 

71 

72 if index % 2 == 0: 

73 title = el.select_one("td:first-child font:-soup-contains('Number ')") 

74 if title: 

75 issue_number = None 

76 number_search = regex.search(self.issue_re, title.text) 

77 if number_search: 

78 number_data = number_search.groupdict() 

79 issue_number = number_data["number"] 

80 current_issue = self.create_xissue(None, year, volume, issue_number) 

81 xissues.append(current_issue) 

82 continue 

83 else: 

84 strong = el.select_one("strong") 

85 if strong: 

86 a_tags = strong 

87 else: 

88 a_tags = el.select_one("font font:last-child") 

89 if a_tags is None: 

90 raise ValueError("Couldn't parse issue data") 

91 if a_tags and a_tags.select_one("b"): 

92 a_tags = a_tags.select_one("b") 

93 del strong 

94 

95 for child in a_tags.contents: 

96 if isinstance(child, Comment): 

97 child.extract() 

98 

99 articles_tags = regex.split( 

100 r"<br\/> ?<br\/>", 

101 cleanup_str(str(a_tags), unsafe=True) 

102 .removeprefix("<strong>") 

103 .removeprefix("<b>") 

104 .removesuffix("</strong>") 

105 .removesuffix("</b>"), 

106 ) 

107 

108 article_index = 0 

109 for a_str in articles_tags: 

110 a_str = cleanup_str(a_str, unsafe=True) 

111 if a_str == "": 

112 continue 

113 if "</a>" not in a_str: 

114 continue 

115 if not current_issue: 

116 raise ValueError("Error while parsing issue articles") 

117 xarticle = self.parse_heldermann_article(a_str, url) 

118 if xarticle is None: 

119 continue 

120 if xarticle.url is None: 

121 xarticle.pid = f"{current_issue.pid}_a{article_index}" 

122 else: 

123 xarticle.pid = f"a{article_index}" 

124 article_index += 1 

125 current_issue.articles.append(xarticle) 

126 return xissues 

127 

128 def parse_heldermann_article(self, article_content: str, issue_href: str): 

129 """ 

130 Some collections in Heldermann do not have a, article-specific page (article data in issue) 

131 so we must parse the article data first before proceeding. 

132 

133 https://www.heldermann.de/JGG/jgg02.htm 

134 """ 

135 

136 content_strs = article_content.split("<br/>") 

137 content_strs = [c for c in content_strs if c != ""] 

138 

139 authors_str = None 

140 # cleanup_str(content_strs[0]) 

141 

142 if content_strs[0] == '<font color="#0000A0" size="2"> ': 

143 content_strs.pop(0) 

144 

145 if len(content_strs) >= 3: 

146 authors_str = content_strs.pop(0) 

147 cut_index = authors_str.rfind(">") 

148 cut_index = cut_index + 1 if cut_index > 0 else 0 

149 authors_str = cleanup_str(authors_str[cut_index:]) 

150 

151 title_str = cleanup_str(content_strs[0]) 

152 

153 xarticle = create_articledata() 

154 

155 article_search = regex.search(self.article_re, content_strs[1]) 

156 if not article_search: 

157 self.logger.debug( 

158 "Couldn't find article url. Skipping article", extra={"url": issue_href} 

159 ) 

160 return None 

161 # raise ValueError("Couldn't find article url") 

162 

163 xarticle.title_tex = title_str 

164 

165 if authors_str: 

166 for a in authors_str.split(", "): 

167 author = create_contributor(role="author", string_name=a) 

168 if len(a) > 256: 

169 pass 

170 xarticle.contributors.append(author) 

171 

172 article_data = article_search.groupdict() 

173 # Remove padding : 001 -> 1 

174 xarticle.fpage = article_data["fpage"].rstrip("0") 

175 

176 if article_data["lpage"] is not None: 

177 xarticle.lpage = article_data["lpage"].rstrip("0") 

178 

179 if article_data["articleurl"] is not None: 

180 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a") 

181 href = a_tag.get("href") 

182 if not isinstance(href, str): 

183 raise ValueError("Couldn't parse article url") 

184 xarticle.url = urljoin(issue_href, href) 

185 else: 

186 if article_data["abstracturl"] is not None: 

187 abstract_tag = BeautifulSoup( 

188 article_data["abstracturl"], "html.parser" 

189 ).select_one("a") 

190 abstract_href = abstract_tag.get("href") 

191 if not isinstance(abstract_href, str): 

192 raise ValueError("Couldn't parse abstract url") 

193 

194 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href)) 

195 if xabstract is not None: 

196 xarticle.abstracts.append(xabstract) 

197 

198 if article_data["pdfurl"] is None: 

199 raise ValueError("Cannot find article pdf") 

200 

201 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a") 

202 pdf_href = pdf_tag.get("href") 

203 if not isinstance(pdf_href, str): 

204 raise ValueError("Couldn't parse pdf url") 

205 

206 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href)) 

207 ext_link = create_extlink( 

208 rel="source", location=issue_href, metadata=self.source_domain 

209 ) 

210 xarticle.ext_links.append(ext_link) 

211 return xarticle 

212 

213 def parse_heldermann_abstract(self, url: str): 

214 url, fragment = urldefrag(url) 

215 content = self.download_file(url) 

216 content = cleanup_str(content) 

217 soup = BeautifulSoup(content, "html5lib") 

218 abstract_title = soup.select_one(f"[name={fragment}]") 

219 if not abstract_title: 

220 self.logger.debug( 

221 f"Couldn't parse abstract for url : {url} with fragment : {fragment}" 

222 ) 

223 return None 

224 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font") 

225 if not abstract_tag: 

226 raise ValueError("Cannot parse abstract") 

227 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text)) 

228 

229 def parse_article_content(self, content, xissue, xarticle, url): 

230 content = cleanup_str(content, unsafe=True) 

231 article_search = regex.search(self.article_page_re, content) 

232 if not article_search: 

233 if "This article plagiarizes" in content: 

234 return None 

235 article_search = regex.search(self.article_page_re_2, content) 

236 

237 if not article_search: 

238 raise ValueError("Couldn't parse article page") 

239 

240 article_dict = article_search.groupdict() 

241 

242 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(article_dict["abstract"]))) 

243 

244 if article_dict.get("keywords", None) is not None: 

245 for kwd in article_dict["keywords"].removesuffix(".").split(", "): 

246 xarticle.kwds.append(create_subj(value=kwd)) 

247 

248 if article_dict.get("msc", None) is not None: 

249 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".") 

250 for msc in article_dict["msc"].split(", "): 

251 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc))) 

252 

253 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a") 

254 href = href_soup.get("href") 

255 if not isinstance(href, str): 

256 raise ValueError("Article pdf cannot be parsed") 

257 href = urljoin(url, href) 

258 add_pdf_link_to_xarticle(xarticle, href) 

259 

260 # Paywall check on pdf 

261 is_openaccess, response, *_ = self.check_pdf_link_validity(href) 

262 if not is_openaccess: 

263 return None 

264 

265 return xarticle