Coverage for src / crawler / by_source / heldermann_crawler.py: 8%

175 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1from urllib.parse import urldefrag, urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Comment 

5from ptf.model_data import ( 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17class HeldermannCrawler(BaseCollectionCrawler): 

18 source_name = "Heldermann Verlag" 

19 source_domain = "HELDERMANN" 

20 source_website = "https://www.heldermann.de/" 

21 

22 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

23 issue_re = r"Number (?P<number>\d+)" 

24 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?" 

25 

26 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)" 

27 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)' 

28 

29 def parse_collection_content(self, content): 

30 xissues = [] 

31 soup = BeautifulSoup(content, "html5lib") 

32 issues = soup.select("b > a") 

33 for issue in issues: 

34 volume_search = regex.search(self.volume_re, issue.text) 

35 if not volume_search: 

36 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping") 

37 continue 

38 issue_href = issue.get("href") 

39 if not isinstance(issue_href, str): 

40 raise ValueError("Couldn't parse issue href") 

41 volume_dict = volume_search.groupdict() 

42 parsed_issues = self.parse_heldermann_issue_content( 

43 urljoin(self.collection_url, issue_href), 

44 volume_dict["year"], 

45 volume_dict["volume"], 

46 ) 

47 

48 xissues.extend(parsed_issues) 

49 return xissues 

50 

51 def parse_heldermann_issue_content(self, url, year, volume): 

52 """ 

53 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page) 

54 

55 Therefore, we must parse volume pages when crawling the collection 

56 """ 

57 content = self.download_file(url) 

58 soup = BeautifulSoup(content, "html5lib") 

59 div = soup.select("div[align='center']") 

60 xissues = [] 

61 current_issue: IssueData | None = None 

62 # Let's hope the website is consistent : 

63 # first div should be the issue number 

64 # second div should be the issue contents 

65 for index, el in enumerate(div): 

66 if url == "https://www.heldermann.de/JCA/jca02.htm": 

67 current_issue = self.create_xissue(None, year, volume, "1-2") 

68 xissues.append(current_issue) 

69 index = 1 

70 

71 if index % 2 == 0: 

72 title = el.select_one("td:first-child font:-soup-contains('Number ')") 

73 if title: 

74 issue_number = None 

75 number_search = regex.search(self.issue_re, title.text) 

76 if number_search: 

77 number_data = number_search.groupdict() 

78 issue_number = number_data["number"] 

79 current_issue = self.create_xissue(None, year, volume, issue_number) 

80 xissues.append(current_issue) 

81 continue 

82 else: 

83 strong = el.select_one("strong") 

84 if strong: 

85 a_tags = strong 

86 else: 

87 a_tags = el.select_one("font font:last-child") 

88 if a_tags is None: 

89 raise ValueError("Couldn't parse issue data") 

90 if a_tags and a_tags.select_one("b"): 

91 a_tags = a_tags.select_one("b") 

92 del strong 

93 

94 for child in a_tags.contents: 

95 if isinstance(child, Comment): 

96 child.extract() 

97 

98 articles_tags = regex.split( 

99 r"<br\/> ?<br\/>", 

100 cleanup_str(str(a_tags)) 

101 .removeprefix("<strong>") 

102 .removeprefix("<b>") 

103 .removesuffix("</strong>") 

104 .removeprefix("</b>"), 

105 ) 

106 

107 article_index = 0 

108 for a_str in articles_tags: 

109 a_str = cleanup_str(a_str) 

110 if a_str == "": 

111 continue 

112 if "</a>" not in a_str: 

113 continue 

114 if not current_issue: 

115 raise ValueError("Error while parsing issue articles") 

116 xarticle = self.parse_heldermann_article(a_str, url) 

117 if xarticle is None: 

118 continue 

119 xarticle.pid = f"{current_issue.pid}_a{article_index}" 

120 article_index += 1 

121 current_issue.articles.append(xarticle) 

122 return xissues 

123 

124 def parse_heldermann_article(self, article_content: str, issue_href: str): 

125 """ 

126 Some collections in Heldermann do not have a, article-specific page (article data in issue) 

127 so we must parse the article data first before proceeding. 

128 

129 https://www.heldermann.de/JGG/jgg02.htm 

130 """ 

131 

132 content_strs = article_content.split("<br/>") 

133 content_strs = [c for c in content_strs if c != ""] 

134 

135 authors_str = None 

136 # cleanup_str(content_strs[0]) 

137 

138 if content_strs[0] == '<font color="#0000A0" size="2"> ': 

139 content_strs.pop(0) 

140 

141 if len(content_strs) >= 3: 

142 authors_str = content_strs.pop(0) 

143 cut_index = authors_str.rfind(">") 

144 cut_index = cut_index + 1 if cut_index > 0 else 0 

145 authors_str = cleanup_str(authors_str[cut_index:]) 

146 

147 title_str = cleanup_str(content_strs[0]) 

148 

149 xarticle = create_articledata() 

150 

151 article_search = regex.search(self.article_re, content_strs[1]) 

152 if not article_search: 

153 self.logger.debug( 

154 "Couldn't find article url. Skipping article", extra={"url": issue_href} 

155 ) 

156 return None 

157 # raise ValueError("Couldn't find article url") 

158 

159 xarticle.title_tex = title_str 

160 

161 if authors_str: 

162 for a in authors_str.split(", "): 

163 author = create_contributor(role="author", string_name=a) 

164 if len(a) > 256: 

165 pass 

166 xarticle.contributors.append(author) 

167 

168 article_data = article_search.groupdict() 

169 # Remove padding : 001 -> 1 

170 xarticle.fpage = article_data["fpage"].rstrip("0") 

171 

172 if article_data["lpage"] is not None: 

173 xarticle.lpage = article_data["lpage"].rstrip("0") 

174 

175 if article_data["articleurl"] is not None: 

176 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a") 

177 href = a_tag.get("href") 

178 if not isinstance(href, str): 

179 raise ValueError("Couldn't parse article url") 

180 xarticle.url = urljoin(issue_href, href) 

181 else: 

182 if article_data["abstracturl"] is not None: 

183 abstract_tag = BeautifulSoup( 

184 article_data["abstracturl"], "html.parser" 

185 ).select_one("a") 

186 abstract_href = abstract_tag.get("href") 

187 if not isinstance(abstract_href, str): 

188 raise ValueError("Couldn't parse abstract url") 

189 

190 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href)) 

191 if xabstract is not None: 

192 xarticle.abstracts.append(xabstract) 

193 

194 if article_data["pdfurl"] is None: 

195 raise ValueError("Cannot find article pdf") 

196 

197 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a") 

198 pdf_href = pdf_tag.get("href") 

199 if not isinstance(pdf_href, str): 

200 raise ValueError("Couldn't parse pdf url") 

201 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href)) 

202 

203 return xarticle 

204 

205 def parse_heldermann_abstract(self, url: str): 

206 url, fragment = urldefrag(url) 

207 content = self.download_file(url) 

208 content = cleanup_str(content) 

209 soup = BeautifulSoup(content, "html5lib") 

210 abstract_title = soup.select_one(f"[name={fragment}]") 

211 if not abstract_title: 

212 self.logger.debug( 

213 f"Couldn't parse abstract for url : {url} with fragment : {fragment}" 

214 ) 

215 return None 

216 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font") 

217 if not abstract_tag: 

218 raise ValueError("Cannot parse abstract") 

219 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text)) 

220 

221 def parse_article_content(self, content, xissue, xarticle, url): 

222 content = cleanup_str(content) 

223 article_search = regex.search(self.article_page_re, content) 

224 if not article_search: 

225 if "This article plagiarizes" in content: 

226 return None 

227 article_search = regex.search(self.article_page_re_2, content) 

228 

229 if not article_search: 

230 raise ValueError("Couldn't parse article page") 

231 

232 article_dict = article_search.groupdict() 

233 

234 xarticle.abstracts.append(create_abstract(value_tex=article_dict["abstract"])) 

235 if article_dict.get("keywords", None) is not None: 

236 for kwd in article_dict["keywords"].removesuffix(".").split(", "): 

237 xarticle.kwds.append(create_subj(value=kwd)) 

238 

239 if article_dict.get("msc", None) is not None: 

240 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".") 

241 for msc in article_dict["msc"].split(", "): 

242 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

243 

244 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a") 

245 href = href_soup.get("href") 

246 if not isinstance(href, str): 

247 raise ValueError("Article pdf cannot be parsed") 

248 href = urljoin(url, href) 

249 add_pdf_link_to_xarticle(xarticle, href) 

250 

251 # Paywall check on pdf 

252 is_openaccess, response, *_ = self.check_pdf_link_validity(href, session=self.session) 

253 if not is_openaccess: 

254 return None 

255 if getattr(response, "from_cache", False): 

256 self._wait_download_delay() 

257 

258 return xarticle