Coverage for src/crawler/by_source/heldermann_crawler.py: 82%

173 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1from urllib.parse import urldefrag, urljoin 

2 

3import regex 

4import requests 

5from bs4 import BeautifulSoup, Comment 

6from ptf.model_data import ( 

7 IssueData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_subj, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17 

18class HeldermannCrawler(BaseCollectionCrawler): 

19 source_name = "Heldermann Verlag" 

20 source_domain = "HELDERMANN" 

21 source_website = "https://www.heldermann.de/" 

22 

23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

24 issue_re = r"Number (?P<number>\d+)" 

25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?" 

26 

27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)" 

28 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)' 

29 

30 def parse_collection_content(self, content): 

31 xissues = [] 

32 soup = BeautifulSoup(content, "html5lib") 

33 issues = soup.select("b > a") 

34 for issue in issues: 

35 volume_search = regex.search(self.volume_re, issue.text) 

36 if not volume_search: 

37 print(f"Couldn't parse volume year for : {issue.text}. Skipping") 

38 continue 

39 issue_href = issue.get("href") 

40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError("Couldn't parse issue href") 

42 volume_dict = volume_search.groupdict() 

43 parsed_issues = self.parse_heldermann_issue_content( 

44 urljoin(self.collection_url, issue_href), 

45 volume_dict["year"], 

46 volume_dict["volume"], 

47 ) 

48 

49 xissues.extend(parsed_issues) 

50 return xissues 

51 

52 def parse_heldermann_issue_content(self, url, year, volume): 

53 """ 

54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page) 

55 

56 Therefore, we must parse volume pages when crawling the collection 

57 """ 

58 content = self.download_file(url) 

59 soup = BeautifulSoup(content, "html5lib") 

60 div = soup.select("div[align='center']") 

61 xissues = [] 

62 current_issue: IssueData | None = None 

63 # Let's hope the website is consistent : 

64 # first div should be the issue number 

65 # second div should be the issue contents 

66 for index, el in enumerate(div): 

67 if url == "https://www.heldermann.de/JCA/jca02.htm": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 current_issue = self.create_xissue(None, year, volume, "1-2") 

69 xissues.append(current_issue) 

70 index = 1 

71 

72 if index % 2 == 0: 

73 title = el.select_one("td:first-child font:-soup-contains('Number ')") 

74 if title: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true

75 issue_number = None 

76 number_search = regex.search(self.issue_re, title.text) 

77 if number_search: 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true

78 number_data = number_search.groupdict() 

79 issue_number = number_data["number"] 

80 current_issue = self.create_xissue(None, year, volume, issue_number) 

81 xissues.append(current_issue) 

82 continue 

83 else: 

84 strong = el.select_one("strong") 

85 if strong: 

86 a_tags = strong 

87 else: 

88 a_tags = el.select_one("font font:last-child") 

89 if a_tags is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 raise ValueError("Couldn't parse issue data") 

91 if a_tags and a_tags.select_one("b"): 

92 a_tags = a_tags.select_one("b") 

93 del strong 

94 

95 for child in a_tags.contents: 

96 if isinstance(child, Comment): 

97 child.extract() 

98 

99 articles_tags = regex.split( 

100 r"<br\/> ?<br\/>", 

101 cleanup_str(str(a_tags)) 

102 .removeprefix("<strong>") 

103 .removeprefix("<b>") 

104 .removesuffix("</strong>") 

105 .removeprefix("</b>"), 

106 ) 

107 

108 article_index = 0 

109 for a_str in articles_tags: 

110 a_str = cleanup_str(a_str) 

111 if a_str == "": 

112 continue 

113 if "</a>" not in a_str: 

114 continue 

115 if not current_issue: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 raise ValueError("Error while parsing issue articles") 

117 xarticle = self.parse_heldermann_article(a_str, url) 

118 if xarticle is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 continue 

120 xarticle.pid = f"{current_issue.pid}_a{article_index}" 

121 article_index += 1 

122 current_issue.articles.append(xarticle) 

123 return xissues 

124 

125 def parse_heldermann_article(self, article_content: str, issue_href: str): 

126 """ 

127 Some collections in Heldermann do not have a, article-specific page (article data in issue) 

128 so we must parse the article data first before proceeding. 

129 

130 https://www.heldermann.de/JGG/jgg02.htm 

131 """ 

132 

133 content_strs = article_content.split("<br/>") 

134 content_strs = [c for c in content_strs if c != ""] 

135 

136 authors_str = None 

137 # cleanup_str(content_strs[0]) 

138 

139 if content_strs[0] == '<font color="#0000A0" size="2"> ': 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 content_strs.pop(0) 

141 

142 if len(content_strs) >= 3: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true

143 authors_str = content_strs.pop(0) 

144 cut_index = authors_str.rfind(">") 

145 cut_index = cut_index + 1 if cut_index > 0 else 0 

146 authors_str = cleanup_str(authors_str[cut_index:]) 

147 

148 title_str = cleanup_str(content_strs[0]) 

149 

150 xarticle = create_articledata() 

151 

152 article_search = regex.search(self.article_re, content_strs[1]) 

153 if not article_search: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 print(f"Couldn't find article url. Skipping article. {issue_href}") 

155 return None 

156 # raise ValueError("Couldn't find article url") 

157 

158 xarticle.title_tex = title_str 

159 

160 if authors_str: 

161 for a in authors_str.split(", "): 

162 author = create_contributor(role="author", string_name=a) 

163 if len(a) > 256: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 pass 

165 xarticle.contributors.append(author) 

166 

167 article_data = article_search.groupdict() 

168 # Remove padding : 001 -> 1 

169 xarticle.fpage = article_data["fpage"].rstrip("0") 

170 

171 if article_data["lpage"] is not None: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true

172 xarticle.lpage = article_data["lpage"].rstrip("0") 

173 

174 if article_data["articleurl"] is not None: 

175 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a") 

176 href = a_tag.get("href") 

177 if not isinstance(href, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError("Couldn't parse article url") 

179 xarticle.url = urljoin(issue_href, href) 

180 else: 

181 if article_data["abstracturl"] is not None: 

182 abstract_tag = BeautifulSoup( 

183 article_data["abstracturl"], "html.parser" 

184 ).select_one("a") 

185 abstract_href = abstract_tag.get("href") 

186 if not isinstance(abstract_href, str): 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 raise ValueError("Couldn't parse abstract url") 

188 

189 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href)) 

190 if xabstract is not None: 

191 xarticle.abstracts.append(xabstract) 

192 

193 if article_data["pdfurl"] is None: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 raise ValueError("Cannot find article pdf") 

195 

196 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a") 

197 pdf_href = pdf_tag.get("href") 

198 if not isinstance(pdf_href, str): 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true

199 raise ValueError("Couldn't parse pdf url") 

200 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href)) 

201 

202 return xarticle 

203 

204 def parse_heldermann_abstract(self, url: str): 

205 url, fragment = urldefrag(url) 

206 content = self.download_file(url) 

207 content = cleanup_str(content) 

208 soup = BeautifulSoup(content, "html5lib") 

209 abstract_title = soup.select_one(f"[name={fragment}]") 

210 if not abstract_title: 

211 print(f"Couldn't parse abstract for url : {url} with fragment : {fragment}") 

212 return None 

213 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font") 

214 if not abstract_tag: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 raise ValueError("Cannot parse abstract") 

216 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text)) 

217 

218 def parse_article_content(self, content, xissue, xarticle, url): 

219 content = cleanup_str(content) 

220 article_search = regex.search(self.article_page_re, content) 

221 if not article_search: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 if "This article plagiarizes" in content: 

223 return None 

224 article_search = regex.search(self.article_page_re_2, content) 

225 

226 if not article_search: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 raise ValueError("Couldn't parse article page") 

228 

229 article_dict = article_search.groupdict() 

230 

231 xarticle.abstracts.append( 

232 create_abstract(tag="abstract", value_tex=article_dict["abstract"]) 

233 ) 

234 if article_dict.get("keywords", None) is not None: 234 ↛ 238line 234 didn't jump to line 238 because the condition on line 234 was always true

235 for kwd in article_dict["keywords"].removesuffix(".").split(", "): 

236 xarticle.kwds.append(create_subj(value=kwd)) 

237 

238 if article_dict.get("msc", None) is not None: 238 ↛ 243line 238 didn't jump to line 243 because the condition on line 238 was always true

239 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".") 

240 for msc in article_dict["msc"].split(", "): 

241 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

242 

243 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a") 

244 href = href_soup.get("href") 

245 if not isinstance(href, str): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 raise ValueError("Article pdf cannot be parsed") 

247 add_pdf_link_to_xarticle(xarticle, href) 

248 

249 return xarticle 

250 

251 def decode_response(self, response: requests.Response, encoding: str = "ISO-8859-1"): 

252 """Override this if the content-type headers from the sources are advertising something else than the actual content 

253 SASA needs this""" 

254 response.encoding = encoding 

255 return response.text