Coverage for src/crawler/by_source/heldermann_crawler.py: 83%

162 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urldefrag, urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Comment 

5from ptf.model_data import ( 

6 IssueData, 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17class HeldermannCrawler(BaseCollectionCrawler): 

18 source_name = "Heldermann Verlag" 

19 source_domain = "HELDERMANN" 

20 source_website = "https://www.heldermann.de/" 

21 

22 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)" 

23 issue_re = r"Number (?P<number>\d+)" 

24 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?" 

25 

26 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)" 

27 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)' 

28 

29 def parse_collection_content(self, content): 

30 xissues = [] 

31 soup = BeautifulSoup(content, "html5lib") 

32 issues = soup.select("b > a") 

33 for issue in issues: 

34 volume_search = regex.search(self.volume_re, issue.text) 

35 if not volume_search: 

36 print(f"Couldn't parse volume year for : {issue.text}. Skipping") 

37 continue 

38 issue_href = issue.get("href") 

39 if not isinstance(issue_href, str): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 raise ValueError("Couldn't parse issue href") 

41 volume_dict = volume_search.groupdict() 

42 parsed_issues = self.parse_heldermann_issue_content( 

43 urljoin(self.collection_url, issue_href), 

44 volume_dict["year"], 

45 volume_dict["volume"], 

46 ) 

47 

48 xissues.extend(parsed_issues) 

49 return xissues 

50 

51 def parse_heldermann_issue_content(self, url, year, volume): 

52 """ 

53 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page) 

54 

55 Therefore, we must parse volume pages when crawling the collection 

56 """ 

57 content = self.download_file(url) 

58 soup = BeautifulSoup(content, "html5lib") 

59 div = soup.select("div[align='center']") 

60 xissues = [] 

61 current_issue: IssueData | None = None 

62 # Let's hope the website is consistent : 

63 # first div should be the issue number 

64 # second div should be the issue contents 

65 for index, el in enumerate(div): 

66 if index % 2 == 0: 

67 title = el.select_one("td:first-child font:-soup-contains('Number ')") 

68 if title: 68 ↛ 65line 68 didn't jump to line 65 because the condition on line 68 was always true

69 issue_number = None 

70 number_search = regex.search(self.issue_re, title.text) 

71 if number_search: 71 ↛ 65line 71 didn't jump to line 65 because the condition on line 71 was always true

72 number_data = number_search.groupdict() 

73 issue_number = number_data["number"] 

74 current_issue = self.create_xissue(None, year, volume, issue_number) 

75 xissues.append(current_issue) 

76 continue 

77 else: 

78 strong = el.select_one("strong") 

79 if strong: 

80 a_tags = strong 

81 else: 

82 a_tags = el.select_one("font font:last-child") 

83 if a_tags is None: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 raise ValueError("Couldn't parse issue data") 

85 if a_tags and a_tags.select_one("b"): 

86 a_tags = a_tags.select_one("b") 

87 del strong 

88 

89 for child in a_tags.contents: 

90 if isinstance(child, Comment): 

91 child.extract() 

92 

93 articles_tags = regex.split( 

94 r"<br\/> ?<br\/>", 

95 cleanup_str(str(a_tags)) 

96 .removeprefix("<strong>") 

97 .removeprefix("<b>") 

98 .removesuffix("</strong>") 

99 .removeprefix("</b>"), 

100 ) 

101 

102 article_index = 0 

103 for a_str in articles_tags: 

104 a_str = cleanup_str(a_str) 

105 if a_str == "": 

106 continue 

107 if "</a>" not in a_str: 

108 continue 

109 if not current_issue: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 raise ValueError("Error while parsing issue articles") 

111 xarticle = self.parse_heldermann_article(a_str, url) 

112 if xarticle is None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 continue 

114 xarticle.pid = f"{current_issue.pid}_a{article_index}" 

115 article_index += 1 

116 current_issue.articles.append(xarticle) 

117 return xissues 

118 

119 def parse_heldermann_article(self, article_content: str, issue_href: str): 

120 """ 

121 Some collections in Heldermann do not have a, article-specific page (article data in issue) 

122 so we must parse the article data first before proceeding. 

123 

124 https://www.heldermann.de/JGG/jgg02.htm 

125 """ 

126 

127 content_strs = article_content.split("<br/>") 

128 content_strs = [c for c in content_strs if c != ""] 

129 

130 authors_str = None 

131 # cleanup_str(content_strs[0]) 

132 

133 if content_strs[0] == '<font color="#0000A0" size="2"> ': 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 content_strs.pop(0) 

135 

136 if len(content_strs) >= 3: 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true

137 authors_str = content_strs.pop(0) 

138 

139 title_str = cleanup_str(content_strs[0]) 

140 

141 xarticle = create_articledata() 

142 

143 article_search = regex.search(self.article_re, content_strs[1]) 

144 if not article_search: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 print(f"Couldn't find article url. Skipping article. {issue_href}") 

146 return None 

147 # raise ValueError("Couldn't find article url") 

148 

149 xarticle.title_tex = title_str 

150 

151 if authors_str: 151 ↛ 158line 151 didn't jump to line 158 because the condition on line 151 was always true

152 for a in authors_str.split(", "): 

153 author = create_contributor(role="author", string_name=a) 

154 if len(a) > 256: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 pass 

156 xarticle.contributors.append(author) 

157 

158 article_data = article_search.groupdict() 

159 # Remove padding : 001 -> 1 

160 xarticle.fpage = article_data["fpage"].rstrip("0") 

161 

162 if article_data["lpage"] is not None: 162 ↛ 165line 162 didn't jump to line 165 because the condition on line 162 was always true

163 xarticle.lpage = article_data["lpage"].rstrip("0") 

164 

165 if article_data["articleurl"] is not None: 

166 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a") 

167 href = a_tag.get("href") 

168 if not isinstance(href, str): 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 raise ValueError("Couldn't parse article url") 

170 xarticle.url = urljoin(issue_href, href) 

171 else: 

172 if article_data["abstracturl"] is not None: 

173 abstract_tag = BeautifulSoup( 

174 article_data["abstracturl"], "html.parser" 

175 ).select_one("a") 

176 abstract_href = abstract_tag.get("href") 

177 if not isinstance(abstract_href, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 raise ValueError("Couldn't parse abstract url") 

179 

180 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href)) 

181 if xabstract is not None: 

182 xarticle.abstracts.append(xabstract) 

183 

184 if article_data["pdfurl"] is None: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 raise ValueError("Cannot find article pdf") 

186 

187 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a") 

188 pdf_href = pdf_tag.get("href") 

189 if not isinstance(pdf_href, str): 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 raise ValueError("Couldn't parse pdf url") 

191 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href)) 

192 

193 return xarticle 

194 

195 def parse_heldermann_abstract(self, url: str): 

196 url, fragment = urldefrag(url) 

197 content = self.download_file(url) 

198 content = cleanup_str(content) 

199 soup = BeautifulSoup(content, "html5lib") 

200 abstract_title = soup.select_one(f"[name={fragment}]") 

201 if not abstract_title: 

202 print(f"Couldn't parse abstract for url : {url} with fragment : {fragment}") 

203 return None 

204 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font") 

205 if not abstract_tag: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 raise ValueError("Cannot parse abstract") 

207 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text)) 

208 

209 def parse_article_content(self, content, xissue, xarticle, url, pid): 

210 content = cleanup_str(content) 

211 article_search = regex.search(self.article_page_re, content) 

212 if not article_search: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 if "This article plagiarizes" in content: 

214 return None 

215 article_search = regex.search(self.article_page_re_2, content) 

216 

217 if not article_search: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 raise ValueError("Couldn't parse article page") 

219 

220 article_dict = article_search.groupdict() 

221 

222 xarticle.abstracts.append( 

223 create_abstract(tag="abstract", value_tex=article_dict["abstract"]) 

224 ) 

225 if article_dict.get("keywords", None) is not None: 225 ↛ 229line 225 didn't jump to line 229 because the condition on line 225 was always true

226 for kwd in article_dict["keywords"].removesuffix(".").split(", "): 

227 xarticle.kwds.append(create_subj(value=kwd)) 

228 

229 if article_dict.get("msc", None) is not None: 229 ↛ 234line 229 didn't jump to line 234 because the condition on line 229 was always true

230 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".") 

231 for msc in article_dict["msc"].split(", "): 

232 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

233 

234 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a") 

235 href = href_soup.get("href") 

236 if not isinstance(href, str): 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 raise ValueError("Article pdf cannot be parsed") 

238 add_pdf_link_to_xarticle(xarticle, href) 

239 

240 return super().parse_article_content(content, xissue, xarticle, url, pid)