Coverage for src/crawler/by_source/heldermann_crawler.py: 82%
177 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urldefrag, urljoin
3import regex
4import requests
5from bs4 import BeautifulSoup, Comment
6from ptf.model_data import (
7 IssueData,
8 create_abstract,
9 create_articledata,
10 create_contributor,
11 create_subj,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18class HeldermannCrawler(BaseCollectionCrawler):
19 source_name = "Heldermann Verlag"
20 source_domain = "HELDERMANN"
21 source_website = "https://www.heldermann.de/"
23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
24 issue_re = r"Number (?P<number>\d+)"
25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"
27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"
28 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'
30 def parse_collection_content(self, content):
31 xissues = []
32 soup = BeautifulSoup(content, "html5lib")
33 issues = soup.select("b > a")
34 for issue in issues:
35 volume_search = regex.search(self.volume_re, issue.text)
36 if not volume_search:
37 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping")
38 continue
39 issue_href = issue.get("href")
40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 raise ValueError("Couldn't parse issue href")
42 volume_dict = volume_search.groupdict()
43 parsed_issues = self.parse_heldermann_issue_content(
44 urljoin(self.collection_url, issue_href),
45 volume_dict["year"],
46 volume_dict["volume"],
47 )
49 xissues.extend(parsed_issues)
50 return xissues
52 def parse_heldermann_issue_content(self, url, year, volume):
53 """
54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)
56 Therefore, we must parse volume pages when crawling the collection
57 """
58 content = self.download_file(url)
59 soup = BeautifulSoup(content, "html5lib")
60 div = soup.select("div[align='center']")
61 xissues = []
62 current_issue: IssueData | None = None
63 # Let's hope the website is consistent :
64 # first div should be the issue number
65 # second div should be the issue contents
66 for index, el in enumerate(div):
67 if url == "https://www.heldermann.de/JCA/jca02.htm": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 current_issue = self.create_xissue(None, year, volume, "1-2")
69 xissues.append(current_issue)
70 index = 1
72 if index % 2 == 0:
73 title = el.select_one("td:first-child font:-soup-contains('Number ')")
74 if title: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true
75 issue_number = None
76 number_search = regex.search(self.issue_re, title.text)
77 if number_search: 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true
78 number_data = number_search.groupdict()
79 issue_number = number_data["number"]
80 current_issue = self.create_xissue(None, year, volume, issue_number)
81 xissues.append(current_issue)
82 continue
83 else:
84 strong = el.select_one("strong")
85 if strong:
86 a_tags = strong
87 else:
88 a_tags = el.select_one("font font:last-child")
89 if a_tags is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 raise ValueError("Couldn't parse issue data")
91 if a_tags and a_tags.select_one("b"):
92 a_tags = a_tags.select_one("b")
93 del strong
95 for child in a_tags.contents:
96 if isinstance(child, Comment):
97 child.extract()
99 articles_tags = regex.split(
100 r"<br\/> ?<br\/>",
101 cleanup_str(str(a_tags))
102 .removeprefix("<strong>")
103 .removeprefix("<b>")
104 .removesuffix("</strong>")
105 .removeprefix("</b>"),
106 )
108 article_index = 0
109 for a_str in articles_tags:
110 a_str = cleanup_str(a_str)
111 if a_str == "":
112 continue
113 if "</a>" not in a_str:
114 continue
115 if not current_issue: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 raise ValueError("Error while parsing issue articles")
117 xarticle = self.parse_heldermann_article(a_str, url)
118 if xarticle is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 continue
120 xarticle.pid = f"{current_issue.pid}_a{article_index}"
121 article_index += 1
122 current_issue.articles.append(xarticle)
123 return xissues
125 def parse_heldermann_article(self, article_content: str, issue_href: str):
126 """
127 Some collections in Heldermann do not have a, article-specific page (article data in issue)
128 so we must parse the article data first before proceeding.
130 https://www.heldermann.de/JGG/jgg02.htm
131 """
133 content_strs = article_content.split("<br/>")
134 content_strs = [c for c in content_strs if c != ""]
136 authors_str = None
137 # cleanup_str(content_strs[0])
139 if content_strs[0] == '<font color="#0000A0" size="2"> ': 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 content_strs.pop(0)
142 if len(content_strs) >= 3: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true
143 authors_str = content_strs.pop(0)
144 cut_index = authors_str.rfind(">")
145 cut_index = cut_index + 1 if cut_index > 0 else 0
146 authors_str = cleanup_str(authors_str[cut_index:])
148 title_str = cleanup_str(content_strs[0])
150 xarticle = create_articledata()
152 article_search = regex.search(self.article_re, content_strs[1])
153 if not article_search: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 self.logger.debug(
155 "Couldn't find article url. Skipping article", extra={"url": issue_href}
156 )
157 return None
158 # raise ValueError("Couldn't find article url")
160 xarticle.title_tex = title_str
162 if authors_str:
163 for a in authors_str.split(", "):
164 author = create_contributor(role="author", string_name=a)
165 if len(a) > 256: 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 pass
167 xarticle.contributors.append(author)
169 article_data = article_search.groupdict()
170 # Remove padding : 001 -> 1
171 xarticle.fpage = article_data["fpage"].rstrip("0")
173 if article_data["lpage"] is not None: 173 ↛ 176line 173 didn't jump to line 176 because the condition on line 173 was always true
174 xarticle.lpage = article_data["lpage"].rstrip("0")
176 if article_data["articleurl"] is not None:
177 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")
178 href = a_tag.get("href")
179 if not isinstance(href, str): 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true
180 raise ValueError("Couldn't parse article url")
181 xarticle.url = urljoin(issue_href, href)
182 else:
183 if article_data["abstracturl"] is not None:
184 abstract_tag = BeautifulSoup(
185 article_data["abstracturl"], "html.parser"
186 ).select_one("a")
187 abstract_href = abstract_tag.get("href")
188 if not isinstance(abstract_href, str): 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true
189 raise ValueError("Couldn't parse abstract url")
191 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))
192 if xabstract is not None:
193 xarticle.abstracts.append(xabstract)
195 if article_data["pdfurl"] is None: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 raise ValueError("Cannot find article pdf")
198 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")
199 pdf_href = pdf_tag.get("href")
200 if not isinstance(pdf_href, str): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 raise ValueError("Couldn't parse pdf url")
202 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))
204 return xarticle
206 def parse_heldermann_abstract(self, url: str):
207 url, fragment = urldefrag(url)
208 content = self.download_file(url)
209 content = cleanup_str(content)
210 soup = BeautifulSoup(content, "html5lib")
211 abstract_title = soup.select_one(f"[name={fragment}]")
212 if not abstract_title:
213 self.logger.debug(
214 f"Couldn't parse abstract for url : {url} with fragment : {fragment}"
215 )
216 return None
217 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")
218 if not abstract_tag: 218 ↛ 219line 218 didn't jump to line 219 because the condition on line 218 was never true
219 raise ValueError("Cannot parse abstract")
220 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))
222 def parse_article_content(self, content, xissue, xarticle, url):
223 content = cleanup_str(content)
224 article_search = regex.search(self.article_page_re, content)
225 if not article_search: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true
226 if "This article plagiarizes" in content:
227 return None
228 article_search = regex.search(self.article_page_re_2, content)
230 if not article_search: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true
231 raise ValueError("Couldn't parse article page")
233 article_dict = article_search.groupdict()
235 xarticle.abstracts.append(
236 create_abstract(tag="abstract", value_tex=article_dict["abstract"])
237 )
238 if article_dict.get("keywords", None) is not None: 238 ↛ 242line 238 didn't jump to line 242 because the condition on line 238 was always true
239 for kwd in article_dict["keywords"].removesuffix(".").split(", "):
240 xarticle.kwds.append(create_subj(value=kwd))
242 if article_dict.get("msc", None) is not None: 242 ↛ 247line 242 didn't jump to line 247 because the condition on line 242 was always true
243 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")
244 for msc in article_dict["msc"].split(", "):
245 xarticle.kwds.append(create_subj(type="msc", value=msc))
247 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")
248 href = href_soup.get("href")
249 if not isinstance(href, str): 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true
250 raise ValueError("Article pdf cannot be parsed")
251 add_pdf_link_to_xarticle(xarticle, href)
253 # Paywall check on pdf
254 pdf_check = self.session.head(href)
255 if pdf_check.status_code == 401: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 return None
258 return xarticle
260 def decode_response(self, response: requests.Response, encoding: str = ""):
261 """Override this if the content-type headers from the sources are advertising something else than the actual content
262 SASA needs this"""
263 if encoding != "": 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true
264 response.encoding = encoding
265 return response.text