Coverage for src / crawler / by_source / heldermann_crawler.py: 7%
217 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-04-08 09:35 +0000
1from urllib.parse import urldefrag, urljoin
3import regex
4from bs4 import BeautifulSoup, Comment
5from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser
6from ptf.cmds.xml.jats.builder.issue import get_abstract_xml, get_single_title_xml
7from ptf.model_data import (
8 IssueData,
9 create_abstract,
10 create_articledata,
11 create_contributor,
12 create_extlink,
13 create_subj,
14)
16from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
20class HeldermannCrawler(MatchingCrawler):
21 source_name = "Heldermann Verlag"
22 source_domain = "HELDERMANN"
23 source_website = "https://www.heldermann.de/"
25 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
26 issue_re = r"Number (?P<number>\d+)"
27 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"
29 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"
30 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'
32 def parse_collection_content(self, content):
33 xissues = []
34 soup = BeautifulSoup(content, "html5lib")
35 issues = soup.select("b > a")
36 for issue in issues:
37 volume_search = regex.search(self.volume_re, issue.text)
38 if not volume_search:
39 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping")
40 continue
41 issue_href = issue.get("href")
42 if not isinstance(issue_href, str):
43 raise ValueError("Couldn't parse issue href")
44 volume_dict = volume_search.groupdict()
45 parsed_issues = self.parse_heldermann_issue_content(
46 urljoin(self.collection_url, issue_href),
47 volume_dict["year"],
48 volume_dict["volume"],
49 )
51 xissues.extend(parsed_issues)
52 return xissues
54 def parse_heldermann_issue_content(self, url, year, volume):
55 """
56 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)
58 Therefore, we must parse volume pages when crawling the collection
59 """
60 content = self.download_file(url)
61 soup = BeautifulSoup(content, "html5lib")
62 div = soup.select("div[align='center']")
63 xissues = []
64 current_issue: IssueData | None = None
65 # Let's hope the website is consistent :
66 # first div should be the issue number
67 # second div should be the issue contents
68 for index, el in enumerate(div):
69 if url == "https://www.heldermann.de/JCA/jca02.htm":
70 current_issue = self.create_xissue(None, year, volume, "1-2")
71 xissues.append(current_issue)
72 index = 1
74 if index % 2 == 0:
75 title = el.select_one("td:first-child font:-soup-contains('Number ')")
76 if title:
77 issue_number = None
78 number_search = regex.search(self.issue_re, title.text)
79 if number_search:
80 number_data = number_search.groupdict()
81 issue_number = number_data["number"]
82 current_issue = self.create_xissue(None, year, volume, issue_number)
83 xissues.append(current_issue)
84 continue
85 else:
86 strong = el.select_one("strong")
87 if strong:
88 a_tags = strong
89 else:
90 a_tags = el.select_one("font font:last-child")
91 if a_tags is None:
92 raise ValueError("Couldn't parse issue data")
93 if a_tags and a_tags.select_one("b"):
94 a_tags = a_tags.select_one("b")
95 del strong
97 for child in a_tags.contents:
98 if isinstance(child, Comment):
99 child.extract()
101 articles_tags = regex.split(
102 r"<br\/> ?<br\/>",
103 cleanup_str(str(a_tags), unsafe=True)
104 .removeprefix("<strong>")
105 .removeprefix("<b>")
106 .removesuffix("</strong>")
107 .removesuffix("</b>"),
108 )
110 article_index = 0
111 for a_str in articles_tags:
112 a_str = cleanup_str(a_str, unsafe=True)
113 if a_str == "":
114 continue
115 if "</a>" not in a_str:
116 continue
117 if not current_issue:
118 raise ValueError("Error while parsing issue articles")
119 xarticle = self.parse_heldermann_article(a_str, url)
120 if xarticle is None:
121 continue
122 if xarticle.url is None:
123 xarticle.pid = f"{current_issue.pid}_a{article_index}"
124 else:
125 xarticle.pid = f"a{article_index}"
126 article_index += 1
127 current_issue.articles.append(xarticle)
128 return xissues
130 def parse_heldermann_article(self, article_content: str, issue_href: str):
131 """
132 Parse an article's data directly from the issue page
133 Some collections in Heldermann do not have a, article-specific page (article data in issue)
134 so we must parse the article data first before proceeding.
136 https://www.heldermann.de/JGG/jgg02.htm
137 """
139 content_strs = article_content.split("<br/>")
140 content_strs = [c for c in content_strs if c != ""]
142 authors_str = None
143 # cleanup_str(content_strs[0])
145 if content_strs[0] == '<font color="#0000A0" size="2"> ':
146 content_strs.pop(0)
148 if len(content_strs) >= 3:
149 authors_str = content_strs.pop(0)
150 cut_index = authors_str.rfind(">")
151 cut_index = cut_index + 1 if cut_index > 0 else 0
152 authors_str = cleanup_str(authors_str[cut_index:])
154 title_str = get_single_title_xml(content_strs[0])
156 xarticle = create_articledata()
158 article_search = regex.search(self.article_re, content_strs[1])
159 if not article_search:
160 self.logger.debug(
161 "Couldn't find article url. Skipping article", extra={"url": issue_href}
162 )
163 return None
164 # raise ValueError("Couldn't find article url")
166 xarticle.title_tex = title_str
168 if authors_str:
169 for a in authors_str.split(", "):
170 author = create_contributor(role="author", string_name=a)
171 if len(a) > 256:
172 pass
173 xarticle.contributors.append(author)
175 article_data = article_search.groupdict()
176 # Remove padding : 001 -> 1
177 xarticle.fpage = article_data["fpage"].lstrip("0")
179 if article_data["lpage"] is not None:
180 xarticle.lpage = article_data["lpage"].lstrip("0")
182 if article_data["articleurl"] is not None:
183 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")
184 href = a_tag.get("href")
185 if not isinstance(href, str):
186 raise ValueError("Couldn't parse article url")
187 xarticle.url = urljoin(issue_href, href)
188 else:
189 if article_data["abstracturl"] is not None:
190 abstract_tag = BeautifulSoup(
191 article_data["abstracturl"], "html.parser"
192 ).select_one("a")
193 abstract_href = abstract_tag.get("href")
194 if not isinstance(abstract_href, str):
195 raise ValueError("Couldn't parse abstract url")
197 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))
198 if xabstract is not None:
199 xarticle.abstracts.append(xabstract)
201 if article_data["pdfurl"] is None:
202 raise ValueError("Cannot find article pdf")
204 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")
205 pdf_href = pdf_tag.get("href")
206 if not isinstance(pdf_href, str):
207 raise ValueError("Couldn't parse pdf url")
209 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))
210 ext_link = create_extlink(
211 rel="source", location=issue_href, metadata=self.source_domain
212 )
213 xarticle.ext_links.append(ext_link)
214 return xarticle
216 def parse_heldermann_abstract(self, url: str):
217 url, fragment = urldefrag(url)
218 content = self.download_file(url)
219 content = cleanup_str(content)
220 soup = BeautifulSoup(content, "html5lib")
221 abstract_title = soup.select_one(f"[name={fragment}]")
222 if not abstract_title:
223 self.logger.debug(
224 f"Couldn't parse abstract for url : {url} with fragment : {fragment}"
225 )
226 return None
227 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")
228 if not abstract_tag:
229 raise ValueError("Cannot parse abstract")
230 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))
232 def parse_article_content(self, content, xissue, xarticle, url):
233 soup = BeautifulSoup(content, "html5lib")
234 content = cleanup_str(content, unsafe=True)
235 article_search = regex.search(self.article_page_re, content)
236 if not article_search:
237 if "This article plagiarizes" in content:
238 return None
239 article_search = regex.search(self.article_page_re_2, content)
241 if not article_search:
242 raise ValueError("Couldn't parse article page")
244 article_dict = article_search.groupdict()
245 ckeditor_data = CkeditorParser(
246 html_value=article_dict["abstract"],
247 mml_formulas="",
248 )
250 abstract = create_abstract(
251 lang="en",
252 value_xml=get_abstract_xml(ckeditor_data.value_xml, lang="en"),
253 value_tex=ckeditor_data.value_tex,
254 value_html=ckeditor_data.value_html,
255 )
257 xarticle.abstracts.append(abstract)
258 title_tags = soup.select("font[size='4'] b")
259 if len(title_tags) == 1:
260 xarticle.title_tex = get_single_title_xml(
261 str(title_tags[0])
262 .lstrip("<b>")
263 .rstrip("</b>")
264 .strip()
265 .replace("<", "<")
266 .replace(">", ">")
267 )
269 contributors = []
270 author = None
271 author_tags = soup.select("font[size='3']")
272 for author_tag in author_tags:
273 author_name = cleanup_str(author_tag.get_text(), unsafe=True)
274 if len(author_name) > 256:
275 continue
276 if author_name != "\x86":
277 # 1 author has a dagger (deceased after publication) and the HTML becomes worse than usual
278 # Ignore it and append address/email to the previous author
279 author = create_contributor(role="author", string_name=author_name)
281 siblings = author_tag.find_next_sibling("font")
282 if siblings:
283 for sibling in siblings:
284 parent = sibling.parent
285 if parent.name == "font" and parent.get("size") == "2":
286 children = sibling.contents
288 pos = 0
289 keep_searching_for_address = True
290 while pos < len(children) and keep_searching_for_address:
291 if isinstance(children[pos], str):
292 address = cleanup_str(
293 children[pos].get_text(), unsafe=True
294 ).replace("and: ", "")
295 if address:
296 author["addresses"].append(address)
297 elif children[pos].name == "a":
298 keep_searching_for_address = False
299 href = children[pos].get("href")
300 if href.find("mailto:") == 0:
301 email = cleanup_str(children[pos].get_text(), unsafe=True)
302 author["email"] = email
303 pos += 1
304 if author_name != "\x86" and author:
305 contributors.append(author)
306 xarticle.contributors = contributors
308 if article_dict.get("keywords", None) is not None:
309 for kwd in article_dict["keywords"].removesuffix(".").split(", "):
310 xarticle.kwds.append(create_subj(value=kwd))
312 if article_dict.get("msc", None) is not None:
313 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")
314 for msc in article_dict["msc"].split(", "):
315 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc)))
317 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")
318 href = href_soup.get("href")
319 if not isinstance(href, str):
320 raise ValueError("Article pdf cannot be parsed")
321 href = urljoin(url, href)
322 add_pdf_link_to_xarticle(xarticle, href)
324 # Paywall check on pdf
325 is_openaccess, response, *_ = self.check_pdf_link_validity(href)
326 if not is_openaccess:
327 return None
329 return xarticle