Coverage for src / crawler / by_source / heldermann_crawler.py: 8%
177 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
1from urllib.parse import urldefrag, urljoin
3import regex
4from bs4 import BeautifulSoup, Comment
5from ptf.model_data import (
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_extlink,
11 create_subj,
12)
14from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18class HeldermannCrawler(MatchingCrawler):
19 source_name = "Heldermann Verlag"
20 source_domain = "HELDERMANN"
21 source_website = "https://www.heldermann.de/"
23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
24 issue_re = r"Number (?P<number>\d+)"
25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"
27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"
28 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'
30 def parse_collection_content(self, content):
31 xissues = []
32 soup = BeautifulSoup(content, "html5lib")
33 issues = soup.select("b > a")
34 for issue in issues:
35 volume_search = regex.search(self.volume_re, issue.text)
36 if not volume_search:
37 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping")
38 continue
39 issue_href = issue.get("href")
40 if not isinstance(issue_href, str):
41 raise ValueError("Couldn't parse issue href")
42 volume_dict = volume_search.groupdict()
43 parsed_issues = self.parse_heldermann_issue_content(
44 urljoin(self.collection_url, issue_href),
45 volume_dict["year"],
46 volume_dict["volume"],
47 )
49 xissues.extend(parsed_issues)
50 return xissues
52 def parse_heldermann_issue_content(self, url, year, volume):
53 """
54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)
56 Therefore, we must parse volume pages when crawling the collection
57 """
58 content = self.download_file(url)
59 soup = BeautifulSoup(content, "html5lib")
60 div = soup.select("div[align='center']")
61 xissues = []
62 current_issue: IssueData | None = None
63 # Let's hope the website is consistent :
64 # first div should be the issue number
65 # second div should be the issue contents
66 for index, el in enumerate(div):
67 if url == "https://www.heldermann.de/JCA/jca02.htm":
68 current_issue = self.create_xissue(None, year, volume, "1-2")
69 xissues.append(current_issue)
70 index = 1
72 if index % 2 == 0:
73 title = el.select_one("td:first-child font:-soup-contains('Number ')")
74 if title:
75 issue_number = None
76 number_search = regex.search(self.issue_re, title.text)
77 if number_search:
78 number_data = number_search.groupdict()
79 issue_number = number_data["number"]
80 current_issue = self.create_xissue(None, year, volume, issue_number)
81 xissues.append(current_issue)
82 continue
83 else:
84 strong = el.select_one("strong")
85 if strong:
86 a_tags = strong
87 else:
88 a_tags = el.select_one("font font:last-child")
89 if a_tags is None:
90 raise ValueError("Couldn't parse issue data")
91 if a_tags and a_tags.select_one("b"):
92 a_tags = a_tags.select_one("b")
93 del strong
95 for child in a_tags.contents:
96 if isinstance(child, Comment):
97 child.extract()
99 articles_tags = regex.split(
100 r"<br\/> ?<br\/>",
101 cleanup_str(str(a_tags), unsafe=True)
102 .removeprefix("<strong>")
103 .removeprefix("<b>")
104 .removesuffix("</strong>")
105 .removesuffix("</b>"),
106 )
108 article_index = 0
109 for a_str in articles_tags:
110 a_str = cleanup_str(a_str, unsafe=True)
111 if a_str == "":
112 continue
113 if "</a>" not in a_str:
114 continue
115 if not current_issue:
116 raise ValueError("Error while parsing issue articles")
117 xarticle = self.parse_heldermann_article(a_str, url)
118 if xarticle is None:
119 continue
120 if xarticle.url is None:
121 xarticle.pid = f"{current_issue.pid}_a{article_index}"
122 else:
123 xarticle.pid = f"a{article_index}"
124 article_index += 1
125 current_issue.articles.append(xarticle)
126 return xissues
128 def parse_heldermann_article(self, article_content: str, issue_href: str):
129 """
130 Some collections in Heldermann do not have a, article-specific page (article data in issue)
131 so we must parse the article data first before proceeding.
133 https://www.heldermann.de/JGG/jgg02.htm
134 """
136 content_strs = article_content.split("<br/>")
137 content_strs = [c for c in content_strs if c != ""]
139 authors_str = None
140 # cleanup_str(content_strs[0])
142 if content_strs[0] == '<font color="#0000A0" size="2"> ':
143 content_strs.pop(0)
145 if len(content_strs) >= 3:
146 authors_str = content_strs.pop(0)
147 cut_index = authors_str.rfind(">")
148 cut_index = cut_index + 1 if cut_index > 0 else 0
149 authors_str = cleanup_str(authors_str[cut_index:])
151 title_str = cleanup_str(content_strs[0])
153 xarticle = create_articledata()
155 article_search = regex.search(self.article_re, content_strs[1])
156 if not article_search:
157 self.logger.debug(
158 "Couldn't find article url. Skipping article", extra={"url": issue_href}
159 )
160 return None
161 # raise ValueError("Couldn't find article url")
163 xarticle.title_tex = title_str
165 if authors_str:
166 for a in authors_str.split(", "):
167 author = create_contributor(role="author", string_name=a)
168 if len(a) > 256:
169 pass
170 xarticle.contributors.append(author)
172 article_data = article_search.groupdict()
173 # Remove padding : 001 -> 1
174 xarticle.fpage = article_data["fpage"].rstrip("0")
176 if article_data["lpage"] is not None:
177 xarticle.lpage = article_data["lpage"].rstrip("0")
179 if article_data["articleurl"] is not None:
180 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")
181 href = a_tag.get("href")
182 if not isinstance(href, str):
183 raise ValueError("Couldn't parse article url")
184 xarticle.url = urljoin(issue_href, href)
185 else:
186 if article_data["abstracturl"] is not None:
187 abstract_tag = BeautifulSoup(
188 article_data["abstracturl"], "html.parser"
189 ).select_one("a")
190 abstract_href = abstract_tag.get("href")
191 if not isinstance(abstract_href, str):
192 raise ValueError("Couldn't parse abstract url")
194 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))
195 if xabstract is not None:
196 xarticle.abstracts.append(xabstract)
198 if article_data["pdfurl"] is None:
199 raise ValueError("Cannot find article pdf")
201 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")
202 pdf_href = pdf_tag.get("href")
203 if not isinstance(pdf_href, str):
204 raise ValueError("Couldn't parse pdf url")
206 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))
207 ext_link = create_extlink(
208 rel="source", location=issue_href, metadata=self.source_domain
209 )
210 xarticle.ext_links.append(ext_link)
211 return xarticle
213 def parse_heldermann_abstract(self, url: str):
214 url, fragment = urldefrag(url)
215 content = self.download_file(url)
216 content = cleanup_str(content)
217 soup = BeautifulSoup(content, "html5lib")
218 abstract_title = soup.select_one(f"[name={fragment}]")
219 if not abstract_title:
220 self.logger.debug(
221 f"Couldn't parse abstract for url : {url} with fragment : {fragment}"
222 )
223 return None
224 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")
225 if not abstract_tag:
226 raise ValueError("Cannot parse abstract")
227 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))
229 def parse_article_content(self, content, xissue, xarticle, url):
230 content = cleanup_str(content, unsafe=True)
231 article_search = regex.search(self.article_page_re, content)
232 if not article_search:
233 if "This article plagiarizes" in content:
234 return None
235 article_search = regex.search(self.article_page_re_2, content)
237 if not article_search:
238 raise ValueError("Couldn't parse article page")
240 article_dict = article_search.groupdict()
242 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(article_dict["abstract"])))
244 if article_dict.get("keywords", None) is not None:
245 for kwd in article_dict["keywords"].removesuffix(".").split(", "):
246 xarticle.kwds.append(create_subj(value=kwd))
248 if article_dict.get("msc", None) is not None:
249 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")
250 for msc in article_dict["msc"].split(", "):
251 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc)))
253 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")
254 href = href_soup.get("href")
255 if not isinstance(href, str):
256 raise ValueError("Article pdf cannot be parsed")
257 href = urljoin(url, href)
258 add_pdf_link_to_xarticle(xarticle, href)
260 # Paywall check on pdf
261 is_openaccess, response, *_ = self.check_pdf_link_validity(href)
262 if not is_openaccess:
263 return None
265 return xarticle