Coverage for src / crawler / by_source / heldermann_crawler.py: 8%
175 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1from urllib.parse import urldefrag, urljoin
3import regex
4from bs4 import BeautifulSoup, Comment
5from ptf.model_data import (
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17class HeldermannCrawler(BaseCollectionCrawler):
18 source_name = "Heldermann Verlag"
19 source_domain = "HELDERMANN"
20 source_website = "https://www.heldermann.de/"
22 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
23 issue_re = r"Number (?P<number>\d+)"
24 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"
26 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"
27 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'
29 def parse_collection_content(self, content):
30 xissues = []
31 soup = BeautifulSoup(content, "html5lib")
32 issues = soup.select("b > a")
33 for issue in issues:
34 volume_search = regex.search(self.volume_re, issue.text)
35 if not volume_search:
36 self.logger.debug(f"Couldn't parse volume year for : {issue.text}. Skipping")
37 continue
38 issue_href = issue.get("href")
39 if not isinstance(issue_href, str):
40 raise ValueError("Couldn't parse issue href")
41 volume_dict = volume_search.groupdict()
42 parsed_issues = self.parse_heldermann_issue_content(
43 urljoin(self.collection_url, issue_href),
44 volume_dict["year"],
45 volume_dict["volume"],
46 )
48 xissues.extend(parsed_issues)
49 return xissues
51 def parse_heldermann_issue_content(self, url, year, volume):
52 """
53 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)
55 Therefore, we must parse volume pages when crawling the collection
56 """
57 content = self.download_file(url)
58 soup = BeautifulSoup(content, "html5lib")
59 div = soup.select("div[align='center']")
60 xissues = []
61 current_issue: IssueData | None = None
62 # Let's hope the website is consistent :
63 # first div should be the issue number
64 # second div should be the issue contents
65 for index, el in enumerate(div):
66 if url == "https://www.heldermann.de/JCA/jca02.htm":
67 current_issue = self.create_xissue(None, year, volume, "1-2")
68 xissues.append(current_issue)
69 index = 1
71 if index % 2 == 0:
72 title = el.select_one("td:first-child font:-soup-contains('Number ')")
73 if title:
74 issue_number = None
75 number_search = regex.search(self.issue_re, title.text)
76 if number_search:
77 number_data = number_search.groupdict()
78 issue_number = number_data["number"]
79 current_issue = self.create_xissue(None, year, volume, issue_number)
80 xissues.append(current_issue)
81 continue
82 else:
83 strong = el.select_one("strong")
84 if strong:
85 a_tags = strong
86 else:
87 a_tags = el.select_one("font font:last-child")
88 if a_tags is None:
89 raise ValueError("Couldn't parse issue data")
90 if a_tags and a_tags.select_one("b"):
91 a_tags = a_tags.select_one("b")
92 del strong
94 for child in a_tags.contents:
95 if isinstance(child, Comment):
96 child.extract()
98 articles_tags = regex.split(
99 r"<br\/> ?<br\/>",
100 cleanup_str(str(a_tags))
101 .removeprefix("<strong>")
102 .removeprefix("<b>")
103 .removesuffix("</strong>")
104 .removeprefix("</b>"),
105 )
107 article_index = 0
108 for a_str in articles_tags:
109 a_str = cleanup_str(a_str)
110 if a_str == "":
111 continue
112 if "</a>" not in a_str:
113 continue
114 if not current_issue:
115 raise ValueError("Error while parsing issue articles")
116 xarticle = self.parse_heldermann_article(a_str, url)
117 if xarticle is None:
118 continue
119 xarticle.pid = f"{current_issue.pid}_a{article_index}"
120 article_index += 1
121 current_issue.articles.append(xarticle)
122 return xissues
124 def parse_heldermann_article(self, article_content: str, issue_href: str):
125 """
126 Some collections in Heldermann do not have a, article-specific page (article data in issue)
127 so we must parse the article data first before proceeding.
129 https://www.heldermann.de/JGG/jgg02.htm
130 """
132 content_strs = article_content.split("<br/>")
133 content_strs = [c for c in content_strs if c != ""]
135 authors_str = None
136 # cleanup_str(content_strs[0])
138 if content_strs[0] == '<font color="#0000A0" size="2"> ':
139 content_strs.pop(0)
141 if len(content_strs) >= 3:
142 authors_str = content_strs.pop(0)
143 cut_index = authors_str.rfind(">")
144 cut_index = cut_index + 1 if cut_index > 0 else 0
145 authors_str = cleanup_str(authors_str[cut_index:])
147 title_str = cleanup_str(content_strs[0])
149 xarticle = create_articledata()
151 article_search = regex.search(self.article_re, content_strs[1])
152 if not article_search:
153 self.logger.debug(
154 "Couldn't find article url. Skipping article", extra={"url": issue_href}
155 )
156 return None
157 # raise ValueError("Couldn't find article url")
159 xarticle.title_tex = title_str
161 if authors_str:
162 for a in authors_str.split(", "):
163 author = create_contributor(role="author", string_name=a)
164 if len(a) > 256:
165 pass
166 xarticle.contributors.append(author)
168 article_data = article_search.groupdict()
169 # Remove padding : 001 -> 1
170 xarticle.fpage = article_data["fpage"].rstrip("0")
172 if article_data["lpage"] is not None:
173 xarticle.lpage = article_data["lpage"].rstrip("0")
175 if article_data["articleurl"] is not None:
176 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")
177 href = a_tag.get("href")
178 if not isinstance(href, str):
179 raise ValueError("Couldn't parse article url")
180 xarticle.url = urljoin(issue_href, href)
181 else:
182 if article_data["abstracturl"] is not None:
183 abstract_tag = BeautifulSoup(
184 article_data["abstracturl"], "html.parser"
185 ).select_one("a")
186 abstract_href = abstract_tag.get("href")
187 if not isinstance(abstract_href, str):
188 raise ValueError("Couldn't parse abstract url")
190 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))
191 if xabstract is not None:
192 xarticle.abstracts.append(xabstract)
194 if article_data["pdfurl"] is None:
195 raise ValueError("Cannot find article pdf")
197 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")
198 pdf_href = pdf_tag.get("href")
199 if not isinstance(pdf_href, str):
200 raise ValueError("Couldn't parse pdf url")
201 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))
203 return xarticle
205 def parse_heldermann_abstract(self, url: str):
206 url, fragment = urldefrag(url)
207 content = self.download_file(url)
208 content = cleanup_str(content)
209 soup = BeautifulSoup(content, "html5lib")
210 abstract_title = soup.select_one(f"[name={fragment}]")
211 if not abstract_title:
212 self.logger.debug(
213 f"Couldn't parse abstract for url : {url} with fragment : {fragment}"
214 )
215 return None
216 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")
217 if not abstract_tag:
218 raise ValueError("Cannot parse abstract")
219 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))
221 def parse_article_content(self, content, xissue, xarticle, url):
222 content = cleanup_str(content)
223 article_search = regex.search(self.article_page_re, content)
224 if not article_search:
225 if "This article plagiarizes" in content:
226 return None
227 article_search = regex.search(self.article_page_re_2, content)
229 if not article_search:
230 raise ValueError("Couldn't parse article page")
232 article_dict = article_search.groupdict()
234 xarticle.abstracts.append(create_abstract(value_tex=article_dict["abstract"]))
235 if article_dict.get("keywords", None) is not None:
236 for kwd in article_dict["keywords"].removesuffix(".").split(", "):
237 xarticle.kwds.append(create_subj(value=kwd))
239 if article_dict.get("msc", None) is not None:
240 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")
241 for msc in article_dict["msc"].split(", "):
242 xarticle.kwds.append(create_subj(type="msc", value=msc))
244 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")
245 href = href_soup.get("href")
246 if not isinstance(href, str):
247 raise ValueError("Article pdf cannot be parsed")
248 href = urljoin(url, href)
249 add_pdf_link_to_xarticle(xarticle, href)
251 # Paywall check on pdf
252 is_openaccess, response, *_ = self.check_pdf_link_validity(href, session=self.session)
253 if not is_openaccess:
254 return None
255 if getattr(response, "from_cache", False):
256 self._wait_download_delay()
258 return xarticle