Coverage for src/crawler/by_source/heldermann_crawler.py: 82%
173 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1from urllib.parse import urldefrag, urljoin
3import regex
4import requests
5from bs4 import BeautifulSoup, Comment
6from ptf.model_data import (
7 IssueData,
8 create_abstract,
9 create_articledata,
10 create_contributor,
11 create_subj,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18class HeldermannCrawler(BaseCollectionCrawler):
19 source_name = "Heldermann Verlag"
20 source_domain = "HELDERMANN"
21 source_website = "https://www.heldermann.de/"
23 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
24 issue_re = r"Number (?P<number>\d+)"
25 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"
27 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"
28 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'
30 def parse_collection_content(self, content):
31 xissues = []
32 soup = BeautifulSoup(content, "html5lib")
33 issues = soup.select("b > a")
34 for issue in issues:
35 volume_search = regex.search(self.volume_re, issue.text)
36 if not volume_search:
37 print(f"Couldn't parse volume year for : {issue.text}. Skipping")
38 continue
39 issue_href = issue.get("href")
40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 raise ValueError("Couldn't parse issue href")
42 volume_dict = volume_search.groupdict()
43 parsed_issues = self.parse_heldermann_issue_content(
44 urljoin(self.collection_url, issue_href),
45 volume_dict["year"],
46 volume_dict["volume"],
47 )
49 xissues.extend(parsed_issues)
50 return xissues
52 def parse_heldermann_issue_content(self, url, year, volume):
53 """
54 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)
56 Therefore, we must parse volume pages when crawling the collection
57 """
58 content = self.download_file(url)
59 soup = BeautifulSoup(content, "html5lib")
60 div = soup.select("div[align='center']")
61 xissues = []
62 current_issue: IssueData | None = None
63 # Let's hope the website is consistent :
64 # first div should be the issue number
65 # second div should be the issue contents
66 for index, el in enumerate(div):
67 if url == "https://www.heldermann.de/JCA/jca02.htm": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 current_issue = self.create_xissue(None, year, volume, "1-2")
69 xissues.append(current_issue)
70 index = 1
72 if index % 2 == 0:
73 title = el.select_one("td:first-child font:-soup-contains('Number ')")
74 if title: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true
75 issue_number = None
76 number_search = regex.search(self.issue_re, title.text)
77 if number_search: 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true
78 number_data = number_search.groupdict()
79 issue_number = number_data["number"]
80 current_issue = self.create_xissue(None, year, volume, issue_number)
81 xissues.append(current_issue)
82 continue
83 else:
84 strong = el.select_one("strong")
85 if strong:
86 a_tags = strong
87 else:
88 a_tags = el.select_one("font font:last-child")
89 if a_tags is None: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 raise ValueError("Couldn't parse issue data")
91 if a_tags and a_tags.select_one("b"):
92 a_tags = a_tags.select_one("b")
93 del strong
95 for child in a_tags.contents:
96 if isinstance(child, Comment):
97 child.extract()
99 articles_tags = regex.split(
100 r"<br\/> ?<br\/>",
101 cleanup_str(str(a_tags))
102 .removeprefix("<strong>")
103 .removeprefix("<b>")
104 .removesuffix("</strong>")
105 .removeprefix("</b>"),
106 )
108 article_index = 0
109 for a_str in articles_tags:
110 a_str = cleanup_str(a_str)
111 if a_str == "":
112 continue
113 if "</a>" not in a_str:
114 continue
115 if not current_issue: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 raise ValueError("Error while parsing issue articles")
117 xarticle = self.parse_heldermann_article(a_str, url)
118 if xarticle is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 continue
120 xarticle.pid = f"{current_issue.pid}_a{article_index}"
121 article_index += 1
122 current_issue.articles.append(xarticle)
123 return xissues
125 def parse_heldermann_article(self, article_content: str, issue_href: str):
126 """
127 Some collections in Heldermann do not have a, article-specific page (article data in issue)
128 so we must parse the article data first before proceeding.
130 https://www.heldermann.de/JGG/jgg02.htm
131 """
133 content_strs = article_content.split("<br/>")
134 content_strs = [c for c in content_strs if c != ""]
136 authors_str = None
137 # cleanup_str(content_strs[0])
139 if content_strs[0] == '<font color="#0000A0" size="2"> ': 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 content_strs.pop(0)
142 if len(content_strs) >= 3: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true
143 authors_str = content_strs.pop(0)
144 cut_index = authors_str.rfind(">")
145 cut_index = cut_index + 1 if cut_index > 0 else 0
146 authors_str = cleanup_str(authors_str[cut_index:])
148 title_str = cleanup_str(content_strs[0])
150 xarticle = create_articledata()
152 article_search = regex.search(self.article_re, content_strs[1])
153 if not article_search: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 print(f"Couldn't find article url. Skipping article. {issue_href}")
155 return None
156 # raise ValueError("Couldn't find article url")
158 xarticle.title_tex = title_str
160 if authors_str:
161 for a in authors_str.split(", "):
162 author = create_contributor(role="author", string_name=a)
163 if len(a) > 256: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 pass
165 xarticle.contributors.append(author)
167 article_data = article_search.groupdict()
168 # Remove padding : 001 -> 1
169 xarticle.fpage = article_data["fpage"].rstrip("0")
171 if article_data["lpage"] is not None: 171 ↛ 174line 171 didn't jump to line 174 because the condition on line 171 was always true
172 xarticle.lpage = article_data["lpage"].rstrip("0")
174 if article_data["articleurl"] is not None:
175 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")
176 href = a_tag.get("href")
177 if not isinstance(href, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 raise ValueError("Couldn't parse article url")
179 xarticle.url = urljoin(issue_href, href)
180 else:
181 if article_data["abstracturl"] is not None:
182 abstract_tag = BeautifulSoup(
183 article_data["abstracturl"], "html.parser"
184 ).select_one("a")
185 abstract_href = abstract_tag.get("href")
186 if not isinstance(abstract_href, str): 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true
187 raise ValueError("Couldn't parse abstract url")
189 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))
190 if xabstract is not None:
191 xarticle.abstracts.append(xabstract)
193 if article_data["pdfurl"] is None: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 raise ValueError("Cannot find article pdf")
196 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")
197 pdf_href = pdf_tag.get("href")
198 if not isinstance(pdf_href, str): 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true
199 raise ValueError("Couldn't parse pdf url")
200 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))
202 return xarticle
204 def parse_heldermann_abstract(self, url: str):
205 url, fragment = urldefrag(url)
206 content = self.download_file(url)
207 content = cleanup_str(content)
208 soup = BeautifulSoup(content, "html5lib")
209 abstract_title = soup.select_one(f"[name={fragment}]")
210 if not abstract_title:
211 print(f"Couldn't parse abstract for url : {url} with fragment : {fragment}")
212 return None
213 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")
214 if not abstract_tag: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 raise ValueError("Cannot parse abstract")
216 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))
218 def parse_article_content(self, content, xissue, xarticle, url):
219 content = cleanup_str(content)
220 article_search = regex.search(self.article_page_re, content)
221 if not article_search: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 if "This article plagiarizes" in content:
223 return None
224 article_search = regex.search(self.article_page_re_2, content)
226 if not article_search: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 raise ValueError("Couldn't parse article page")
229 article_dict = article_search.groupdict()
231 xarticle.abstracts.append(
232 create_abstract(tag="abstract", value_tex=article_dict["abstract"])
233 )
234 if article_dict.get("keywords", None) is not None: 234 ↛ 238line 234 didn't jump to line 238 because the condition on line 234 was always true
235 for kwd in article_dict["keywords"].removesuffix(".").split(", "):
236 xarticle.kwds.append(create_subj(value=kwd))
238 if article_dict.get("msc", None) is not None: 238 ↛ 243line 238 didn't jump to line 243 because the condition on line 238 was always true
239 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")
240 for msc in article_dict["msc"].split(", "):
241 xarticle.kwds.append(create_subj(type="msc", value=msc))
243 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")
244 href = href_soup.get("href")
245 if not isinstance(href, str): 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 raise ValueError("Article pdf cannot be parsed")
247 add_pdf_link_to_xarticle(xarticle, href)
249 return xarticle
251 def decode_response(self, response: requests.Response, encoding: str = "ISO-8859-1"):
252 """Override this if the content-type headers from the sources are advertising something else than the actual content
253 SASA needs this"""
254 response.encoding = encoding
255 return response.text