Coverage for src/crawler/by_source/heldermann_crawler.py: 83%
162 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from urllib.parse import urldefrag, urljoin
3import regex
4from bs4 import BeautifulSoup, Comment
5from ptf.model_data import (
6 IssueData,
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
17class HeldermannCrawler(BaseCollectionCrawler):
18 source_name = "Heldermann Verlag"
19 source_domain = "HELDERMANN"
20 source_website = "https://www.heldermann.de/"
22 volume_re = r"Volume (?P<volume>\d+) \((?P<year>\d+)\)"
23 issue_re = r"Number (?P<number>\d+)"
24 article_re = r"(?P<fpage>[\da]+)(?:-+(?P<lpage>[\da]+))? (?:(?:\[(?P<abstracturl><a.*>Abstract<\/a>)\] ?)?\[?\[(?P<pdfurl><a.*>Full[tT]ext-pdf \(.*\)<\/a>)\])?(?:\[(?P<articleurl><a.*>Abstract \/ Full Text<\/a>)\])?"
26 article_page_re = r"Abstract-pdf<\/a>\]<br\/?><br\/?>(?:-->)? (?P<abstract>.+?) (?:<!--)?<br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)"
27 article_page_re_2 = r'(?:<font size="3" color="#0000A0"><b> )(?P<abstract>.+?)\. <br\/?><br\/?> (?:Keywords: (?:(?P<keywords>.*?)\.?) <br\/?><br\/?> )?(?:MSC: (?P<msc>.*?)\.? ?<br\/?><br\/?> )?.*?\[?(?P<pdfurl><a.*<\/a>)'
29 def parse_collection_content(self, content):
30 xissues = []
31 soup = BeautifulSoup(content, "html5lib")
32 issues = soup.select("b > a")
33 for issue in issues:
34 volume_search = regex.search(self.volume_re, issue.text)
35 if not volume_search:
36 print(f"Couldn't parse volume year for : {issue.text}. Skipping")
37 continue
38 issue_href = issue.get("href")
39 if not isinstance(issue_href, str): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 raise ValueError("Couldn't parse issue href")
41 volume_dict = volume_search.groupdict()
42 parsed_issues = self.parse_heldermann_issue_content(
43 urljoin(self.collection_url, issue_href),
44 volume_dict["year"],
45 volume_dict["volume"],
46 )
48 xissues.extend(parsed_issues)
49 return xissues
51 def parse_heldermann_issue_content(self, url, year, volume):
52 """
53 Heldermann has pages for volumes, but no pages for issues (multiple issues inside one page)
55 Therefore, we must parse volume pages when crawling the collection
56 """
57 content = self.download_file(url)
58 soup = BeautifulSoup(content, "html5lib")
59 div = soup.select("div[align='center']")
60 xissues = []
61 current_issue: IssueData | None = None
62 # Let's hope the website is consistent :
63 # first div should be the issue number
64 # second div should be the issue contents
65 for index, el in enumerate(div):
66 if index % 2 == 0:
67 title = el.select_one("td:first-child font:-soup-contains('Number ')")
68 if title: 68 ↛ 65line 68 didn't jump to line 65 because the condition on line 68 was always true
69 issue_number = None
70 number_search = regex.search(self.issue_re, title.text)
71 if number_search: 71 ↛ 65line 71 didn't jump to line 65 because the condition on line 71 was always true
72 number_data = number_search.groupdict()
73 issue_number = number_data["number"]
74 current_issue = self.create_xissue(None, year, volume, issue_number)
75 xissues.append(current_issue)
76 continue
77 else:
78 strong = el.select_one("strong")
79 if strong:
80 a_tags = strong
81 else:
82 a_tags = el.select_one("font font:last-child")
83 if a_tags is None: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 raise ValueError("Couldn't parse issue data")
85 if a_tags and a_tags.select_one("b"):
86 a_tags = a_tags.select_one("b")
87 del strong
89 for child in a_tags.contents:
90 if isinstance(child, Comment):
91 child.extract()
93 articles_tags = regex.split(
94 r"<br\/> ?<br\/>",
95 cleanup_str(str(a_tags))
96 .removeprefix("<strong>")
97 .removeprefix("<b>")
98 .removesuffix("</strong>")
99 .removeprefix("</b>"),
100 )
102 article_index = 0
103 for a_str in articles_tags:
104 a_str = cleanup_str(a_str)
105 if a_str == "":
106 continue
107 if "</a>" not in a_str:
108 continue
109 if not current_issue: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 raise ValueError("Error while parsing issue articles")
111 xarticle = self.parse_heldermann_article(a_str, url)
112 if xarticle is None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 continue
114 xarticle.pid = f"{current_issue.pid}_a{article_index}"
115 article_index += 1
116 current_issue.articles.append(xarticle)
117 return xissues
119 def parse_heldermann_article(self, article_content: str, issue_href: str):
120 """
121 Some collections in Heldermann do not have a, article-specific page (article data in issue)
122 so we must parse the article data first before proceeding.
124 https://www.heldermann.de/JGG/jgg02.htm
125 """
127 content_strs = article_content.split("<br/>")
128 content_strs = [c for c in content_strs if c != ""]
130 authors_str = None
131 # cleanup_str(content_strs[0])
133 if content_strs[0] == '<font color="#0000A0" size="2"> ': 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 content_strs.pop(0)
136 if len(content_strs) >= 3: 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true
137 authors_str = content_strs.pop(0)
139 title_str = cleanup_str(content_strs[0])
141 xarticle = create_articledata()
143 article_search = regex.search(self.article_re, content_strs[1])
144 if not article_search: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 print(f"Couldn't find article url. Skipping article. {issue_href}")
146 return None
147 # raise ValueError("Couldn't find article url")
149 xarticle.title_tex = title_str
151 if authors_str: 151 ↛ 158line 151 didn't jump to line 158 because the condition on line 151 was always true
152 for a in authors_str.split(", "):
153 author = create_contributor(role="author", string_name=a)
154 if len(a) > 256: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 pass
156 xarticle.contributors.append(author)
158 article_data = article_search.groupdict()
159 # Remove padding : 001 -> 1
160 xarticle.fpage = article_data["fpage"].rstrip("0")
162 if article_data["lpage"] is not None: 162 ↛ 165line 162 didn't jump to line 165 because the condition on line 162 was always true
163 xarticle.lpage = article_data["lpage"].rstrip("0")
165 if article_data["articleurl"] is not None:
166 a_tag = BeautifulSoup(article_data["articleurl"], "html.parser").select_one("a")
167 href = a_tag.get("href")
168 if not isinstance(href, str): 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 raise ValueError("Couldn't parse article url")
170 xarticle.url = urljoin(issue_href, href)
171 else:
172 if article_data["abstracturl"] is not None:
173 abstract_tag = BeautifulSoup(
174 article_data["abstracturl"], "html.parser"
175 ).select_one("a")
176 abstract_href = abstract_tag.get("href")
177 if not isinstance(abstract_href, str): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 raise ValueError("Couldn't parse abstract url")
180 xabstract = self.parse_heldermann_abstract(urljoin(issue_href, abstract_href))
181 if xabstract is not None:
182 xarticle.abstracts.append(xabstract)
184 if article_data["pdfurl"] is None: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 raise ValueError("Cannot find article pdf")
187 pdf_tag = BeautifulSoup(article_data["pdfurl"], "html.parser").select_one("a")
188 pdf_href = pdf_tag.get("href")
189 if not isinstance(pdf_href, str): 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 raise ValueError("Couldn't parse pdf url")
191 add_pdf_link_to_xarticle(xarticle, urljoin(issue_href, pdf_href))
193 return xarticle
195 def parse_heldermann_abstract(self, url: str):
196 url, fragment = urldefrag(url)
197 content = self.download_file(url)
198 content = cleanup_str(content)
199 soup = BeautifulSoup(content, "html5lib")
200 abstract_title = soup.select_one(f"[name={fragment}]")
201 if not abstract_title:
202 print(f"Couldn't parse abstract for url : {url} with fragment : {fragment}")
203 return None
204 abstract_tag = abstract_title.find_parent("dt").find_next_sibling("font")
205 if not abstract_tag: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 raise ValueError("Cannot parse abstract")
207 return create_abstract(tag="abstract", value_tex=cleanup_str(abstract_tag.text))
209 def parse_article_content(self, content, xissue, xarticle, url, pid):
210 content = cleanup_str(content)
211 article_search = regex.search(self.article_page_re, content)
212 if not article_search: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 if "This article plagiarizes" in content:
214 return None
215 article_search = regex.search(self.article_page_re_2, content)
217 if not article_search: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 raise ValueError("Couldn't parse article page")
220 article_dict = article_search.groupdict()
222 xarticle.abstracts.append(
223 create_abstract(tag="abstract", value_tex=article_dict["abstract"])
224 )
225 if article_dict.get("keywords", None) is not None: 225 ↛ 229line 225 didn't jump to line 229 because the condition on line 225 was always true
226 for kwd in article_dict["keywords"].removesuffix(".").split(", "):
227 xarticle.kwds.append(create_subj(value=kwd))
229 if article_dict.get("msc", None) is not None: 229 ↛ 234line 229 didn't jump to line 234 because the condition on line 229 was always true
230 article_dict["msc"] = article_dict["msc"].replace(";", ",").removesuffix(".")
231 for msc in article_dict["msc"].split(", "):
232 xarticle.kwds.append(create_subj(type="msc", value=msc))
234 href_soup = BeautifulSoup(article_dict["pdfurl"], "html.parser").select_one("a")
235 href = href_soup.get("href")
236 if not isinstance(href, str): 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true
237 raise ValueError("Article pdf cannot be parsed")
238 add_pdf_link_to_xarticle(xarticle, href)
240 return super().parse_article_content(content, xissue, xarticle, url, pid)