Coverage for src/crawler/by_source/dmlpl_crawler.py: 76%
164 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import json
2from urllib import parse
4from bs4 import BeautifulSoup, Tag
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_extlink,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.by_source.lofpl_crawler import LofplCrawler
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18class DmlplCrawler(BaseCollectionCrawler):
19 source_name = "The Polish Digital Mathematics Library"
20 source_domain = "DMLPL"
21 source_website = "http://pldml.icm.edu.pl/pldml"
23 periode_begin = 0
24 periode_end = 9999
26 # HACK : Workaround for tests (monkeypatching)
27 # We store the class here, so we can monkeypatch it when running tests
28 subCrawlers = {LofplCrawler: None}
30 def parse_collection_content(self, content):
31 """
32 Parse the HTML page of Annals of Math and returns a list of xissue.
33 Each xissue has its pid/volume/number/year metadata + its url
35 self.periode is set at the end based on the xissue years of the HTML page
36 """
37 issues = []
38 data = json.loads(content)
39 for entry in data:
40 link = self.source_website + "/tree/hierarchy.action"
41 params = {"root": entry["id"]}
42 link += "?" + parse.urlencode(params)
44 text: str = entry["text"]
45 if not text.startswith("tom/rocznik"): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError(
47 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string'
48 )
49 soup = BeautifulSoup(text, "html.parser")
50 a_tags = soup.select("a")
51 if len(a_tags) < 2: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("Cannot parse Collection : couldn't find volume information")
53 volume = a_tags[0].text
54 year = a_tags[1].text
56 issues.extend(self.parse_dmlpl_volume_content(link, year, volume))
57 return issues
59 def parse_dmlpl_volume_content(self, link, year, volume):
60 content = self.download_file(link)
61 has_articles = False
62 issues = []
63 data = json.loads(content)
64 for entry in data:
65 entry_link = self.source_website + "/tree/hierarchy.action"
66 params = {"root": entry["id"]}
67 entry_link += "?" + parse.urlencode(params)
69 number = None
70 text: str = entry["text"]
71 if text.startswith("numer"):
72 soup = BeautifulSoup(text, "html.parser")
73 a_tag = soup.select_one("a")
74 if not a_tag: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 raise ValueError("Cannot parse Collection : couldn't find issue information")
76 number = a_tag.text.replace(" ", "_")
77 issues.append(self.create_xissue(entry_link, year, volume, number))
78 elif text.startswith("artykuł"): 78 ↛ 64line 78 didn't jump to line 64 because the condition on line 78 was always true
79 has_articles = True
81 if has_articles:
82 issues.append(self.create_xissue(link, year, volume))
84 return issues
86 def parse_issue_content(self, content, xissue):
87 data = json.loads(content)
88 for index, entry in enumerate(data):
89 xarticle = create_articledata()
90 xarticle.pid = "a" + str(index)
91 xarticle.url = self.source_website + "/element/" + entry["id"]
92 xissue.articles.append(xarticle)
94 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki)
95 def crawl_article(self, xarticle, xissue):
96 parsed_xarticle = xarticle
97 if hasattr(xarticle, "url") and xarticle.url: 97 ↛ 133line 97 didn't jump to line 133 because the condition on line 97 was always true
98 url = xarticle.url
100 article_source = self.source_domain
101 response = self.get(xarticle.url)
102 content = self.decode_response(response)
103 pid = f"{xissue.pid}_{xarticle.pid}"
105 # Crawl using LOFPL if detected
106 if response.url.startswith("https://bibliotekanauki.pl"):
107 xarticle.url = response.url.replace(
108 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api"
109 )
110 content = self.download_file(xarticle.url)
111 targetCrawler = self.subCrawlers[LofplCrawler]
112 if targetCrawler is None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 raise ValueError("Crawler incorrectly initialized")
114 targetCrawler.parse_article_content(
115 content, xissue, xarticle, xarticle.url, pid # type: ignore
116 )
117 article_source = targetCrawler.source_domain
118 elif response.url.startswith("http://pldml.icm.edu.pl"): 118 ↛ 123line 118 didn't jump to line 123 because the condition on line 118 was always true
119 parsed_xarticle = self.parse_article_content(
120 content, xissue, xarticle, xarticle.url, pid
121 )
122 else:
123 raise NotImplementedError
125 # ARTICLE URL as en ExtLink (to display the link in the article page)
126 ext_link = create_extlink()
127 ext_link["rel"] = "source"
128 ext_link["location"] = url
129 ext_link["metadata"] = article_source
130 parsed_xarticle.ext_links.append(ext_link)
132 # The article title may have formulas surrounded with '$'
133 return self.process_article_metadata(parsed_xarticle)
135 def parse_dmlpl_generic_page(self, content: str):
136 soup = BeautifulSoup(content, "html.parser")
137 main = soup.select_one("div.details-content")
138 if not main: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 raise ValueError("Cannot parse article : main div not found")
141 sections = main.select("div.row")
142 sections_dict: dict[str, Tag] = {}
143 for s in sections:
144 row_label = s.select_one("div.row-label")
145 if not row_label: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 raise ValueError("Cannot parse article : row label not found")
147 tag = s.select_one("div.row-desc")
148 if tag: 148 ↛ 143line 148 didn't jump to line 143 because the condition on line 148 was always true
149 sections_dict[row_label.text] = tag
151 return sections_dict
153 def parse_article_content(self, content, xissue, xarticle, url, pid):
154 sections_dict = self.parse_dmlpl_generic_page(content)
156 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text)
157 xarticle.pid = pid
159 # Author
160 for a_tag in sections_dict["Autorzy"].select("a"):
161 href = a_tag.get("href")
162 if not isinstance(href, str): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 raise ValueError("author href is not a string")
164 author = self.parse_author(self.download_file(self.source_website + "/" + href))
165 author["role"] = "author"
166 xarticle.contributors.append(author)
168 # TODO : Contributor ? (Twórcy)
170 # PDF
171 if "Treść / Zawartość" in sections_dict: 171 ↛ 182line 171 didn't jump to line 182 because the condition on line 171 was always true
172 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a")
173 if not pdf_a_tag: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 raise ValueError("Cannot find pdf for article")
175 pdf_url = pdf_a_tag.get("href")
176 if not isinstance(pdf_url, str): 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true
177 raise ValueError("Cannot parse pdf url for article")
178 if not pdf_url.startswith("http"):
179 pdf_url = self.source_website + "/" + pdf_url
180 add_pdf_link_to_xarticle(xarticle, pdf_url)
181 else:
182 print(f"[{pid}]PDF not found")
184 # Lang
185 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower())
186 if len(xarticle.lang) > 3:
187 if xarticle.lang == "pl fr": 187 ↛ 191line 187 didn't jump to line 191 because the condition on line 187 was always true
188 xarticle.lang = "pl"
189 print(f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)")
190 else:
191 raise ValueError("Cannot parse article lang")
193 # Abstract
194 if "Abstrakty" in sections_dict: 194 ↛ 212line 194 didn't jump to line 212 because the condition on line 194 was always true
195 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row")
196 for div in abstract_divs: 196 ↛ 197line 196 didn't jump to line 197 because the loop on line 196 never started
197 lang = "und"
198 lang_div = div.select_one("div.articleDetails-langCell")
199 if lang_div:
200 lang = cleanup_str(lang_div.text).lower()
201 text_div = div.select_one("div.articleDetails-abstract")
202 if not text_div:
203 raise ValueError(
204 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed"
205 )
206 xabstract = create_abstract(
207 tag="abstract", value_tex=cleanup_str(text_div.text), lang=lang
208 )
209 xarticle.abstracts.append(xabstract)
211 # Keywords
212 if "Słowa kluczowe" in sections_dict: 212 ↛ 224line 212 didn't jump to line 224 because the condition on line 212 was always true
213 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row")
214 for list in keywords_lists: 214 ↛ 215line 214 didn't jump to line 215 because the loop on line 214 never started
215 lang = "und"
216 lang_div = list.select_one("div.articleDetails-langCell")
217 keywords_a_tags = list.select("a")
218 for a_tag in keywords_a_tags:
219 subject = create_subj()
220 subject["value"] = a_tag.text
221 subject["lang"] = lang
222 xarticle.kwds.append(subject)
223 # Page
224 if "Strony" in sections_dict:
225 self.set_pages(xarticle, cleanup_str(sections_dict["Strony"].text))
227 return xarticle
229 def parse_author(self, content: str):
230 author = create_contributor()
231 sections_dict = self.parse_dmlpl_generic_page(content)
232 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text)
233 author["first_name"] = cleanup_str(sections_dict["Imię"].text)
234 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true
235 author["string_name"] = cleanup_str(sections_dict["Twórca"].text)
236 return author