Coverage for src / crawler / by_source / dmlpl_crawler.py: 9%
154 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import json
2from urllib import parse
4from bs4 import BeautifulSoup, Tag
5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.by_source.lofpl_crawler import LofplCrawler
9from crawler.crawler_utils import set_pages
10from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
13class DmlplCrawler(BaseCollectionCrawler):
14 source_name = "The Polish Digital Mathematics Library"
15 source_domain = "DMLPL"
16 source_website = "http://pldml.icm.edu.pl/pldml"
18 # HACK : Workaround for tests (monkeypatching)
19 # We store the class here, so we can monkeypatch it when running tests
20 subCrawlers = {LofplCrawler: None}
22 def parse_collection_content(self, content):
23 """
24 Parse the HTML page of Annals of Math and returns a list of xissue.
25 Each xissue has its pid/volume/number/year metadata + its url
26 """
27 issues = []
28 data = json.loads(content)
29 for entry in data:
30 link = self.source_website + "/tree/hierarchy.action"
31 params = {"root": entry["id"]}
32 link += "?" + parse.urlencode(params)
34 text: str = entry["text"]
35 if not text.startswith("tom/rocznik"):
36 raise ValueError(
37 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string'
38 )
39 soup = BeautifulSoup(text, "html.parser")
40 a_tags = soup.select("a")
41 if len(a_tags) < 2:
42 raise ValueError("Cannot parse Collection : couldn't find volume information")
43 volume = a_tags[0].text
44 year = a_tags[1].text
46 issues.extend(self.parse_dmlpl_volume_content(link, year, volume))
47 return issues
49 def parse_dmlpl_volume_content(self, link, year, volume):
50 content = self.download_file(link)
51 has_articles = False
52 issues = []
53 data = json.loads(content)
54 for entry in data:
55 entry_link = self.source_website + "/tree/hierarchy.action"
56 params = {"root": entry["id"]}
57 entry_link += "?" + parse.urlencode(params)
59 number = None
60 text: str = entry["text"]
61 if text.startswith("numer"):
62 soup = BeautifulSoup(text, "html.parser")
63 a_tag = soup.select_one("a")
64 if not a_tag:
65 raise ValueError("Cannot parse Collection : couldn't find issue information")
66 number = a_tag.text.replace(" ", "_")
67 issues.append(self.create_xissue(entry_link, year, volume, number))
68 elif text.startswith("artykuł"):
69 has_articles = True
71 if has_articles:
72 issues.append(self.create_xissue(link, year, volume))
74 return issues
76 def parse_issue_content(self, content, xissue):
77 data = json.loads(content)
78 for index, entry in enumerate(data):
79 xarticle = create_articledata()
80 xarticle.pid = "a" + str(index)
81 xarticle.url = self.source_website + "/element/" + entry["id"]
82 xissue.articles.append(xarticle)
84 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki)
85 def crawl_article(self, xarticle, xissue):
86 parsed_xarticle = xarticle
87 if hasattr(xarticle, "url") and xarticle.url:
88 response = self._get(xarticle.url)
90 # Crawl using LOFPL if detected
91 if response.url.startswith("https://bibliotekanauki.pl"):
92 xarticle.url = response.url.replace(
93 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api"
94 )
95 targetCrawler = self.subCrawlers[LofplCrawler]
96 if targetCrawler is None:
97 raise ValueError("Crawler incorrectly initialized")
98 parsed_xarticle = targetCrawler.crawl_article(xarticle, xissue)
99 elif response.url.startswith("http://pldml.icm.edu.pl"):
100 parsed_xarticle = super().crawl_article(xarticle, xissue)
101 else:
102 raise NotImplementedError
104 if not parsed_xarticle:
105 raise ValueError("Couldn't crawl article")
106 # The article title may have formulas surrounded with '$'
107 return self.process_article_metadata(parsed_xarticle)
109 def parse_dmlpl_generic_page(self, content: str):
110 soup = BeautifulSoup(content, "html.parser")
111 main = soup.select_one("div.details-content")
112 if not main:
113 raise ValueError("Cannot parse article : main div not found")
115 sections = main.select("div.row")
116 sections_dict: dict[str, Tag] = {}
117 for s in sections:
118 row_label = s.select_one("div.row-label")
119 if not row_label:
120 raise ValueError("Cannot parse article : row label not found")
121 tag = s.select_one("div.row-desc")
122 if tag:
123 sections_dict[row_label.text] = tag
125 return sections_dict
127 def parse_article_content(self, content, xissue, xarticle, url):
128 sections_dict = self.parse_dmlpl_generic_page(content)
130 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text)
132 # Author
133 for a_tag in sections_dict["Autorzy"].select("a"):
134 href = a_tag.get("href")
135 if not isinstance(href, str):
136 raise ValueError("author href is not a string")
137 author = self.parse_author(self.download_file(self.source_website + "/" + href))
138 author["role"] = "author"
139 xarticle.contributors.append(author)
141 # TODO : Contributor ? (Twórcy)
143 # PDF
144 if "Treść / Zawartość" in sections_dict:
145 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a")
146 if not pdf_a_tag:
147 raise ValueError("Cannot find pdf for article")
148 pdf_url = pdf_a_tag.get("href")
149 if not isinstance(pdf_url, str):
150 raise ValueError("Cannot parse pdf url for article")
151 if not pdf_url.startswith("http"):
152 pdf_url = self.source_website + "/" + pdf_url
153 add_pdf_link_to_xarticle(xarticle, pdf_url)
154 else:
155 self.logger.info("PDF not found", extra={"pid": xarticle.pid})
157 # Lang
158 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower())
159 if len(xarticle.lang) > 3:
160 if xarticle.lang == "pl fr":
161 xarticle.lang = "pl"
162 self.logger.info(
163 f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)",
164 extra={"pid": xarticle.pid},
165 )
166 else:
167 raise ValueError("Cannot parse article lang")
169 # Abstract
170 if "Abstrakty" in sections_dict:
171 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row")
172 for div in abstract_divs:
173 lang = "und"
174 lang_div = div.select_one("div.articleDetails-langCell")
175 if lang_div:
176 lang = cleanup_str(lang_div.text).lower()
177 text_div = div.select_one("div.articleDetails-abstract")
178 if not text_div:
179 raise ValueError(
180 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed"
181 )
182 abstract_text = cleanup_str(text_div.text)
183 if abstract_text != "-":
184 xarticle.abstracts.append(create_abstract(value_tex=abstract_text, lang=lang))
186 # Keywords
187 if "Słowa kluczowe" in sections_dict:
188 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row")
189 for list in keywords_lists:
190 lang = "und"
191 lang_div = list.select_one("div.articleDetails-langCell")
192 keywords_a_tags = list.select("a")
193 for a_tag in keywords_a_tags:
194 subject = create_subj()
195 subject["value"] = a_tag.text
196 subject["lang"] = lang
197 xarticle.kwds.append(subject)
198 # Page
199 if "Strony" in sections_dict:
200 set_pages(xarticle, cleanup_str(sections_dict["Strony"].text))
202 return xarticle
204 def parse_author(self, content: str):
205 author = create_contributor()
206 sections_dict = self.parse_dmlpl_generic_page(content)
207 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text)
208 author["first_name"] = cleanup_str(sections_dict["Imię"].text)
209 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0:
210 author["string_name"] = cleanup_str(sections_dict["Twórca"].text)
211 return author