Coverage for src/crawler/by_source/dmlpl_crawler.py: 74%
155 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import json
2from urllib import parse
4from bs4 import BeautifulSoup, Tag
5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.by_source.lofpl_crawler import LofplCrawler
9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
12class DmlplCrawler(BaseCollectionCrawler):
13 source_name = "The Polish Digital Mathematics Library"
14 source_domain = "DMLPL"
15 source_website = "http://pldml.icm.edu.pl/pldml"
17 periode_begin = 0
18 periode_end = 9999
20 # HACK : Workaround for tests (monkeypatching)
21 # We store the class here, so we can monkeypatch it when running tests
22 subCrawlers = {LofplCrawler: None}
24 def parse_collection_content(self, content):
25 """
26 Parse the HTML page of Annals of Math and returns a list of xissue.
27 Each xissue has its pid/volume/number/year metadata + its url
29 self.periode is set at the end based on the xissue years of the HTML page
30 """
31 issues = []
32 data = json.loads(content)
33 for entry in data:
34 link = self.source_website + "/tree/hierarchy.action"
35 params = {"root": entry["id"]}
36 link += "?" + parse.urlencode(params)
38 text: str = entry["text"]
39 if not text.startswith("tom/rocznik"): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 raise ValueError(
41 'Cannot parse Collection : couldn\'t find "tom/rocznik" at the start of the string'
42 )
43 soup = BeautifulSoup(text, "html.parser")
44 a_tags = soup.select("a")
45 if len(a_tags) < 2: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Cannot parse Collection : couldn't find volume information")
47 volume = a_tags[0].text
48 year = a_tags[1].text
50 issues.extend(self.parse_dmlpl_volume_content(link, year, volume))
51 return issues
53 def parse_dmlpl_volume_content(self, link, year, volume):
54 content = self.download_file(link)
55 has_articles = False
56 issues = []
57 data = json.loads(content)
58 for entry in data:
59 entry_link = self.source_website + "/tree/hierarchy.action"
60 params = {"root": entry["id"]}
61 entry_link += "?" + parse.urlencode(params)
63 number = None
64 text: str = entry["text"]
65 if text.startswith("numer"):
66 soup = BeautifulSoup(text, "html.parser")
67 a_tag = soup.select_one("a")
68 if not a_tag: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 raise ValueError("Cannot parse Collection : couldn't find issue information")
70 number = a_tag.text.replace(" ", "_")
71 issues.append(self.create_xissue(entry_link, year, volume, number))
72 elif text.startswith("artykuł"): 72 ↛ 58line 72 didn't jump to line 58 because the condition on line 72 was always true
73 has_articles = True
75 if has_articles:
76 issues.append(self.create_xissue(link, year, volume))
78 return issues
80 def parse_issue_content(self, content, xissue):
81 data = json.loads(content)
82 for index, entry in enumerate(data):
83 xarticle = create_articledata()
84 xarticle.pid = "a" + str(index)
85 xarticle.url = self.source_website + "/element/" + entry["id"]
86 xissue.articles.append(xarticle)
88 # IDEA : manually following redirections would allow us to get the redirection URL without the body (for bibliotekanauki)
89 def crawl_article(self, xarticle, xissue):
90 parsed_xarticle = xarticle
91 if hasattr(xarticle, "url") and xarticle.url: 91 ↛ 108line 91 didn't jump to line 108 because the condition on line 91 was always true
92 response = self.get(xarticle.url)
94 # Crawl using LOFPL if detected
95 if response.url.startswith("https://bibliotekanauki.pl"):
96 xarticle.url = response.url.replace(
97 "https://bibliotekanauki.pl", "https://bibliotekanauki.pl/api"
98 )
99 targetCrawler = self.subCrawlers[LofplCrawler]
100 if targetCrawler is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 raise ValueError("Crawler incorrectly initialized")
102 parsed_xarticle = targetCrawler.crawl_article(xarticle, xissue)
103 elif response.url.startswith("http://pldml.icm.edu.pl"): 103 ↛ 106line 103 didn't jump to line 106 because the condition on line 103 was always true
104 parsed_xarticle = super().crawl_article(xarticle, xissue)
105 else:
106 raise NotImplementedError
108 if not parsed_xarticle: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 raise ValueError("Couldn't crawl article")
110 # The article title may have formulas surrounded with '$'
111 return self.process_article_metadata(parsed_xarticle)
113 def parse_dmlpl_generic_page(self, content: str):
114 soup = BeautifulSoup(content, "html.parser")
115 main = soup.select_one("div.details-content")
116 if not main: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 raise ValueError("Cannot parse article : main div not found")
119 sections = main.select("div.row")
120 sections_dict: dict[str, Tag] = {}
121 for s in sections:
122 row_label = s.select_one("div.row-label")
123 if not row_label: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 raise ValueError("Cannot parse article : row label not found")
125 tag = s.select_one("div.row-desc")
126 if tag: 126 ↛ 121line 126 didn't jump to line 121 because the condition on line 126 was always true
127 sections_dict[row_label.text] = tag
129 return sections_dict
131 def parse_article_content(self, content, xissue, xarticle, url, pid):
132 sections_dict = self.parse_dmlpl_generic_page(content)
134 xarticle.title_tex = cleanup_str(sections_dict["Tytuł artykułu"].text)
135 xarticle.pid = pid
137 # Author
138 for a_tag in sections_dict["Autorzy"].select("a"):
139 href = a_tag.get("href")
140 if not isinstance(href, str): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 raise ValueError("author href is not a string")
142 author = self.parse_author(self.download_file(self.source_website + "/" + href))
143 author["role"] = "author"
144 xarticle.contributors.append(author)
146 # TODO : Contributor ? (Twórcy)
148 # PDF
149 if "Treść / Zawartość" in sections_dict: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true
150 pdf_a_tag = sections_dict["Treść / Zawartość"].select_one("a")
151 if not pdf_a_tag: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 raise ValueError("Cannot find pdf for article")
153 pdf_url = pdf_a_tag.get("href")
154 if not isinstance(pdf_url, str): 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 raise ValueError("Cannot parse pdf url for article")
156 if not pdf_url.startswith("http"):
157 pdf_url = self.source_website + "/" + pdf_url
158 add_pdf_link_to_xarticle(xarticle, pdf_url)
159 else:
160 print(f"[{pid}]PDF not found")
162 # Lang
163 xarticle.lang = cleanup_str(sections_dict["Języki publikacji"].text.lower())
164 if len(xarticle.lang) > 3:
165 if xarticle.lang == "pl fr": 165 ↛ 169line 165 didn't jump to line 169 because the condition on line 165 was always true
166 xarticle.lang = "pl"
167 print(f"[{xarticle.pid}] Patch : set article lang to 'pl' (was 'pl fr' before)")
168 else:
169 raise ValueError("Cannot parse article lang")
171 # Abstract
172 if "Abstrakty" in sections_dict: 172 ↛ 190line 172 didn't jump to line 190 because the condition on line 172 was always true
173 abstract_divs = sections_dict["Abstrakty"].select("div.listing-row")
174 for div in abstract_divs: 174 ↛ 175line 174 didn't jump to line 175 because the loop on line 174 never started
175 lang = "und"
176 lang_div = div.select_one("div.articleDetails-langCell")
177 if lang_div:
178 lang = cleanup_str(lang_div.text).lower()
179 text_div = div.select_one("div.articleDetails-abstract")
180 if not text_div:
181 raise ValueError(
182 "Error while parsing abstract : abstract presence detected, but abstract cannot be parsed"
183 )
184 xabstract = create_abstract(
185 tag="abstract", value_tex=cleanup_str(text_div.text), lang=lang
186 )
187 xarticle.abstracts.append(xabstract)
189 # Keywords
190 if "Słowa kluczowe" in sections_dict: 190 ↛ 202line 190 didn't jump to line 202 because the condition on line 190 was always true
191 keywords_lists = sections_dict["Słowa kluczowe"].select("div.listing-row")
192 for list in keywords_lists: 192 ↛ 193line 192 didn't jump to line 193 because the loop on line 192 never started
193 lang = "und"
194 lang_div = list.select_one("div.articleDetails-langCell")
195 keywords_a_tags = list.select("a")
196 for a_tag in keywords_a_tags:
197 subject = create_subj()
198 subject["value"] = a_tag.text
199 subject["lang"] = lang
200 xarticle.kwds.append(subject)
201 # Page
202 if "Strony" in sections_dict:
203 self.set_pages(xarticle, cleanup_str(sections_dict["Strony"].text))
205 return xarticle
207 def parse_author(self, content: str):
208 author = create_contributor()
209 sections_dict = self.parse_dmlpl_generic_page(content)
210 author["last_name"] = cleanup_str(sections_dict["Nazwisko"].text)
211 author["first_name"] = cleanup_str(sections_dict["Imię"].text)
212 if len(author["last_name"]) == 0 or len(author["first_name"]) == 0: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 author["string_name"] = cleanup_str(sections_dict["Twórca"].text)
214 return author