Coverage for src / crawler / by_source / dml_e_crawler.py: 28%
176 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
1import regex
2import requests
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import (
5 ArticleData,
6 IssueData,
7 create_articledata,
8 create_contributor,
9 create_extlink,
10)
12from crawler.base_crawler import BaseCollectionCrawler
13from crawler.crawler_utils import article_has_pdf, article_has_source
14from crawler.models.extlink_checked import ExtlinkChecked
15from crawler.utils import add_pdf_link_to_xarticle
18class Dml_eCrawler(BaseCollectionCrawler):
19 """
20 DML_E is quite peculiar :
21 There is no issue page, and articles are separated into "years" instead of volumes/issues.
22 volume/issue number is stored inside each article page.
23 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues.
24 """
26 source_domain = "DML_E"
27 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas "
28 source_website = "http://dmle.icmat.es/revistas/"
30 # 1987, 1: 1-17
31 # 1999,19: 1-11
32 # 2008, 53-62,
33 # 1963 (1-2):
34 # 2000, 51 (1): 49-58, 13 Ref.
35 # 2006, 57 (Extra): 327-342, 10 Ref.
36 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?"
38 requests_interval = 60
40 def parse_collection_content(self, content):
41 xissues = []
42 soup = BeautifulSoup(content, "html.parser")
43 pagination_elements = soup.select("div.prevnext a")
44 for page in pagination_elements:
45 href = page.get("href")
46 if not isinstance(href, str): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 continue
48 href = self.source_website + href
49 content = self.download_file(href)
50 xissues = [*xissues, *self.parse_collection_page(content, href)]
52 return xissues
54 def parse_collection_page(self, content: str, url: str):
55 soup = BeautifulSoup(content, "html.parser")
56 xissues = []
57 current_year = False
58 issues_tags = soup.select("a[name], ul.art_info")
59 for issue_tag in issues_tags:
60 if issue_tag.name == "a":
61 current_year = issue_tag.get("name")
62 if not isinstance(current_year, str): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 raise ValueError("Issue year cannot be parsed")
64 continue
66 if not current_year: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 raise ValueError("Issue year not found")
68 issue = self.create_xissue(url, current_year, current_year)
69 self.parse_issue_tag(issue_tag, issue)
70 xissues.append(issue)
71 return xissues
73 # def parse_issue_content(self, content, xissue):
74 # pass
76 def parse_issue_tag(self, tag: Tag, xissue: IssueData):
77 article_tags = tag.select("li")
78 for index, art_tag in enumerate(article_tags):
79 href_tag = art_tag.select_one("a[href]")
80 if not href_tag: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Cannot parse article")
82 url = href_tag.get("href")
83 if not isinstance(url, str): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 raise ValueError("Cannot parse Article URL")
85 url = self.source_website + url
87 title = href_tag.text
89 article = create_articledata()
90 article.title_tex = title
91 article.url = url
92 article.pid = "a" + str(index)
93 xissue.articles.append(article)
95 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid):
96 xarticle.pid = pid
97 soup = BeautifulSoup(content, "html.parser")
98 table_lines = soup.select("div#centro table tr")
99 issue_volume: str | None = None
100 issue_number: str | None = None
101 for line in table_lines:
102 header_tag = line.select_one("th")
103 value_tag = line.select_one("td")
104 if not value_tag:
105 raise ValueError("Cannot parse article")
107 # PDF
108 if not header_tag:
109 href_tag = line.select_one("a")
110 if not href_tag:
111 raise ValueError("Cannot parse article pdf link")
112 href = href_tag.get("href")
113 if not isinstance(href, str):
114 raise ValueError("Cannot parse article pdf link")
115 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
116 continue
118 # Title
119 if header_tag.text == "Título español":
120 xarticle.title_tex = value_tag.text
121 continue
122 if header_tag.text == "Título original":
123 xarticle.title_tex = value_tag.text
124 continue
125 if header_tag.text == "Título inglés":
126 xarticle.title_tex = value_tag.text
127 continue
129 # Author
130 if header_tag.text == "Autor/es":
131 authors_tags = value_tag.select("a")
132 for a in authors_tags:
133 author = create_contributor()
134 author["role"] = "author"
135 author["string_name"] = a.text
136 xarticle.contributors.append(author)
137 continue
138 # Page
139 if header_tag.text == "Publicación":
140 volume_re = list(regex.finditer(self.issue_regex, value_tag.text))
141 if len(volume_re) != 0:
142 # raise ValueError("Cannot parse Article page")
143 volume_data = volume_re[0].groupdict()
145 if volume_data["page_start"] and volume_data["page_end"]:
146 xarticle.page_range = (
147 volume_data["page_start"] + "-" + volume_data["page_end"]
148 )
149 if "volume" in volume_data:
150 issue_volume = volume_data["volume"]
151 if "number" in volume_data:
152 issue_number = volume_data["number"]
153 else:
154 raise ValueError("issue volume or number not found")
156 # LANG
157 if header_tag.text == "Idioma":
158 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"}
159 if value_tag.text in languages:
160 xarticle.lang = languages[value_tag.text]
162 return xarticle, issue_volume, issue_number
164 def crawl_issue(self, xissue: IssueData):
165 if hasattr(xissue, "url") and xissue.url:
166 content = self.download_file(xissue.url)
167 self.parse_issue_content(content, xissue)
169 dml_e_issues: dict[str, IssueData] = {}
171 xarticles = xissue.articles
173 for xarticle in xarticles:
174 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue)
175 if parsed_xarticle is None:
176 continue
177 if xissue_vol or xissue_number:
178 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "")
179 else:
180 issue_tag = xissue.year
181 if not issue_tag:
182 raise ValueError("issue_tag is None")
183 if issue_tag not in dml_e_issues:
184 dml_e_issues[issue_tag] = self.create_xissue(
185 xissue.url, xissue.year, xissue_vol, xissue_number or None
186 )
187 dml_e_issues[issue_tag].articles.append(parsed_xarticle)
189 for value in dml_e_issues.values():
190 if self.ignore_missing_pdf:
191 value.articles = [a for a in value.articles if article_has_pdf(a)]
193 if not self.dry and len(value.articles) > 0:
194 self.process_resource_metadata(xissue, resource_type="issue")
195 self.add_xissue_into_database(value)
197 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData):
198 parsed_xarticle = xarticle
199 if not hasattr(xarticle, "url") or not xarticle.url:
200 raise ValueError("article does not have an url")
201 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")
203 content = self.download_file(xarticle.url)
204 pid = f"{xissue.pid}_{xarticle.pid}"
206 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content(
207 content, xissue, xarticle, xarticle.url, pid
208 )
210 if not article_has_source(parsed_xarticle) and parsed_xarticle.url:
211 ext_link = create_extlink()
212 ext_link["rel"] = "source"
213 ext_link["location"] = parsed_xarticle.url
214 ext_link["metadata"] = self.source_domain
215 parsed_xarticle.ext_links.append(ext_link)
217 # The article title may have formulas surrounded with '$'
218 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number
220 @classmethod
221 def check_pdf_link_validity(cls, url: str, verify: bool):
222 # we overwrite this base_crawler method to manage the links to pdf that are not article pdf.
223 # Avoid downloading the whole PDF
224 CHUNK_SIZE = 100 # number of characters fetched
225 # If the url contains Movingwall it does not lead to the article
226 if "Movingwall" in url:
227 print("The url does not link to the PDF article because of o moving wall")
228 return (
229 False,
230 "No query sent",
231 {
232 "status": ExtlinkChecked.Status.ERROR,
233 "message": "The url does not link to the PDF article because of o moving wall",
234 },
235 )
237 header = {
238 "Range": f"bytes=0-{CHUNK_SIZE}",
239 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
240 }
241 with requests.get(
242 url, stream=True, allow_redirects=True, headers=header, verify=verify
243 ) as response:
244 content_type = response.headers.get("Content-Type")
245 if "application/pdf" not in content_type:
246 # Content type is wrong, lest's check the header
247 try:
248 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE))
249 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None:
250 return (
251 False,
252 response,
253 {
254 "status": ExtlinkChecked.Status.ERROR,
255 "message": f"Content-Type header: {content_type}; PDF Header not found : got {pdf_header}",
256 },
257 )
258 else:
259 return (
260 True,
261 response,
262 {
263 "status": ExtlinkChecked.Status.WARNING,
264 "message": f"Content-Type header: {content_type}",
265 },
266 )
267 except StopIteration:
268 return (
269 False,
270 response,
271 {
272 "status": ExtlinkChecked.Status.ERROR,
273 "message": f"Content-Type header: {content_type}.",
274 },
275 )
276 try:
277 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE))
278 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None:
279 return (
280 False,
281 response,
282 {
283 "status": ExtlinkChecked.Status.ERROR,
284 "message": "PDF Header not found : got {pdf_header}",
285 },
286 )
287 except StopIteration:
288 return (
289 False,
290 response,
291 {
292 "status": ExtlinkChecked.Status.ERROR,
293 "message": f"Content-Type header: {content_type}.",
294 },
295 )
297 if response.status_code not in (200, 206):
298 raise ValueError("Invalid status code")
300 return (
301 True,
302 response,
303 {
304 "status": ExtlinkChecked.Status.OK,
305 "message": "",
306 },
307 )