Coverage for src / crawler / by_source / dml_e_crawler.py: 31%
155 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from ptf.model_data import (
4 ArticleData,
5 IssueData,
6 create_articledata,
7 create_contributor,
8 create_extlink,
9)
11from crawler.base_crawler import BaseCollectionCrawler
12from crawler.crawler_utils import article_has_pdf, article_has_source
13from crawler.models import ExtlinkChecked
14from crawler.utils import add_pdf_link_to_xarticle
17class Dml_eCrawler(BaseCollectionCrawler):
18 """
19 DML_E is quite peculiar :
20 There is no issue page, and articles are separated into "years" instead of volumes/issues.
21 volume/issue number is stored inside each article page.
22 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues.
23 """
25 source_domain = "DML_E"
26 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas "
27 source_website = "http://dmle.icmat.es/revistas/"
29 # 1987, 1: 1-17
30 # 1999,19: 1-11
31 # 2008, 53-62,
32 # 1963 (1-2):
33 # 2000, 51 (1): 49-58, 13 Ref.
34 # 2006, 57 (Extra): 327-342, 10 Ref.
35 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?"
37 requests_interval = 60
39 def parse_collection_content(self, content):
40 xissues = []
41 soup = BeautifulSoup(content, "html.parser")
42 pagination_elements = soup.select("div.prevnext a")
43 for page in pagination_elements:
44 href = page.get("href")
45 if not isinstance(href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 continue
47 href = self.source_website + href
48 content = self.download_file(href)
49 xissues = [*xissues, *self.parse_collection_page(content, href)]
51 return xissues
53 def parse_collection_page(self, content: str, url: str):
54 soup = BeautifulSoup(content, "html.parser")
55 xissues = []
56 current_year = False
57 issues_tags = soup.select("a[name], ul.art_info")
58 for issue_tag in issues_tags:
59 if issue_tag.name == "a":
60 current_year = issue_tag.get("name")
61 if not isinstance(current_year, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError("Issue year cannot be parsed")
63 continue
65 if not current_year: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true
66 raise ValueError("Issue year not found")
67 issue = self.create_xissue(url, current_year, current_year)
68 self.parse_issue_tag(issue_tag, issue)
69 xissues.append(issue)
70 return xissues
72 # def parse_issue_content(self, content, xissue):
73 # pass
75 def parse_issue_tag(self, tag: Tag, xissue: IssueData):
76 article_tags = tag.select("li")
77 for index, art_tag in enumerate(article_tags):
78 href_tag = art_tag.select_one("a[href]")
79 if not href_tag: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 raise ValueError("Cannot parse article")
81 url = href_tag.get("href")
82 if not isinstance(url, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 raise ValueError("Cannot parse Article URL")
84 url = self.source_website + url
86 title = href_tag.text
88 article = create_articledata()
89 article.title_tex = title
90 article.url = url
91 article.pid = "a" + str(index)
92 xissue.articles.append(article)
94 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid):
95 xarticle.pid = pid
96 soup = BeautifulSoup(content, "html.parser")
97 table_lines = soup.select("div#centro table tr")
98 issue_volume: str | None = None
99 issue_number: str | None = None
100 for line in table_lines:
101 header_tag = line.select_one("th")
102 value_tag = line.select_one("td")
103 if not value_tag:
104 raise ValueError("Cannot parse article")
106 # PDF
107 if not header_tag:
108 href_tag = line.select_one("a")
109 if not href_tag:
110 raise ValueError("Cannot parse article pdf link")
111 href = href_tag.get("href")
112 if not isinstance(href, str):
113 raise ValueError("Cannot parse article pdf link")
114 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
115 continue
117 # Title
118 if header_tag.text == "Título español":
119 xarticle.title_tex = value_tag.text
120 continue
121 if header_tag.text == "Título original":
122 xarticle.title_tex = value_tag.text
123 continue
124 if header_tag.text == "Título inglés":
125 xarticle.title_tex = value_tag.text
126 continue
128 # Author
129 if header_tag.text == "Autor/es":
130 authors_tags = value_tag.select("a")
131 for a in authors_tags:
132 author = create_contributor()
133 author["role"] = "author"
134 author["string_name"] = a.text
135 xarticle.contributors.append(author)
136 continue
137 # Page
138 if header_tag.text == "Publicación":
139 volume_re = list(regex.finditer(self.issue_regex, value_tag.text))
140 if len(volume_re) != 0:
141 # raise ValueError("Cannot parse Article page")
142 volume_data = volume_re[0].groupdict()
144 if volume_data["page_start"] and volume_data["page_end"]:
145 xarticle.page_range = (
146 volume_data["page_start"] + "-" + volume_data["page_end"]
147 )
148 if "volume" in volume_data:
149 issue_volume = volume_data["volume"]
150 if "number" in volume_data:
151 issue_number = volume_data["number"]
152 else:
153 raise ValueError("issue volume or number not found")
155 # LANG
156 if header_tag.text == "Idioma":
157 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"}
158 if value_tag.text in languages:
159 xarticle.lang = languages[value_tag.text]
161 return xarticle, issue_volume, issue_number
163 def crawl_issue(self, xissue: IssueData):
164 if hasattr(xissue, "url") and xissue.url:
165 content = self.download_file(xissue.url)
166 self.parse_issue_content(content, xissue)
168 dml_e_issues: dict[str, IssueData] = {}
170 xarticles = xissue.articles
172 for xarticle in xarticles:
173 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue)
174 if parsed_xarticle is None:
175 continue
176 if xissue_vol or xissue_number:
177 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "")
178 else:
179 issue_tag = xissue.year
180 if not issue_tag:
181 raise ValueError("issue_tag is None")
182 if issue_tag not in dml_e_issues:
183 dml_e_issues[issue_tag] = self.create_xissue(
184 xissue.url, xissue.year, xissue_vol, xissue_number or None
185 )
186 dml_e_issues[issue_tag].articles.append(parsed_xarticle)
188 for value in dml_e_issues.values():
189 if self.ignore_missing_pdf:
190 value.articles = [a for a in value.articles if article_has_pdf(a)]
192 if not self.dry and len(value.articles) > 0:
193 self.process_resource_metadata(xissue, resource_type="issue")
194 self.database_executor.submit(
195 self.add_xissue_into_database, xissue
196 ).add_done_callback(self._handle_future_exceptions)
198 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData):
199 parsed_xarticle = xarticle
200 if not hasattr(xarticle, "url") or not xarticle.url:
201 raise ValueError("article does not have an url")
202 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")
204 content = self.download_file(xarticle.url)
205 pid = f"{xissue.pid}_{xarticle.pid}"
207 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content(
208 content, xissue, xarticle, xarticle.url, pid
209 )
211 if not article_has_source(parsed_xarticle) and parsed_xarticle.url:
212 ext_link = create_extlink()
213 ext_link["rel"] = "source"
214 ext_link["location"] = parsed_xarticle.url
215 ext_link["metadata"] = self.source_domain
216 parsed_xarticle.ext_links.append(ext_link)
218 # The article title may have formulas surrounded with '$'
219 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number
221 @classmethod
222 def check_pdf_link_validity(cls, url, verify=True):
223 # we overwrite this base_crawler method to manage the links to pdf that are not article pdf.
224 # Avoid downloading the whole PDF
225 # CHUNK_SIZE = 100 # number of characters fetched
226 # If the url contains Movingwall it does not lead to the article
227 # TODO this should be in the harvest tasks
228 if "Movingwall" in url:
229 print("The url does not link to the PDF article because of o moving wall")
230 return (
231 False,
232 None,
233 {
234 "status": ExtlinkChecked.Status.ERROR,
235 "message": "The url does not link to the PDF article because of a moving wall",
236 },
237 )
238 return super().check_pdf_link_validity(url)