Coverage for src / crawler / by_source / dml_e_crawler.py: 28%
166 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from ptf.model_data import (
4 ArticleData,
5 IssueData,
6 create_articledata,
7 create_contributor,
8 create_extlink,
9)
11from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
12from crawler.crawler_utils import article_has_source
13from crawler.models import ExtlinkChecked
14from crawler.utils import add_pdf_link_to_xarticle
17class Dml_eCrawler(MatchingCrawler):
18 """
19 DML_E is quite peculiar :
20 There is no issue page, and articles are separated into "years" instead of volumes/issues.
21 volume/issue number is stored inside each article page.
22 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues.
23 """
25 source_domain = "DML_E"
26 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas "
27 source_website = "http://dmle.icmat.es/revistas/"
29 # 1987, 1: 1-17
30 # 1999,19: 1-11
31 # 2008, 53-62,
32 # 1963 (1-2):
33 # 2000, 51 (1): 49-58, 13 Ref.
34 # 2006, 57 (Extra): 327-342, 10 Ref.
35 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?"
37 def parse_collection_content(self, content):
38 xissues = []
39 soup = BeautifulSoup(content, "html.parser")
40 pagination_elements = soup.select("div.prevnext a")
41 for page in pagination_elements:
42 href = page.get("href")
43 if not isinstance(href, str): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 continue
45 href = self.source_website + href
46 content = self.download_file(href)
47 xissues = [*xissues, *self.parse_collection_page(content, href)]
49 return xissues
51 def parse_collection_page(self, content: str, url: str):
52 soup = BeautifulSoup(content, "html.parser")
53 xissues = []
54 current_year = False
55 issues_tags = soup.select("a[name], ul.art_info")
56 for issue_tag in issues_tags:
57 if issue_tag.name == "a":
58 current_year = issue_tag.get("name")
59 if not isinstance(current_year, str): 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 raise ValueError("Issue year cannot be parsed")
61 continue
63 if not current_year: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 raise ValueError("Issue year not found")
65 issue = self.create_xissue(url, current_year, current_year)
66 self.parse_issue_tag(issue_tag, issue)
67 xissues.append(issue)
68 return xissues
70 # def parse_issue_content(self, content, xissue):
71 # pass
73 def parse_issue_tag(self, tag: Tag, xissue: IssueData):
74 article_tags = tag.select("li")
75 for index, art_tag in enumerate(article_tags):
76 href_tag = art_tag.select_one("a[href]")
77 if not href_tag: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 raise ValueError("Cannot parse article")
79 url = href_tag.get("href")
80 if not isinstance(url, str): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Cannot parse Article URL")
82 url = self.source_website + url
84 title = href_tag.text
86 article = create_articledata()
87 article.title_tex = title
88 article.url = url
89 article.pid = "a" + str(index)
90 xissue.articles.append(article)
92 def parse_dml_e_article_content(self, content, xissue, xarticle: ArticleData, url, pid):
93 xarticle.pid = pid
94 soup = BeautifulSoup(content, "html.parser")
95 table_lines = soup.select("div#centro table tr")
96 issue_volume: str | None = None
97 issue_number: str | None = None
98 for line in table_lines:
99 header_tag = line.select_one("th")
100 value_tag = line.select_one("td")
101 if not value_tag:
102 raise ValueError("Cannot parse article")
104 # PDF
105 if not header_tag:
106 href_tag = line.select_one("a")
107 if not href_tag:
108 raise ValueError("Cannot parse article pdf link")
109 href = href_tag.get("href")
110 if not isinstance(href, str):
111 raise ValueError("Cannot parse article pdf link")
112 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
113 continue
115 # Title
116 if header_tag.text == "Título español":
117 xarticle.title_tex = value_tag.text
118 continue
119 if header_tag.text == "Título original":
120 xarticle.title_tex = value_tag.text
121 continue
122 if header_tag.text == "Título inglés":
123 xarticle.title_tex = value_tag.text
124 continue
126 # Author
127 if header_tag.text == "Autor/es":
128 authors_tags = value_tag.select("a")
129 for a in authors_tags:
130 author = create_contributor()
131 author["role"] = "author"
132 author["string_name"] = a.text
133 xarticle.contributors.append(author)
134 continue
135 # Page
136 if header_tag.text == "Publicación":
137 volume_re = list(regex.finditer(self.issue_regex, value_tag.text))
138 if len(volume_re) != 0:
139 # raise ValueError("Cannot parse Article page")
140 volume_data = volume_re[0].groupdict()
142 if volume_data["page_start"] and volume_data["page_end"]:
143 xarticle.page_range = (
144 volume_data["page_start"] + "-" + volume_data["page_end"]
145 )
146 if "volume" in volume_data:
147 issue_volume = volume_data["volume"]
148 if "number" in volume_data:
149 issue_number = volume_data["number"]
150 else:
151 raise ValueError("issue volume or number not found")
153 # LANG
154 if header_tag.text == "Idioma":
155 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"}
156 if value_tag.text in languages:
157 xarticle.lang = languages[value_tag.text]
159 if header_tag.text == "Código MathReviews":
160 if value_tag.text.startswith("MR"):
161 xarticle.extids.append(("mr-item-id", value_tag.text))
162 if header_tag.text == "Código Z-Math":
163 if value_tag.text.startswith("Zbl "):
164 xarticle.extids.append(("zbl-item-id", value_tag.text.removeprefix("Zbl ")))
166 return xarticle, issue_volume, issue_number
168 def crawl_issue(self, xissue: IssueData):
169 if hasattr(xissue, "url") and xissue.url:
170 content = self.download_file(xissue.url)
171 self.parse_issue_content(content, xissue)
173 dml_e_issues: dict[str, IssueData] = {}
175 xarticles = xissue.articles
177 for xarticle in xarticles:
178 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue)
179 if parsed_xarticle is None:
180 continue
181 if xissue_vol or xissue_number:
182 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "")
183 else:
184 issue_tag = xissue.year
185 if not issue_tag:
186 raise ValueError("issue_tag is None")
187 if issue_tag not in dml_e_issues:
188 dml_e_issues[issue_tag] = self.create_xissue(
189 xissue.url, xissue.year, xissue_vol, xissue_number or None
190 )
191 dml_e_issues[issue_tag].articles.append(parsed_xarticle)
193 for value in dml_e_issues.values():
194 if self.ignore_missing_pdf:
195 value.articles = [a for a in value.articles if self.article_has_pdf(a)]
196 if self.dry:
197 return
198 issue_has_pdf = self.article_has_pdf(value)
199 if len(value.articles) == 0 and not issue_has_pdf:
200 continue
201 for index, article in enumerate(value.articles):
202 article.pid = f"{value.pid}_a{index}"
203 self.process_resource_metadata(value, resource_type="issue")
204 self.add_xissue_into_database(value)
206 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData):
207 parsed_xarticle = xarticle
208 if not hasattr(xarticle, "url") or not xarticle.url:
209 raise ValueError("article does not have an url")
210 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")
212 content = self.download_file(xarticle.url)
213 pid = f"{xissue.pid}_{xarticle.pid}"
215 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content(
216 content, xissue, xarticle, xarticle.url, pid
217 )
219 if not article_has_source(parsed_xarticle) and parsed_xarticle.url:
220 ext_link = create_extlink()
221 ext_link["rel"] = "source"
222 ext_link["location"] = parsed_xarticle.url
223 ext_link["metadata"] = self.source_domain
224 parsed_xarticle.ext_links.append(ext_link)
226 # The article title may have formulas surrounded with '$'
227 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number
229 @classmethod
230 def check_pdf_link_validity(cls, url, verify=True):
231 # we overwrite this base_crawler method to manage the links to pdf that are not article pdf.
232 # Avoid downloading the whole PDF
233 # CHUNK_SIZE = 100 # number of characters fetched
234 # If the url contains Movingwall it does not lead to the article
235 # TODO this should be in the harvest tasks
236 if "Movingwall" in url:
237 print("The url does not link to the PDF article because of o moving wall")
238 return (
239 False,
240 None,
241 {
242 "status": ExtlinkChecked.Status.ERROR,
243 "message": "The url does not link to the PDF article because of a moving wall",
244 },
245 )
246 return super().check_pdf_link_validity(url)