Coverage for src/crawler/by_source/dml_e_crawler.py: 30%
147 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from ptf.model_data import (
4 ArticleData,
5 IssueData,
6 create_articledata,
7 create_contributor,
8 create_extlink,
9)
11from crawler.base_crawler import BaseCollectionCrawler
12from crawler.utils import add_pdf_link_to_xarticle
15class Dml_eCrawler(BaseCollectionCrawler):
16 """
17 DML_E is quite peculiar :
18 There is no issue page, and articles are separated into "years" instead of volumes/issues.
19 volume/issue number is stored inside each article page.
20 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues.
21 """
23 source_domain = "DML_E"
24 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas "
25 source_website = "http://dmle.icmat.es/revistas/"
27 # 1987, 1: 1-17
28 # 1999,19: 1-11
29 # 2008, 53-62,
30 # 1963 (1-2):
31 # 2000, 51 (1): 49-58, 13 Ref.
32 # 2006, 57 (Extra): 327-342, 10 Ref.
33 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?"
35 requests_interval = 60
37 def parse_collection_content(self, content):
38 xissues = []
39 soup = BeautifulSoup(content, "html.parser")
40 pagination_elements = soup.select("div.prevnext a")
41 for page in pagination_elements:
42 href = page.get("href")
43 if not isinstance(href, str): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 continue
45 href = self.source_website + href
46 content = self.download_file(href)
47 xissues = [*xissues, *self.parse_collection_page(content, href)]
49 return xissues
51 def parse_collection_page(self, content: str, url: str):
52 soup = BeautifulSoup(content, "html.parser")
53 xissues = []
54 current_year = False
55 issues_tags = soup.select("a[name], ul.art_info")
56 for issue_tag in issues_tags:
57 if issue_tag.name == "a":
58 current_year = issue_tag.get("name")
59 if not isinstance(current_year, str): 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 raise ValueError("Issue year cannot be parsed")
61 continue
63 if not current_year: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 raise ValueError("Issue year not found")
65 issue = self.create_xissue(url, current_year, current_year)
66 self.parse_issue_tag(issue_tag, issue)
67 xissues.append(issue)
68 return xissues
70 # def parse_issue_content(self, content, xissue):
71 # pass
73 def parse_issue_tag(self, tag: Tag, xissue: IssueData):
74 article_tags = tag.select("li")
75 for index, art_tag in enumerate(article_tags):
76 href_tag = art_tag.select_one("a[href]")
77 if not href_tag: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 raise ValueError("Cannot parse article")
79 url = href_tag.get("href")
80 if not isinstance(url, str): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Cannot parse Article URL")
82 url = self.source_website + url
84 title = href_tag.text
86 article = create_articledata()
87 article.title_tex = title
88 article.url = url
89 article.pid = "a" + str(index)
90 xissue.articles.append(article)
92 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid):
93 xarticle.pid = pid
94 soup = BeautifulSoup(content, "html.parser")
95 table_lines = soup.select("div#centro table tr")
96 issue_volume: str | None = None
97 issue_number: str | None = None
98 for line in table_lines:
99 header_tag = line.select_one("th")
100 value_tag = line.select_one("td")
101 if not value_tag:
102 raise ValueError("Cannot parse article")
104 # PDF
105 if not header_tag:
106 href_tag = line.select_one("a")
107 if not href_tag:
108 raise ValueError("Cannot parse article pdf link")
109 href = href_tag.get("href")
110 if not isinstance(href, str):
111 raise ValueError("Cannot parse article pdf link")
112 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
113 continue
115 # Title
116 if header_tag.text == "Título español":
117 xarticle.title_tex = value_tag.text
118 continue
119 if header_tag.text == "Título original":
120 xarticle.title_tex = value_tag.text
121 continue
122 if header_tag.text == "Título inglés":
123 xarticle.title_tex = value_tag.text
124 continue
126 # Author
127 if header_tag.text == "Autor/es":
128 authors_tags = value_tag.select("a")
129 for a in authors_tags:
130 author = create_contributor()
131 author["role"] = "author"
132 author["string_name"] = a.text
133 xarticle.contributors.append(author)
134 continue
135 # Page
136 if header_tag.text == "Publicación":
137 volume_re = list(regex.finditer(self.issue_regex, value_tag.text))
138 if len(volume_re) != 0:
139 # raise ValueError("Cannot parse Article page")
140 volume_data = volume_re[0].groupdict()
142 if volume_data["page_start"] and volume_data["page_end"]:
143 xarticle.page_range = (
144 volume_data["page_start"] + "-" + volume_data["page_end"]
145 )
146 if "volume" in volume_data:
147 issue_volume = volume_data["volume"]
148 if "number" in volume_data:
149 issue_number = volume_data["number"]
150 else:
151 raise ValueError("issue volume or number not found")
153 # LANG
154 if header_tag.text == "Idioma":
155 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"}
156 if value_tag.text in languages:
157 xarticle.lang = languages[value_tag.text]
159 return xarticle, issue_volume, issue_number
161 def crawl_issue(self, xissue: IssueData):
162 if hasattr(xissue, "url") and xissue.url:
163 content = self.download_file(xissue.url)
164 self.parse_issue_content(content, xissue)
166 dml_e_issues: dict[str, IssueData] = {}
168 xarticles = xissue.articles
170 for xarticle in xarticles:
171 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue)
172 if parsed_xarticle is None:
173 continue
174 if xissue_vol or xissue_number:
175 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "")
176 else:
177 issue_tag = xissue.year
178 if not issue_tag:
179 raise ValueError("issue_tag is None")
180 if issue_tag not in dml_e_issues:
181 dml_e_issues[issue_tag] = self.create_xissue(
182 xissue.url, xissue.year, xissue_vol, xissue_number or None
183 )
184 dml_e_issues[issue_tag].articles.append(parsed_xarticle)
186 for value in dml_e_issues.values():
187 if self.ignore_missing_pdf:
188 value.articles = [a for a in value.articles if self.article_has_pdf(a)]
190 if not self.test_mode and len(value.articles) > 0:
191 self.process_resource_metadata(xissue)
192 self.add_xissue_into_database(value)
194 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData):
195 parsed_xarticle = xarticle
196 if not hasattr(xarticle, "url") or not xarticle.url:
197 raise ValueError("article does not have an url")
198 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")
200 content = self.download_file(xarticle.url)
201 pid = f"{xissue.pid}_{xarticle.pid}"
203 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content(
204 content, xissue, xarticle, xarticle.url, pid
205 )
207 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url:
208 ext_link = create_extlink()
209 ext_link["rel"] = "source"
210 ext_link["location"] = parsed_xarticle.url
211 ext_link["metadata"] = self.source_domain
212 parsed_xarticle.ext_links.append(ext_link)
214 # The article title may have formulas surrounded with '$'
215 return self.process_resource_metadata(parsed_xarticle), xissue_vol, xissue_number