Coverage for src/crawler/by_source/dml_e_crawler.py: 34%
144 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from ptf.model_data import ArticleData, IssueData, create_articledata, create_contributor
5from crawler.base_crawler import BaseCollectionCrawler
6from crawler.utils import add_pdf_link_to_xarticle
9class Dml_eCrawler(BaseCollectionCrawler):
10 """
11 DML_E is quite peculiar :
12 There is no issue page, and articles are separated into "years" instead of volumes/issues.
13 volume/issue number is stored inside each article page.
14 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues.
15 """
17 source_domain = "DML_E"
18 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas "
19 source_website = "http://dmle.icmat.es/revistas/"
21 periode_begin = 1968
22 periode_begin = 1969
24 # 1987, 1: 1-17
25 # 1999,19: 1-11
26 # 2008, 53-62,
27 # 1963 (1-2):
28 # 2000, 51 (1): 49-58, 13 Ref.
29 # 2006, 57 (Extra): 327-342, 10 Ref.
30 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?"
32 requests_interval = 60
34 def parse_collection_content(self, content):
35 xissues = []
36 soup = BeautifulSoup(content, "html.parser")
37 pagination_elements = soup.select("div.prevnext a")
38 for page in pagination_elements:
39 href = page.get("href")
40 if not isinstance(href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 continue
42 href = self.source_website + href
43 content = self.download_file(href)
44 xissues = [*xissues, *self.parse_collection_page(content, href)]
46 years = [int(issue.year) for issue in xissues]
47 self.periode_begin = min(years)
48 self.periode_end = max(years)
49 self.periode = self.get_or_create_periode()
50 return xissues
52 def parse_collection_page(self, content: str, url: str):
53 soup = BeautifulSoup(content, "html.parser")
54 xissues = []
55 current_year = False
56 issues_tags = soup.select("a[name], ul.art_info")
57 for issue_tag in issues_tags:
58 if issue_tag.name == "a":
59 current_year = issue_tag.get("name")
60 if not isinstance(current_year, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 raise ValueError("Issue year cannot be parsed")
62 continue
64 if not current_year: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 raise ValueError("Issue year not found")
66 issue = self.create_xissue(url, current_year, current_year)
67 self.parse_issue_tag(issue_tag, issue)
68 xissues.append(issue)
69 return xissues
71 # def parse_issue_content(self, content, xissue):
72 # pass
74 def parse_issue_tag(self, tag: Tag, xissue: IssueData):
75 article_tags = tag.select("li")
76 for index, art_tag in enumerate(article_tags):
77 href_tag = art_tag.select_one("a[href]")
78 if not href_tag: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 raise ValueError("Cannot parse article")
80 url = href_tag.get("href")
81 if not isinstance(url, str): 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 raise ValueError("Cannot parse Article URL")
83 url = self.source_website + url
85 title = href_tag.text
87 article = create_articledata()
88 article.title_tex = title
89 article.url = url
90 article.pid = "a" + str(index)
91 xissue.articles.append(article)
93 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid):
94 xarticle.pid = pid
95 soup = BeautifulSoup(content, "html.parser")
96 table_lines = soup.select("div#centro table tr")
97 issue_volume: str | None = None
98 issue_number: str | None = None
99 for line in table_lines:
100 header_tag = line.select_one("th")
101 value_tag = line.select_one("td")
102 if not value_tag:
103 raise ValueError("Cannot parse article")
105 # PDF
106 if not header_tag:
107 href_tag = line.select_one("a")
108 if not href_tag:
109 raise ValueError("Cannot parse article pdf link")
110 href = href_tag.get("href")
111 if not isinstance(href, str):
112 raise ValueError("Cannot parse article pdf link")
113 add_pdf_link_to_xarticle(xarticle, self.source_website + href)
114 continue
116 # Title
117 if header_tag.text == "Título español":
118 xarticle.title_tex = value_tag.text
119 continue
120 if header_tag.text == "Título original":
121 xarticle.title_tex = value_tag.text
122 continue
123 if header_tag.text == "Título inglés":
124 xarticle.title_tex = value_tag.text
125 continue
127 # Author
128 if header_tag.text == "Autor/es":
129 authors_tags = value_tag.select("a")
130 for a in authors_tags:
131 author = create_contributor()
132 author["role"] = "author"
133 author["string_name"] = a.text
134 xarticle.contributors.append(author)
135 continue
136 # Page
137 if header_tag.text == "Publicación":
138 volume_re = list(regex.finditer(self.issue_regex, value_tag.text))
139 if len(volume_re) != 0:
140 # raise ValueError("Cannot parse Article page")
141 volume_data = volume_re[0].groupdict()
143 if volume_data["page_start"] and volume_data["page_end"]:
144 xarticle.page_range = (
145 volume_data["page_start"] + "-" + volume_data["page_end"]
146 )
147 if "volume" in volume_data:
148 issue_volume = volume_data["volume"]
149 if "number" in volume_data:
150 issue_number = volume_data["number"]
151 else:
152 raise ValueError("issue volume or number not found")
154 # LANG
155 if header_tag.text == "Idioma":
156 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"}
157 if value_tag.text in languages:
158 xarticle.lang = languages[value_tag.text]
160 return xarticle, issue_volume, issue_number
162 def crawl_issue(self, xissue: IssueData):
163 if hasattr(xissue, "url") and xissue.url:
164 content = self.download_file(xissue.url)
165 self.parse_issue_content(content, xissue)
167 dml_e_issues: dict[str, IssueData] = {}
169 xarticles = xissue.articles
171 for xarticle in xarticles:
172 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue)
173 if parsed_xarticle is None:
174 continue
175 if xissue_vol or xissue_number:
176 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "")
177 else:
178 issue_tag = xissue.year
179 if not issue_tag:
180 raise ValueError("issue_tag is None")
181 if issue_tag not in dml_e_issues:
182 dml_e_issues[issue_tag] = self.create_xissue(
183 xissue.url, xissue.year, xissue_vol, xissue_number or None
184 )
185 dml_e_issues[issue_tag].articles.append(parsed_xarticle)
187 for value in dml_e_issues.values():
188 if not self.test_mode and len(value.articles) > 0:
189 self.add_xissue_into_database(value)
191 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData):
192 parsed_xarticle = xarticle
193 if not hasattr(xarticle, "url") or not xarticle.url:
194 raise ValueError("article does not have an url")
195 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}")
197 content = self.download_file(xarticle.url)
198 pid = f"{xissue.pid}_{xarticle.pid}"
200 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content(
201 content, xissue, xarticle, xarticle.url, pid
202 )
204 # The article title may have formulas surrounded with '$'
205 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number