Coverage for src / crawler / by_source / dml_e_crawler.py: 31%

155 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from ptf.model_data import ( 

4 ArticleData, 

5 IssueData, 

6 create_articledata, 

7 create_contributor, 

8 create_extlink, 

9) 

10 

11from crawler.base_crawler import BaseCollectionCrawler 

12from crawler.crawler_utils import article_has_pdf, article_has_source 

13from crawler.models import ExtlinkChecked 

14from crawler.utils import add_pdf_link_to_xarticle 

15 

16 

17class Dml_eCrawler(BaseCollectionCrawler): 

18 """ 

19 DML_E is quite peculiar : 

20 There is no issue page, and articles are separated into "years" instead of volumes/issues. 

21 volume/issue number is stored inside each article page. 

22 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues. 

23 """ 

24 

25 source_domain = "DML_E" 

26 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas " 

27 source_website = "http://dmle.icmat.es/revistas/" 

28 

29 # 1987, 1: 1-17 

30 # 1999,19: 1-11 

31 # 2008, 53-62, 

32 # 1963 (1-2): 

33 # 2000, 51 (1): 49-58, 13 Ref. 

34 # 2006, 57 (Extra): 327-342, 10 Ref. 

35 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?" 

36 

37 requests_interval = 60 

38 

39 def parse_collection_content(self, content): 

40 xissues = [] 

41 soup = BeautifulSoup(content, "html.parser") 

42 pagination_elements = soup.select("div.prevnext a") 

43 for page in pagination_elements: 

44 href = page.get("href") 

45 if not isinstance(href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 continue 

47 href = self.source_website + href 

48 content = self.download_file(href) 

49 xissues = [*xissues, *self.parse_collection_page(content, href)] 

50 

51 return xissues 

52 

53 def parse_collection_page(self, content: str, url: str): 

54 soup = BeautifulSoup(content, "html.parser") 

55 xissues = [] 

56 current_year = False 

57 issues_tags = soup.select("a[name], ul.art_info") 

58 for issue_tag in issues_tags: 

59 if issue_tag.name == "a": 

60 current_year = issue_tag.get("name") 

61 if not isinstance(current_year, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Issue year cannot be parsed") 

63 continue 

64 

65 if not current_year: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 raise ValueError("Issue year not found") 

67 issue = self.create_xissue(url, current_year, current_year) 

68 self.parse_issue_tag(issue_tag, issue) 

69 xissues.append(issue) 

70 return xissues 

71 

72 # def parse_issue_content(self, content, xissue): 

73 # pass 

74 

75 def parse_issue_tag(self, tag: Tag, xissue: IssueData): 

76 article_tags = tag.select("li") 

77 for index, art_tag in enumerate(article_tags): 

78 href_tag = art_tag.select_one("a[href]") 

79 if not href_tag: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Cannot parse article") 

81 url = href_tag.get("href") 

82 if not isinstance(url, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Cannot parse Article URL") 

84 url = self.source_website + url 

85 

86 title = href_tag.text 

87 

88 article = create_articledata() 

89 article.title_tex = title 

90 article.url = url 

91 article.pid = "a" + str(index) 

92 xissue.articles.append(article) 

93 

94 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid): 

95 xarticle.pid = pid 

96 soup = BeautifulSoup(content, "html.parser") 

97 table_lines = soup.select("div#centro table tr") 

98 issue_volume: str | None = None 

99 issue_number: str | None = None 

100 for line in table_lines: 

101 header_tag = line.select_one("th") 

102 value_tag = line.select_one("td") 

103 if not value_tag: 

104 raise ValueError("Cannot parse article") 

105 

106 # PDF 

107 if not header_tag: 

108 href_tag = line.select_one("a") 

109 if not href_tag: 

110 raise ValueError("Cannot parse article pdf link") 

111 href = href_tag.get("href") 

112 if not isinstance(href, str): 

113 raise ValueError("Cannot parse article pdf link") 

114 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

115 continue 

116 

117 # Title 

118 if header_tag.text == "Título español": 

119 xarticle.title_tex = value_tag.text 

120 continue 

121 if header_tag.text == "Título original": 

122 xarticle.title_tex = value_tag.text 

123 continue 

124 if header_tag.text == "Título inglés": 

125 xarticle.title_tex = value_tag.text 

126 continue 

127 

128 # Author 

129 if header_tag.text == "Autor/es": 

130 authors_tags = value_tag.select("a") 

131 for a in authors_tags: 

132 author = create_contributor() 

133 author["role"] = "author" 

134 author["string_name"] = a.text 

135 xarticle.contributors.append(author) 

136 continue 

137 # Page 

138 if header_tag.text == "Publicación": 

139 volume_re = list(regex.finditer(self.issue_regex, value_tag.text)) 

140 if len(volume_re) != 0: 

141 # raise ValueError("Cannot parse Article page") 

142 volume_data = volume_re[0].groupdict() 

143 

144 if volume_data["page_start"] and volume_data["page_end"]: 

145 xarticle.page_range = ( 

146 volume_data["page_start"] + "-" + volume_data["page_end"] 

147 ) 

148 if "volume" in volume_data: 

149 issue_volume = volume_data["volume"] 

150 if "number" in volume_data: 

151 issue_number = volume_data["number"] 

152 else: 

153 raise ValueError("issue volume or number not found") 

154 

155 # LANG 

156 if header_tag.text == "Idioma": 

157 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"} 

158 if value_tag.text in languages: 

159 xarticle.lang = languages[value_tag.text] 

160 

161 return xarticle, issue_volume, issue_number 

162 

163 def crawl_issue(self, xissue: IssueData): 

164 if hasattr(xissue, "url") and xissue.url: 

165 content = self.download_file(xissue.url) 

166 self.parse_issue_content(content, xissue) 

167 

168 dml_e_issues: dict[str, IssueData] = {} 

169 

170 xarticles = xissue.articles 

171 

172 for xarticle in xarticles: 

173 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue) 

174 if parsed_xarticle is None: 

175 continue 

176 if xissue_vol or xissue_number: 

177 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "") 

178 else: 

179 issue_tag = xissue.year 

180 if not issue_tag: 

181 raise ValueError("issue_tag is None") 

182 if issue_tag not in dml_e_issues: 

183 dml_e_issues[issue_tag] = self.create_xissue( 

184 xissue.url, xissue.year, xissue_vol, xissue_number or None 

185 ) 

186 dml_e_issues[issue_tag].articles.append(parsed_xarticle) 

187 

188 for value in dml_e_issues.values(): 

189 if self.ignore_missing_pdf: 

190 value.articles = [a for a in value.articles if article_has_pdf(a)] 

191 

192 if not self.dry and len(value.articles) > 0: 

193 self.process_resource_metadata(xissue, resource_type="issue") 

194 self.database_executor.submit( 

195 self.add_xissue_into_database, xissue 

196 ).add_done_callback(self._handle_future_exceptions) 

197 

198 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData): 

199 parsed_xarticle = xarticle 

200 if not hasattr(xarticle, "url") or not xarticle.url: 

201 raise ValueError("article does not have an url") 

202 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}") 

203 

204 content = self.download_file(xarticle.url) 

205 pid = f"{xissue.pid}_{xarticle.pid}" 

206 

207 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content( 

208 content, xissue, xarticle, xarticle.url, pid 

209 ) 

210 

211 if not article_has_source(parsed_xarticle) and parsed_xarticle.url: 

212 ext_link = create_extlink() 

213 ext_link["rel"] = "source" 

214 ext_link["location"] = parsed_xarticle.url 

215 ext_link["metadata"] = self.source_domain 

216 parsed_xarticle.ext_links.append(ext_link) 

217 

218 # The article title may have formulas surrounded with '$' 

219 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number 

220 

221 @classmethod 

222 def check_pdf_link_validity(cls, url, verify=True): 

223 # we overwrite this base_crawler method to manage the links to pdf that are not article pdf. 

224 # Avoid downloading the whole PDF 

225 # CHUNK_SIZE = 100 # number of characters fetched 

226 # If the url contains Movingwall it does not lead to the article 

227 # TODO this should be in the harvest tasks 

228 if "Movingwall" in url: 

229 print("The url does not link to the PDF article because of o moving wall") 

230 return ( 

231 False, 

232 None, 

233 { 

234 "status": ExtlinkChecked.Status.ERROR, 

235 "message": "The url does not link to the PDF article because of a moving wall", 

236 }, 

237 ) 

238 return super().check_pdf_link_validity(url)