Coverage for src / crawler / by_source / dml_e_crawler.py: 28%

166 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from ptf.model_data import ( 

4 ArticleData, 

5 IssueData, 

6 create_articledata, 

7 create_contributor, 

8 create_extlink, 

9) 

10 

11from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

12from crawler.crawler_utils import article_has_source 

13from crawler.models import ExtlinkChecked 

14from crawler.utils import add_pdf_link_to_xarticle 

15 

16 

17class Dml_eCrawler(MatchingCrawler): 

18 """ 

19 DML_E is quite peculiar : 

20 There is no issue page, and articles are separated into "years" instead of volumes/issues. 

21 volume/issue number is stored inside each article page. 

22 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues. 

23 """ 

24 

25 source_domain = "DML_E" 

26 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas " 

27 source_website = "http://dmle.icmat.es/revistas/" 

28 

29 # 1987, 1: 1-17 

30 # 1999,19: 1-11 

31 # 2008, 53-62, 

32 # 1963 (1-2): 

33 # 2000, 51 (1): 49-58, 13 Ref. 

34 # 2006, 57 (Extra): 327-342, 10 Ref. 

35 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?" 

36 

37 def parse_collection_content(self, content): 

38 xissues = [] 

39 soup = BeautifulSoup(content, "html.parser") 

40 pagination_elements = soup.select("div.prevnext a") 

41 for page in pagination_elements: 

42 href = page.get("href") 

43 if not isinstance(href, str): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 continue 

45 href = self.source_website + href 

46 content = self.download_file(href) 

47 xissues = [*xissues, *self.parse_collection_page(content, href)] 

48 

49 return xissues 

50 

51 def parse_collection_page(self, content: str, url: str): 

52 soup = BeautifulSoup(content, "html.parser") 

53 xissues = [] 

54 current_year = False 

55 issues_tags = soup.select("a[name], ul.art_info") 

56 for issue_tag in issues_tags: 

57 if issue_tag.name == "a": 

58 current_year = issue_tag.get("name") 

59 if not isinstance(current_year, str): 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError("Issue year cannot be parsed") 

61 continue 

62 

63 if not current_year: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("Issue year not found") 

65 issue = self.create_xissue(url, current_year, current_year) 

66 self.parse_issue_tag(issue_tag, issue) 

67 xissues.append(issue) 

68 return xissues 

69 

70 # def parse_issue_content(self, content, xissue): 

71 # pass 

72 

73 def parse_issue_tag(self, tag: Tag, xissue: IssueData): 

74 article_tags = tag.select("li") 

75 for index, art_tag in enumerate(article_tags): 

76 href_tag = art_tag.select_one("a[href]") 

77 if not href_tag: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Cannot parse article") 

79 url = href_tag.get("href") 

80 if not isinstance(url, str): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Cannot parse Article URL") 

82 url = self.source_website + url 

83 

84 title = href_tag.text 

85 

86 article = create_articledata() 

87 article.title_tex = title 

88 article.url = url 

89 article.pid = "a" + str(index) 

90 xissue.articles.append(article) 

91 

92 def parse_dml_e_article_content(self, content, xissue, xarticle: ArticleData, url, pid): 

93 xarticle.pid = pid 

94 soup = BeautifulSoup(content, "html.parser") 

95 table_lines = soup.select("div#centro table tr") 

96 issue_volume: str | None = None 

97 issue_number: str | None = None 

98 for line in table_lines: 

99 header_tag = line.select_one("th") 

100 value_tag = line.select_one("td") 

101 if not value_tag: 

102 raise ValueError("Cannot parse article") 

103 

104 # PDF 

105 if not header_tag: 

106 href_tag = line.select_one("a") 

107 if not href_tag: 

108 raise ValueError("Cannot parse article pdf link") 

109 href = href_tag.get("href") 

110 if not isinstance(href, str): 

111 raise ValueError("Cannot parse article pdf link") 

112 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

113 continue 

114 

115 # Title 

116 if header_tag.text == "Título español": 

117 xarticle.title_tex = value_tag.text 

118 continue 

119 if header_tag.text == "Título original": 

120 xarticle.title_tex = value_tag.text 

121 continue 

122 if header_tag.text == "Título inglés": 

123 xarticle.title_tex = value_tag.text 

124 continue 

125 

126 # Author 

127 if header_tag.text == "Autor/es": 

128 authors_tags = value_tag.select("a") 

129 for a in authors_tags: 

130 author = create_contributor() 

131 author["role"] = "author" 

132 author["string_name"] = a.text 

133 xarticle.contributors.append(author) 

134 continue 

135 # Page 

136 if header_tag.text == "Publicación": 

137 volume_re = list(regex.finditer(self.issue_regex, value_tag.text)) 

138 if len(volume_re) != 0: 

139 # raise ValueError("Cannot parse Article page") 

140 volume_data = volume_re[0].groupdict() 

141 

142 if volume_data["page_start"] and volume_data["page_end"]: 

143 xarticle.page_range = ( 

144 volume_data["page_start"] + "-" + volume_data["page_end"] 

145 ) 

146 if "volume" in volume_data: 

147 issue_volume = volume_data["volume"] 

148 if "number" in volume_data: 

149 issue_number = volume_data["number"] 

150 else: 

151 raise ValueError("issue volume or number not found") 

152 

153 # LANG 

154 if header_tag.text == "Idioma": 

155 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"} 

156 if value_tag.text in languages: 

157 xarticle.lang = languages[value_tag.text] 

158 

159 if header_tag.text == "Código MathReviews": 

160 if value_tag.text.startswith("MR"): 

161 xarticle.extids.append(("mr-item-id", value_tag.text)) 

162 if header_tag.text == "Código Z-Math": 

163 if value_tag.text.startswith("Zbl "): 

164 xarticle.extids.append(("zbl-item-id", value_tag.text.removeprefix("Zbl "))) 

165 

166 return xarticle, issue_volume, issue_number 

167 

168 def crawl_issue(self, xissue: IssueData): 

169 if hasattr(xissue, "url") and xissue.url: 

170 content = self.download_file(xissue.url) 

171 self.parse_issue_content(content, xissue) 

172 

173 dml_e_issues: dict[str, IssueData] = {} 

174 

175 xarticles = xissue.articles 

176 

177 for xarticle in xarticles: 

178 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue) 

179 if parsed_xarticle is None: 

180 continue 

181 if xissue_vol or xissue_number: 

182 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "") 

183 else: 

184 issue_tag = xissue.year 

185 if not issue_tag: 

186 raise ValueError("issue_tag is None") 

187 if issue_tag not in dml_e_issues: 

188 dml_e_issues[issue_tag] = self.create_xissue( 

189 xissue.url, xissue.year, xissue_vol, xissue_number or None 

190 ) 

191 dml_e_issues[issue_tag].articles.append(parsed_xarticle) 

192 

193 for value in dml_e_issues.values(): 

194 if self.ignore_missing_pdf: 

195 value.articles = [a for a in value.articles if self.article_has_pdf(a)] 

196 if self.dry: 

197 return 

198 issue_has_pdf = self.article_has_pdf(value) 

199 if len(value.articles) == 0 and not issue_has_pdf: 

200 continue 

201 for index, article in enumerate(value.articles): 

202 article.pid = f"{value.pid}_a{index}" 

203 self.process_resource_metadata(value, resource_type="issue") 

204 self.add_xissue_into_database(value) 

205 

206 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData): 

207 parsed_xarticle = xarticle 

208 if not hasattr(xarticle, "url") or not xarticle.url: 

209 raise ValueError("article does not have an url") 

210 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}") 

211 

212 content = self.download_file(xarticle.url) 

213 pid = f"{xissue.pid}_{xarticle.pid}" 

214 

215 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content( 

216 content, xissue, xarticle, xarticle.url, pid 

217 ) 

218 

219 if not article_has_source(parsed_xarticle) and parsed_xarticle.url: 

220 ext_link = create_extlink() 

221 ext_link["rel"] = "source" 

222 ext_link["location"] = parsed_xarticle.url 

223 ext_link["metadata"] = self.source_domain 

224 parsed_xarticle.ext_links.append(ext_link) 

225 

226 # The article title may have formulas surrounded with '$' 

227 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number 

228 

229 @classmethod 

230 def check_pdf_link_validity(cls, url, verify=True): 

231 # we overwrite this base_crawler method to manage the links to pdf that are not article pdf. 

232 # Avoid downloading the whole PDF 

233 # CHUNK_SIZE = 100 # number of characters fetched 

234 # If the url contains Movingwall it does not lead to the article 

235 # TODO this should be in the harvest tasks 

236 if "Movingwall" in url: 

237 print("The url does not link to the PDF article because of o moving wall") 

238 return ( 

239 False, 

240 None, 

241 { 

242 "status": ExtlinkChecked.Status.ERROR, 

243 "message": "The url does not link to the PDF article because of a moving wall", 

244 }, 

245 ) 

246 return super().check_pdf_link_validity(url)