Coverage for src / crawler / by_source / dml_e_crawler.py: 28%

176 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-23 15:27 +0000

1import regex 

2import requests 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import ( 

5 ArticleData, 

6 IssueData, 

7 create_articledata, 

8 create_contributor, 

9 create_extlink, 

10) 

11 

12from crawler.base_crawler import BaseCollectionCrawler 

13from crawler.crawler_utils import article_has_pdf, article_has_source 

14from crawler.models.extlink_checked import ExtlinkChecked 

15from crawler.utils import add_pdf_link_to_xarticle 

16 

17 

18class Dml_eCrawler(BaseCollectionCrawler): 

19 """ 

20 DML_E is quite peculiar : 

21 There is no issue page, and articles are separated into "years" instead of volumes/issues. 

22 volume/issue number is stored inside each article page. 

23 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues. 

24 """ 

25 

26 source_domain = "DML_E" 

27 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas " 

28 source_website = "http://dmle.icmat.es/revistas/" 

29 

30 # 1987, 1: 1-17 

31 # 1999,19: 1-11 

32 # 2008, 53-62, 

33 # 1963 (1-2): 

34 # 2000, 51 (1): 49-58, 13 Ref. 

35 # 2006, 57 (Extra): 327-342, 10 Ref. 

36 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?" 

37 

38 requests_interval = 60 

39 

40 def parse_collection_content(self, content): 

41 xissues = [] 

42 soup = BeautifulSoup(content, "html.parser") 

43 pagination_elements = soup.select("div.prevnext a") 

44 for page in pagination_elements: 

45 href = page.get("href") 

46 if not isinstance(href, str): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 continue 

48 href = self.source_website + href 

49 content = self.download_file(href) 

50 xissues = [*xissues, *self.parse_collection_page(content, href)] 

51 

52 return xissues 

53 

54 def parse_collection_page(self, content: str, url: str): 

55 soup = BeautifulSoup(content, "html.parser") 

56 xissues = [] 

57 current_year = False 

58 issues_tags = soup.select("a[name], ul.art_info") 

59 for issue_tag in issues_tags: 

60 if issue_tag.name == "a": 

61 current_year = issue_tag.get("name") 

62 if not isinstance(current_year, str): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError("Issue year cannot be parsed") 

64 continue 

65 

66 if not current_year: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise ValueError("Issue year not found") 

68 issue = self.create_xissue(url, current_year, current_year) 

69 self.parse_issue_tag(issue_tag, issue) 

70 xissues.append(issue) 

71 return xissues 

72 

73 # def parse_issue_content(self, content, xissue): 

74 # pass 

75 

76 def parse_issue_tag(self, tag: Tag, xissue: IssueData): 

77 article_tags = tag.select("li") 

78 for index, art_tag in enumerate(article_tags): 

79 href_tag = art_tag.select_one("a[href]") 

80 if not href_tag: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Cannot parse article") 

82 url = href_tag.get("href") 

83 if not isinstance(url, str): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 raise ValueError("Cannot parse Article URL") 

85 url = self.source_website + url 

86 

87 title = href_tag.text 

88 

89 article = create_articledata() 

90 article.title_tex = title 

91 article.url = url 

92 article.pid = "a" + str(index) 

93 xissue.articles.append(article) 

94 

95 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid): 

96 xarticle.pid = pid 

97 soup = BeautifulSoup(content, "html.parser") 

98 table_lines = soup.select("div#centro table tr") 

99 issue_volume: str | None = None 

100 issue_number: str | None = None 

101 for line in table_lines: 

102 header_tag = line.select_one("th") 

103 value_tag = line.select_one("td") 

104 if not value_tag: 

105 raise ValueError("Cannot parse article") 

106 

107 # PDF 

108 if not header_tag: 

109 href_tag = line.select_one("a") 

110 if not href_tag: 

111 raise ValueError("Cannot parse article pdf link") 

112 href = href_tag.get("href") 

113 if not isinstance(href, str): 

114 raise ValueError("Cannot parse article pdf link") 

115 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

116 continue 

117 

118 # Title 

119 if header_tag.text == "Título español": 

120 xarticle.title_tex = value_tag.text 

121 continue 

122 if header_tag.text == "Título original": 

123 xarticle.title_tex = value_tag.text 

124 continue 

125 if header_tag.text == "Título inglés": 

126 xarticle.title_tex = value_tag.text 

127 continue 

128 

129 # Author 

130 if header_tag.text == "Autor/es": 

131 authors_tags = value_tag.select("a") 

132 for a in authors_tags: 

133 author = create_contributor() 

134 author["role"] = "author" 

135 author["string_name"] = a.text 

136 xarticle.contributors.append(author) 

137 continue 

138 # Page 

139 if header_tag.text == "Publicación": 

140 volume_re = list(regex.finditer(self.issue_regex, value_tag.text)) 

141 if len(volume_re) != 0: 

142 # raise ValueError("Cannot parse Article page") 

143 volume_data = volume_re[0].groupdict() 

144 

145 if volume_data["page_start"] and volume_data["page_end"]: 

146 xarticle.page_range = ( 

147 volume_data["page_start"] + "-" + volume_data["page_end"] 

148 ) 

149 if "volume" in volume_data: 

150 issue_volume = volume_data["volume"] 

151 if "number" in volume_data: 

152 issue_number = volume_data["number"] 

153 else: 

154 raise ValueError("issue volume or number not found") 

155 

156 # LANG 

157 if header_tag.text == "Idioma": 

158 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"} 

159 if value_tag.text in languages: 

160 xarticle.lang = languages[value_tag.text] 

161 

162 return xarticle, issue_volume, issue_number 

163 

164 def crawl_issue(self, xissue: IssueData): 

165 if hasattr(xissue, "url") and xissue.url: 

166 content = self.download_file(xissue.url) 

167 self.parse_issue_content(content, xissue) 

168 

169 dml_e_issues: dict[str, IssueData] = {} 

170 

171 xarticles = xissue.articles 

172 

173 for xarticle in xarticles: 

174 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue) 

175 if parsed_xarticle is None: 

176 continue 

177 if xissue_vol or xissue_number: 

178 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "") 

179 else: 

180 issue_tag = xissue.year 

181 if not issue_tag: 

182 raise ValueError("issue_tag is None") 

183 if issue_tag not in dml_e_issues: 

184 dml_e_issues[issue_tag] = self.create_xissue( 

185 xissue.url, xissue.year, xissue_vol, xissue_number or None 

186 ) 

187 dml_e_issues[issue_tag].articles.append(parsed_xarticle) 

188 

189 for value in dml_e_issues.values(): 

190 if self.ignore_missing_pdf: 

191 value.articles = [a for a in value.articles if article_has_pdf(a)] 

192 

193 if not self.dry and len(value.articles) > 0: 

194 self.process_resource_metadata(xissue, resource_type="issue") 

195 self.add_xissue_into_database(value) 

196 

197 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData): 

198 parsed_xarticle = xarticle 

199 if not hasattr(xarticle, "url") or not xarticle.url: 

200 raise ValueError("article does not have an url") 

201 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}") 

202 

203 content = self.download_file(xarticle.url) 

204 pid = f"{xissue.pid}_{xarticle.pid}" 

205 

206 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content( 

207 content, xissue, xarticle, xarticle.url, pid 

208 ) 

209 

210 if not article_has_source(parsed_xarticle) and parsed_xarticle.url: 

211 ext_link = create_extlink() 

212 ext_link["rel"] = "source" 

213 ext_link["location"] = parsed_xarticle.url 

214 ext_link["metadata"] = self.source_domain 

215 parsed_xarticle.ext_links.append(ext_link) 

216 

217 # The article title may have formulas surrounded with '$' 

218 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number 

219 

220 @classmethod 

221 def check_pdf_link_validity(cls, url: str, verify: bool): 

222 # we overwrite this base_crawler method to manage the links to pdf that are not article pdf. 

223 # Avoid downloading the whole PDF 

224 CHUNK_SIZE = 100 # number of characters fetched 

225 # If the url contains Movingwall it does not lead to the article 

226 if "Movingwall" in url: 

227 print("The url does not link to the PDF article because of o moving wall") 

228 return ( 

229 False, 

230 "No query sent", 

231 { 

232 "status": ExtlinkChecked.Status.ERROR, 

233 "message": "The url does not link to the PDF article because of o moving wall", 

234 }, 

235 ) 

236 

237 header = { 

238 "Range": f"bytes=0-{CHUNK_SIZE}", 

239 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0", 

240 } 

241 with requests.get( 

242 url, stream=True, allow_redirects=True, headers=header, verify=verify 

243 ) as response: 

244 content_type = response.headers.get("Content-Type") 

245 if "application/pdf" not in content_type: 

246 # Content type is wrong, lest's check the header 

247 try: 

248 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE)) 

249 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None: 

250 return ( 

251 False, 

252 response, 

253 { 

254 "status": ExtlinkChecked.Status.ERROR, 

255 "message": f"Content-Type header: {content_type}; PDF Header not found : got {pdf_header}", 

256 }, 

257 ) 

258 else: 

259 return ( 

260 True, 

261 response, 

262 { 

263 "status": ExtlinkChecked.Status.WARNING, 

264 "message": f"Content-Type header: {content_type}", 

265 }, 

266 ) 

267 except StopIteration: 

268 return ( 

269 False, 

270 response, 

271 { 

272 "status": ExtlinkChecked.Status.ERROR, 

273 "message": f"Content-Type header: {content_type}.", 

274 }, 

275 ) 

276 try: 

277 pdf_header = next(response.iter_lines(chunk_size=CHUNK_SIZE)) 

278 if regex.match(r"^%PDF-\d\.\d", pdf_header.decode()) is None: 

279 return ( 

280 False, 

281 response, 

282 { 

283 "status": ExtlinkChecked.Status.ERROR, 

284 "message": "PDF Header not found : got {pdf_header}", 

285 }, 

286 ) 

287 except StopIteration: 

288 return ( 

289 False, 

290 response, 

291 { 

292 "status": ExtlinkChecked.Status.ERROR, 

293 "message": f"Content-Type header: {content_type}.", 

294 }, 

295 ) 

296 

297 if response.status_code not in (200, 206): 

298 raise ValueError("Invalid status code") 

299 

300 return ( 

301 True, 

302 response, 

303 { 

304 "status": ExtlinkChecked.Status.OK, 

305 "message": "", 

306 }, 

307 )