Coverage for src/crawler/by_source/dml_e_crawler.py: 30%

147 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from ptf.model_data import ( 

4 ArticleData, 

5 IssueData, 

6 create_articledata, 

7 create_contributor, 

8 create_extlink, 

9) 

10 

11from crawler.base_crawler import BaseCollectionCrawler 

12from crawler.utils import add_pdf_link_to_xarticle 

13 

14 

15class Dml_eCrawler(BaseCollectionCrawler): 

16 """ 

17 DML_E is quite peculiar : 

18 There is no issue page, and articles are separated into "years" instead of volumes/issues. 

19 volume/issue number is stored inside each article page. 

20 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues. 

21 """ 

22 

23 source_domain = "DML_E" 

24 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas " 

25 source_website = "http://dmle.icmat.es/revistas/" 

26 

27 # 1987, 1: 1-17 

28 # 1999,19: 1-11 

29 # 2008, 53-62, 

30 # 1963 (1-2): 

31 # 2000, 51 (1): 49-58, 13 Ref. 

32 # 2006, 57 (Extra): 327-342, 10 Ref. 

33 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?" 

34 

35 requests_interval = 60 

36 

37 def parse_collection_content(self, content): 

38 xissues = [] 

39 soup = BeautifulSoup(content, "html.parser") 

40 pagination_elements = soup.select("div.prevnext a") 

41 for page in pagination_elements: 

42 href = page.get("href") 

43 if not isinstance(href, str): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 continue 

45 href = self.source_website + href 

46 content = self.download_file(href) 

47 xissues = [*xissues, *self.parse_collection_page(content, href)] 

48 

49 return xissues 

50 

51 def parse_collection_page(self, content: str, url: str): 

52 soup = BeautifulSoup(content, "html.parser") 

53 xissues = [] 

54 current_year = False 

55 issues_tags = soup.select("a[name], ul.art_info") 

56 for issue_tag in issues_tags: 

57 if issue_tag.name == "a": 

58 current_year = issue_tag.get("name") 

59 if not isinstance(current_year, str): 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError("Issue year cannot be parsed") 

61 continue 

62 

63 if not current_year: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("Issue year not found") 

65 issue = self.create_xissue(url, current_year, current_year) 

66 self.parse_issue_tag(issue_tag, issue) 

67 xissues.append(issue) 

68 return xissues 

69 

70 # def parse_issue_content(self, content, xissue): 

71 # pass 

72 

73 def parse_issue_tag(self, tag: Tag, xissue: IssueData): 

74 article_tags = tag.select("li") 

75 for index, art_tag in enumerate(article_tags): 

76 href_tag = art_tag.select_one("a[href]") 

77 if not href_tag: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Cannot parse article") 

79 url = href_tag.get("href") 

80 if not isinstance(url, str): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Cannot parse Article URL") 

82 url = self.source_website + url 

83 

84 title = href_tag.text 

85 

86 article = create_articledata() 

87 article.title_tex = title 

88 article.url = url 

89 article.pid = "a" + str(index) 

90 xissue.articles.append(article) 

91 

92 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid): 

93 xarticle.pid = pid 

94 soup = BeautifulSoup(content, "html.parser") 

95 table_lines = soup.select("div#centro table tr") 

96 issue_volume: str | None = None 

97 issue_number: str | None = None 

98 for line in table_lines: 

99 header_tag = line.select_one("th") 

100 value_tag = line.select_one("td") 

101 if not value_tag: 

102 raise ValueError("Cannot parse article") 

103 

104 # PDF 

105 if not header_tag: 

106 href_tag = line.select_one("a") 

107 if not href_tag: 

108 raise ValueError("Cannot parse article pdf link") 

109 href = href_tag.get("href") 

110 if not isinstance(href, str): 

111 raise ValueError("Cannot parse article pdf link") 

112 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

113 continue 

114 

115 # Title 

116 if header_tag.text == "Título español": 

117 xarticle.title_tex = value_tag.text 

118 continue 

119 if header_tag.text == "Título original": 

120 xarticle.title_tex = value_tag.text 

121 continue 

122 if header_tag.text == "Título inglés": 

123 xarticle.title_tex = value_tag.text 

124 continue 

125 

126 # Author 

127 if header_tag.text == "Autor/es": 

128 authors_tags = value_tag.select("a") 

129 for a in authors_tags: 

130 author = create_contributor() 

131 author["role"] = "author" 

132 author["string_name"] = a.text 

133 xarticle.contributors.append(author) 

134 continue 

135 # Page 

136 if header_tag.text == "Publicación": 

137 volume_re = list(regex.finditer(self.issue_regex, value_tag.text)) 

138 if len(volume_re) != 0: 

139 # raise ValueError("Cannot parse Article page") 

140 volume_data = volume_re[0].groupdict() 

141 

142 if volume_data["page_start"] and volume_data["page_end"]: 

143 xarticle.page_range = ( 

144 volume_data["page_start"] + "-" + volume_data["page_end"] 

145 ) 

146 if "volume" in volume_data: 

147 issue_volume = volume_data["volume"] 

148 if "number" in volume_data: 

149 issue_number = volume_data["number"] 

150 else: 

151 raise ValueError("issue volume or number not found") 

152 

153 # LANG 

154 if header_tag.text == "Idioma": 

155 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"} 

156 if value_tag.text in languages: 

157 xarticle.lang = languages[value_tag.text] 

158 

159 return xarticle, issue_volume, issue_number 

160 

161 def crawl_issue(self, xissue: IssueData): 

162 if hasattr(xissue, "url") and xissue.url: 

163 content = self.download_file(xissue.url) 

164 self.parse_issue_content(content, xissue) 

165 

166 dml_e_issues: dict[str, IssueData] = {} 

167 

168 xarticles = xissue.articles 

169 

170 for xarticle in xarticles: 

171 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue) 

172 if parsed_xarticle is None: 

173 continue 

174 if xissue_vol or xissue_number: 

175 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "") 

176 else: 

177 issue_tag = xissue.year 

178 if not issue_tag: 

179 raise ValueError("issue_tag is None") 

180 if issue_tag not in dml_e_issues: 

181 dml_e_issues[issue_tag] = self.create_xissue( 

182 xissue.url, xissue.year, xissue_vol, xissue_number or None 

183 ) 

184 dml_e_issues[issue_tag].articles.append(parsed_xarticle) 

185 

186 for value in dml_e_issues.values(): 

187 if self.ignore_missing_pdf: 

188 value.articles = [a for a in value.articles if self.article_has_pdf(a)] 

189 

190 if not self.test_mode and len(value.articles) > 0: 

191 self.process_resource_metadata(xissue) 

192 self.add_xissue_into_database(value) 

193 

194 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData): 

195 parsed_xarticle = xarticle 

196 if not hasattr(xarticle, "url") or not xarticle.url: 

197 raise ValueError("article does not have an url") 

198 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}") 

199 

200 content = self.download_file(xarticle.url) 

201 pid = f"{xissue.pid}_{xarticle.pid}" 

202 

203 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content( 

204 content, xissue, xarticle, xarticle.url, pid 

205 ) 

206 

207 if not self.article_has_source(parsed_xarticle) and parsed_xarticle.url: 

208 ext_link = create_extlink() 

209 ext_link["rel"] = "source" 

210 ext_link["location"] = parsed_xarticle.url 

211 ext_link["metadata"] = self.source_domain 

212 parsed_xarticle.ext_links.append(ext_link) 

213 

214 # The article title may have formulas surrounded with '$' 

215 return self.process_resource_metadata(parsed_xarticle), xissue_vol, xissue_number