Coverage for src/crawler/by_source/dml_e_crawler.py: 34%

144 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from ptf.model_data import ArticleData, IssueData, create_articledata, create_contributor 

4 

5from crawler.base_crawler import BaseCollectionCrawler 

6from crawler.utils import add_pdf_link_to_xarticle 

7 

8 

9class Dml_eCrawler(BaseCollectionCrawler): 

10 """ 

11 DML_E is quite peculiar : 

12 There is no issue page, and articles are separated into "years" instead of volumes/issues. 

13 volume/issue number is stored inside each article page. 

14 In order to being able to parse volume and issue numbers, we must parse the articles before creating volumes and issues. 

15 """ 

16 

17 source_domain = "DML_E" 

18 source_name = "Proyecto DML-E: Biblioteca Digital de Matemáticas " 

19 source_website = "http://dmle.icmat.es/revistas/" 

20 

21 periode_begin = 1968 

22 periode_begin = 1969 

23 

24 # 1987, 1: 1-17 

25 # 1999,19: 1-11 

26 # 2008, 53-62, 

27 # 1963 (1-2): 

28 # 2000, 51 (1): 49-58, 13 Ref. 

29 # 2006, 57 (Extra): 327-342, 10 Ref. 

30 issue_regex = r"\d+,? ?(?:(?P<volume>\d+),? ?)?(?:\((?P<number>[\d\w\-]+)\))?(?:[:,])? ?(?:(?P<page_start>\d+)-(?P<page_end>\d+))?" 

31 

32 requests_interval = 60 

33 

34 def parse_collection_content(self, content): 

35 xissues = [] 

36 soup = BeautifulSoup(content, "html.parser") 

37 pagination_elements = soup.select("div.prevnext a") 

38 for page in pagination_elements: 

39 href = page.get("href") 

40 if not isinstance(href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 continue 

42 href = self.source_website + href 

43 content = self.download_file(href) 

44 xissues = [*xissues, *self.parse_collection_page(content, href)] 

45 

46 years = [int(issue.year) for issue in xissues] 

47 self.periode_begin = min(years) 

48 self.periode_end = max(years) 

49 self.periode = self.get_or_create_periode() 

50 return xissues 

51 

52 def parse_collection_page(self, content: str, url: str): 

53 soup = BeautifulSoup(content, "html.parser") 

54 xissues = [] 

55 current_year = False 

56 issues_tags = soup.select("a[name], ul.art_info") 

57 for issue_tag in issues_tags: 

58 if issue_tag.name == "a": 

59 current_year = issue_tag.get("name") 

60 if not isinstance(current_year, str): 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Issue year cannot be parsed") 

62 continue 

63 

64 if not current_year: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 raise ValueError("Issue year not found") 

66 issue = self.create_xissue(url, current_year, current_year) 

67 self.parse_issue_tag(issue_tag, issue) 

68 xissues.append(issue) 

69 return xissues 

70 

71 # def parse_issue_content(self, content, xissue): 

72 # pass 

73 

74 def parse_issue_tag(self, tag: Tag, xissue: IssueData): 

75 article_tags = tag.select("li") 

76 for index, art_tag in enumerate(article_tags): 

77 href_tag = art_tag.select_one("a[href]") 

78 if not href_tag: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 raise ValueError("Cannot parse article") 

80 url = href_tag.get("href") 

81 if not isinstance(url, str): 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 raise ValueError("Cannot parse Article URL") 

83 url = self.source_website + url 

84 

85 title = href_tag.text 

86 

87 article = create_articledata() 

88 article.title_tex = title 

89 article.url = url 

90 article.pid = "a" + str(index) 

91 xissue.articles.append(article) 

92 

93 def parse_dml_e_article_content(self, content, xissue, xarticle, url, pid): 

94 xarticle.pid = pid 

95 soup = BeautifulSoup(content, "html.parser") 

96 table_lines = soup.select("div#centro table tr") 

97 issue_volume: str | None = None 

98 issue_number: str | None = None 

99 for line in table_lines: 

100 header_tag = line.select_one("th") 

101 value_tag = line.select_one("td") 

102 if not value_tag: 

103 raise ValueError("Cannot parse article") 

104 

105 # PDF 

106 if not header_tag: 

107 href_tag = line.select_one("a") 

108 if not href_tag: 

109 raise ValueError("Cannot parse article pdf link") 

110 href = href_tag.get("href") 

111 if not isinstance(href, str): 

112 raise ValueError("Cannot parse article pdf link") 

113 add_pdf_link_to_xarticle(xarticle, self.source_website + href) 

114 continue 

115 

116 # Title 

117 if header_tag.text == "Título español": 

118 xarticle.title_tex = value_tag.text 

119 continue 

120 if header_tag.text == "Título original": 

121 xarticle.title_tex = value_tag.text 

122 continue 

123 if header_tag.text == "Título inglés": 

124 xarticle.title_tex = value_tag.text 

125 continue 

126 

127 # Author 

128 if header_tag.text == "Autor/es": 

129 authors_tags = value_tag.select("a") 

130 for a in authors_tags: 

131 author = create_contributor() 

132 author["role"] = "author" 

133 author["string_name"] = a.text 

134 xarticle.contributors.append(author) 

135 continue 

136 # Page 

137 if header_tag.text == "Publicación": 

138 volume_re = list(regex.finditer(self.issue_regex, value_tag.text)) 

139 if len(volume_re) != 0: 

140 # raise ValueError("Cannot parse Article page") 

141 volume_data = volume_re[0].groupdict() 

142 

143 if volume_data["page_start"] and volume_data["page_end"]: 

144 xarticle.page_range = ( 

145 volume_data["page_start"] + "-" + volume_data["page_end"] 

146 ) 

147 if "volume" in volume_data: 

148 issue_volume = volume_data["volume"] 

149 if "number" in volume_data: 

150 issue_number = volume_data["number"] 

151 else: 

152 raise ValueError("issue volume or number not found") 

153 

154 # LANG 

155 if header_tag.text == "Idioma": 

156 languages = {"Inglés": "en", "Español": "es", "Francés": "fr"} 

157 if value_tag.text in languages: 

158 xarticle.lang = languages[value_tag.text] 

159 

160 return xarticle, issue_volume, issue_number 

161 

162 def crawl_issue(self, xissue: IssueData): 

163 if hasattr(xissue, "url") and xissue.url: 

164 content = self.download_file(xissue.url) 

165 self.parse_issue_content(content, xissue) 

166 

167 dml_e_issues: dict[str, IssueData] = {} 

168 

169 xarticles = xissue.articles 

170 

171 for xarticle in xarticles: 

172 parsed_xarticle, xissue_vol, xissue_number = self.crawl_dml_e_article(xarticle, xissue) 

173 if parsed_xarticle is None: 

174 continue 

175 if xissue_vol or xissue_number: 

176 issue_tag = (xissue_vol or "") + "_" + (xissue_number or "") 

177 else: 

178 issue_tag = xissue.year 

179 if not issue_tag: 

180 raise ValueError("issue_tag is None") 

181 if issue_tag not in dml_e_issues: 

182 dml_e_issues[issue_tag] = self.create_xissue( 

183 xissue.url, xissue.year, xissue_vol, xissue_number or None 

184 ) 

185 dml_e_issues[issue_tag].articles.append(parsed_xarticle) 

186 

187 for value in dml_e_issues.values(): 

188 if not self.test_mode and len(value.articles) > 0: 

189 self.add_xissue_into_database(value) 

190 

191 def crawl_dml_e_article(self, xarticle: ArticleData, xissue: IssueData): 

192 parsed_xarticle = xarticle 

193 if not hasattr(xarticle, "url") or not xarticle.url: 

194 raise ValueError("article does not have an url") 

195 # self.progress_bar.text(f"{xarticle.pid} - {xarticle.url}") 

196 

197 content = self.download_file(xarticle.url) 

198 pid = f"{xissue.pid}_{xarticle.pid}" 

199 

200 parsed_xarticle, xissue_vol, xissue_number = self.parse_dml_e_article_content( 

201 content, xissue, xarticle, xarticle.url, pid 

202 ) 

203 

204 # The article title may have formulas surrounded with '$' 

205 return self.process_article_metadata(parsed_xarticle), xissue_vol, xissue_number