Coverage for src/crawler/by_source/elibm_crawler.py: 76%

159 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from bs4 import BeautifulSoup 

2from crawler.base_crawler import BaseCollectionCrawler 

3from crawler.base_crawler import add_pdf_link_to_xarticle 

4 

5from ptf.model_data import create_articledata 

6from ptf.model_data import create_contributor 

7from ptf.model_data import create_issuedata 

8from ptf.model_data import create_subj 

9 

10 

11class ElibmCrawler(BaseCollectionCrawler): 

12 source_name = "Electronic Library of Mathematics" 

13 source_domain = "ELIBM" 

14 source_website = "https://www.elibm.org" 

15 

16 def __init__(self, *args, **kwargs): 

17 super().__init__(*args, **kwargs) 

18 

19 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue. 

20 

21 self.source = self.get_or_create_source() 

22 

23 if self.collection_id == "DOCMA": 

24 self.delimiter_inline_formula = "\\(" 

25 self.delimiter_disp_formula = "\\[" 

26 

27 def parse_collection_content(self, content): 

28 """ 

29 Parse the HTML page of Annals of Math and returns a list of xissue. 

30 Each xissue has its pid/volume/number/year metadata + its url 

31 

32 self.periode is set at the end based on the xissue years of the HTML page 

33 """ 

34 soup = BeautifulSoup(content, "html.parser") 

35 xissues = [] 

36 

37 # Extract the list of issues 

38 link_nodes = soup.find_all("a") 

39 

40 # eLibM puts special issue titles as volume number 

41 # to create a issue pid, we use S1, S2... 

42 last_special_issue_number = 0 

43 

44 for link_node in link_nodes: 

45 url = link_node.get("href") 

46 text = link_node.get_text() 

47 if url.startswith("/issue"): 

48 xissue, last_special_issue_number = self.create_xissue( 

49 url, text, last_special_issue_number 

50 ) 

51 

52 # eLibM lists the special issues at the end. 

53 # set the periode_begin if we find a special issue 

54 if last_special_issue_number == 1: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 self.periode_begin = self.get_first_year(xissues[-1].year) 

56 

57 if xissue: 57 ↛ 44line 57 didn't jump to line 44 because the condition on line 57 was always true

58 xissues.append(xissue) 

59 

60 self.periode_end = self.get_first_year(xissues[0].year) 

61 

62 if last_special_issue_number == 0: 62 ↛ 65line 62 didn't jump to line 65 because the condition on line 62 was always true

63 self.periode_begin = self.get_first_year(xissues[-1].year) 

64 

65 self.periode = self.get_or_create_periode() 

66 

67 return xissues 

68 

69 def get_first_year(self, year): 

70 if "/" in year: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 year = year.split("/")[0] 

72 

73 return year 

74 

75 def create_xissue(self, url, text, last_special_issue_number): 

76 if "(" not in text or ")" not in text: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 return None 

78 

79 parts = text.split("(") 

80 

81 year = parts[1].split(")")[0] 

82 year = year.replace("/", "-") 

83 

84 # volume might not be an integer. eLibM puts special issue titles as volume number. 

85 volume = parts[0].strip() 

86 

87 number = "" 

88 if "No. " in volume: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 parts = volume.split("No. ") 

90 volume = parts[0].strip() 

91 number = parts[1].strip() 

92 

93 try: 

94 volume_for_pid = int(volume) 

95 except ValueError: 

96 last_special_issue_number += 1 

97 volume_for_pid = f"S{last_special_issue_number}" 

98 

99 xissue = create_issuedata() 

100 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}" 

101 xissue.year = year 

102 xissue.volume = volume 

103 xissue.number = number 

104 xissue.url = self.source_website + url 

105 

106 return xissue, last_special_issue_number 

107 

108 def parse_issue_content(self, content, xissue): 

109 soup = BeautifulSoup(content, "html.parser") 

110 article_nodes = soup.find_all("div", {"class": "title"}) 

111 

112 for index_article, article_node in enumerate(article_nodes): 

113 article_link_node = article_node.find("a") 

114 if article_link_node: 114 ↛ 112line 114 didn't jump to line 112 because the condition on line 114 was always true

115 url = article_link_node.get("href") 

116 xarticle = create_articledata() 

117 xarticle.pid = "a" + str(index_article) 

118 xarticle.url = self.source_website + url 

119 

120 # eLibM lists the articles in the reverse order, except for one special issue 

121 if xissue.volume == "Mahler Selecta": 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 xissue.articles.append(xarticle) 

123 else: 

124 xissue.articles.insert(0, xarticle) 

125 

126 # if the issue has only 1 article, eLibM skip the issue page and directly display the article page 

127 if len(xissue.articles) == 0: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 title_node = soup.find("h2", {"class": "document_title"}) 

129 if title_node is not None: 

130 xarticle = create_articledata() 

131 xarticle.pid = "a0" 

132 xarticle.url = xissue.url 

133 

134 xissue.articles.append(xarticle) 

135 

136 def parse_article_content(self, content, xissue, xarticle, url, pid): 

137 """ 

138 Parse the content with Beautifulsoup and returns an ArticleData 

139 """ 

140 xarticle = create_articledata() 

141 xarticle.pid = pid 

142 xarticle.lang = "en" 

143 

144 soup = BeautifulSoup(content, "html.parser") 

145 

146 # TITLE 

147 title_node = soup.find("h2", {"class": "document_title"}) 

148 if title_node: 148 ↛ 152line 148 didn't jump to line 152 because the condition on line 148 was always true

149 xarticle.title_tex = title_node.get_text() 

150 

151 # AUTHORS 

152 citation_author_node = soup.find("h3", {"class": "document_author"}) 

153 if citation_author_node: 153 ↛ 172line 153 didn't jump to line 172 because the condition on line 153 was always true

154 text = citation_author_node.get_text() 

155 if text: 155 ↛ 172line 155 didn't jump to line 172 because the condition on line 155 was always true

156 parts = text.split(";") 

157 for part in parts: 

158 text_author = part.strip() 

159 

160 role = "author" 

161 if "(ed.)" in text_author: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 role = "editor" 

163 text_author = text_author.split("(ed.)")[0].strip() 

164 

165 author = create_contributor() 

166 author["role"] = role 

167 author["string_name"] = text_author 

168 

169 xarticle.contributors.append(author) 

170 

171 # PDF 

172 link_nodes = soup.find_all("a") 

173 for link_node in link_nodes: 

174 url = link_node.get("href") 

175 if url.startswith("/ft/"): 

176 pdf_url = self.source_website + url 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 panel_nodes = soup.find_all("h3", {"class": "panel-title"}) 

180 for panel_node in panel_nodes: 

181 text = panel_node.get_text() 

182 content_node = panel_node.parent.parent.find("div", {"class": "panel-body"}) 

183 

184 if text == "Summary": 

185 # ABSTRACT 

186 abstract = content_node.get_text() 

187 xabstract = { 

188 "tag": "abstract", 

189 "value_html": "", 

190 "value_tex": abstract, 

191 "value_xml": "", 

192 "lang": "en", 

193 } 

194 xarticle.abstracts.append(xabstract) 

195 

196 elif text == "Mathematics Subject Classification": 

197 # MSC 

198 subjs = content_node.get_text().split(", ") 

199 for subj in subjs: 

200 subject = create_subj() 

201 subject["value"] = subj 

202 subject["type"] = "msc" 

203 subject["lang"] = "en" 

204 xarticle.kwds.append(subject) 

205 

206 elif text == "Keywords/Phrases": 

207 # Keywords 

208 subjs = content_node.get_text().split(", ") 

209 for subj in subjs: 

210 subject = create_subj() 

211 subject["value"] = subj 

212 subject["lang"] = "en" 

213 xarticle.kwds.append(subject) 

214 

215 # PAGES 

216 citation_node = soup.find("h5", {"class": "document_source"}) 

217 if citation_node: 217 ↛ 236line 217 didn't jump to line 236 because the condition on line 217 was always true

218 text = citation_node.get_text() 

219 year = f"({xissue.year})" 

220 if year in text: 220 ↛ 236line 220 didn't jump to line 236 because the condition on line 220 was always true

221 text = text.split(year)[0] 

222 

223 if "p." in text: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 text = text.split("p.")[0].split(",")[-1].strip() 

225 xarticle.size = text 

226 

227 elif "-" in text: 227 ↛ 236line 227 didn't jump to line 236 because the condition on line 227 was always true

228 parts = text.split("-") 

229 first_page = parts[-2].split(" ")[-1] 

230 last_page = parts[-1].split(",")[0].split(" ")[0] 

231 

232 xarticle.fpage = first_page 

233 xarticle.lpage = last_page 

234 

235 # DOI 

236 doi_node = citation_node.next_sibling 

237 if doi_node.name == "div": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 text = doi_node.get_text() 

239 if text.startswith("DOI: "): 

240 doi = text[5:] 

241 

242 xarticle.doi = doi 

243 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

244 

245 return xarticle