Coverage for src/crawler/by_source/impan_crawler.py: 77%

125 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import urljoin 

4 

5import lingua 

6from bs4 import BeautifulSoup 

7from lingua import LanguageDetectorBuilder 

8from ptf.model_data import ( 

9 ArticleData, 

10 IssueData, 

11 create_abstract, 

12 create_articledata, 

13 create_issuedata, 

14) 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.types import CitationLiteral 

18 

19 

20class ImpanCrawler(BaseCollectionCrawler): 

21 source_name = "Institute of Mathematics Polish Academy of Sciences" 

22 source_domain = "IMPAN" 

23 source_website = "https://www.impan.pl/" 

24 

25 language_detector = LanguageDetectorBuilder.from_languages( 

26 lingua.Language.ENGLISH, 

27 lingua.Language.FRENCH, 

28 lingua.Language.POLISH, 

29 lingua.Language.RUSSIAN, 

30 lingua.Language.GERMAN, 

31 ).build() 

32 

33 def parse_collection_content(self, content): 

34 """ 

35 Discrete Analysis. 

36 We ignore the journal web page and query Crossref to get the list of articles. 

37 We query crossref for each article to get the list of xissues based on the publication date. 

38 Each xissue has its year + list of articles with their URLs 

39 """ 

40 

41 soup = BeautifulSoup(content, "html.parser") 

42 xissues_dict: dict[str, IssueData] = {} 

43 

44 # Extract the list of issues 

45 volume_nodes = soup.select("div.year") 

46 

47 for volume_node in volume_nodes: 

48 year = volume_node.get_text() 

49 

50 issues_nodes = volume_node.parent 

51 if issues_nodes is None: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 continue 

53 issues_nodes = issues_nodes.select("div.issues") 

54 

55 for issue_node in issues_nodes: 

56 issues_link_node = issue_node.select("a") 

57 for issue_link_node in issues_link_node: 

58 href = issue_link_node.get("href") 

59 if href is None: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError( 

61 f"[{self.source_domain}] {self.collection_id} : Collection href is None" 

62 ) 

63 if isinstance(href, list): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError( 

65 f"[{self.source_domain}] {self.collection_id} : Collection href is an array" 

66 ) 

67 url = urljoin(self.source_website, href) 

68 

69 xissue = self.create_impan_xissue(url, year) 

70 # Prevent duplicate issues 

71 # NOTE : is this needed ? 

72 pid = xissue.pid 

73 if pid is None: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 continue 

75 if pid in xissues_dict: 

76 print( 

77 f"[{self.source_domain}] {self.collection_id} : Duplicate issue in connection : {pid}" 

78 ) 

79 continue 

80 xissues_dict[pid] = xissue 

81 

82 return list(xissues_dict.values()) 

83 

84 def create_impan_xissue(self, url: str, year: str): 

85 if url.endswith("/"): 

86 url = url[:-1] 

87 parts = url.split("/") 

88 issue_number = parts[-1].replace(",", "-") 

89 volume_number = parts[-2] 

90 

91 xissue = create_issuedata() 

92 if volume_number == "all": 

93 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

94 xissue.volume = issue_number 

95 

96 else: 

97 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

98 xissue.volume = volume_number 

99 

100 xissue.year = year 

101 xissue.number = issue_number.replace(",", "-") 

102 xissue.url = url 

103 

104 return xissue 

105 

106 def parse_issue_content(self, content, xissue: IssueData, retries=0): 

107 soup = BeautifulSoup(content, "html.parser") 

108 article_nodes = soup.select("div.info") 

109 if len(article_nodes) == 0 and xissue.url: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 if retries > 3: 

111 print( 

112 f"Maximum number of retires reached. This issue seems to be empty : {xissue.url}" 

113 ) 

114 return 

115 print("Couldn't find articles... Retrying") 

116 time.sleep(60) 

117 return self.parse_issue_content( 

118 self.download_file(xissue.url, force_refresh=True), 

119 xissue=xissue, 

120 retries=retries + 1, 

121 ) 

122 for index_article, article_node in enumerate(article_nodes): 

123 xarticle = create_articledata() 

124 xarticle.pid = "a" + str(index_article) 

125 

126 article_link_node = article_node.select_one("a") 

127 if article_link_node is None: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

129 href = article_link_node.get("href") 

130 if href is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None") 

132 if isinstance(href, list): 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list") 

134 xissue_url = xissue.url 

135 if xissue_url is None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

137 xarticle.url = xissue_url + href 

138 

139 xissue.articles.append(xarticle) 

140 

141 def parse_article_content(self, content, xissue, xarticle, url): 

142 """ 

143 Parse the content with Beautifulsoup and returns an ArticleData 

144 """ 

145 

146 # We only parse the arXiv id in the Discrete Analysis article page 

147 soup = BeautifulSoup(content, "html.parser") 

148 

149 title_info = soup.select_one("div.info") 

150 if title_info is None: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true

151 raise ValueError( 

152 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

153 ) 

154 title_node = title_info.select_one("a") 

155 if title_node is None: 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true

156 title_node = soup.select_one("h2.product-title") 

157 if title_node is None: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 raise ValueError( 

159 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

160 ) 

161 

162 title_tex = title_node.get_text() 

163 xarticle.title_tex = title_tex 

164 xarticle.lang = self.detect_language(xarticle.title_tex) 

165 

166 what: list[CitationLiteral] = [ 

167 "author", 

168 "page", 

169 "doi", 

170 "publisher", 

171 "page", 

172 "keywords", 

173 ] 

174 

175 # If download button has the "buy" class, skip adding the pdf. 

176 if not soup.select_one("a.button.download.noborder.buy"): 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true

177 what.append("pdf") 

178 

179 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

180 

181 # abstract 

182 abstract_mml_node = soup.select_one("div.details.abstract p") 

183 if abstract_mml_node is None: 

184 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

185 else: 

186 abstract_tex = abstract_mml_node.get_text() 

187 xabstract = create_abstract(tag="abstract", value_tex=abstract_tex, lang=xarticle.lang) 

188 xarticle.abstracts.append(xabstract) 

189 

190 # href_attrib = soup.select_one("div.order a") 

191 

192 # if href_attrib is not None: 

193 # href = href_attrib.get("href") 

194 # if isinstance(href, list): 

195 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

196 # if href is None: 

197 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

198 # pdf_url = urljoin(self.source_website, href) 

199 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

200 return xarticle 

201 

202 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

203 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

204 parsed_xarticle = xarticle 

205 if hasattr(xarticle, "url") and xarticle.url: 205 ↛ 223line 205 didn't jump to line 223 because the condition on line 205 was always true

206 parsed_xarticle = None 

207 attempts = 0 

208 while parsed_xarticle is None and attempts < 3: 

209 try: 

210 parsed_xarticle = super().crawl_article(xarticle, xissue) 

211 except ValueError as e: 

212 print(f"{xarticle.pid} : Caught error : {e} ") 

213 attempts += 1 

214 print( 

215 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})" 

216 ) 

217 # 15 mins, 30 mins, 45 mins 

218 time.sleep(attempts * 15 * 60) 

219 self.download_file(xarticle.url, force_refresh=True) 

220 

221 if parsed_xarticle is None: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

223 return parsed_xarticle