Coverage for src/crawler/by_source/impan_crawler.py: 77%

125 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import urljoin 

4 

5import lingua 

6from bs4 import BeautifulSoup 

7from lingua import LanguageDetectorBuilder 

8from ptf.model_data import ( 

9 ArticleData, 

10 IssueData, 

11 create_abstract, 

12 create_articledata, 

13 create_issuedata, 

14) 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.types import CitationLiteral 

18 

19 

20class ImpanCrawler(BaseCollectionCrawler): 

21 source_name = "Institute of Mathematics Polish Academy of Sciences" 

22 source_domain = "IMPAN" 

23 source_website = "https://www.impan.pl/" 

24 

25 language_detector = LanguageDetectorBuilder.from_languages( 

26 lingua.Language.ENGLISH, 

27 lingua.Language.FRENCH, 

28 lingua.Language.POLISH, 

29 lingua.Language.RUSSIAN, 

30 lingua.Language.GERMAN, 

31 ).build() 

32 

33 def parse_collection_content(self, content): 

34 """ 

35 Discrete Analysis. 

36 We ignore the journal web page and query Crossref to get the list of articles. 

37 We query crossref for each article to get the list of xissues based on the publication date. 

38 Each xissue has its year + list of articles with their URLs 

39 """ 

40 

41 soup = BeautifulSoup(content, "html.parser") 

42 xissues_dict: dict[str, IssueData] = {} 

43 

44 # Extract the list of issues 

45 volume_nodes = soup.select("div.year") 

46 

47 for volume_node in volume_nodes: 

48 year = volume_node.get_text() 

49 

50 issues_nodes = volume_node.parent 

51 if issues_nodes is None: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 continue 

53 issues_nodes = issues_nodes.select("div.issues") 

54 

55 for issue_node in issues_nodes: 

56 issues_link_node = issue_node.select("a") 

57 for issue_link_node in issues_link_node: 

58 href = issue_link_node.get("href") 

59 if href is None: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError( 

61 f"[{self.source_domain}] {self.collection_id} : Collection href is None" 

62 ) 

63 if isinstance(href, list): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError( 

65 f"[{self.source_domain}] {self.collection_id} : Collection href is an array" 

66 ) 

67 url = urljoin(self.source_website, href) 

68 

69 xissue = self.create_impan_xissue(url, year) 

70 # Prevent duplicate issues 

71 # NOTE : is this needed ? 

72 pid = xissue.pid 

73 if pid is None: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 continue 

75 if pid in xissues_dict: 

76 self.logger.debug("Duplicate issue in connection", extra={"pid": pid}) 

77 continue 

78 xissues_dict[pid] = xissue 

79 

80 return list(xissues_dict.values()) 

81 

82 def create_impan_xissue(self, url: str, year: str): 

83 if url.endswith("/"): 

84 url = url[:-1] 

85 parts = url.split("/") 

86 issue_number = parts[-1].replace(",", "-") 

87 volume_number = parts[-2] 

88 

89 xissue = create_issuedata() 

90 if volume_number == "all": 

91 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

92 xissue.volume = issue_number 

93 

94 else: 

95 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

96 xissue.volume = volume_number 

97 

98 xissue.year = year 

99 xissue.number = issue_number.replace(",", "-") 

100 xissue.url = url 

101 

102 return xissue 

103 

104 def parse_issue_content(self, content, xissue: IssueData, retries=0): 

105 soup = BeautifulSoup(content, "html.parser") 

106 article_nodes = soup.select("div.info") 

107 if len(article_nodes) == 0 and xissue.url: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 if retries > 3: 

109 self.logger.info( 

110 "Maximum number of retires reached. This issue seems to be empty", 

111 {"pid": xissue.pid, "url": xissue.url}, 

112 ) 

113 return 

114 self.logger.debug("Couldn't find articles... Retrying", {"pid": xissue.pid}) 

115 time.sleep(60) 

116 return self.parse_issue_content( 

117 self.download_file(xissue.url, force_refresh=True), 

118 xissue=xissue, 

119 retries=retries + 1, 

120 ) 

121 for index_article, article_node in enumerate(article_nodes): 

122 xarticle = create_articledata() 

123 xarticle.pid = "a" + str(index_article) 

124 

125 article_link_node = article_node.select_one("a") 

126 if article_link_node is None: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

128 href = article_link_node.get("href") 

129 if href is None: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None") 

131 if isinstance(href, list): 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list") 

133 xissue_url = xissue.url 

134 if xissue_url is None: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true

135 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

136 xarticle.url = xissue_url + href 

137 

138 xissue.articles.append(xarticle) 

139 

140 def parse_article_content(self, content, xissue, xarticle, url): 

141 """ 

142 Parse the content with Beautifulsoup and returns an ArticleData 

143 """ 

144 

145 # We only parse the arXiv id in the Discrete Analysis article page 

146 soup = BeautifulSoup(content, "html.parser") 

147 

148 title_info = soup.select_one("div.info") 

149 if title_info is None: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 raise ValueError( 

151 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

152 ) 

153 title_node = title_info.select_one("a") 

154 if title_node is None: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 title_node = soup.select_one("h2.product-title") 

156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 raise ValueError( 

158 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

159 ) 

160 

161 title_tex = title_node.get_text() 

162 xarticle.title_tex = title_tex 

163 xarticle.lang = self.detect_language(xarticle.title_tex) 

164 

165 what: list[CitationLiteral] = [ 

166 "author", 

167 "page", 

168 "doi", 

169 "publisher", 

170 "page", 

171 "keywords", 

172 ] 

173 

174 # If download button has the "buy" class, skip adding the pdf. 

175 if not soup.select_one("a.button.download.noborder.buy"): 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true

176 what.append("pdf") 

177 

178 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

179 

180 # abstract 

181 abstract_mml_node = soup.select_one("div.details.abstract p") 

182 if abstract_mml_node is None: 

183 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid}) 

184 else: 

185 abstract_tex = abstract_mml_node.get_text() 

186 xabstract = create_abstract(tag="abstract", value_tex=abstract_tex, lang=xarticle.lang) 

187 xarticle.abstracts.append(xabstract) 

188 

189 # href_attrib = soup.select_one("div.order a") 

190 

191 # if href_attrib is not None: 

192 # href = href_attrib.get("href") 

193 # if isinstance(href, list): 

194 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

195 # if href is None: 

196 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

197 # pdf_url = urljoin(self.source_website, href) 

198 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

199 return xarticle 

200 

201 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

202 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

203 parsed_xarticle = xarticle 

204 if hasattr(xarticle, "url") and xarticle.url: 204 ↛ 223line 204 didn't jump to line 223 because the condition on line 204 was always true

205 parsed_xarticle = None 

206 attempts = 0 

207 while parsed_xarticle is None and attempts < 3: 

208 try: 

209 parsed_xarticle = super().crawl_article(xarticle, xissue) 

210 except ValueError as e: 

211 self.logger.debug(f"Caught error : {e}", {"pid": xarticle.pid}) 

212 attempts += 1 

213 self.logger.debug( 

214 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})", 

215 {"pid": xarticle.pid}, 

216 ) 

217 # 15 mins, 30 mins, 45 mins 

218 time.sleep(attempts * 15 * 60) 

219 self.download_file(xarticle.url, force_refresh=True) 

220 

221 if parsed_xarticle is None: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

223 return parsed_xarticle