Coverage for src/crawler/by_source/impan_crawler.py: 77%

125 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import time 

2from datetime import datetime, timedelta 

3from urllib.parse import urljoin 

4 

5import lingua 

6from bs4 import BeautifulSoup 

7from lingua import LanguageDetectorBuilder 

8from ptf.model_data import ( 

9 ArticleData, 

10 IssueData, 

11 create_abstract, 

12 create_articledata, 

13 create_issuedata, 

14) 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.types import CitationLiteral 

18 

19 

20class ImpanCrawler(BaseCollectionCrawler): 

21 source_name = "Institute of Mathematics Polish Academy of Sciences" 

22 source_domain = "IMPAN" 

23 source_website = "https://www.impan.pl/" 

24 

25 language_detector = LanguageDetectorBuilder.from_languages( 

26 lingua.Language.ENGLISH, 

27 lingua.Language.FRENCH, 

28 lingua.Language.POLISH, 

29 lingua.Language.RUSSIAN, 

30 lingua.Language.GERMAN, 

31 ).build() 

32 

33 def parse_collection_content(self, content): 

34 """ 

35 Discrete Analysis. 

36 We ignore the journal web page and query Crossref to get the list of articles. 

37 We query crossref for each article to get the list of xissues based on the publication date. 

38 Each xissue has its year + list of articles with their URLs 

39 """ 

40 

41 soup = BeautifulSoup(content, "html.parser") 

42 xissues_dict: dict[str, IssueData] = {} 

43 

44 # Extract the list of issues 

45 volume_nodes = soup.select("div.year") 

46 

47 for volume_node in volume_nodes: 

48 year = volume_node.get_text() 

49 

50 issues_nodes = volume_node.parent 

51 if issues_nodes is None: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 continue 

53 issues_nodes = issues_nodes.select("div.issues") 

54 

55 for issue_node in issues_nodes: 

56 issues_link_node = issue_node.select("a") 

57 for issue_link_node in issues_link_node: 

58 href = issue_link_node.get("href") 

59 if href is None: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError( 

61 f"[{self.source_domain}] {self.collection_id} : Collection href is None" 

62 ) 

63 if isinstance(href, list): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError( 

65 f"[{self.source_domain}] {self.collection_id} : Collection href is an array" 

66 ) 

67 url = urljoin(self.source_website, href) 

68 

69 xissue = self.create_impan_xissue(url, year) 

70 # Prevent duplicate issues 

71 # NOTE : is this needed ? 

72 pid = xissue.pid 

73 if pid is None: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 continue 

75 if pid in xissues_dict: 

76 print( 

77 f"[{self.source_domain}] {self.collection_id} : Duplicate issue in connection : {pid}" 

78 ) 

79 continue 

80 xissues_dict[pid] = xissue 

81 

82 return list(xissues_dict.values()) 

83 

84 def create_impan_xissue(self, url: str, year: str): 

85 if url.endswith("/"): 

86 url = url[:-1] 

87 parts = url.split("/") 

88 issue_number = parts[-1].replace(",", "-") 

89 volume_number = parts[-2] 

90 

91 xissue = create_issuedata() 

92 if volume_number == "all": 

93 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

94 xissue.volume = issue_number 

95 

96 else: 

97 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

98 xissue.volume = volume_number 

99 

100 xissue.year = year 

101 xissue.number = issue_number.replace(",", "-") 

102 xissue.url = url 

103 

104 return xissue 

105 

106 def parse_issue_content(self, content, xissue: IssueData, retries=0): 

107 soup = BeautifulSoup(content, "html.parser") 

108 article_nodes = soup.select("div.info") 

109 if len(article_nodes) == 0 and xissue.url: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 if retries > 3: 

111 print(f"Maximum number of retires reached. This issue seems to be empty : {xissue.url}") 

112 return 

113 print("Couldn't find articles... Retrying") 

114 time.sleep(60) 

115 return self.parse_issue_content( 

116 self.download_file(xissue.url, force_refresh=True), 

117 xissue=xissue, 

118 retries=retries + 1, 

119 ) 

120 for index_article, article_node in enumerate(article_nodes): 

121 xarticle = create_articledata() 

122 xarticle.pid = "a" + str(index_article) 

123 

124 article_link_node = article_node.select_one("a") 

125 if article_link_node is None: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

127 href = article_link_node.get("href") 

128 if href is None: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None") 

130 if isinstance(href, list): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list") 

132 xissue_url = xissue.url 

133 if xissue_url is None: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

135 xarticle.url = xissue_url + href 

136 

137 xissue.articles.append(xarticle) 

138 

139 def parse_article_content(self, content, xissue, xarticle, url): 

140 """ 

141 Parse the content with Beautifulsoup and returns an ArticleData 

142 """ 

143 

144 # We only parse the arXiv id in the Discrete Analysis article page 

145 soup = BeautifulSoup(content, "html.parser") 

146 

147 title_info = soup.select_one("div.info") 

148 if title_info is None: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true

149 raise ValueError( 

150 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

151 ) 

152 title_node = title_info.select_one("a") 

153 if title_node is None: 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true

154 title_node = soup.select_one("h2.product-title") 

155 if title_node is None: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 raise ValueError( 

157 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

158 ) 

159 

160 title_tex = title_node.get_text() 

161 xarticle.title_tex = title_tex 

162 xarticle.lang = self.detect_language(xarticle.title_tex) 

163 

164 what: list[CitationLiteral] = [ 

165 "author", 

166 "page", 

167 "doi", 

168 "publisher", 

169 "page", 

170 "keywords", 

171 ] 

172 

173 # If download button has the "buy" class, skip adding the pdf. 

174 if not soup.select_one("a.button.download.noborder.buy"): 174 ↛ 177line 174 didn't jump to line 177 because the condition on line 174 was always true

175 what.append("pdf") 

176 

177 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

178 

179 # abstract 

180 abstract_mml_node = soup.select_one("div.details.abstract p") 

181 if abstract_mml_node is None: 

182 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

183 else: 

184 abstract_tex = abstract_mml_node.get_text() 

185 xabstract = create_abstract(tag="abstract", value_tex=abstract_tex, lang=xarticle.lang) 

186 xarticle.abstracts.append(xabstract) 

187 

188 # href_attrib = soup.select_one("div.order a") 

189 

190 # if href_attrib is not None: 

191 # href = href_attrib.get("href") 

192 # if isinstance(href, list): 

193 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

194 # if href is None: 

195 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

196 # pdf_url = urljoin(self.source_website, href) 

197 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

198 return xarticle 

199 

200 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

201 # TODO : set pid in xarticle here instead of passing it to `parse_article_content` 

202 parsed_xarticle = xarticle 

203 if hasattr(xarticle, "url") and xarticle.url: 203 ↛ 221line 203 didn't jump to line 221 because the condition on line 203 was always true

204 parsed_xarticle = None 

205 attempts = 0 

206 while parsed_xarticle is None and attempts < 3: 

207 try: 

208 parsed_xarticle = super().crawl_article(xarticle, xissue) 

209 except ValueError as e: 

210 print(f"{xarticle.pid} : Caught error : {e} ") 

211 attempts += 1 

212 print( 

213 f"Retrying in {((attempts) * 15)}mins ({(datetime.now() + timedelta(minutes=attempts * 15)).time()})" 

214 ) 

215 # 15 mins, 30 mins, 45 mins 

216 time.sleep(attempts * 15 * 60) 

217 self.download_file(xarticle.url, force_refresh=True) 

218 

219 if parsed_xarticle is None: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 raise ValueError(f"Couldn't parse article {xarticle.pid}") 

221 return parsed_xarticle