Coverage for src/crawler/by_source/impan_crawler.py: 78%

109 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1import time 

2from urllib.parse import urljoin 

3 

4import lingua 

5from bs4 import BeautifulSoup 

6from lingua import LanguageDetectorBuilder 

7from ptf.model_data import ( 

8 IssueData, 

9 create_abstract, 

10 create_articledata, 

11 create_issuedata, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.types import CitationLiteral 

16 

17 

18class ImpanCrawler(BaseCollectionCrawler): 

19 source_name = "Institute of Mathematics Polish Academy of Sciences" 

20 source_domain = "IMPAN" 

21 source_website = "https://www.impan.pl/" 

22 

23 requests_timeout = 120 

24 language_detector = LanguageDetectorBuilder.from_languages( 

25 lingua.Language.ENGLISH, 

26 lingua.Language.FRENCH, 

27 lingua.Language.POLISH, 

28 lingua.Language.RUSSIAN, 

29 lingua.Language.GERMAN, 

30 ).build() 

31 

32 def parse_collection_content(self, content): 

33 """ 

34 Discrete Analysis. 

35 We ignore the journal web page and query Crossref to get the list of articles. 

36 We query crossref for each article to get the list of xissues based on the publication date. 

37 Each xissue has its year + list of articles with their URLs 

38 """ 

39 

40 soup = BeautifulSoup(content, "html.parser") 

41 xissues_dict: dict[str, IssueData] = {} 

42 

43 # Extract the list of issues 

44 volume_nodes = soup.select("div.year") 

45 

46 for volume_node in volume_nodes: 

47 year = volume_node.get_text() 

48 

49 issues_nodes = volume_node.parent 

50 if issues_nodes is None: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 continue 

52 issues_nodes = issues_nodes.select("div.issues") 

53 

54 for issue_node in issues_nodes: 

55 issues_link_node = issue_node.select("a") 

56 for issue_link_node in issues_link_node: 

57 href = issue_link_node.get("href") 

58 if href is None: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError( 

60 f"[{self.source_domain}] {self.collection_id} : Collection href is None" 

61 ) 

62 if isinstance(href, list): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError( 

64 f"[{self.source_domain}] {self.collection_id} : Collection href is an array" 

65 ) 

66 url = urljoin(self.source_website, href) 

67 

68 xissue = self.create_impan_xissue(url, year) 

69 # Prevent duplicate issues 

70 # NOTE : is this needed ? 

71 pid = xissue.pid 

72 if pid is None: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 continue 

74 if pid in xissues_dict: 

75 self.logger.debug("Duplicate issue in connection", extra={"pid": pid}) 

76 continue 

77 xissues_dict[pid] = xissue 

78 

79 return list(xissues_dict.values()) 

80 

81 def create_impan_xissue(self, url: str, year: str): 

82 if url.endswith("/"): 

83 url = url[:-1] 

84 parts = url.split("/") 

85 issue_number = parts[-1].replace(",", "-") 

86 volume_number = parts[-2] 

87 

88 xissue = create_issuedata() 

89 if volume_number == "all": 

90 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

91 xissue.volume = issue_number 

92 

93 else: 

94 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

95 xissue.volume = volume_number 

96 

97 xissue.year = year 

98 xissue.number = issue_number.replace(",", "-") 

99 xissue.url = url 

100 

101 return xissue 

102 

103 def parse_issue_content(self, content, xissue: IssueData, retries=0): 

104 soup = BeautifulSoup(content, "html.parser") 

105 article_nodes = soup.select("div.info") 

106 if len(article_nodes) == 0 and xissue.url: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 if retries > 3: 

108 self.logger.info( 

109 "Maximum number of retires reached. This issue seems to be empty", 

110 {"pid": xissue.pid, "url": xissue.url}, 

111 ) 

112 return 

113 self.logger.debug("Couldn't find articles... Retrying", {"pid": xissue.pid}) 

114 time.sleep(60) 

115 return self.parse_issue_content( 

116 self.download_file(xissue.url, force_refresh=True), 

117 xissue=xissue, 

118 retries=retries + 1, 

119 ) 

120 for index_article, article_node in enumerate(article_nodes): 

121 xarticle = create_articledata() 

122 xarticle.pid = "a" + str(index_article) 

123 

124 article_link_node = article_node.select_one("a") 

125 if article_link_node is None: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

127 href = article_link_node.get("href") 

128 if href is None: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None") 

130 if isinstance(href, list): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list") 

132 xissue_url = xissue.url 

133 if xissue_url is None: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

135 xarticle.url = xissue_url + href 

136 

137 xissue.articles.append(xarticle) 

138 

139 def parse_article_content(self, content, xissue, xarticle, url): 

140 """ 

141 Parse the content with Beautifulsoup and returns an ArticleData 

142 """ 

143 

144 # We only parse the arXiv id in the Discrete Analysis article page 

145 soup = BeautifulSoup(content, "html.parser") 

146 

147 title_info = soup.select_one("div.info") 

148 if title_info is None: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true

149 self.logger.error( 

150 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

151 ) 

152 return None 

153 title_node = title_info.select_one("a") 

154 if title_node is None: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 title_node = soup.select_one("h2.product-title") 

156 if title_node is None: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 self.logger.error( 

158 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

159 ) 

160 return None 

161 

162 title_tex = title_node.get_text() 

163 xarticle.title_tex = title_tex 

164 xarticle.lang = self.detect_language(xarticle.title_tex) 

165 

166 what: list[CitationLiteral] = [ 

167 "author", 

168 "page", 

169 "doi", 

170 "publisher", 

171 "page", 

172 "keywords", 

173 ] 

174 

175 # If download button has the "buy" class, skip adding the pdf. 

176 if not soup.select_one("a.button.download.noborder.buy"): 176 ↛ 179line 176 didn't jump to line 179 because the condition on line 176 was always true

177 what.append("pdf") 

178 

179 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

180 

181 # abstract 

182 abstract_mml_node = soup.select_one("div.details.abstract p") 

183 if abstract_mml_node is None: 

184 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid}) 

185 else: 

186 abstract_tex = abstract_mml_node.get_text() 

187 

188 xarticle.abstracts.append(create_abstract(value_tex=abstract_tex, lang=xarticle.lang)) 

189 

190 # href_attrib = soup.select_one("div.order a") 

191 

192 # if href_attrib is not None: 

193 # href = href_attrib.get("href") 

194 # if isinstance(href, list): 

195 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

196 # if href is None: 

197 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

198 # pdf_url = urljoin(self.source_website, href) 

199 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

200 return xarticle