Coverage for src / crawler / by_source / impan_crawler.py: 82%

101 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1import time 

2from urllib.parse import urljoin 

3 

4import lingua 

5from bs4 import BeautifulSoup 

6from lingua import LanguageDetectorBuilder 

7from ptf.model_data import ( 

8 IssueData, 

9 create_abstract, 

10 create_articledata, 

11 create_issuedata, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.types import CitationLiteral 

16 

17 

18class ImpanCrawler(BaseCollectionCrawler): 

19 source_name = "Institute of Mathematics Polish Academy of Sciences" 

20 source_domain = "IMPAN" 

21 source_website = "https://www.impan.pl/" 

22 

23 requests_timeout = 120 

24 _language_detector_builder = LanguageDetectorBuilder.from_languages( 

25 lingua.Language.ENGLISH, 

26 lingua.Language.FRENCH, 

27 lingua.Language.POLISH, 

28 lingua.Language.RUSSIAN, 

29 lingua.Language.GERMAN, 

30 ) 

31 

32 def parse_collection_content(self, content): 

33 """ 

34 Discrete Analysis. 

35 We ignore the journal web page and query Crossref to get the list of articles. 

36 We query crossref for each article to get the list of xissues based on the publication date. 

37 Each xissue has its year + list of articles with their URLs 

38 """ 

39 

40 soup = BeautifulSoup(content, "html.parser") 

41 xissues_dict: dict[str, IssueData] = {} 

42 

43 # Extract the list of issues 

44 volume_nodes = soup.select("div.year") 

45 

46 for volume_node in volume_nodes: 

47 year = volume_node.get_text() 

48 

49 issues_nodes = volume_node.parent 

50 if issues_nodes is None: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 continue 

52 issues_nodes = issues_nodes.select("div.issues") 

53 

54 for issue_node in issues_nodes: 

55 issues_link_node = issue_node.select("a") 

56 for issue_link_node in issues_link_node: 

57 href = self.get_str_attr(issue_link_node, "href") 

58 url = urljoin(self.source_website, href) 

59 

60 xissue = self.create_impan_xissue(url, year) 

61 # Prevent duplicate issues 

62 # NOTE : is this needed ? 

63 pid = xissue.pid 

64 if pid is None: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 continue 

66 if pid in xissues_dict: 

67 self.logger.debug("Duplicate issue in connection", extra={"pid": pid}) 

68 continue 

69 xissues_dict[pid] = xissue 

70 

71 return list(xissues_dict.values()) 

72 

73 def create_impan_xissue(self, url: str, year: str): 

74 if url.endswith("/"): 

75 url = url[:-1] 

76 parts = url.split("/") 

77 issue_number = parts[-1].replace(",", "-") 

78 volume_number = parts[-2] 

79 

80 xissue = create_issuedata() 

81 if volume_number == "all": 

82 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

83 xissue.volume = issue_number 

84 

85 else: 

86 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

87 xissue.volume = volume_number 

88 

89 xissue.year = year 

90 xissue.number = issue_number.replace(",", "-") 

91 xissue.url = url 

92 

93 return xissue 

94 

95 def parse_issue_content(self, content, xissue: IssueData, retries=0): 

96 soup = BeautifulSoup(content, "html.parser") 

97 article_nodes = soup.select("div.info") 

98 if len(article_nodes) == 0 and xissue.url: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 if retries > 3: 

100 self.logger.info( 

101 "Maximum number of retires reached. This issue seems to be empty", 

102 {"pid": xissue.pid, "url": xissue.url}, 

103 ) 

104 return 

105 self.logger.debug("Couldn't find articles... Retrying", {"pid": xissue.pid}) 

106 time.sleep(60) 

107 return self.parse_issue_content( 

108 self.download_file(xissue.url, force_refresh=True), 

109 xissue=xissue, 

110 retries=retries + 1, 

111 ) 

112 for index_article, article_node in enumerate(article_nodes): 

113 xarticle = create_articledata() 

114 xarticle.pid = "a" + str(index_article) 

115 

116 article_link_node = article_node.select_one("a") 

117 if article_link_node is None: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

119 

120 href = self.get_str_attr(article_link_node, "href") 

121 

122 xissue_url = xissue.url 

123 if xissue_url is None: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

125 xarticle.url = xissue_url + href 

126 

127 xissue.articles.append(xarticle) 

128 

129 def parse_article_content(self, content, xissue, xarticle, url): 

130 """ 

131 Parse the content with Beautifulsoup and returns an ArticleData 

132 """ 

133 

134 # We only parse the arXiv id in the Discrete Analysis article page 

135 soup = BeautifulSoup(content, "html.parser") 

136 

137 title_info = soup.select_one("div.info") 

138 if title_info is None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 self.logger.error( 

140 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

141 ) 

142 return None 

143 title_node = title_info.select_one("a") 

144 if title_node is None: 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true

145 title_node = soup.select_one("h2.product-title") 

146 if title_node is None: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 self.logger.error( 

148 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

149 ) 

150 return None 

151 

152 title_tex = title_node.get_text() 

153 xarticle.title_tex = title_tex 

154 xarticle.lang = self.detect_language(xarticle.title_tex) 

155 

156 what: list[CitationLiteral] = [ 

157 "author", 

158 "page", 

159 "doi", 

160 "publisher", 

161 "page", 

162 "keywords", 

163 ] 

164 

165 # If download button has the "buy" class, skip adding the pdf. 

166 if not soup.select_one("a.button.download.noborder.buy"): 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true

167 what.append("pdf") 

168 

169 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

170 

171 # abstract 

172 abstract_mml_node = soup.select_one("div.details.abstract p") 

173 if abstract_mml_node is None: 

174 self.logger.debug("Abstract not found", extra={"pid": xarticle.pid}) 

175 else: 

176 abstract_tex = abstract_mml_node.get_text() 

177 

178 xarticle.abstracts.append(create_abstract(value_tex=abstract_tex, lang=xarticle.lang)) 

179 

180 # href_attrib = soup.select_one("div.order a") 

181 

182 # if href_attrib is not None: 

183 # href = href_attrib.get("href") 

184 # if isinstance(href, list): 

185 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

186 # if href is None: 

187 # raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

188 # pdf_url = urljoin(self.source_website, href) 

189 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

190 return xarticle