Coverage for src/crawler/by_source/impan_crawler.py: 80%

115 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import IssueData, create_abstract, create_articledata, create_issuedata 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.types import CitationLiteral 

8from crawler.utils import add_pdf_link_to_xarticle 

9 

10 

11class ImpanCrawler(BaseCollectionCrawler): 

12 source_name = "Institute of Mathematics Polish Academy of Sciences" 

13 source_domain = "IMPAN" 

14 source_website = "https://www.impan.pl/" 

15 

16 periode_end = 2016 

17 periode_begin = 0 

18 

19 def parse_collection_content(self, content): 

20 """ 

21 Discrete Analysis. 

22 We ignore the journal web page and query Crossref to get the list of articles. 

23 We query crossref for each article to get the list of xissues based on the publication date. 

24 Each xissue has its year + list of articles with their URLs 

25 """ 

26 if self.collection_id == "APM": 26 ↛ 27line 26 didn't jump to line 27 because the condition on line 26 was never true

27 self.periode_begin = 1955 

28 

29 if self.collection_id == "DIM": 29 ↛ 30line 29 didn't jump to line 30 because the condition on line 29 was never true

30 self.periode_begin = 2000 

31 

32 soup = BeautifulSoup(content, "html.parser") 

33 xissues_dict: dict[str, IssueData] = {} 

34 

35 # Extract the list of issues 

36 volume_nodes = soup.select("div.year") 

37 

38 for volume_node in volume_nodes: 

39 year = volume_node.get_text() 

40 year_int = int(year) 

41 if self.periode_begin > year_int or year_int > self.periode_end: 

42 continue 

43 issues_nodes = volume_node.parent 

44 if issues_nodes is None: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 continue 

46 issues_nodes = issues_nodes.select("div.issues") 

47 

48 for issue_node in issues_nodes: 

49 issues_link_node = issue_node.select("a") 

50 for issue_link_node in issues_link_node: 

51 href = issue_link_node.get("href") 

52 if href is None: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 raise ValueError( 

54 f"[{self.source_domain}] {self.collection_id} : Collection href is None" 

55 ) 

56 if isinstance(href, list): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 raise ValueError( 

58 f"[{self.source_domain}] {self.collection_id} : Collection href is an array" 

59 ) 

60 url = urljoin(self.source_website, href) 

61 

62 xissue = self.create_impan_xissue(url, year) 

63 # Prevent duplicate issues 

64 # NOTE : is this needed ? 

65 pid = xissue.pid 

66 if pid is None: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 continue 

68 if pid in xissues_dict: 

69 print( 

70 f"[{self.source_domain}] {self.collection_id} : Duplicate issue in connection : {pid}" 

71 ) 

72 continue 

73 xissues_dict[pid] = xissue 

74 

75 return list(xissues_dict.values()) 

76 

77 def create_impan_xissue(self, url: str, year: str): 

78 if url.endswith("/"): 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 url = url[:-1] 

80 parts = url.split("/") 

81 issue_number = parts[-1].replace(",", "-") 

82 volume_number = parts[-2] 

83 

84 xissue = create_issuedata() 

85 if volume_number == "all": 

86 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

87 xissue.volume = issue_number 

88 

89 else: 

90 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

91 xissue.volume = volume_number 

92 

93 xissue.year = year 

94 xissue.number = issue_number.replace(",", "-") 

95 xissue.url = url 

96 

97 return xissue 

98 

99 def parse_issue_content(self, content, xissue: IssueData): 

100 soup = BeautifulSoup(content, "html.parser") 

101 article_nodes = soup.select("div.info") 

102 for index_article, article_node in enumerate(article_nodes): 

103 xarticle = create_articledata() 

104 xarticle.pid = "a" + str(index_article) 

105 

106 article_link_node = article_node.select_one("a") 

107 if article_link_node is None: 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

109 href = article_link_node.get("href") 

110 if href is None: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None") 

112 if isinstance(href, list): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list") 

114 xissue_url = xissue.url 

115 if xissue_url is None: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

117 xarticle.url = xissue_url + href 

118 

119 xissue.articles.append(xarticle) 

120 

121 def parse_article_content(self, content, xissue, xarticle, url, pid): 

122 """ 

123 Parse the content with Beautifulsoup and returns an ArticleData 

124 """ 

125 

126 # We only parse the arXiv id in the Discrete Analysis article page 

127 soup = BeautifulSoup(content, "html.parser") 

128 

129 title_info = soup.select_one("div.info") 

130 if title_info is None: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 raise ValueError( 

132 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

133 ) 

134 title_node = title_info.select_one("a") 

135 if title_node is None: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was always true

136 title_node = soup.select_one("h2.product-title") 

137 if title_node is None: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 raise ValueError( 

139 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

140 ) 

141 

142 title_tex = title_node.get_text() 

143 xarticle.title_tex = title_tex 

144 xarticle.lang = self.detect_language(xarticle.title_tex) 

145 

146 what: list[CitationLiteral] = [ 

147 "author", 

148 "pdf", 

149 "page", 

150 "doi", 

151 "issn", 

152 "publisher", 

153 "page", 

154 "keywords", 

155 ] 

156 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

157 

158 # abstract 

159 abstract_mml_node = soup.select_one("div.details.abstract p") 

160 if abstract_mml_node is None: 

161 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

162 else: 

163 abstract_tex = abstract_mml_node.get_text() 

164 xabstract = create_abstract(tag="abstract", value_tex=abstract_tex, lang=xarticle.lang) 

165 xarticle.abstracts.append(xabstract) 

166 

167 href_attrib = soup.select_one("div.order a") 

168 

169 if href_attrib is not None: 169 ↛ 177line 169 didn't jump to line 177 because the condition on line 169 was always true

170 href = href_attrib.get("href") 

171 if isinstance(href, list): 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

173 if href is None: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

175 pdf_url = urljoin(self.source_website, href) 

176 add_pdf_link_to_xarticle(xarticle, pdf_url) 

177 if xarticle.title_tex is None or xarticle.title_tex == "": 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 print(xarticle) 

179 return xarticle