Coverage for src/crawler/by_source/impan_crawler.py: 74%

120 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from bs4 import BeautifulSoup 

2from crawler.base_crawler import BaseCollectionCrawler 

3from crawler.base_crawler import add_pdf_link_to_xarticle 

4from crawler.crawler_types import CitationLiteral 

5 

6from ptf.model_data import AbstractDict 

7from ptf.model_data import IssueData 

8from ptf.model_data import create_articledata 

9from ptf.model_data import create_issuedata 

10 

11 

12class ImpanCrawler(BaseCollectionCrawler): 

13 source_name = "Institute of Mathematics Polish Academy of Sciences" 

14 source_domain = "IMPAN" 

15 source_website = "https://www.impan.pl" 

16 

17 periode_end = 2016 

18 periode_begin = 0 

19 

20 def __init__(self, *args, **kwargs): 

21 super().__init__(*args, **kwargs) 

22 

23 self.source = self.get_or_create_source() 

24 

25 self.periode = self.get_or_create_periode() 

26 

27 def parse_collection_content(self, content): 

28 """ 

29 Discrete Analysis. 

30 We ignore the journal web page and query Crossref to get the list of articles. 

31 We query crossref for each article to get the list of xissues based on the publication date. 

32 Each xissue has its year + list of articles with their URLs 

33 """ 

34 if self.collection_id == "APM": 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 self.periode_begin = 1955 

36 

37 if self.collection_id == "DIM": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 self.periode_begin = 2000 

39 

40 soup = BeautifulSoup(content, "html.parser") 

41 xissues_dict: dict[str, IssueData] = {} 

42 

43 # Extract the list of issues 

44 volume_nodes = soup.select("div.year") 

45 

46 for volume_node in volume_nodes: 

47 year = volume_node.get_text() 

48 year_int = int(year) 

49 if self.periode_begin > year_int or year_int > self.periode_end: 

50 continue 

51 issues_nodes = volume_node.parent 

52 if issues_nodes is None: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 continue 

54 issues_nodes = issues_nodes.select("div.issues") 

55 

56 for issue_node in issues_nodes: 

57 issues_link_node = issue_node.select("a") 

58 for issue_link_node in issues_link_node: 

59 href = issue_link_node.get("href") 

60 if href is None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError( 

62 f"[{self.source_domain}] {self.collection_id} : Collection href is None" 

63 ) 

64 if isinstance(href, list): 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 raise ValueError( 

66 f"[{self.source_domain}] {self.collection_id} : Collection href is an array" 

67 ) 

68 url = self.source_website + href 

69 

70 xissue = self.create_xissue(url, year) 

71 # Prevent duplicate issues 

72 # NOTE : is this needed ? 

73 pid = xissue.pid 

74 if pid is None: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 continue 

76 if pid in xissues_dict: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 print( 

78 f"[{self.source_domain}] {self.collection_id} : Duplicate issue in connection : {pid}" 

79 ) 

80 continue 

81 xissues_dict[pid] = xissue 

82 

83 return list(xissues_dict.values()) 

84 

85 def create_xissue(self, url: str, year: str): 

86 if url.endswith("/"): 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 url = url[:-1] 

88 parts = url.split("/") 

89 issue_number = parts[-1].replace(",", "-") 

90 volume_number = parts[-2] 

91 

92 xissue = create_issuedata() 

93 if volume_number == "all": 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 xissue.pid = f"{self.collection_id}_{year}__{issue_number}" 

95 xissue.volume = issue_number 

96 

97 else: 

98 xissue.pid = f"{self.collection_id}_{year}__{volume_number}_{issue_number}" 

99 xissue.volume = volume_number 

100 

101 xissue.year = year 

102 xissue.number = issue_number.replace(",", "-") 

103 xissue.url = url 

104 

105 return xissue 

106 

107 def parse_issue_content(self, content, xissue: IssueData): 

108 soup = BeautifulSoup(content, "html.parser") 

109 article_nodes = soup.select("div.info") 

110 for index_article, article_node in enumerate(article_nodes): 

111 xarticle = create_articledata() 

112 xarticle.pid = "a" + str(index_article) 

113 

114 article_link_node = article_node.select_one("a") 

115 if article_link_node is None: 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true

116 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue link is None") 

117 href = article_link_node.get("href") 

118 if href is None: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is None") 

120 if isinstance(href, list): 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue href is a list") 

122 xissue_url = xissue.url 

123 if xissue_url is None: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Issue url is None") 

125 xarticle.url = xissue_url + href 

126 

127 xissue.articles.append(xarticle) 

128 

129 def parse_article_content(self, content, xissue, xarticle, url, pid): 

130 """ 

131 Parse the content with Beautifulsoup and returns an ArticleData 

132 """ 

133 

134 # We only parse the arXiv id in the Discrete Analysis article page 

135 soup = BeautifulSoup(content, "html.parser") 

136 

137 title_info = soup.select_one("div.info") 

138 if title_info is None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 raise ValueError( 

140 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

141 ) 

142 title_node = title_info.select_one("a") 

143 if title_node is None: 143 ↛ 145line 143 didn't jump to line 145 because the condition on line 143 was always true

144 title_node = soup.select_one("h2.product-title") 

145 if title_node is None: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 raise ValueError( 

147 f"[{self.source_domain}] {self.collection_id} {xarticle.pid} : Title not found" 

148 ) 

149 

150 title_tex = title_node.get_text() 

151 xarticle.title_tex = title_tex 

152 

153 what: list[CitationLiteral] = [ 

154 "author", 

155 "pdf", 

156 "page", 

157 "doi", 

158 "issn", 

159 "publisher", 

160 "page", 

161 "keywords", 

162 ] 

163 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

164 # abstract 

165 

166 abstract_mml_node = soup.select_one("div.details.abstract p") 

167 if abstract_mml_node is None: 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 print(f"[{self.source_domain}] {xarticle.pid} : Abstract not found") 

169 else: 

170 abstract_tex = abstract_mml_node.get_text() 

171 xabstract: AbstractDict = { 

172 "tag": "abstract", 

173 "value_html": "", 

174 "value_tex": abstract_tex, 

175 "value_xml": "", 

176 "lang": "en", 

177 } 

178 xarticle.abstracts.append(xabstract) 

179 

180 href_attrib = soup.select_one("div.order a") 

181 

182 if href_attrib is not None: 182 ↛ 190line 182 didn't jump to line 190 because the condition on line 182 was always true

183 href = href_attrib.get("href") 

184 if isinstance(href, list): 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href is a list") 

186 if href is None: 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 raise ValueError(f"[{self.source_domain}] {xarticle.pid} : Article href not found") 

188 pdf_url = self.source_website + href 

189 add_pdf_link_to_xarticle(xarticle, pdf_url) 

190 if xarticle.title_tex is None or xarticle.title_tex == "": 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 print(xarticle) 

192 return xarticle