Coverage for src/crawler/by_source/amc_crawler.py: 82%

119 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from bs4 import BeautifulSoup, Tag 

2from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj 

3 

4from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

5 

6 

7class AmcCrawler(BaseCollectionCrawler): 

8 source_domain = "AMC" 

9 source_name = "Ars Mathematica Contemporanea website" 

10 source_website = "https://amc-journal.eu" 

11 periode_begin = 2009 

12 periode_end = 2024 

13 

14 def __init__(self, *args, **kwargs): 

15 super().__init__(*args, **kwargs) 

16 

17 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue. 

18 # self.collection_id = "AM" 

19 # self.collection_url = "https://annals.math.princeton.edu" 

20 

21 self.source = self.get_or_create_source() 

22 self.periode = self.get_or_create_periode() 

23 

24 def parse_collection_content(self, content): 

25 """ 

26 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue. 

27 Each xissue has its volume/number/year metadata + its url 

28 This web site has multiple pages for its issues. so we need to crawl all of them 

29 """ 

30 xissues = [] 

31 self.parse_one_issues_page(content, xissues) 

32 

33 url = self.collection_url + "/2" 

34 content = self.download_file(url) 

35 self.parse_one_issues_page(content, xissues) 

36 

37 return xissues 

38 

39 def parse_one_issues_page(self, content, xissues): 

40 soup = BeautifulSoup(content, "html.parser") 

41 

42 # Extract the list of issues 

43 issue_nodes = soup.find_all("h2") 

44 

45 for issue_node in issue_nodes: 

46 issue_link_node = issue_node.find("a") 

47 if issue_link_node: 

48 url = issue_link_node.get("href") 

49 text = issue_link_node.get_text().strip() 

50 if text.find("Vol.") == 0: 

51 text = text[5:] 

52 parts = text.split("No.") 

53 volume = parts[0].strip() 

54 parts = parts[1].split("(") 

55 number = parts[0].strip() 

56 year = parts[1][0:4] 

57 

58 xissue = create_issuedata() 

59 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

60 xissue.year = year 

61 xissue.volume = volume 

62 xissue.number = number 

63 xissue.url = url 

64 

65 xissues.append(xissue) 

66 

67 def parse_issue_content(self, content, xissue): 

68 soup = BeautifulSoup(content, "html.parser") 

69 article_nodes = soup.find_all("h3", {"class": "title"}) 

70 

71 for index_article, article_node in enumerate(article_nodes): 

72 article_link_node = article_node.find("a") 

73 if article_link_node: 73 ↛ 71line 73 didn't jump to line 71 because the condition on line 73 was always true

74 url = article_link_node.get("href") 

75 xarticle = create_articledata() 

76 xarticle.pid = "a" + str(index_article) 

77 xarticle.url = url 

78 

79 meta_node = article_node.find_next_sibling("div") 

80 if meta_node: 80 ↛ 98line 80 didn't jump to line 98 because the condition on line 80 was always true

81 pages_node = meta_node.find("div", {"class": "pages"}) 

82 if pages_node is not None: 

83 text = pages_node.get_text() 

84 

85 if "," in text and "pp" in text: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 parts = text.split(",") 

87 number_parts = parts[0].split(".") 

88 if len(number_parts) == 2: 

89 xarticle.article_number = number_parts[1].strip() 

90 

91 text = parts[1].split("pp")[0].strip() 

92 xarticle.counts.append(("page-count", text)) 

93 elif "-" in text: 93 ↛ 98line 93 didn't jump to line 98 because the condition on line 93 was always true

94 parts = text.split("-") 

95 xarticle.fpage = parts[0].strip() 

96 xarticle.lpage = parts[1].strip() 

97 

98 xissue.articles.append(xarticle) 

99 

100 def parse_article_content(self, content, xissue, xarticle, url, pid): 

101 """ 

102 Parse the content with Beautifulsoup and returns an ArticleData 

103 """ 

104 xarticle.pid = pid 

105 xarticle.lang = "en" 

106 

107 soup = BeautifulSoup(content, "html.parser") 

108 

109 # TITLE 

110 title_node = soup.find("h1", {"class": "page_title"}) 

111 if title_node: 111 ↛ 115line 111 didn't jump to line 115 because the condition on line 111 was always true

112 xarticle.title_tex = title_node.get_text() 

113 

114 # AUTHORS 

115 authors_node = soup.find("ul", {"class": "authors"}) 

116 if authors_node and isinstance(authors_node, Tag): 116 ↛ 126line 116 didn't jump to line 126 because the condition on line 116 was always true

117 span_nodes = authors_node.find_all("span", {"class": "name"}) 

118 for span_node in span_nodes: 

119 text = span_node.get_text().strip() 

120 

121 author = create_contributor(role="author", string_name=text) 

122 

123 xarticle.contributors.append(author) 

124 

125 # DOI 

126 doi_node = soup.find("section", {"class": "item doi"}) 

127 if doi_node: 127 ↛ 139line 127 didn't jump to line 139 because the condition on line 127 was always true

128 doi_node = doi_node.find("a") 

129 if doi_node and isinstance(doi_node, Tag): 129 ↛ 139line 129 didn't jump to line 139 because the condition on line 129 was always true

130 url = doi_node.get("href") 

131 if isinstance(url, str): 131 ↛ 139line 131 didn't jump to line 139 because the condition on line 131 was always true

132 pos = url.find("10.") 

133 if pos > 0: 133 ↛ 139line 133 didn't jump to line 139 because the condition on line 133 was always true

134 doi = url[pos:] 

135 xarticle.doi = doi 

136 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

137 

138 # KEYWORDS 

139 kwds_node = soup.find("section", {"class": "item keywords"}) 

140 if kwds_node: 140 ↛ 151line 140 didn't jump to line 151 because the condition on line 140 was always true

141 span_node = kwds_node.find("span", {"class": "value"}) 

142 if span_node and not isinstance(span_node, int): 142 ↛ 151line 142 didn't jump to line 151 because the condition on line 142 was always true

143 text = span_node.get_text().strip() 

144 for kwd in text.split(", "): 

145 subject = create_subj() 

146 subject["value"] = kwd 

147 subject["lang"] = "en" 

148 xarticle.kwds.append(subject) 

149 

150 # ABSTRACT 

151 abstract_node = soup.find("section", {"class": "item abstract"}) 

152 if abstract_node: 152 ↛ 167line 152 didn't jump to line 167 because the condition on line 152 was always true

153 text = abstract_node.get_text().strip() 

154 if text.find("Abstract") == 0: 154 ↛ 167line 154 didn't jump to line 167 because the condition on line 154 was always true

155 text = text[9:] 

156 xarticle.abstracts.append( 

157 { 

158 "tag": "abstract", 

159 "value_html": "", 

160 "value_tex": text, 

161 "value_xml": "", 

162 "lang": "en", 

163 } 

164 ) 

165 

166 # PDF 

167 pdf_node = soup.find("a", {"class": "obj_galley_link pdf"}) 

168 if pdf_node and isinstance(pdf_node, Tag): 168 ↛ 176line 168 didn't jump to line 176 because the condition on line 168 was always true

169 pdf_url = pdf_node.get("href") 

170 if isinstance(pdf_url, list): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 raise ValueError("pdf_url is a list") 

172 if pdf_url is None: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 raise ValueError("pdf_url not found") 

174 add_pdf_link_to_xarticle(xarticle, pdf_url) 

175 

176 return xarticle