Coverage for src/crawler/by_source/amc_crawler.py: 83%

124 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-02 15:25 +0000

1import lingua 

2from bs4 import BeautifulSoup, Tag 

3from lingua import LanguageDetectorBuilder 

4from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle 

8 

9 

10class AmcCrawler(BaseCollectionCrawler): 

11 source_name = "Ars Mathematica Contemporanea website" 

12 source_domain = "AMC" 

13 source_website = "https://amc-journal.eu" 

14 

15 language_detector = LanguageDetectorBuilder.from_languages( 

16 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.SLOVENE 

17 ).build() 

18 

19 def parse_collection_content(self, content): 

20 """ 

21 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue. 

22 Each xissue has its volume/number/year metadata + its url 

23 This web site has multiple pages for its issues. so we need to crawl all of them 

24 """ 

25 xissues = [] 

26 

27 soup = BeautifulSoup(content, "html.parser") 

28 self.parse_one_issues_page(content, xissues) 

29 next_button = soup.select_one("a.next") 

30 

31 while next_button: 

32 url = next_button.get("href") 

33 if not isinstance(url, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 next_button = None 

35 continue 

36 content = self.download_file(url) 

37 soup = BeautifulSoup(content, "html.parser") 

38 self.parse_one_issues_page(content, xissues) 

39 next_button = soup.select_one("a.next") 

40 return xissues 

41 

42 def parse_one_issues_page(self, content, xissues): 

43 soup = BeautifulSoup(content, "html.parser") 

44 

45 # Extract the list of issues 

46 issue_nodes = soup.find_all("h2") 

47 

48 for issue_node in issue_nodes: 

49 issue_link_node = issue_node.find("a") 

50 if issue_link_node: 

51 url = issue_link_node.get("href") 

52 text = issue_link_node.get_text().strip() 

53 if text.find("Vol.") == 0: 

54 text = text[5:] 

55 parts = text.split("No.") 

56 volume = parts[0].strip() 

57 parts = parts[1].split("(") 

58 number = parts[0].strip() 

59 year = parts[1][0:4] 

60 

61 xissue = create_issuedata() 

62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

63 xissue.year = year 

64 xissue.volume = volume 

65 xissue.number = number 

66 xissue.url = url 

67 

68 xissues.append(xissue) 

69 

70 def parse_issue_content(self, content, xissue): 

71 soup = BeautifulSoup(content, "html.parser") 

72 article_nodes = soup.find_all("h3", {"class": "title"}) 

73 

74 for index_article, article_node in enumerate(article_nodes): 

75 article_link_node = article_node.find("a") 

76 if article_link_node: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true

77 url = article_link_node.get("href") 

78 xarticle = create_articledata() 

79 xarticle.pid = "a" + str(index_article) 

80 xarticle.url = url 

81 

82 meta_node = article_node.find_next_sibling("div") 

83 if meta_node: 83 ↛ 101line 83 didn't jump to line 101 because the condition on line 83 was always true

84 pages_node = meta_node.find("div", {"class": "pages"}) 

85 if pages_node is not None: 

86 text = pages_node.get_text() 

87 

88 if "," in text and "pp" in text: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 parts = text.split(",") 

90 number_parts = parts[0].split(".") 

91 if len(number_parts) == 2: 

92 xarticle.article_number = number_parts[1].strip() 

93 

94 text = parts[1].split("pp")[0].strip() 

95 xarticle.counts.append(("page-count", text)) 

96 elif "-" in text: 96 ↛ 101line 96 didn't jump to line 101 because the condition on line 96 was always true

97 parts = text.split("-") 

98 xarticle.fpage = parts[0].strip() 

99 xarticle.lpage = parts[1].strip() 

100 

101 xissue.articles.append(xarticle) 

102 

103 def parse_article_content(self, content, xissue, xarticle, url): 

104 """ 

105 Parse the content with Beautifulsoup and returns an ArticleData 

106 """ 

107 

108 xarticle.lang = "en" 

109 

110 soup = BeautifulSoup(content, "html.parser") 

111 

112 # TITLE 

113 title_node = soup.select_one("h1.page_title") 

114 if title_node: 114 ↛ 118line 114 didn't jump to line 118 because the condition on line 114 was always true

115 xarticle.title_tex = title_node.get_text() 

116 

117 # AUTHORS 

118 authors_node = soup.select_one("ul.authors") 

119 if authors_node and isinstance(authors_node, Tag): 119 ↛ 129line 119 didn't jump to line 129 because the condition on line 119 was always true

120 span_nodes = authors_node.find_all("span", {"class": "name"}) 

121 for span_node in span_nodes: 

122 text = span_node.get_text().strip() 

123 

124 author = create_contributor(role="author", string_name=text) 

125 

126 xarticle.contributors.append(author) 

127 

128 # DOI 

129 doi_node = soup.select_one("section.item.doi") 

130 if doi_node: 130 ↛ 142line 130 didn't jump to line 142 because the condition on line 130 was always true

131 doi_node = doi_node.find("a") 

132 if doi_node and isinstance(doi_node, Tag): 132 ↛ 142line 132 didn't jump to line 142 because the condition on line 132 was always true

133 url = doi_node.get("href") 

134 if isinstance(url, str): 134 ↛ 142line 134 didn't jump to line 142 because the condition on line 134 was always true

135 pos = url.find("10.") 

136 if pos > 0: 136 ↛ 142line 136 didn't jump to line 142 because the condition on line 136 was always true

137 doi = url[pos:] 

138 xarticle.doi = doi 

139 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

140 

141 # KEYWORDS 

142 kwds_node = soup.select_one("section.item.keywords") 

143 if kwds_node: 143 ↛ 154line 143 didn't jump to line 154 because the condition on line 143 was always true

144 span_node = kwds_node.select_one("span.value") 

145 if span_node and not isinstance(span_node, int): 145 ↛ 154line 145 didn't jump to line 154 because the condition on line 145 was always true

146 text = span_node.get_text().strip() 

147 for kwd in text.split(", "): 

148 subject = create_subj() 

149 subject["value"] = kwd 

150 subject["lang"] = xarticle.lang 

151 xarticle.kwds.append(subject) 

152 

153 # ABSTRACT 

154 abstract_node = soup.select_one("section.item.abstract") 

155 if abstract_node: 155 ↛ 170line 155 didn't jump to line 170 because the condition on line 155 was always true

156 text = abstract_node.get_text().strip() 

157 if text.find("Abstract") == 0: 157 ↛ 170line 157 didn't jump to line 170 because the condition on line 157 was always true

158 text = text[9:] 

159 xarticle.abstracts.append( 

160 { 

161 "tag": "abstract", 

162 "value_html": "", 

163 "value_tex": text, 

164 "value_xml": "", 

165 "lang": self.detect_language(text), 

166 } 

167 ) 

168 

169 # PDF 

170 pdf_node = soup.select_one("a.obj_galley_link.pdf") 

171 if pdf_node and isinstance(pdf_node, Tag): 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true

172 pdf_url = pdf_node.get("href") 

173 if isinstance(pdf_url, list): 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 raise ValueError("pdf_url is a list") 

175 if pdf_url is None: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 raise ValueError("pdf_url not found") 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 return xarticle