Coverage for src/crawler/by_source/amc_crawler.py: 83%

124 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1import lingua 

2from bs4 import BeautifulSoup, Tag 

3from lingua import LanguageDetectorBuilder 

4from ptf.model_data import ( 

5 create_abstract, 

6 create_articledata, 

7 create_contributor, 

8 create_issuedata, 

9 create_subj, 

10) 

11 

12from crawler.base_crawler import BaseCollectionCrawler 

13from crawler.utils import add_pdf_link_to_xarticle 

14 

15 

16class AmcCrawler(BaseCollectionCrawler): 

17 source_name = "Ars Mathematica Contemporanea website" 

18 source_domain = "AMC" 

19 source_website = "https://amc-journal.eu" 

20 

21 language_detector = LanguageDetectorBuilder.from_languages( 

22 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.SLOVENE 

23 ).build() 

24 

25 def parse_collection_content(self, content): 

26 """ 

27 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue. 

28 Each xissue has its volume/number/year metadata + its url 

29 This web site has multiple pages for its issues. so we need to crawl all of them 

30 """ 

31 xissues = [] 

32 

33 soup = BeautifulSoup(content, "html.parser") 

34 self.parse_one_issues_page(content, xissues) 

35 next_button = soup.select_one("a.next") 

36 

37 while next_button: 

38 url = next_button.get("href") 

39 if not isinstance(url, str): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 next_button = None 

41 continue 

42 content = self.download_file(url) 

43 soup = BeautifulSoup(content, "html.parser") 

44 self.parse_one_issues_page(content, xissues) 

45 next_button = soup.select_one("a.next") 

46 return xissues 

47 

48 def parse_one_issues_page(self, content, xissues): 

49 soup = BeautifulSoup(content, "html.parser") 

50 

51 # Extract the list of issues 

52 issue_nodes = soup.find_all("h2") 

53 

54 for issue_node in issue_nodes: 

55 issue_link_node = issue_node.find("a") 

56 if issue_link_node: 

57 url = issue_link_node.get("href") 

58 text = issue_link_node.get_text().strip() 

59 if text.find("Vol.") == 0: 

60 text = text[5:] 

61 parts = text.split("No.") 

62 volume = parts[0].strip() 

63 parts = parts[1].split("(") 

64 number = parts[0].strip() 

65 year = parts[1][0:4] 

66 

67 xissue = create_issuedata() 

68 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

69 xissue.year = year 

70 xissue.volume = volume 

71 xissue.number = number 

72 xissue.url = url 

73 

74 xissues.append(xissue) 

75 

76 def parse_issue_content(self, content, xissue): 

77 soup = BeautifulSoup(content, "html.parser") 

78 article_nodes = soup.find_all("h3", {"class": "title"}) 

79 

80 for index_article, article_node in enumerate(article_nodes): 

81 article_link_node = article_node.find("a") 

82 if article_link_node: 82 ↛ 80line 82 didn't jump to line 80 because the condition on line 82 was always true

83 url = article_link_node.get("href") 

84 xarticle = create_articledata() 

85 xarticle.pid = "a" + str(index_article) 

86 xarticle.url = url 

87 

88 meta_node = article_node.find_next_sibling("div") 

89 if meta_node: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true

90 pages_node = meta_node.find("div", {"class": "pages"}) 

91 if pages_node is not None: 

92 text = pages_node.get_text() 

93 

94 if "," in text and "pp" in text: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 parts = text.split(",") 

96 number_parts = parts[0].split(".") 

97 if len(number_parts) == 2: 

98 xarticle.article_number = number_parts[1].strip() 

99 

100 text = parts[1].split("pp")[0].strip() 

101 xarticle.counts.append(("page-count", text)) 

102 elif "-" in text: 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was always true

103 parts = text.split("-") 

104 xarticle.fpage = parts[0].strip() 

105 xarticle.lpage = parts[1].strip() 

106 

107 xissue.articles.append(xarticle) 

108 

109 def parse_article_content(self, content, xissue, xarticle, url): 

110 """ 

111 Parse the content with Beautifulsoup and returns an ArticleData 

112 """ 

113 

114 xarticle.lang = "en" 

115 

116 soup = BeautifulSoup(content, "html.parser") 

117 

118 # TITLE 

119 title_node = soup.select_one("h1.page_title") 

120 if title_node: 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true

121 xarticle.title_tex = title_node.get_text() 

122 

123 # AUTHORS 

124 authors_node = soup.select_one("ul.authors") 

125 if authors_node and isinstance(authors_node, Tag): 125 ↛ 135line 125 didn't jump to line 135 because the condition on line 125 was always true

126 span_nodes = authors_node.find_all("span", {"class": "name"}) 

127 for span_node in span_nodes: 

128 text = span_node.get_text().strip() 

129 

130 author = create_contributor(role="author", string_name=text) 

131 

132 xarticle.contributors.append(author) 

133 

134 # DOI 

135 doi_node = soup.select_one("section.item.doi") 

136 if doi_node: 136 ↛ 148line 136 didn't jump to line 148 because the condition on line 136 was always true

137 doi_node = doi_node.find("a") 

138 if doi_node and isinstance(doi_node, Tag): 138 ↛ 148line 138 didn't jump to line 148 because the condition on line 138 was always true

139 url = doi_node.get("href") 

140 if isinstance(url, str): 140 ↛ 148line 140 didn't jump to line 148 because the condition on line 140 was always true

141 pos = url.find("10.") 

142 if pos > 0: 142 ↛ 148line 142 didn't jump to line 148 because the condition on line 142 was always true

143 doi = url[pos:] 

144 xarticle.doi = doi 

145 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_") 

146 

147 # KEYWORDS 

148 kwds_node = soup.select_one("section.item.keywords") 

149 if kwds_node: 149 ↛ 160line 149 didn't jump to line 160 because the condition on line 149 was always true

150 span_node = kwds_node.select_one("span.value") 

151 if span_node and not isinstance(span_node, int): 151 ↛ 160line 151 didn't jump to line 160 because the condition on line 151 was always true

152 text = span_node.get_text().strip() 

153 for kwd in text.split(", "): 

154 subject = create_subj() 

155 subject["value"] = kwd 

156 subject["lang"] = xarticle.lang 

157 xarticle.kwds.append(subject) 

158 

159 # ABSTRACT 

160 abstract_node = soup.select_one("section.item.abstract") 

161 if abstract_node: 161 ↛ 170line 161 didn't jump to line 170 because the condition on line 161 was always true

162 text = abstract_node.get_text().strip() 

163 if text.find("Abstract") == 0: 163 ↛ 170line 163 didn't jump to line 170 because the condition on line 163 was always true

164 text = text[9:] 

165 xarticle.abstracts.append( 

166 create_abstract(lang=self.detect_language(text), value_tex=text) 

167 ) 

168 

169 # PDF 

170 pdf_node = soup.select_one("a.obj_galley_link.pdf") 

171 if pdf_node and isinstance(pdf_node, Tag): 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true

172 pdf_url = pdf_node.get("href") 

173 if isinstance(pdf_url, list): 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 raise ValueError("pdf_url is a list") 

175 if pdf_url is None: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 raise ValueError("pdf_url not found") 

177 add_pdf_link_to_xarticle(xarticle, pdf_url) 

178 

179 return xarticle