Coverage for src/crawler/by_source/amc

1import lingua

2from bs4 import BeautifulSoup, Tag

3from lingua import LanguageDetectorBuilder

4from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj

6from crawler.base_crawler import BaseCollectionCrawler

7from crawler.utils import add_pdf_link_to_xarticle

10class AmcCrawler(BaseCollectionCrawler):

11 source_name = "Ars Mathematica Contemporanea website"

12 source_domain = "AMC"

13 source_website = "https://amc-journal.eu"

15 language_detector = LanguageDetectorBuilder.from_languages(

16 lingua.Language.ENGLISH, lingua.Language.FRENCH, lingua.Language.SLOVENE

17 ).build()

19 def parse_collection_content(self, content):

20 """

21 Parse the HTML page of Ars Mathematica Contemporanea and returns a list of xissue.

22 Each xissue has its volume/number/year metadata + its url

23 This web site has multiple pages for its issues. so we need to crawl all of them

24 """

25 xissues = []

27 soup = BeautifulSoup(content, "html.parser")

28 self.parse_one_issues_page(content, xissues)

29 next_button = soup.select_one("a.next")

31 while next_button:

32 url = next_button.get("href")

33 if not isinstance(url, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 next_button = None

35 continue

36 content = self.download_file(url)

37 soup = BeautifulSoup(content, "html.parser")

38 self.parse_one_issues_page(content, xissues)

39 next_button = soup.select_one("a.next")

40 return xissues

42 def parse_one_issues_page(self, content, xissues):

43 soup = BeautifulSoup(content, "html.parser")

45 # Extract the list of issues

46 issue_nodes = soup.find_all("h2")

48 for issue_node in issue_nodes:

49 issue_link_node = issue_node.find("a")

50 if issue_link_node:

51 url = issue_link_node.get("href")

52 text = issue_link_node.get_text().strip()

53 if text.find("Vol.") == 0:

54 text = text[5:]

55 parts = text.split("No.")

56 volume = parts[0].strip()

57 parts = parts[1].split("(")

58 number = parts[0].strip()

59 year = parts[1][0:4]

61 xissue = create_issuedata()

62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}"

63 xissue.year = year

64 xissue.volume = volume

65 xissue.number = number

66 xissue.url = url

68 xissues.append(xissue)

70 def parse_issue_content(self, content, xissue):

71 soup = BeautifulSoup(content, "html.parser")

72 article_nodes = soup.find_all("h3", {"class": "title"})

74 for index_article, article_node in enumerate(article_nodes):

75 article_link_node = article_node.find("a")

76 if article_link_node: 76 ↛ 74line 76 didn't jump to line 74 because the condition on line 76 was always true

77 url = article_link_node.get("href")

78 xarticle = create_articledata()

79 xarticle.pid = "a" + str(index_article)

80 xarticle.url = url

82 meta_node = article_node.find_next_sibling("div")

83 if meta_node: 83 ↛ 101line 83 didn't jump to line 101 because the condition on line 83 was always true

84 pages_node = meta_node.find("div", {"class": "pages"})

85 if pages_node is not None:

86 text = pages_node.get_text()

88 if "," in text and "pp" in text: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 parts = text.split(",")

90 number_parts = parts[0].split(".")

91 if len(number_parts) == 2:

92 xarticle.article_number = number_parts[1].strip()

94 text = parts[1].split("pp")[0].strip()

95 xarticle.counts.append(("page-count", text))

96 elif "-" in text: 96 ↛ 101line 96 didn't jump to line 101 because the condition on line 96 was always true

97 parts = text.split("-")

98 xarticle.fpage = parts[0].strip()

99 xarticle.lpage = parts[1].strip()

100

101 xissue.articles.append(xarticle)

102

103 def parse_article_content(self, content, xissue, xarticle, url):

104 """

105 Parse the content with Beautifulsoup and returns an ArticleData

106 """

107

108 xarticle.lang = "en"

109

110 soup = BeautifulSoup(content, "html.parser")

111

112 # TITLE

113 title_node = soup.select_one("h1.page_title")

114 if title_node: 114 ↛ 118line 114 didn't jump to line 118 because the condition on line 114 was always true

115 xarticle.title_tex = title_node.get_text()

116

117 # AUTHORS

118 authors_node = soup.select_one("ul.authors")

119 if authors_node and isinstance(authors_node, Tag): 119 ↛ 129line 119 didn't jump to line 129 because the condition on line 119 was always true

120 span_nodes = authors_node.find_all("span", {"class": "name"})

121 for span_node in span_nodes:

122 text = span_node.get_text().strip()

123

124 author = create_contributor(role="author", string_name=text)

125

126 xarticle.contributors.append(author)

127

128 # DOI

129 doi_node = soup.select_one("section.item.doi")

130 if doi_node: 130 ↛ 142line 130 didn't jump to line 142 because the condition on line 130 was always true

131 doi_node = doi_node.find("a")

132 if doi_node and isinstance(doi_node, Tag): 132 ↛ 142line 132 didn't jump to line 142 because the condition on line 132 was always true

133 url = doi_node.get("href")

134 if isinstance(url, str): 134 ↛ 142line 134 didn't jump to line 142 because the condition on line 134 was always true

135 pos = url.find("10.")

136 if pos > 0: 136 ↛ 142line 136 didn't jump to line 142 because the condition on line 136 was always true

137 doi = url[pos:]

138 xarticle.doi = doi

139 xarticle.pid = doi.replace("/", "_").replace(".", "_").replace("-", "_")

140

141 # KEYWORDS

142 kwds_node = soup.select_one("section.item.keywords")

143 if kwds_node: 143 ↛ 154line 143 didn't jump to line 154 because the condition on line 143 was always true

144 span_node = kwds_node.select_one("span.value")

145 if span_node and not isinstance(span_node, int): 145 ↛ 154line 145 didn't jump to line 154 because the condition on line 145 was always true

146 text = span_node.get_text().strip()

147 for kwd in text.split(", "):

148 subject = create_subj()

149 subject["value"] = kwd

150 subject["lang"] = xarticle.lang

151 xarticle.kwds.append(subject)

152

153 # ABSTRACT

154 abstract_node = soup.select_one("section.item.abstract")

155 if abstract_node: 155 ↛ 170line 155 didn't jump to line 170 because the condition on line 155 was always true

156 text = abstract_node.get_text().strip()

157 if text.find("Abstract") == 0: 157 ↛ 170line 157 didn't jump to line 170 because the condition on line 157 was always true

158 text = text[9:]

159 xarticle.abstracts.append(

160 {

161 "tag": "abstract",

162 "value_html": "",

163 "value_tex": text,

164 "value_xml": "",

165 "lang": self.detect_language(text),

166 }

167 )

168

169 # PDF

170 pdf_node = soup.select_one("a.obj_galley_link.pdf")

171 if pdf_node and isinstance(pdf_node, Tag): 171 ↛ 179line 171 didn't jump to line 179 because the condition on line 171 was always true

172 pdf_url = pdf_node.get("href")

173 if isinstance(pdf_url, list): 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true

174 raise ValueError("pdf_url is a list")

175 if pdf_url is None: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 raise ValueError("pdf_url not found")

177 add_pdf_link_to_xarticle(xarticle, pdf_url)

178

179 return xarticle

Coverage for src/crawler/by_source/amc_crawler.py: 83%

124 statements