Coverage for src/crawler/by_source/amp

1import lingua

2from bs4 import BeautifulSoup

3from lingua import LanguageDetectorBuilder

4from ptf.model_data import create_articledata

6from crawler.base_crawler import BaseCollectionCrawler

7from crawler.types import CitationLiteral

10class AmpCrawler(BaseCollectionCrawler):

11 source_name = "Annals of Mathematics website"

12 source_domain = "AMP"

13 source_website = "https://annals.math.princeton.edu"

15 language_detector = LanguageDetectorBuilder.from_languages(

16 lingua.Language.ENGLISH,

17 lingua.Language.FRENCH,

18 ).build()

20 def parse_collection_content(self, content):

21 """

22 Parse the HTML page of Annals of Math and returns a list of xissue.

23 Each xissue has its volume/number/year metadata + its url

24 """

25 soup = BeautifulSoup(content, "html.parser")

26 xissues = []

28 # Extract the list of issues

29 issue_nodes = soup.find_all("div", {"class": "cat-item-2"})

31 for issue_node in issue_nodes:

32 issue_link_node = issue_node.find("a")

33 if issue_link_node: 33 ↛ 31line 33 didn't jump to line 31 because the condition on line 33 was always true

34 url = issue_link_node.get("href")

35 xissue = self.create_amp_xissue(url)

36 if xissue: 36 ↛ 31line 36 didn't jump to line 31 because the condition on line 36 was always true

37 xissues.append(xissue)

39 return xissues

41 def create_amp_xissue(self, url):

42 if url.endswith("/"):

43 url = url[:-1]

44 parts = url.split("/")

46 last_part = parts[-1]

47 exceptions = last_part.split("-")

48 if len(exceptions) > 2:

49 year = exceptions[0]

50 volume = exceptions[0]

51 number = exceptions[1]

52 else:

53 year = parts[-2]

54 if len(year) < 4:

55 # The links are different with volumes before 2015

56 year = parts[-3]

58 volume_number = parts[-1]

59 volume_number_parts = volume_number.split("-")

60 volume = volume_number_parts[0]

61 number = volume_number_parts[1]

63 return self.create_xissue(url, year, volume, number)

65 def parse_issue_content(self, content, xissue):

66 soup = BeautifulSoup(content, "html.parser")

67 article_nodes = soup.find_all("h2", {"class": "entry-title"})

69 for index_article, article_node in enumerate(article_nodes):

70 article_link_node = article_node.find("a")

71 if article_link_node: 71 ↛ 69line 71 didn't jump to line 69 because the condition on line 71 was always true

72 url = article_link_node.get("href")

73 xarticle = create_articledata()

74 xarticle.pid = "a" + str(index_article)

75 xarticle.url = url

76 xissue.articles.append(xarticle)

78 def parse_article_content(self, content, xissue, xarticle, url):

79 """

80 Parse the content with Beautifulsoup and returns an ArticleData

81 """

82 xarticle.lang = "en"

84 soup = BeautifulSoup(content, "html.parser")

86 what: list[CitationLiteral] = ["author", "abstract", "page"]

88 doi_tag = soup.select_one("meta[name='citation_doi']")

89 if doi_tag and not doi_tag.text.startswith("https://doi.org/"): 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was always true

90 what.append("pdf")

92 title_node = soup.find("h1", {"class": "entry-title"})

93 if title_node: 93 ↛ 96line 93 didn't jump to line 96 because the condition on line 93 was always true

94 what.append("title")

96 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 96 ↛ 102line 96 didn't jump to line 102 because the condition on line 96 was always true

97 # Exception with Annals of Math: 2 articles have the same DOI !

98 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14

99 # we ignore DOI/ZBMATH/MR for the first one

100 what.append("doi")

101

102 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)

103

104 abstract_node = soup.select_one("div.entry-content")

105 if abstract_node is not None: 105 ↛ 120line 105 didn't jump to line 120 because the condition on line 105 was always true

106 abstract_section_node = abstract_node.select_one("p")

107 if abstract_section_node: 107 ↛ 120line 107 didn't jump to line 120 because the condition on line 107 was always true

108 abstract = str(abstract_section_node)

109 xarticle.abstracts.append(

110 {

111 "tag": "abstract",

112 "value_html": "",

113 "value_tex": abstract,

114 "value_xml": "",

115 "lang": self.detect_language(abstract, xarticle),

116 }

117 )

118

119 # ZBMATH

120 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"})

121 for metadata_header_node in metadata_header_nodes:

122 text = metadata_header_node.get_text()

123

124 if text == "zbMATH":

125 link_node = metadata_header_node.parent.find("a")

126 if link_node: 126 ↛ 121line 126 didn't jump to line 121 because the condition on line 126 was always true

127 zblid = link_node.get("href")

128 pos = zblid.find("?q=an:")

129 if pos > 0: 129 ↛ 131line 129 didn't jump to line 131 because the condition on line 129 was always true

130 zblid = zblid[pos + 6 :]

131 xarticle.extids.append(("zbl-item-id", zblid))

132 elif text == "MR":

133 link_node = metadata_header_node.parent.find("a")

134 if link_node: 134 ↛ 121line 134 didn't jump to line 121 because the condition on line 134 was always true

135 mrid = link_node.get("href")

136 pos = mrid.find("?mr=")

137 if pos > 0: 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was always true

138 mrid = mrid[pos + 4 :]

139 xarticle.extids.append(("mr-item-id", mrid))

140

141 return xarticle

Coverage for src/crawler/by_source/amp_crawler.py: 91%

91 statements