Coverage for src/crawler/by_source/amp_crawler.py: 94%

92 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1from bs4 import BeautifulSoup 

2from ptf.model_data import create_articledata, create_issuedata 

3 

4from crawler.base_crawler import BaseCollectionCrawler 

5from crawler.types import CitationLiteral 

6 

7 

8class AmpCrawler(BaseCollectionCrawler): 

9 source_name = "Annals of Mathematics website" 

10 source_domain = "AMP" 

11 source_website = "https://annals.math.princeton.edu" 

12 

13 periode_begin = 2003 

14 periode_end = 2017 

15 

16 def parse_collection_content(self, content): 

17 """ 

18 Parse the HTML page of Annals of Math and returns a list of xissue. 

19 Each xissue has its volume/number/year metadata + its url 

20 """ 

21 soup = BeautifulSoup(content, "html.parser") 

22 xissues = [] 

23 

24 # Extract the list of issues 

25 issue_nodes = soup.find_all("div", {"class": "cat-item-2"}) 

26 

27 for issue_node in issue_nodes: 

28 issue_link_node = issue_node.find("a") 

29 if issue_link_node: 29 ↛ 27line 29 didn't jump to line 27 because the condition on line 29 was always true

30 url = issue_link_node.get("href") 

31 xissue = self.create_amp_xissue(url) 

32 if xissue: 

33 xissues.append(xissue) 

34 

35 return xissues 

36 

37 def create_amp_xissue(self, url): 

38 if url.endswith("/"): 

39 url = url[:-1] 

40 parts = url.split("/") 

41 

42 last_part = parts[-1] 

43 exceptions = last_part.split("-") 

44 if len(exceptions) > 2: 

45 year = exceptions[0] 

46 volume = exceptions[0] 

47 number = exceptions[1] 

48 else: 

49 year = parts[-2] 

50 if len(year) < 4: 

51 # The links are different with volumes before 2015 

52 year = parts[-3] 

53 

54 volume_number = parts[-1] 

55 volume_number_parts = volume_number.split("-") 

56 volume = volume_number_parts[0] 

57 number = volume_number_parts[1] 

58 

59 year_int = int(year) 

60 if self.periode_begin <= year_int <= self.periode_end: 

61 xissue = create_issuedata() 

62 xissue.pid = f"{self.collection_id}_{year}__{volume}_{number}" 

63 xissue.year = year 

64 xissue.volume = volume 

65 xissue.number = number 

66 xissue.url = url 

67 else: 

68 xissue = None 

69 

70 return xissue 

71 

72 def parse_issue_content(self, content, xissue): 

73 soup = BeautifulSoup(content, "html.parser") 

74 article_nodes = soup.find_all("h2", {"class": "entry-title"}) 

75 

76 for index_article, article_node in enumerate(article_nodes): 

77 article_link_node = article_node.find("a") 

78 if article_link_node: 78 ↛ 76line 78 didn't jump to line 76 because the condition on line 78 was always true

79 url = article_link_node.get("href") 

80 xarticle = create_articledata() 

81 xarticle.pid = "a" + str(index_article) 

82 xarticle.url = url 

83 xissue.articles.append(xarticle) 

84 

85 def parse_article_content(self, content, xissue, xarticle, url, pid): 

86 """ 

87 Parse the content with Beautifulsoup and returns an ArticleData 

88 """ 

89 xarticle = create_articledata() 

90 xarticle.pid = pid 

91 xarticle.lang = "en" 

92 

93 soup = BeautifulSoup(content, "html.parser") 

94 

95 what: list[CitationLiteral] = ["author", "pdf", "abstract", "page"] 

96 

97 title_node = soup.find("h1", {"class": "entry-title"}) 

98 if title_node: 98 ↛ 101line 98 didn't jump to line 101 because the condition on line 98 was always true

99 what.append("title") 

100 

101 if url != "https://annals.math.princeton.edu/2010/172-3/p06": 101 ↛ 107line 101 didn't jump to line 107 because the condition on line 101 was always true

102 # Exception with Annals of Math: 2 articles have the same DOI ! 

103 # https://annals.math.princeton.edu/2010/172-3/p06 and https://annals.math.princeton.edu/2011/173-1/p14 

104 # we ignore DOI/ZBMATH/MR for the first one 

105 what.append("doi") 

106 

107 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what) 

108 

109 # ZBMATH 

110 metadata_header_nodes = soup.find_all("div", {"class": "metadata-headers"}) 

111 for metadata_header_node in metadata_header_nodes: 

112 text = metadata_header_node.get_text() 

113 

114 if text == "zbMATH": 

115 link_node = metadata_header_node.parent.find("a") 

116 if link_node: 116 ↛ 111line 116 didn't jump to line 111 because the condition on line 116 was always true

117 zblid = link_node.get("href") 

118 pos = zblid.find("?q=an:") 

119 if pos > 0: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 zblid = zblid[pos + 6 :] 

121 xarticle.extids.append(("zbl-item-id", zblid)) 

122 elif text == "MR": 

123 link_node = metadata_header_node.parent.find("a") 

124 if link_node: 124 ↛ 111line 124 didn't jump to line 111 because the condition on line 124 was always true

125 mrid = link_node.get("href") 

126 pos = mrid.find("?mr=") 

127 if pos > 0: 127 ↛ 129line 127 didn't jump to line 129 because the condition on line 127 was always true

128 mrid = mrid[pos + 4 :] 

129 xarticle.extids.append(("mr-item-id", mrid)) 

130 

131 return xarticle