Coverage for src/crawler/by_source/hdml_crawler.py: 84%

109 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2from urllib.parse import unquote 

3 

4import regex 

5from bs4 import BeautifulSoup 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.base_crawler import add_pdf_link_to_xarticle 

8 

9from ptf.model_data import create_articledata 

10from ptf.model_data import create_contributor 

11from ptf.model_data import create_issuedata 

12 

13 

14class HdmlCrawler(BaseCollectionCrawler): 

15 source_name = "Hellenic Digital Mathematics Library" 

16 source_domain = "HDML" 

17 source_website = "https://hdml.di.ionio.gr" 

18 periode_begin = 0 

19 periode_end = 0 

20 pdf_href = "pdfs/journals" 

21 issue_href = r"(?P<number>((\d+)-?)(\d+)?)" 

22 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)" 

23 

24 def __init__(self, *args, **kwargs): 

25 super().__init__(*args, **kwargs) 

26 

27 self.source = self.get_or_create_source() 

28 # initialisation periode 

29 self.periode = self.get_or_create_periode() 

30 

31 def parse_collection_content(self, content): 

32 """ 

33 Parse the HTML page of Annals of Math and returns a list of xissue. 

34 Each xissue has its volume/number/year metadata + its url 

35 

36 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page 

37 """ 

38 soup = BeautifulSoup(content, "html5lib") 

39 xissues = [] 

40 

41 # Extract the list of issues 

42 base_url_collection = self.collection_url.replace(self.source_website, "") 

43 base_url_collection = unquote(base_url_collection[1:]) 

44 reg_issue = re.compile(base_url_collection + self.issue_href) 

45 

46 issue_nodes = [ 

47 a 

48 for a in soup.select("div#collectionResults a") 

49 if reg_issue.search(str(a.get("href"))) 

50 ] 

51 

52 for issue_node in issue_nodes: 

53 issue_node_link = self.source_website + "/" + issue_node.get("href") 

54 dates = issue_node.find_all("strong")[1].get_text() 

55 xissue = self.create_xissue(issue_node_link, dates) 

56 if xissue: 56 ↛ 52line 56 didn't jump to line 52 because the condition on line 56 was always true

57 xissues.append(xissue) 

58 

59 return xissues 

60 

61 def crawl_one_issue_url(self, xissue): 

62 xissue = super().crawl_one_issue_url(xissue) 

63 

64 xissue.articles = sorted( 

65 xissue.articles, key=lambda x: (int(-1 if x.fpage == "" else x.fpage), int(x.lpage)) 

66 ) 

67 

68 return xissue 

69 

70 def create_xissue(self, url, dates): 

71 if url.endswith("/"): 71 ↛ 73line 71 didn't jump to line 73 because the condition on line 71 was always true

72 url = url[:-1] 

73 parts = url.split("/") 

74 

75 volume = parts[-1] 

76 year = dates 

77 xissue = None 

78 

79 year_int = int(year.split("-")[:1][0]) 

80 if self.periode_begin <= year_int: 80 ↛ 88line 80 didn't jump to line 88 because the condition on line 80 was always true

81 if self.periode_end == 0 or self.periode_begin <= self.periode_end: 81 ↛ 88line 81 didn't jump to line 88 because the condition on line 81 was always true

82 xissue = create_issuedata() 

83 xissue.pid = f"{self.collection_id}_{year}__{volume}" 

84 xissue.year = year 

85 xissue.volume = volume 

86 xissue.url = url 

87 

88 return xissue 

89 

90 def parse_issue_content(self, content, xissue): 

91 # xissue = self.create_xissue(url) 

92 

93 soup = BeautifulSoup(content, "html.parser") 

94 article_nodes = soup.find("div", {"id": "collectionResults"}) 

95 for index_article, article_node in enumerate(article_nodes.find_all("a")): 

96 article_link_node = article_node.get("href") 

97 if article_link_node: 97 ↛ 95line 97 didn't jump to line 95 because the condition on line 97 was always true

98 url = article_node.get("href") 

99 xarticle = create_articledata() 

100 xarticle.pid = "a" + str(index_article) 

101 xarticle.url = self.source_website + "/" + url 

102 

103 xissue.articles.append(xarticle) 

104 

105 xissue.articles = sorted( 

106 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage) 

107 ) 

108 

109 return xissue 

110 

111 def parse_article_content(self, content, xissue, xarticle, url, pid): 

112 """ 

113 Parse the content with Beautifulsoup and returns an ArticleData 

114 """ 

115 xarticle = create_articledata() 

116 xarticle.pid = pid 

117 xarticle.lang = "en" 

118 soup = BeautifulSoup(content, "html.parser") 

119 node_infos_em = soup.find_all("em") 

120 

121 try: 

122 if node_infos_em: 122 ↛ 144line 122 didn't jump to line 144 because the condition on line 122 was always true

123 # TITLE 

124 title = node_infos_em[0].get_text() 

125 xarticle.title_tex = title 

126 xarticle.lang = "gr" 

127 

128 # PAGES 

129 pages = node_infos_em[4].get_text() 

130 xarticle.page_range = pages 

131 pages_infos = pages.split("-") 

132 

133 if len(pages_infos) > 1: 133 ↛ 136line 133 didn't jump to line 136 because the condition on line 133 was always true

134 xarticle.fpage = pages_infos[0] 

135 else: 

136 xarticle.fpage = pages 

137 if len(pages_infos) > 1: 137 ↛ 144line 137 didn't jump to line 144 because the condition on line 137 was always true

138 xarticle.lpage = pages_infos[1] 

139 

140 except Exception: 

141 pass 

142 

143 # AUTHORS 

144 authors = soup.find("strong", text="Authors") 

145 if not isinstance(authors, type(None)): 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 contribs = authors.find_next("em").get_text() 

147 contribs = contribs.split(",") 

148 

149 else: 

150 author = soup.find("strong", text="Author") 

151 if not isinstance(author, type(None)): 151 ↛ 155line 151 didn't jump to line 155 because the condition on line 151 was always true

152 contribs = author.find_next("em").get_text() 

153 contribs = contribs.split(",") 

154 

155 for contrib in contribs: 

156 author = create_contributor() 

157 author["role"] = "author" 

158 author["string_name"] = contrib.replace("\xa0", "") 

159 author["string_name"] = author["string_name"].replace(",", "").replace("by", "") 

160 xarticle.contributors.append(author) 

161 

162 # PDF 

163 reg_pdf = regex.compile(self.pdf_href) 

164 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0] 

165 pdf_link = self.source_website + "/" + pdf_link 

166 add_pdf_link_to_xarticle(xarticle, pdf_link) 

167 

168 return xarticle