Coverage for src/crawler/by_source/hdml_crawler.py: 80%

104 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2from urllib.parse import unquote 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import create_articledata, create_contributor, create_issuedata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.utils import add_pdf_link_to_xarticle 

10 

11 

12class HdmlCrawler(BaseCollectionCrawler): 

13 source_name = "Hellenic Digital Mathematics Library" 

14 source_domain = "HDML" 

15 source_website = "https://hdml.di.ionio.gr" 

16 periode_begin = 0 

17 periode_end = 0 

18 pdf_href = "pdfs/journals" 

19 issue_href = r"(?P<number>((\d+)-?)(\d+)?)" 

20 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)" 

21 

22 def parse_collection_content(self, content): 

23 """ 

24 Parse the HTML page of Annals of Math and returns a list of xissue. 

25 Each xissue has its volume/number/year metadata + its url 

26 

27 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page 

28 """ 

29 soup = BeautifulSoup(content, "html5lib") 

30 xissues = [] 

31 

32 # Extract the list of issues 

33 base_url_collection = self.collection_url.replace(self.source_website, "") 

34 base_url_collection = unquote(base_url_collection[1:]) 

35 reg_issue = re.compile(base_url_collection + self.issue_href) 

36 

37 issue_nodes = [ 

38 a 

39 for a in soup.select("div#collectionResults a") 

40 if reg_issue.search(str(a.get("href"))) 

41 ] 

42 

43 for issue_node in issue_nodes: 

44 href = issue_node.get("href") 

45 if not isinstance(href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Cannot parse issue href") 

47 issue_node_link = self.source_website + "/" + href 

48 dates = issue_node.find_all("strong")[1].get_text() 

49 xissue = self.create_hdml_xissue(issue_node_link, dates) 

50 if xissue: 50 ↛ 43line 50 didn't jump to line 43 because the condition on line 50 was always true

51 xissues.append(xissue) 

52 

53 return xissues 

54 

55 def create_hdml_xissue(self, url, dates): 

56 if url.endswith("/"): 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true

57 url = url[:-1] 

58 parts = url.split("/") 

59 

60 volume = parts[-1] 

61 year = dates 

62 xissue = None 

63 

64 year_int = int(year.split("-")[:1][0]) 

65 if self.periode_begin <= year_int: 65 ↛ 73line 65 didn't jump to line 73 because the condition on line 65 was always true

66 if self.periode_end == 0 or self.periode_begin <= self.periode_end: 66 ↛ 73line 66 didn't jump to line 73 because the condition on line 66 was always true

67 xissue = create_issuedata() 

68 xissue.pid = f"{self.collection_id}_{year}__{volume}" 

69 xissue.year = year 

70 xissue.volume = volume 

71 xissue.url = url 

72 

73 return xissue 

74 

75 def parse_issue_content(self, content, xissue): 

76 # xissue = self.create_xissue(url) 

77 

78 soup = BeautifulSoup(content, "html.parser") 

79 article_nodes = soup.find("div", {"id": "collectionResults"}) 

80 if not isinstance(article_nodes, Tag): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Cannot find articles") 

82 for index_article, article_node in enumerate(article_nodes.find_all("a")): 

83 article_link_node = article_node.get("href") 

84 if article_link_node: 84 ↛ 82line 84 didn't jump to line 82 because the condition on line 84 was always true

85 url = article_node.get("href") 

86 xarticle = create_articledata() 

87 xarticle.pid = "a" + str(index_article) 

88 xarticle.url = self.source_website + "/" + url 

89 

90 xissue.articles.append(xarticle) 

91 

92 xissue.articles = sorted( 

93 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage) 

94 ) 

95 

96 def parse_article_content(self, content, xissue, xarticle, url, pid): 

97 """ 

98 Parse the content with Beautifulsoup and returns an ArticleData 

99 """ 

100 xarticle = create_articledata() 

101 xarticle.pid = pid 

102 xarticle.lang = "en" 

103 soup = BeautifulSoup(content, "html.parser") 

104 node_infos_em = soup.find_all("em") 

105 

106 try: 

107 if node_infos_em: 107 ↛ 122line 107 didn't jump to line 122 because the condition on line 107 was always true

108 # TITLE 

109 title = node_infos_em[0].get_text() 

110 xarticle.title_tex = title 

111 xarticle.lang = "gr" 

112 

113 # PAGES 

114 pages = node_infos_em[4].get_text() 

115 self.set_pages(xarticle, pages) 

116 

117 except Exception: 

118 pass 

119 

120 # AUTHORS 

121 # WTF : Shouldn't we handle multiple authors here ? 

122 contribs = None 

123 authors = soup.find("strong", text="Authors") 

124 if authors: 124 ↛ 131line 124 didn't jump to line 131 because the condition on line 124 was always true

125 contribs_div = authors.find_next("em") 

126 if not contribs_div: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise ValueError("Error finding Author") 

128 contribs = contribs_div.get_text().split(",") 

129 

130 else: 

131 author = soup.find("strong", text="Author") 

132 if author: 

133 contribs_div = author.find_next("em") 

134 if not contribs_div: 

135 raise ValueError("Error finding Author") 

136 contribs = contribs_div.get_text().split(",") 

137 

138 if contribs is None: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 raise ValueError("No Contributors found") 

140 

141 for contrib in contribs: 

142 author = create_contributor() 

143 author["role"] = "author" 

144 author["string_name"] = contrib.replace("\xa0", "") 

145 author["string_name"] = author["string_name"].replace(",", "").replace("by", "") 

146 xarticle.contributors.append(author) 

147 

148 # PDF 

149 reg_pdf = regex.compile(self.pdf_href) 

150 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0] 

151 pdf_link = self.source_website + "/" + pdf_link 

152 add_pdf_link_to_xarticle(xarticle, pdf_link) 

153 

154 return xarticle