Coverage for src/crawler/by_source/hdml_crawler.py: 81%

99 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2from urllib.parse import unquote 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import create_articledata, create_contributor, create_issuedata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.utils import add_pdf_link_to_xarticle 

10 

11 

12class HdmlCrawler(BaseCollectionCrawler): 

13 source_name = "Hellenic Digital Mathematics Library" 

14 source_domain = "HDML" 

15 source_website = "https://hdml.di.ionio.gr" 

16 

17 pdf_href = "pdfs/journals" 

18 issue_href = r"(?P<number>((\d+)-?)(\d+)?)" 

19 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)" 

20 

21 def parse_collection_content(self, content): 

22 """ 

23 Parse the HTML page of Annals of Math and returns a list of xissue. 

24 Each xissue has its volume/number/year metadata + its url 

25 """ 

26 soup = BeautifulSoup(content, "html5lib") 

27 xissues = [] 

28 

29 # Extract the list of issues 

30 base_url_collection = self.collection_url.replace(self.source_website, "") 

31 base_url_collection = unquote(base_url_collection[1:]) 

32 reg_issue = re.compile(base_url_collection + self.issue_href) 

33 

34 issue_nodes = [ 

35 a 

36 for a in soup.select("div#collectionResults a") 

37 if reg_issue.search(str(a.get("href"))) 

38 ] 

39 

40 for issue_node in issue_nodes: 

41 href = issue_node.get("href") 

42 if not isinstance(href, str): 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 raise ValueError("Cannot parse issue href") 

44 issue_node_link = self.source_website + "/" + href 

45 dates = issue_node.find_all("strong")[1].get_text() 

46 xissue = self.create_hdml_xissue(issue_node_link, dates) 

47 if xissue: 47 ↛ 40line 47 didn't jump to line 40 because the condition on line 47 was always true

48 xissues.append(xissue) 

49 

50 return xissues 

51 

52 def create_hdml_xissue(self, url, dates): 

53 if url.endswith("/"): 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true

54 url = url[:-1] 

55 parts = url.split("/") 

56 

57 volume = parts[-1] 

58 year = dates 

59 xissue = None 

60 

61 if year == "1985-86": 

62 year = "1985-1986" 

63 

64 xissue = create_issuedata() 

65 xissue.pid = f"{self.collection_id}_{year}__{volume}" 

66 xissue.year = year 

67 xissue.volume = volume 

68 xissue.url = url 

69 

70 return xissue 

71 

72 def parse_issue_content(self, content, xissue): 

73 # xissue = self.create_xissue(url) 

74 

75 soup = BeautifulSoup(content, "html.parser") 

76 article_nodes = soup.find("div", {"id": "collectionResults"}) 

77 if not isinstance(article_nodes, Tag): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Cannot find articles") 

79 for index_article, article_node in enumerate(article_nodes.find_all("a")): 

80 article_link_node = article_node.get("href") 

81 if article_link_node: 81 ↛ 79line 81 didn't jump to line 79 because the condition on line 81 was always true

82 url = article_node.get("href") 

83 xarticle = create_articledata() 

84 xarticle.pid = "a" + str(index_article) 

85 xarticle.url = self.source_website + "/" + url 

86 

87 xissue.articles.append(xarticle) 

88 

89 xissue.articles = sorted( 

90 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage) 

91 ) 

92 

93 def parse_article_content(self, content, xissue, xarticle, url): 

94 """ 

95 Parse the content with Beautifulsoup and returns an ArticleData 

96 """ 

97 xarticle.lang = "en" 

98 soup = BeautifulSoup(content, "html.parser") 

99 node_infos_em = soup.find_all("em") 

100 

101 try: 

102 if node_infos_em: 102 ↛ 117line 102 didn't jump to line 117 because the condition on line 102 was always true

103 # TITLE 

104 title = node_infos_em[0].get_text() 

105 xarticle.title_tex = title 

106 xarticle.lang = "gr" 

107 

108 # PAGES 

109 pages = node_infos_em[4].get_text() 

110 self.set_pages(xarticle, pages) 

111 

112 except Exception: 

113 pass 

114 

115 # AUTHORS 

116 # WTF : Shouldn't we handle multiple authors here ? 

117 contribs = None 

118 authors = soup.find("strong", text="Authors") 

119 if authors: 119 ↛ 126line 119 didn't jump to line 126 because the condition on line 119 was always true

120 contribs_div = authors.find_next("em") 

121 if not contribs_div: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 raise ValueError("Error finding Author") 

123 contribs = contribs_div.get_text().split(",") 

124 

125 else: 

126 author = soup.find("strong", text="Author") 

127 if author: 

128 contribs_div = author.find_next("em") 

129 if not contribs_div: 

130 raise ValueError("Error finding Author") 

131 contribs = contribs_div.get_text().split(",") 

132 

133 if contribs is None: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError("No Contributors found") 

135 

136 for contrib in contribs: 

137 author = create_contributor() 

138 author["role"] = "author" 

139 author["string_name"] = contrib.replace("\xa0", "") 

140 author["string_name"] = author["string_name"].replace(",", "").replace("by", "") 

141 xarticle.contributors.append(author) 

142 

143 # PDF 

144 reg_pdf = regex.compile(self.pdf_href) 

145 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0] 

146 pdf_link = self.source_website + "/" + pdf_link 

147 add_pdf_link_to_xarticle(xarticle, pdf_link) 

148 

149 return xarticle