Coverage for src/crawler/by_source/hdml_crawler.py: 81%

100 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-10-23 10:17 +0000

1import re 

2from urllib.parse import unquote 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import create_articledata, create_contributor, create_issuedata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.utils import add_pdf_link_to_xarticle 

10 

11 

12class HdmlCrawler(BaseCollectionCrawler): 

13 source_name = "Hellenic Digital Mathematics Library" 

14 source_domain = "HDML" 

15 source_website = "https://hdml.di.ionio.gr" 

16 

17 pdf_href = "pdfs/journals" 

18 issue_href = r"(?P<number>((\d+)-?)(\d+)?)" 

19 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)" 

20 

21 verify = False 

22 

23 def parse_collection_content(self, content): 

24 """ 

25 Parse the HTML page of Annals of Math and returns a list of xissue. 

26 Each xissue has its volume/number/year metadata + its url 

27 """ 

28 soup = BeautifulSoup(content, "html5lib") 

29 xissues = [] 

30 

31 # Extract the list of issues 

32 base_url_collection = self.collection_url.replace(self.source_website, "") 

33 base_url_collection = unquote(base_url_collection[1:]) 

34 reg_issue = re.compile(base_url_collection + self.issue_href) 

35 

36 issue_nodes = [ 

37 a 

38 for a in soup.select("div#collectionResults a") 

39 if reg_issue.search(str(a.get("href"))) 

40 ] 

41 

42 for issue_node in issue_nodes: 

43 href = issue_node.get("href") 

44 if not isinstance(href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Cannot parse issue href") 

46 issue_node_link = self.source_website + "/" + href 

47 dates = issue_node.find_all("strong")[1].get_text() 

48 xissue = self.create_hdml_xissue(issue_node_link, dates) 

49 if xissue: 49 ↛ 42line 49 didn't jump to line 42 because the condition on line 49 was always true

50 xissues.append(xissue) 

51 

52 return xissues 

53 

54 def create_hdml_xissue(self, url, dates): 

55 if url.endswith("/"): 55 ↛ 57line 55 didn't jump to line 57 because the condition on line 55 was always true

56 url = url[:-1] 

57 parts = url.split("/") 

58 

59 volume = parts[-1] 

60 year = dates 

61 xissue = None 

62 

63 if year == "1985-86": 

64 year = "1985-1986" 

65 

66 xissue = create_issuedata() 

67 xissue.pid = f"{self.collection_id}_{year}__{volume}" 

68 xissue.year = year 

69 xissue.volume = volume 

70 xissue.url = url 

71 

72 return xissue 

73 

74 def parse_issue_content(self, content, xissue): 

75 # xissue = self.create_xissue(url) 

76 

77 soup = BeautifulSoup(content, "html.parser") 

78 article_nodes = soup.find("div", {"id": "collectionResults"}) 

79 if not isinstance(article_nodes, Tag): 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Cannot find articles") 

81 for index_article, article_node in enumerate(article_nodes.find_all("a")): 

82 article_link_node = article_node.get("href") 

83 if article_link_node: 83 ↛ 81line 83 didn't jump to line 81 because the condition on line 83 was always true

84 url = article_node.get("href") 

85 xarticle = create_articledata() 

86 xarticle.pid = "a" + str(index_article) 

87 xarticle.url = self.source_website + "/" + url 

88 

89 xissue.articles.append(xarticle) 

90 

91 xissue.articles = sorted( 

92 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage) 

93 ) 

94 

95 def parse_article_content(self, content, xissue, xarticle, url): 

96 """ 

97 Parse the content with Beautifulsoup and returns an ArticleData 

98 """ 

99 xarticle.lang = "en" 

100 soup = BeautifulSoup(content, "html.parser") 

101 node_infos_em = soup.find_all("em") 

102 

103 try: 

104 if node_infos_em: 104 ↛ 119line 104 didn't jump to line 119 because the condition on line 104 was always true

105 # TITLE 

106 title = node_infos_em[0].get_text() 

107 xarticle.title_tex = title 

108 xarticle.lang = "gr" 

109 

110 # PAGES 

111 pages = node_infos_em[4].get_text() 

112 self.set_pages(xarticle, pages) 

113 

114 except Exception: 

115 pass 

116 

117 # AUTHORS 

118 # WTF : Shouldn't we handle multiple authors here ? 

119 contribs = None 

120 authors = soup.find("strong", text="Authors") 

121 if authors: 121 ↛ 128line 121 didn't jump to line 128 because the condition on line 121 was always true

122 contribs_div = authors.find_next("em") 

123 if not contribs_div: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 raise ValueError("Error finding Author") 

125 contribs = contribs_div.get_text().split(",") 

126 

127 else: 

128 author = soup.find("strong", text="Author") 

129 if author: 

130 contribs_div = author.find_next("em") 

131 if not contribs_div: 

132 raise ValueError("Error finding Author") 

133 contribs = contribs_div.get_text().split(",") 

134 

135 if contribs is None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 raise ValueError("No Contributors found") 

137 

138 for contrib in contribs: 

139 author = create_contributor() 

140 author["role"] = "author" 

141 author["string_name"] = contrib.replace("\xa0", "") 

142 author["string_name"] = author["string_name"].replace(",", "").replace("by", "") 

143 xarticle.contributors.append(author) 

144 

145 # PDF 

146 reg_pdf = regex.compile(self.pdf_href) 

147 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0] 

148 pdf_link = self.source_website + "/" + pdf_link 

149 add_pdf_link_to_xarticle(xarticle, pdf_link) 

150 

151 return xarticle