Coverage for src / crawler / by_source / hdml_crawler.py: 25%

98 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-23 15:27 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import create_abstract, create_articledata, create_contributor 

5from unidecode import unidecode 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.crawler_utils import set_pages 

9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_base, regex_to_dict 

10 

11 

12class HdmlCrawler(BaseCollectionCrawler): 

13 source_name = "Hellenic Digital Mathematics Library" 

14 source_domain = "HDML" 

15 source_website = "https://hdml.di.ionio.gr" 

16 

17 pdf_href = "pdfs/journals" 

18 issue_re = r"Issue number : (?P<volume>[\d-]+)(?P<issue>[\w]+)? Issue date : (?P<year>[\d-]+)" 

19 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)" 

20 

21 verify = False 

22 

23 def parse_collection_content(self, content): 

24 """ 

25 Parse the HTML page of Annals of Math and returns a list of xissue. 

26 Each xissue has its volume/number/year metadata + its url 

27 """ 

28 soup = BeautifulSoup(content, "html5lib") 

29 base = get_base(soup, self.collection_url) 

30 

31 xissues = [] 

32 

33 for issue_node in soup.select("div#collectionResults a"): 

34 href = issue_node.get("href") 

35 if not isinstance(href, str): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 raise ValueError("Cannot parse issue href") 

37 issue_text = issue_node.get_text(" ", strip=True) 

38 issue_text = unidecode(issue_text) 

39 dict = regex_to_dict(self.issue_re, issue_text) 

40 xissue = self.create_xissue( 

41 urljoin(base, href), 

42 dict["year"], 

43 dict["volume"], 

44 dict.get("issue", None), 

45 ) 

46 xissues.append(xissue) 

47 xissues.sort(key=lambda i: i.pid) 

48 return xissues 

49 

50 def parse_issue_content(self, content, xissue): 

51 if not xissue.url: 

52 raise ValueError("xissue url cannot be None") 

53 

54 soup = BeautifulSoup(content, "html.parser") 

55 base = get_base(soup, xissue.url) 

56 

57 article_nodes = soup.find("div", {"id": "collectionResults"}) 

58 if not isinstance(article_nodes, Tag): 

59 raise ValueError("Cannot find articles") 

60 for index_article, article_node in enumerate(article_nodes.find_all("a")): 

61 article_link_node = article_node.get("href") 

62 if article_link_node: 

63 url = article_node.get("href") 

64 if not isinstance(url, str): 

65 raise ValueError("Cannot parse article url") 

66 xarticle = create_articledata() 

67 xarticle.pid = "a" + str(index_article) 

68 xarticle.url = urljoin(base, url) 

69 

70 xissue.articles.append(xarticle) 

71 

72 xissue.articles = sorted( 

73 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage) 

74 ) 

75 

76 def parse_article_content(self, content, xissue, xarticle, url): 

77 """ 

78 Parse the content with Beautifulsoup and returns an ArticleData 

79 """ 

80 xarticle.lang = "en" 

81 soup = BeautifulSoup(content, "html.parser") 

82 node_infos_em = soup.find_all("em") 

83 

84 base = get_base(soup, url) 

85 

86 try: 

87 if node_infos_em: 

88 # TITLE 

89 title = node_infos_em[0].get_text() 

90 xarticle.title_tex = title 

91 xarticle.lang = "gr" 

92 

93 # PAGES 

94 pages = node_infos_em[4].get_text() 

95 set_pages(xarticle, pages) 

96 

97 except Exception: 

98 pass 

99 

100 # AUTHORS 

101 contribs = None 

102 authors = soup.select_one("strong:-soup-contains-own('Author')") 

103 if authors: 

104 contribs_div = authors.find_next("em") 

105 if not contribs_div: 

106 raise ValueError("Error finding Author") 

107 contribs = contribs_div.get_text().split(",") 

108 

109 if contribs is None: 

110 raise ValueError("No Contributors found") 

111 

112 if not cleanup_str("".join(contribs)) == "": 

113 for contrib in contribs: 

114 author = create_contributor() 

115 author["role"] = "author" 

116 author["string_name"] = contrib.replace("\xa0", "") 

117 author["string_name"] = author["string_name"].replace(",", "") 

118 author["string_name"] = cleanup_str(author["string_name"]) 

119 if author["string_name"] == "": 

120 continue 

121 xarticle.contributors.append(author) 

122 

123 # PDF 

124 pdf_img = soup.select_one("img[src='images/pdf.png']") 

125 if not pdf_img: 

126 raise ValueError("Couldn't find pdf image") 

127 pdf_tag = pdf_img.parent 

128 if not pdf_tag: 

129 raise ValueError("Couldn't find pdf link") 

130 pdf_link = pdf_tag.get("href") 

131 if not isinstance(pdf_link, str): 

132 raise ValueError("Couldn't parse pdf link") 

133 add_pdf_link_to_xarticle(xarticle, urljoin(base, pdf_link)) 

134 

135 # Abstract 

136 abstract_header = authors = soup.select_one("strong:-soup-contains-own('Abstract')") 

137 if abstract_header: 

138 abstract_tag = abstract_header.find_next("em") 

139 if abstract_tag: 

140 xarticle.abstracts.append(create_abstract(value_tex=abstract_tag.text)) 

141 return xarticle