Coverage for src / crawler / by_source / hdml_crawler.py: 24%

97 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 10:24 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import create_abstract, create_articledata, create_contributor 

5from unidecode import unidecode 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_base, regex_to_dict 

9 

10 

11class HdmlCrawler(BaseCollectionCrawler): 

12 source_name = "Hellenic Digital Mathematics Library" 

13 source_domain = "HDML" 

14 source_website = "https://hdml.di.ionio.gr" 

15 

16 pdf_href = "pdfs/journals" 

17 issue_re = r"Issue number : (?P<volume>[\d-]+)(?P<issue>[\w]+)? Issue date : (?P<year>[\d-]+)" 

18 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)" 

19 

20 verify = False 

21 

22 def parse_collection_content(self, content): 

23 """ 

24 Parse the HTML page of Annals of Math and returns a list of xissue. 

25 Each xissue has its volume/number/year metadata + its url 

26 """ 

27 soup = BeautifulSoup(content, "html5lib") 

28 base = get_base(soup, self.collection_url) 

29 

30 xissues = [] 

31 

32 for issue_node in soup.select("div#collectionResults a"): 

33 href = issue_node.get("href") 

34 if not isinstance(href, str): 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 raise ValueError("Cannot parse issue href") 

36 issue_text = issue_node.get_text(" ", strip=True) 

37 issue_text = unidecode(issue_text) 

38 dict = regex_to_dict(self.issue_re, issue_text) 

39 xissue = self.create_xissue( 

40 urljoin(base, href), 

41 dict["year"], 

42 dict["volume"], 

43 dict.get("issue", None), 

44 ) 

45 xissues.append(xissue) 

46 xissues.sort(key=lambda i: i.pid) 

47 return xissues 

48 

49 def parse_issue_content(self, content, xissue): 

50 if not xissue.url: 

51 raise ValueError("xissue url cannot be None") 

52 

53 soup = BeautifulSoup(content, "html.parser") 

54 base = get_base(soup, xissue.url) 

55 

56 article_nodes = soup.find("div", {"id": "collectionResults"}) 

57 if not isinstance(article_nodes, Tag): 

58 raise ValueError("Cannot find articles") 

59 for index_article, article_node in enumerate(article_nodes.find_all("a")): 

60 article_link_node = article_node.get("href") 

61 if article_link_node: 

62 url = article_node.get("href") 

63 if not isinstance(url, str): 

64 raise ValueError("Cannot parse article url") 

65 xarticle = create_articledata() 

66 xarticle.pid = "a" + str(index_article) 

67 xarticle.url = urljoin(base, url) 

68 

69 xissue.articles.append(xarticle) 

70 

71 xissue.articles = sorted( 

72 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage) 

73 ) 

74 

75 def parse_article_content(self, content, xissue, xarticle, url): 

76 """ 

77 Parse the content with Beautifulsoup and returns an ArticleData 

78 """ 

79 xarticle.lang = "en" 

80 soup = BeautifulSoup(content, "html.parser") 

81 node_infos_em = soup.find_all("em") 

82 

83 base = get_base(soup, url) 

84 

85 try: 

86 if node_infos_em: 

87 # TITLE 

88 title = node_infos_em[0].get_text() 

89 xarticle.title_tex = title 

90 xarticle.lang = "gr" 

91 

92 # PAGES 

93 pages = node_infos_em[4].get_text() 

94 self.set_pages(xarticle, pages) 

95 

96 except Exception: 

97 pass 

98 

99 # AUTHORS 

100 contribs = None 

101 authors = soup.select_one("strong:-soup-contains-own('Author')") 

102 if authors: 

103 contribs_div = authors.find_next("em") 

104 if not contribs_div: 

105 raise ValueError("Error finding Author") 

106 contribs = contribs_div.get_text().split(",") 

107 

108 if contribs is None: 

109 raise ValueError("No Contributors found") 

110 

111 if not cleanup_str("".join(contribs)) == "": 

112 for contrib in contribs: 

113 author = create_contributor() 

114 author["role"] = "author" 

115 author["string_name"] = contrib.replace("\xa0", "") 

116 author["string_name"] = author["string_name"].replace(",", "") 

117 author["string_name"] = cleanup_str(author["string_name"]) 

118 if author["string_name"] == "": 

119 continue 

120 xarticle.contributors.append(author) 

121 

122 # PDF 

123 pdf_img = soup.select_one("img[src='images/pdf.png']") 

124 if not pdf_img: 

125 raise ValueError("Couldn't find pdf image") 

126 pdf_tag = pdf_img.parent 

127 if not pdf_tag: 

128 raise ValueError("Couldn't find pdf link") 

129 pdf_link = pdf_tag.get("href") 

130 if not isinstance(pdf_link, str): 

131 raise ValueError("Couldn't parse pdf link") 

132 add_pdf_link_to_xarticle(xarticle, urljoin(base, pdf_link)) 

133 

134 # Abstract 

135 abstract_header = authors = soup.select_one("strong:-soup-contains-own('Abstract')") 

136 if abstract_header: 

137 abstract_tag = abstract_header.find_next("em") 

138 if abstract_tag: 

139 xarticle.abstracts.append(create_abstract(value_tex=abstract_tag.text)) 

140 return xarticle