Coverage for src / crawler / by_source / hdml_crawler.py: 25%
98 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import create_abstract, create_articledata, create_contributor
5from unidecode import unidecode
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.crawler_utils import set_pages
9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, get_base, regex_to_dict
12class HdmlCrawler(BaseCollectionCrawler):
13 source_name = "Hellenic Digital Mathematics Library"
14 source_domain = "HDML"
15 source_website = "https://hdml.di.ionio.gr"
17 pdf_href = "pdfs/journals"
18 issue_re = r"Issue number : (?P<volume>[\d-]+)(?P<issue>[\w]+)? Issue date : (?P<year>[\d-]+)"
19 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)"
21 verify = False
23 def parse_collection_content(self, content):
24 """
25 Parse the HTML page of Annals of Math and returns a list of xissue.
26 Each xissue has its volume/number/year metadata + its url
27 """
28 soup = BeautifulSoup(content, "html5lib")
29 base = get_base(soup, self.collection_url)
31 xissues = []
33 for issue_node in soup.select("div#collectionResults a"):
34 href = issue_node.get("href")
35 if not isinstance(href, str): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 raise ValueError("Cannot parse issue href")
37 issue_text = issue_node.get_text(" ", strip=True)
38 issue_text = unidecode(issue_text)
39 dict = regex_to_dict(self.issue_re, issue_text)
40 xissue = self.create_xissue(
41 urljoin(base, href),
42 dict["year"],
43 dict["volume"],
44 dict.get("issue", None),
45 )
46 xissues.append(xissue)
47 xissues.sort(key=lambda i: i.pid)
48 return xissues
50 def parse_issue_content(self, content, xissue):
51 if not xissue.url:
52 raise ValueError("xissue url cannot be None")
54 soup = BeautifulSoup(content, "html.parser")
55 base = get_base(soup, xissue.url)
57 article_nodes = soup.find("div", {"id": "collectionResults"})
58 if not isinstance(article_nodes, Tag):
59 raise ValueError("Cannot find articles")
60 for index_article, article_node in enumerate(article_nodes.find_all("a")):
61 article_link_node = article_node.get("href")
62 if article_link_node:
63 url = article_node.get("href")
64 if not isinstance(url, str):
65 raise ValueError("Cannot parse article url")
66 xarticle = create_articledata()
67 xarticle.pid = "a" + str(index_article)
68 xarticle.url = urljoin(base, url)
70 xissue.articles.append(xarticle)
72 xissue.articles = sorted(
73 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage)
74 )
76 def parse_article_content(self, content, xissue, xarticle, url):
77 """
78 Parse the content with Beautifulsoup and returns an ArticleData
79 """
80 xarticle.lang = "en"
81 soup = BeautifulSoup(content, "html.parser")
82 node_infos_em = soup.find_all("em")
84 base = get_base(soup, url)
86 try:
87 if node_infos_em:
88 # TITLE
89 title = node_infos_em[0].get_text()
90 xarticle.title_tex = title
91 xarticle.lang = "gr"
93 # PAGES
94 pages = node_infos_em[4].get_text()
95 set_pages(xarticle, pages)
97 except Exception:
98 pass
100 # AUTHORS
101 contribs = None
102 authors = soup.select_one("strong:-soup-contains-own('Author')")
103 if authors:
104 contribs_div = authors.find_next("em")
105 if not contribs_div:
106 raise ValueError("Error finding Author")
107 contribs = contribs_div.get_text().split(",")
109 if contribs is None:
110 raise ValueError("No Contributors found")
112 if not cleanup_str("".join(contribs)) == "":
113 for contrib in contribs:
114 author = create_contributor()
115 author["role"] = "author"
116 author["string_name"] = contrib.replace("\xa0", "")
117 author["string_name"] = author["string_name"].replace(",", "")
118 author["string_name"] = cleanup_str(author["string_name"])
119 if author["string_name"] == "":
120 continue
121 xarticle.contributors.append(author)
123 # PDF
124 pdf_img = soup.select_one("img[src='images/pdf.png']")
125 if not pdf_img:
126 raise ValueError("Couldn't find pdf image")
127 pdf_tag = pdf_img.parent
128 if not pdf_tag:
129 raise ValueError("Couldn't find pdf link")
130 pdf_link = pdf_tag.get("href")
131 if not isinstance(pdf_link, str):
132 raise ValueError("Couldn't parse pdf link")
133 add_pdf_link_to_xarticle(xarticle, urljoin(base, pdf_link))
135 # Abstract
136 abstract_header = authors = soup.select_one("strong:-soup-contains-own('Abstract')")
137 if abstract_header:
138 abstract_tag = abstract_header.find_next("em")
139 if abstract_tag:
140 xarticle.abstracts.append(create_abstract(value_tex=abstract_tag.text))
141 return xarticle