Coverage for src/crawler/by_source/hdml_crawler.py: 81%
100 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import re
2from urllib.parse import unquote
4import regex
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import create_articledata, create_contributor, create_issuedata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.utils import add_pdf_link_to_xarticle
12class HdmlCrawler(BaseCollectionCrawler):
13 source_name = "Hellenic Digital Mathematics Library"
14 source_domain = "HDML"
15 source_website = "https://hdml.di.ionio.gr"
17 pdf_href = "pdfs/journals"
18 issue_href = r"(?P<number>((\d+)-?)(\d+)?)"
19 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)"
21 verify = False
23 def parse_collection_content(self, content):
24 """
25 Parse the HTML page of Annals of Math and returns a list of xissue.
26 Each xissue has its volume/number/year metadata + its url
27 """
28 soup = BeautifulSoup(content, "html5lib")
29 xissues = []
31 # Extract the list of issues
32 base_url_collection = self.collection_url.replace(self.source_website, "")
33 base_url_collection = unquote(base_url_collection[1:])
34 reg_issue = re.compile(base_url_collection + self.issue_href)
36 issue_nodes = [
37 a
38 for a in soup.select("div#collectionResults a")
39 if reg_issue.search(str(a.get("href")))
40 ]
42 for issue_node in issue_nodes:
43 href = issue_node.get("href")
44 if not isinstance(href, str): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError("Cannot parse issue href")
46 issue_node_link = self.source_website + "/" + href
47 dates = issue_node.find_all("strong")[1].get_text()
48 xissue = self.create_hdml_xissue(issue_node_link, dates)
49 if xissue: 49 ↛ 42line 49 didn't jump to line 42 because the condition on line 49 was always true
50 xissues.append(xissue)
52 return xissues
54 def create_hdml_xissue(self, url, dates):
55 if url.endswith("/"): 55 ↛ 57line 55 didn't jump to line 57 because the condition on line 55 was always true
56 url = url[:-1]
57 parts = url.split("/")
59 volume = parts[-1]
60 year = dates
61 xissue = None
63 if year == "1985-86":
64 year = "1985-1986"
66 xissue = create_issuedata()
67 xissue.pid = f"{self.collection_id}_{year}__{volume}"
68 xissue.year = year
69 xissue.volume = volume
70 xissue.url = url
72 return xissue
74 def parse_issue_content(self, content, xissue):
75 # xissue = self.create_xissue(url)
77 soup = BeautifulSoup(content, "html.parser")
78 article_nodes = soup.find("div", {"id": "collectionResults"})
79 if not isinstance(article_nodes, Tag): 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 raise ValueError("Cannot find articles")
81 for index_article, article_node in enumerate(article_nodes.find_all("a")):
82 article_link_node = article_node.get("href")
83 if article_link_node: 83 ↛ 81line 83 didn't jump to line 81 because the condition on line 83 was always true
84 url = article_node.get("href")
85 xarticle = create_articledata()
86 xarticle.pid = "a" + str(index_article)
87 xarticle.url = self.source_website + "/" + url
89 xissue.articles.append(xarticle)
91 xissue.articles = sorted(
92 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage)
93 )
95 def parse_article_content(self, content, xissue, xarticle, url):
96 """
97 Parse the content with Beautifulsoup and returns an ArticleData
98 """
99 xarticle.lang = "en"
100 soup = BeautifulSoup(content, "html.parser")
101 node_infos_em = soup.find_all("em")
103 try:
104 if node_infos_em: 104 ↛ 119line 104 didn't jump to line 119 because the condition on line 104 was always true
105 # TITLE
106 title = node_infos_em[0].get_text()
107 xarticle.title_tex = title
108 xarticle.lang = "gr"
110 # PAGES
111 pages = node_infos_em[4].get_text()
112 self.set_pages(xarticle, pages)
114 except Exception:
115 pass
117 # AUTHORS
118 # WTF : Shouldn't we handle multiple authors here ?
119 contribs = None
120 authors = soup.find("strong", text="Authors")
121 if authors: 121 ↛ 128line 121 didn't jump to line 128 because the condition on line 121 was always true
122 contribs_div = authors.find_next("em")
123 if not contribs_div: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 raise ValueError("Error finding Author")
125 contribs = contribs_div.get_text().split(",")
127 else:
128 author = soup.find("strong", text="Author")
129 if author:
130 contribs_div = author.find_next("em")
131 if not contribs_div:
132 raise ValueError("Error finding Author")
133 contribs = contribs_div.get_text().split(",")
135 if contribs is None: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 raise ValueError("No Contributors found")
138 for contrib in contribs:
139 author = create_contributor()
140 author["role"] = "author"
141 author["string_name"] = contrib.replace("\xa0", "")
142 author["string_name"] = author["string_name"].replace(",", "").replace("by", "")
143 xarticle.contributors.append(author)
145 # PDF
146 reg_pdf = regex.compile(self.pdf_href)
147 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0]
148 pdf_link = self.source_website + "/" + pdf_link
149 add_pdf_link_to_xarticle(xarticle, pdf_link)
151 return xarticle