Coverage for src/crawler/by_source/hdml_crawler.py: 81%
99 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
2from urllib.parse import unquote
4import regex
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import create_articledata, create_contributor, create_issuedata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.utils import add_pdf_link_to_xarticle
12class HdmlCrawler(BaseCollectionCrawler):
13 source_name = "Hellenic Digital Mathematics Library"
14 source_domain = "HDML"
15 source_website = "https://hdml.di.ionio.gr"
17 pdf_href = "pdfs/journals"
18 issue_href = r"(?P<number>((\d+)-?)(\d+)?)"
19 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)"
21 def parse_collection_content(self, content):
22 """
23 Parse the HTML page of Annals of Math and returns a list of xissue.
24 Each xissue has its volume/number/year metadata + its url
25 """
26 soup = BeautifulSoup(content, "html5lib")
27 xissues = []
29 # Extract the list of issues
30 base_url_collection = self.collection_url.replace(self.source_website, "")
31 base_url_collection = unquote(base_url_collection[1:])
32 reg_issue = re.compile(base_url_collection + self.issue_href)
34 issue_nodes = [
35 a
36 for a in soup.select("div#collectionResults a")
37 if reg_issue.search(str(a.get("href")))
38 ]
40 for issue_node in issue_nodes:
41 href = issue_node.get("href")
42 if not isinstance(href, str): 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 raise ValueError("Cannot parse issue href")
44 issue_node_link = self.source_website + "/" + href
45 dates = issue_node.find_all("strong")[1].get_text()
46 xissue = self.create_hdml_xissue(issue_node_link, dates)
47 if xissue: 47 ↛ 40line 47 didn't jump to line 40 because the condition on line 47 was always true
48 xissues.append(xissue)
50 return xissues
52 def create_hdml_xissue(self, url, dates):
53 if url.endswith("/"): 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true
54 url = url[:-1]
55 parts = url.split("/")
57 volume = parts[-1]
58 year = dates
59 xissue = None
61 if year == "1985-86":
62 year = "1985-1986"
64 xissue = create_issuedata()
65 xissue.pid = f"{self.collection_id}_{year}__{volume}"
66 xissue.year = year
67 xissue.volume = volume
68 xissue.url = url
70 return xissue
72 def parse_issue_content(self, content, xissue):
73 # xissue = self.create_xissue(url)
75 soup = BeautifulSoup(content, "html.parser")
76 article_nodes = soup.find("div", {"id": "collectionResults"})
77 if not isinstance(article_nodes, Tag): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 raise ValueError("Cannot find articles")
79 for index_article, article_node in enumerate(article_nodes.find_all("a")):
80 article_link_node = article_node.get("href")
81 if article_link_node: 81 ↛ 79line 81 didn't jump to line 79 because the condition on line 81 was always true
82 url = article_node.get("href")
83 xarticle = create_articledata()
84 xarticle.pid = "a" + str(index_article)
85 xarticle.url = self.source_website + "/" + url
87 xissue.articles.append(xarticle)
89 xissue.articles = sorted(
90 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage)
91 )
93 def parse_article_content(self, content, xissue, xarticle, url):
94 """
95 Parse the content with Beautifulsoup and returns an ArticleData
96 """
97 xarticle.lang = "en"
98 soup = BeautifulSoup(content, "html.parser")
99 node_infos_em = soup.find_all("em")
101 try:
102 if node_infos_em: 102 ↛ 117line 102 didn't jump to line 117 because the condition on line 102 was always true
103 # TITLE
104 title = node_infos_em[0].get_text()
105 xarticle.title_tex = title
106 xarticle.lang = "gr"
108 # PAGES
109 pages = node_infos_em[4].get_text()
110 self.set_pages(xarticle, pages)
112 except Exception:
113 pass
115 # AUTHORS
116 # WTF : Shouldn't we handle multiple authors here ?
117 contribs = None
118 authors = soup.find("strong", text="Authors")
119 if authors: 119 ↛ 126line 119 didn't jump to line 126 because the condition on line 119 was always true
120 contribs_div = authors.find_next("em")
121 if not contribs_div: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 raise ValueError("Error finding Author")
123 contribs = contribs_div.get_text().split(",")
125 else:
126 author = soup.find("strong", text="Author")
127 if author:
128 contribs_div = author.find_next("em")
129 if not contribs_div:
130 raise ValueError("Error finding Author")
131 contribs = contribs_div.get_text().split(",")
133 if contribs is None: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 raise ValueError("No Contributors found")
136 for contrib in contribs:
137 author = create_contributor()
138 author["role"] = "author"
139 author["string_name"] = contrib.replace("\xa0", "")
140 author["string_name"] = author["string_name"].replace(",", "").replace("by", "")
141 xarticle.contributors.append(author)
143 # PDF
144 reg_pdf = regex.compile(self.pdf_href)
145 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0]
146 pdf_link = self.source_website + "/" + pdf_link
147 add_pdf_link_to_xarticle(xarticle, pdf_link)
149 return xarticle