Coverage for src/crawler/by_source/hdml_crawler.py: 80%
103 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import re
2from urllib.parse import unquote
4import regex
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import create_articledata, create_contributor, create_issuedata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.utils import add_pdf_link_to_xarticle
12class HdmlCrawler(BaseCollectionCrawler):
13 source_name = "Hellenic Digital Mathematics Library"
14 source_domain = "HDML"
15 source_website = "https://hdml.di.ionio.gr"
16 periode_begin = 0
17 periode_end = 0
18 pdf_href = "pdfs/journals"
19 issue_href = r"(?P<number>((\d+)-?)(\d+)?)"
20 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)"
22 def parse_collection_content(self, content):
23 """
24 Parse the HTML page of Annals of Math and returns a list of xissue.
25 Each xissue has its volume/number/year metadata + its url
27 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page
28 """
29 soup = BeautifulSoup(content, "html5lib")
30 xissues = []
32 # Extract the list of issues
33 base_url_collection = self.collection_url.replace(self.source_website, "")
34 base_url_collection = unquote(base_url_collection[1:])
35 reg_issue = re.compile(base_url_collection + self.issue_href)
37 issue_nodes = [
38 a
39 for a in soup.select("div#collectionResults a")
40 if reg_issue.search(str(a.get("href")))
41 ]
43 for issue_node in issue_nodes:
44 href = issue_node.get("href")
45 if not isinstance(href, str): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Cannot parse issue href")
47 issue_node_link = self.source_website + "/" + href
48 dates = issue_node.find_all("strong")[1].get_text()
49 xissue = self.create_hdml_xissue(issue_node_link, dates)
50 if xissue: 50 ↛ 43line 50 didn't jump to line 43 because the condition on line 50 was always true
51 xissues.append(xissue)
53 return xissues
55 def create_hdml_xissue(self, url, dates):
56 if url.endswith("/"): 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true
57 url = url[:-1]
58 parts = url.split("/")
60 volume = parts[-1]
61 year = dates
62 xissue = None
64 year_int = int(year.split("-")[:1][0])
65 if self.periode_begin <= year_int: 65 ↛ 73line 65 didn't jump to line 73 because the condition on line 65 was always true
66 if self.periode_end == 0 or self.periode_begin <= self.periode_end: 66 ↛ 73line 66 didn't jump to line 73 because the condition on line 66 was always true
67 xissue = create_issuedata()
68 xissue.pid = f"{self.collection_id}_{year}__{volume}"
69 xissue.year = year
70 xissue.volume = volume
71 xissue.url = url
73 return xissue
75 def parse_issue_content(self, content, xissue):
76 # xissue = self.create_xissue(url)
78 soup = BeautifulSoup(content, "html.parser")
79 article_nodes = soup.find("div", {"id": "collectionResults"})
80 if not isinstance(article_nodes, Tag): 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Cannot find articles")
82 for index_article, article_node in enumerate(article_nodes.find_all("a")):
83 article_link_node = article_node.get("href")
84 if article_link_node: 84 ↛ 82line 84 didn't jump to line 82 because the condition on line 84 was always true
85 url = article_node.get("href")
86 xarticle = create_articledata()
87 xarticle.pid = "a" + str(index_article)
88 xarticle.url = self.source_website + "/" + url
90 xissue.articles.append(xarticle)
92 xissue.articles = sorted(
93 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage)
94 )
96 def parse_article_content(self, content, xissue, xarticle, url, pid):
97 """
98 Parse the content with Beautifulsoup and returns an ArticleData
99 """
100 xarticle.pid = pid
101 xarticle.lang = "en"
102 soup = BeautifulSoup(content, "html.parser")
103 node_infos_em = soup.find_all("em")
105 try:
106 if node_infos_em: 106 ↛ 121line 106 didn't jump to line 121 because the condition on line 106 was always true
107 # TITLE
108 title = node_infos_em[0].get_text()
109 xarticle.title_tex = title
110 xarticle.lang = "gr"
112 # PAGES
113 pages = node_infos_em[4].get_text()
114 self.set_pages(xarticle, pages)
116 except Exception:
117 pass
119 # AUTHORS
120 # WTF : Shouldn't we handle multiple authors here ?
121 contribs = None
122 authors = soup.find("strong", text="Authors")
123 if authors: 123 ↛ 130line 123 didn't jump to line 130 because the condition on line 123 was always true
124 contribs_div = authors.find_next("em")
125 if not contribs_div: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 raise ValueError("Error finding Author")
127 contribs = contribs_div.get_text().split(",")
129 else:
130 author = soup.find("strong", text="Author")
131 if author:
132 contribs_div = author.find_next("em")
133 if not contribs_div:
134 raise ValueError("Error finding Author")
135 contribs = contribs_div.get_text().split(",")
137 if contribs is None: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true
138 raise ValueError("No Contributors found")
140 for contrib in contribs:
141 author = create_contributor()
142 author["role"] = "author"
143 author["string_name"] = contrib.replace("\xa0", "")
144 author["string_name"] = author["string_name"].replace(",", "").replace("by", "")
145 xarticle.contributors.append(author)
147 # PDF
148 reg_pdf = regex.compile(self.pdf_href)
149 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0]
150 pdf_link = self.source_website + "/" + pdf_link
151 add_pdf_link_to_xarticle(xarticle, pdf_link)
153 return xarticle