Coverage for src/crawler/by_source/hdml_crawler.py: 84%
109 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import re
2from urllib.parse import unquote
4import regex
5from bs4 import BeautifulSoup
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.base_crawler import add_pdf_link_to_xarticle
9from ptf.model_data import create_articledata
10from ptf.model_data import create_contributor
11from ptf.model_data import create_issuedata
14class HdmlCrawler(BaseCollectionCrawler):
15 source_name = "Hellenic Digital Mathematics Library"
16 source_domain = "HDML"
17 source_website = "https://hdml.di.ionio.gr"
18 periode_begin = 0
19 periode_end = 0
20 pdf_href = "pdfs/journals"
21 issue_href = r"(?P<number>((\d+)-?)(\d+)?)"
22 article_href = r"(?P<base>en/item/Journals)/(\p{Greek}+|s|\s)+/(?P<volume>\d+)/(?P<num>\d+)"
24 def __init__(self, *args, **kwargs):
25 super().__init__(*args, **kwargs)
27 self.source = self.get_or_create_source()
28 # initialisation periode
29 self.periode = self.get_or_create_periode()
31 def parse_collection_content(self, content):
32 """
33 Parse the HTML page of Annals of Math and returns a list of xissue.
34 Each xissue has its volume/number/year metadata + its url
36 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page
37 """
38 soup = BeautifulSoup(content, "html5lib")
39 xissues = []
41 # Extract the list of issues
42 base_url_collection = self.collection_url.replace(self.source_website, "")
43 base_url_collection = unquote(base_url_collection[1:])
44 reg_issue = re.compile(base_url_collection + self.issue_href)
46 issue_nodes = [
47 a
48 for a in soup.select("div#collectionResults a")
49 if reg_issue.search(str(a.get("href")))
50 ]
52 for issue_node in issue_nodes:
53 issue_node_link = self.source_website + "/" + issue_node.get("href")
54 dates = issue_node.find_all("strong")[1].get_text()
55 xissue = self.create_xissue(issue_node_link, dates)
56 if xissue: 56 ↛ 52line 56 didn't jump to line 52 because the condition on line 56 was always true
57 xissues.append(xissue)
59 return xissues
61 def crawl_one_issue_url(self, xissue):
62 xissue = super().crawl_one_issue_url(xissue)
64 xissue.articles = sorted(
65 xissue.articles, key=lambda x: (int(-1 if x.fpage == "" else x.fpage), int(x.lpage))
66 )
68 return xissue
70 def create_xissue(self, url, dates):
71 if url.endswith("/"): 71 ↛ 73line 71 didn't jump to line 73 because the condition on line 71 was always true
72 url = url[:-1]
73 parts = url.split("/")
75 volume = parts[-1]
76 year = dates
77 xissue = None
79 year_int = int(year.split("-")[:1][0])
80 if self.periode_begin <= year_int: 80 ↛ 88line 80 didn't jump to line 88 because the condition on line 80 was always true
81 if self.periode_end == 0 or self.periode_begin <= self.periode_end: 81 ↛ 88line 81 didn't jump to line 88 because the condition on line 81 was always true
82 xissue = create_issuedata()
83 xissue.pid = f"{self.collection_id}_{year}__{volume}"
84 xissue.year = year
85 xissue.volume = volume
86 xissue.url = url
88 return xissue
90 def parse_issue_content(self, content, xissue):
91 # xissue = self.create_xissue(url)
93 soup = BeautifulSoup(content, "html.parser")
94 article_nodes = soup.find("div", {"id": "collectionResults"})
95 for index_article, article_node in enumerate(article_nodes.find_all("a")):
96 article_link_node = article_node.get("href")
97 if article_link_node: 97 ↛ 95line 97 didn't jump to line 95 because the condition on line 97 was always true
98 url = article_node.get("href")
99 xarticle = create_articledata()
100 xarticle.pid = "a" + str(index_article)
101 xarticle.url = self.source_website + "/" + url
103 xissue.articles.append(xarticle)
105 xissue.articles = sorted(
106 xissue.articles, key=lambda x: int(-1 if x.fpage == "" else x.fpage)
107 )
109 return xissue
111 def parse_article_content(self, content, xissue, xarticle, url, pid):
112 """
113 Parse the content with Beautifulsoup and returns an ArticleData
114 """
115 xarticle = create_articledata()
116 xarticle.pid = pid
117 xarticle.lang = "en"
118 soup = BeautifulSoup(content, "html.parser")
119 node_infos_em = soup.find_all("em")
121 try:
122 if node_infos_em: 122 ↛ 144line 122 didn't jump to line 144 because the condition on line 122 was always true
123 # TITLE
124 title = node_infos_em[0].get_text()
125 xarticle.title_tex = title
126 xarticle.lang = "gr"
128 # PAGES
129 pages = node_infos_em[4].get_text()
130 xarticle.page_range = pages
131 pages_infos = pages.split("-")
133 if len(pages_infos) > 1: 133 ↛ 136line 133 didn't jump to line 136 because the condition on line 133 was always true
134 xarticle.fpage = pages_infos[0]
135 else:
136 xarticle.fpage = pages
137 if len(pages_infos) > 1: 137 ↛ 144line 137 didn't jump to line 144 because the condition on line 137 was always true
138 xarticle.lpage = pages_infos[1]
140 except Exception:
141 pass
143 # AUTHORS
144 authors = soup.find("strong", text="Authors")
145 if not isinstance(authors, type(None)): 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 contribs = authors.find_next("em").get_text()
147 contribs = contribs.split(",")
149 else:
150 author = soup.find("strong", text="Author")
151 if not isinstance(author, type(None)): 151 ↛ 155line 151 didn't jump to line 155 because the condition on line 151 was always true
152 contribs = author.find_next("em").get_text()
153 contribs = contribs.split(",")
155 for contrib in contribs:
156 author = create_contributor()
157 author["role"] = "author"
158 author["string_name"] = contrib.replace("\xa0", "")
159 author["string_name"] = author["string_name"].replace(",", "").replace("by", "")
160 xarticle.contributors.append(author)
162 # PDF
163 reg_pdf = regex.compile(self.pdf_href)
164 pdf_link = [a.get("href") for a in soup.find_all("a") if reg_pdf.search(a.get("href"))][0]
165 pdf_link = self.source_website + "/" + pdf_link
166 add_pdf_link_to_xarticle(xarticle, pdf_link)
168 return xarticle