Coverage for src/crawler/by_source/dmlcz_crawler.py: 80%
133 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import re
3from bs4 import BeautifulSoup
4from bs4 import Tag
5from crawler.base_crawler import BaseCollectionCrawler
6from crawler.base_crawler import add_pdf_link_to_xarticle
7from crawler.crawler_types import CitationLiteral
9from ptf.model_data import create_articledata
10from ptf.model_data import create_issuedata
11from ptf.model_data import create_subj
14class DmlczCrawler(BaseCollectionCrawler):
15 source_name = "Czech Digital Mathematics Library"
16 source_domain = "DMLCZ"
17 source_website = "https://dml.cz"
19 def __init__(self, *args, **kwargs):
20 super().__init__(*args, **kwargs)
22 # TODO: creates a cols.csv that supersedes cols_eudml.csv with the entire collection catalogue.
24 self.source = self.get_or_create_source()
26 self.issue_href = r"/handle/\d+.dmlcz/\d+"
28 def parse_collection_content(self, content):
29 """
30 Parse the HTML page of Annals of Math and returns a list of xissue.
31 Each xissue has its pid/volume/number/year metadata + its url
33 self.periode is set at the end based on the xissue years of the HTML page
34 """
35 soup = BeautifulSoup(content, "html.parser")
36 xissues = []
38 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})]
40 for issue_node in issue_nodes:
41 reg_year = re.compile(r"\d{4}")
42 reg_volume = re.compile(r"Volume \d+")
43 issue_text = issue_node.get_text()
44 if re.compile(r"\d+").search(issue_text): 44 ↛ 40line 44 didn't jump to line 40 because the condition on line 44 was always true
45 elem = issue_node.find("a")
46 dates = reg_year.search(issue_text)
47 volume = reg_volume.search(elem.get_text())
48 issues = issue_node.findNext("td")
49 issues = issues.findAll("a")
50 if volume: 50 ↛ 52line 50 didn't jump to line 52 because the condition on line 50 was always true
51 volume = volume[0].replace("Volume ", "")
52 if dates: 52 ↛ 56line 52 didn't jump to line 56 because the condition on line 52 was always true
53 search = reg_year.search(issue_text)
54 if search is not None: 54 ↛ 56line 54 didn't jump to line 56 because the condition on line 54 was always true
55 dates = search[0]
56 for issue in issues:
57 link = issue.get("href")
58 number = issue.get_text()
59 xissue = self.create_xissue(link, volume, number, dates)
60 if xissue: 60 ↛ 56line 60 didn't jump to line 56 because the condition on line 60 was always true
61 xissues.append(xissue)
63 self.periode_begin = self.get_year(xissues[0].year)
64 self.periode_end = self.get_year(xissues[-1].year)
65 self.periode = self.get_or_create_periode()
67 return xissues
69 def get_year(self, year):
70 if "/" in year: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 year = year.split("/")[0]
73 return year
75 def create_xissue(self, url, volume, number, dates):
76 year = dates.replace("/", "-")
78 # volume might not be an integer. eLibM puts special issue titles as volume number.
80 try:
81 volume_for_pid = int(volume)
82 except ValueError:
83 print("error parsing volume")
85 xissue = create_issuedata()
86 number = number.replace(",", "-")
87 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
88 xissue.year = year
89 xissue.volume = volume
90 xissue.number = number
91 xissue.url = self.source_website + url
93 return xissue
95 def parse_issue_content(self, content, xissue):
96 soup = BeautifulSoup(content, "html.parser")
97 article_nodes = soup.find_all("td", {"class": "article"})
99 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887)
100 # We need to ignore the articles already crawled
101 article_urls = []
103 for index_article, article_node in enumerate(article_nodes):
104 article_link_node = article_node.find("a")
105 if article_link_node: 105 ↛ 103line 105 didn't jump to line 103 because the condition on line 105 was always true
106 url = article_link_node.get("href")
107 if url not in article_urls: 107 ↛ 103line 107 didn't jump to line 103 because the condition on line 107 was always true
108 article_urls.append(url)
110 xarticle = create_articledata()
111 xarticle.pid = "a" + str(index_article)
112 xarticle.url = self.source_website + url
114 xissue.articles.append(xarticle)
116 def parse_article_content(self, content, xissue, xarticle, url, pid):
117 """
118 Parse the content with Beautifulsoup and returns an ArticleData
119 """
120 xarticle = create_articledata()
121 xarticle.pid = pid
122 xarticle.lang = "en"
124 soup = BeautifulSoup(content, "html.parser")
125 bloc_ref_ids = soup.find("div", {"class": "item-refids"})
126 # TITLE
127 title_node = soup.find("span", {"class": "item-title"})
128 if title_node: 128 ↛ 132line 128 didn't jump to line 132 because the condition on line 128 was always true
129 xarticle.title_tex = title_node.get_text()
131 # ABSTRACT
132 abstract_section_node = soup.find("dim:field")
133 if abstract_section_node: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 abstract = str(abstract_section_node.get_text())
135 xabstract = {
136 "tag": "abstract",
137 "value_html": "",
138 "value_tex": abstract,
139 "value_xml": "",
140 "lang": "en",
141 }
142 xarticle.abstracts.append(xabstract)
144 # PDF
145 link_nodes = soup.find_all("a")
146 for link_node in link_nodes:
147 pdf_url = link_node.get("href")
148 if pdf_url.startswith("/bitstream/"):
149 add_pdf_link_to_xarticle(xarticle, pdf_url)
150 reg_msc = re.compile("/browse-subject")
151 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))]
153 # MSC
154 for subj in subjs_nodes: 154 ↛ 155line 154 didn't jump to line 155 because the loop on line 154 never started
155 subject = create_subj()
156 subject["value"] = subj
157 subject["type"] = "msc"
158 subject["lang"] = "en"
159 xarticle.kwds.append(subject)
161 # PAGES
162 pages = soup.find("span", {"class": "item-pp"})
163 if pages: 163 ↛ 181line 163 didn't jump to line 181 because the condition on line 163 was always true
164 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
165 if pages_to: 165 ↛ 181line 165 didn't jump to line 181 because the condition on line 165 was always true
166 parts = pages_to[0].split("-")
167 first_page = parts[0].replace("(", "").replace(")", "")
168 if len(parts) > 1: 168 ↛ 172line 168 didn't jump to line 172 because the condition on line 168 was always true
169 last_page = parts[1].replace("(", "").replace(")", "")
170 xarticle.lpage = last_page
172 xarticle.fpage = first_page
174 # Biblio
175 # bibitems_tags = soup.select("div.references-inside div.reference")
176 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags]
177 # if len(bibitems) > 0:
178 # xarticle.abstracts.append(self.create_bibliography(bibitems))
180 # DOI
181 reg_doi = re.compile("dx.doi.org")
183 what: list[CitationLiteral] = [
184 "lang",
185 "title",
186 "author",
187 "pdf",
188 "abstract",
189 "page",
190 "mr",
191 "zbl",
192 "publisher",
193 "keywords",
194 ]
195 self.get_metadata_using_citation_meta(xarticle, xissue, soup, what)
197 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 197 ↛ 216line 197 didn't jump to line 216 because the condition on line 197 was always true
198 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))]
199 if len(doi_node) > 0: 199 ↛ 216line 199 didn't jump to line 216 because the condition on line 199 was always true
200 doi = doi_node[0].get_text()
201 pos = doi.find("10.")
202 if pos > 0: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true
203 doi = doi[pos:]
204 xarticle.doi = doi
206 # fix wrong doi attribution for article a14 of volume 62 number 1
207 # 10.1007/s10587-012-0005-x:
208 if xarticle.pid in ["CMJ_2012__62_1_a14", "ZCSUT_2012__22_3_a3"]: 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true
209 xarticle.doi = None
210 else:
211 xarticle.pid = (
212 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_")
213 )
215 # Hack to handle articles with no titles
216 if not xarticle.title_tex: 216 ↛ 217line 216 didn't jump to line 217 because the condition on line 216 was never true
217 xarticle.title_tex = " "
219 return xarticle