Coverage for src/crawler/by_source/dmlcz_crawler.py: 71%
106 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import create_abstract, create_articledata, create_subj
6from crawler.base_crawler import BaseCollectionCrawler
9class DmlczCrawler(BaseCollectionCrawler):
10 source_name = "Czech Digital Mathematics Library"
11 source_domain = "DMLCZ"
12 source_website = "https://dml.cz"
14 issue_href = r"/handle/\d+.dmlcz/\d+"
16 def parse_collection_content(self, content):
17 """
18 Parse the HTML page of Annals of Math and returns a list of xissue.
19 Each xissue has its pid/volume/number/year metadata + its url
20 """
21 soup = BeautifulSoup(content, "html.parser")
22 xissues = []
24 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})]
26 for issue_node in issue_nodes:
27 reg_year = re.compile(r"\d{4}")
28 reg_volume = re.compile(r"Volume \d+")
29 issue_text = issue_node.get_text()
30 if re.compile(r"\d+").search(issue_text): 30 ↛ 26line 30 didn't jump to line 26 because the condition on line 30 was always true
31 elem = issue_node.find("a")
32 dates = reg_year.search(issue_text)
33 volume = reg_volume.search(elem.get_text())
34 issues = issue_node.findNext("td")
35 issues = issues.findAll("a")
36 if volume: 36 ↛ 38line 36 didn't jump to line 38 because the condition on line 36 was always true
37 volume = volume[0].replace("Volume ", "")
38 if dates: 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true
39 search = reg_year.search(issue_text)
40 if search is not None: 40 ↛ 42line 40 didn't jump to line 42 because the condition on line 40 was always true
41 dates = search[0]
42 for issue in issues:
43 link = issue.get("href")
44 number = issue.get_text()
45 xissue = self.create_dmlcz_xissue(link, volume, number, dates)
46 if xissue: 46 ↛ 42line 46 didn't jump to line 42 because the condition on line 46 was always true
47 xissues.append(xissue)
49 return xissues
51 def get_year(self, year):
52 if "/" in year:
53 year = year.split("/")[0]
55 return year
57 def create_dmlcz_xissue(self, url, volume, number, dates):
58 year = dates.replace("/", "-")
59 number = number.replace(",", "-")
61 # volume might not be an integer. eLibM puts special issue titles as volume number.
63 try:
64 volume_for_pid = int(volume)
65 except ValueError:
66 print("error parsing volume")
68 xissue = super().create_xissue(self.source_website + url, year, volume, number)
69 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
71 return xissue
73 def parse_issue_content(self, content, xissue):
74 soup = BeautifulSoup(content, "html.parser")
75 article_nodes = soup.find_all("td", {"class": "article"})
77 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887)
78 # We need to ignore the articles already crawled
79 article_urls = []
81 for index_article, article_node in enumerate(article_nodes):
82 article_link_node = article_node.find("a")
83 if article_link_node: 83 ↛ 81line 83 didn't jump to line 81 because the condition on line 83 was always true
84 url = article_link_node.get("href")
85 if url not in article_urls: 85 ↛ 81line 85 didn't jump to line 81 because the condition on line 85 was always true
86 article_urls.append(url)
88 xarticle = create_articledata()
89 xarticle.pid = "a" + str(index_article)
90 xarticle.url = self.source_website + url
92 xissue.articles.append(xarticle)
94 def parse_article_content(self, content, xissue, xarticle, url):
95 """
96 Parse the content with Beautifulsoup and returns an ArticleData
97 """
98 soup = BeautifulSoup(content, "html.parser")
99 self.get_metadata_using_citation_meta(
100 xarticle,
101 xissue,
102 soup,
103 [
104 "lang",
105 "title",
106 "author",
107 "pdf",
108 "abstract",
109 "page",
110 "mr",
111 "zbl",
112 "publisher",
113 "keywords",
114 ],
115 )
117 bloc_ref_ids = soup.find("div", {"class": "item-refids"})
118 # TITLE
119 title_node = soup.find("span", {"class": "item-title"})
120 if title_node: 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true
121 xarticle.title_tex = title_node.get_text()
123 # ABSTRACT
124 abstract_section_node = soup.find("dim:field")
125 if abstract_section_node: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 abstract = str(abstract_section_node.get_text())
127 xabstract = create_abstract(
128 tag="abstract",
129 value_tex=abstract,
130 lang=xarticle.lang,
131 )
132 xarticle.abstracts.append(xabstract)
134 # PDF
135 # link_nodes = soup.find_all("a")
136 # for link_node in link_nodes:
137 # pdf_url = link_node.get("href")
138 # if pdf_url.startswith("/bitstream/"):
139 # add_pdf_link_to_xarticle(xarticle, pdf_url)
140 reg_msc = re.compile("/browse-subject")
141 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))]
143 # MSC
144 for subj in subjs_nodes: 144 ↛ 145line 144 didn't jump to line 145 because the loop on line 144 never started
145 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
146 xarticle.kwds.append(subject)
148 # PAGES
149 pages = soup.find("span", {"class": "item-pp"})
150 if pages: 150 ↛ 168line 150 didn't jump to line 168 because the condition on line 150 was always true
151 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
152 if pages_to: 152 ↛ 168line 152 didn't jump to line 168 because the condition on line 152 was always true
153 parts = pages_to[0].split("-")
154 first_page = parts[0].replace("(", "").replace(")", "")
155 if len(parts) > 1: 155 ↛ 159line 155 didn't jump to line 159 because the condition on line 155 was always true
156 last_page = parts[1].replace("(", "").replace(")", "")
157 xarticle.lpage = last_page
159 xarticle.fpage = first_page
161 # Biblio
162 # bibitems_tags = soup.select("div.references-inside div.reference")
163 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags]
164 # if len(bibitems) > 0:
165 # xarticle.abstracts.append(self.create_bibliography(bibitems))
167 # DOI
168 reg_doi = re.compile("dx.doi.org")
170 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true
171 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))]
172 if len(doi_node) > 0:
173 doi = doi_node[0].get_text()
174 pos = doi.find("10.")
175 if pos > 0:
176 doi = doi[pos:]
177 xarticle.doi = doi
179 # fix wrong doi attribution for article a14 of volume 62 number 1
180 # 10.1007/s10587-012-0005-x:
181 if xarticle.pid in ["CMJ_2012__62_1_a14", "ZCSUT_2012__22_3_a3"]:
182 xarticle.doi = None
183 else:
184 xarticle.pid = (
185 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_")
186 )
188 # Hack to handle articles with no titles
189 if not xarticle.title_tex: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 xarticle.title_tex = " "
192 return xarticle