Coverage for src / crawler / by_source / dmlcz_crawler.py: 80%
133 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 10:24 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 10:24 +0000
1import re
3from bs4 import BeautifulSoup, Tag
4from ptf.cmds.xml.xml_utils import escape
5from ptf.model_data import create_abstract, create_articledata, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.cmds.mixed_citation import ExtLinkXml, GenericRefElement, MixedCitation
11class DmlczCrawler(BaseCollectionCrawler):
12 source_name = "Czech Digital Mathematics Library"
13 source_domain = "DMLCZ"
14 source_website = "https://dml.cz"
16 issue_href = r"/handle/\d+.dmlcz/\d+"
18 def parse_collection_content(self, content):
19 """
20 Parse the HTML page of Annals of Math and returns a list of xissue.
21 Each xissue has its pid/volume/number/year metadata + its url
22 """
23 soup = BeautifulSoup(content, "html.parser")
24 xissues = []
26 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})]
28 for issue_node in issue_nodes:
29 reg_year = re.compile(r"\d{4}")
30 reg_volume = re.compile(r"Volume \d+")
31 issue_text = issue_node.get_text()
32 if re.compile(r"\d+").search(issue_text): 32 ↛ 28line 32 didn't jump to line 28 because the condition on line 32 was always true
33 elem = issue_node.find("a")
34 dates = reg_year.search(issue_text)
35 volume = reg_volume.search(elem.get_text())
36 issues = issue_node.findNext("td")
37 issues = issues.findAll("a")
38 if volume: 38 ↛ 40line 38 didn't jump to line 40 because the condition on line 38 was always true
39 volume = volume[0].replace("Volume ", "")
40 if dates: 40 ↛ 44line 40 didn't jump to line 44 because the condition on line 40 was always true
41 search = reg_year.search(issue_text)
42 if search is not None: 42 ↛ 44line 42 didn't jump to line 44 because the condition on line 42 was always true
43 dates = search[0]
44 for issue in issues:
45 link = issue.get("href")
46 number = issue.get_text()
47 xissue = self.create_dmlcz_xissue(link, volume, number, dates)
48 if xissue: 48 ↛ 44line 48 didn't jump to line 44 because the condition on line 48 was always true
49 xissues.append(xissue)
51 return xissues
53 def get_year(self, year):
54 if "/" in year:
55 year = year.split("/")[0]
57 return year
59 def create_dmlcz_xissue(self, url, volume_str: str, number, dates):
60 year = dates.replace("/", "-")
61 number = number.replace(",", "-")
63 volume = volume_str
64 if not volume_str.isnumeric(): 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 volume = None
66 self.logger.debug("Couldn't parse volume string", extra={"url": url})
67 else:
68 volume = str(int(volume_str))
70 xissue = super().create_xissue(self.source_website + url, year, volume, number)
72 return xissue
74 def parse_issue_content(self, content, xissue):
75 soup = BeautifulSoup(content, "html.parser")
76 article_nodes = soup.find_all("td", {"class": "article"})
78 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887)
79 # We need to ignore the articles already crawled
80 article_urls = []
82 for index_article, article_node in enumerate(article_nodes):
83 article_link_node = article_node.find("a")
84 if article_link_node: 84 ↛ 82line 84 didn't jump to line 82 because the condition on line 84 was always true
85 url = article_link_node.get("href")
86 if url not in article_urls: 86 ↛ 82line 86 didn't jump to line 82 because the condition on line 86 was always true
87 article_urls.append(url)
89 xarticle = create_articledata()
90 xarticle.pid = "a" + str(index_article)
91 xarticle.url = self.source_website + url
93 xissue.articles.append(xarticle)
95 def parse_article_content(self, content, xissue, xarticle, url):
96 """
97 Parse the content with Beautifulsoup and returns an ArticleData
98 """
99 soup = BeautifulSoup(content, "html.parser")
100 self.get_metadata_using_citation_meta(
101 xarticle,
102 xissue,
103 soup,
104 [
105 "lang",
106 "title",
107 "author",
108 "pdf",
109 "abstract",
110 "page",
111 "mr",
112 "zbl",
113 "publisher",
114 "keywords",
115 ],
116 )
118 bloc_ref_ids = soup.find("div", {"class": "item-refids"})
119 # TITLE
120 title_node = soup.find("span", {"class": "item-title"})
121 if title_node: 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was always true
122 xarticle.title_tex = title_node.get_text()
124 # ABSTRACT
125 abstract_section_node = soup.find("dim:field")
126 if abstract_section_node:
127 abstract = str(abstract_section_node.get_text())
129 xarticle.abstracts.append(
130 create_abstract(
131 value_tex=abstract,
132 lang=xarticle.lang,
133 )
134 )
136 # PDF
137 # link_nodes = soup.find_all("a")
138 # for link_node in link_nodes:
139 # pdf_url = link_node.get("href")
140 # if pdf_url.startswith("/bitstream/"):
141 # add_pdf_link_to_xarticle(xarticle, pdf_url)
142 reg_msc = re.compile("/browse-subject")
143 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))]
145 # MSC
146 for subj in subjs_nodes:
147 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
148 xarticle.kwds.append(subject)
150 # PAGES
151 pages = soup.find("span", {"class": "item-pp"})
152 if pages: 152 ↛ 164line 152 didn't jump to line 164 because the condition on line 152 was always true
153 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
154 if pages_to: 154 ↛ 164line 154 didn't jump to line 164 because the condition on line 154 was always true
155 parts = pages_to[0].split("-")
156 first_page = parts[0].replace("(", "").replace(")", "")
157 if len(parts) > 1: 157 ↛ 161line 157 didn't jump to line 161 because the condition on line 157 was always true
158 last_page = parts[1].replace("(", "").replace(")", "")
159 xarticle.lpage = last_page
161 xarticle.fpage = first_page
163 # Biblio
164 bibitems_tags = soup.select("div.references-inside div.reference")
165 bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags]
166 xarticle.bibitems = bibitems
168 # DOI
169 reg_doi = re.compile("dx.doi.org")
171 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 171 ↛ 190line 171 didn't jump to line 190 because the condition on line 171 was always true
172 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))]
173 if len(doi_node) > 0: 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 doi = doi_node[0].get_text()
175 pos = doi.find("10.")
176 if pos > 0:
177 doi = doi[pos:]
178 xarticle.doi = doi
180 # fix wrong doi attribution for article a14 of volume 62 number 1
181 # 10.1007/s10587-012-0005-x:
182 if xarticle.pid in ["CMJ_2012_62_1_a14", "ZCSUT_2012_22_3_a3"]:
183 xarticle.doi = None
184 else:
185 xarticle.pid = (
186 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_")
187 )
189 # Hack to handle articles with no titles
190 if not xarticle.title_tex: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 xarticle.title_tex = " "
193 return xarticle
195 def parse_bibitem_tag(self, tag: Tag):
196 citation_builder = MixedCitation()
197 for child in tag.children:
198 if isinstance(child, str):
199 if child.strip() == "|":
200 continue
201 citation_builder.elements.append(child)
202 continue
203 if isinstance(child, Tag): 203 ↛ 197line 203 didn't jump to line 197 because the condition on line 203 was always true
204 if child.name == "b":
205 el = GenericRefElement()
206 el.name = "article-title"
207 el.elements.append(child.text)
208 citation_builder.elements.append(el)
209 continue
210 if child.name == "a": 210 ↛ 197line 210 didn't jump to line 197 because the condition on line 210 was always true
211 href = child.get("href")
212 if not isinstance(href, str): 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 continue
214 el = ExtLinkXml(escape(href), escape(child.text))
215 citation_builder.elements.append(el)
216 continue
218 return citation_builder.get_jats_ref()