Coverage for src/crawler/by_source/dmlcz_crawler.py: 73%
110 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import re
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import create_abstract, create_articledata, create_subj
6from crawler.base_crawler import BaseCollectionCrawler
9class DmlczCrawler(BaseCollectionCrawler):
10 source_name = "Czech Digital Mathematics Library"
11 source_domain = "DMLCZ"
12 source_website = "https://dml.cz"
14 issue_href = r"/handle/\d+.dmlcz/\d+"
16 def parse_collection_content(self, content):
17 """
18 Parse the HTML page of Annals of Math and returns a list of xissue.
19 Each xissue has its pid/volume/number/year metadata + its url
21 self.periode is set at the end based on the xissue years of the HTML page
22 """
23 soup = BeautifulSoup(content, "html.parser")
24 xissues = []
26 issue_nodes = [volume for volume in soup.find_all("td", {"class": "volume"})]
28 for issue_node in issue_nodes:
29 reg_year = re.compile(r"\d{4}")
30 reg_volume = re.compile(r"Volume \d+")
31 issue_text = issue_node.get_text()
32 if re.compile(r"\d+").search(issue_text): 32 ↛ 28line 32 didn't jump to line 28 because the condition on line 32 was always true
33 elem = issue_node.find("a")
34 dates = reg_year.search(issue_text)
35 volume = reg_volume.search(elem.get_text())
36 issues = issue_node.findNext("td")
37 issues = issues.findAll("a")
38 if volume: 38 ↛ 40line 38 didn't jump to line 40 because the condition on line 38 was always true
39 volume = volume[0].replace("Volume ", "")
40 if dates: 40 ↛ 44line 40 didn't jump to line 44 because the condition on line 40 was always true
41 search = reg_year.search(issue_text)
42 if search is not None: 42 ↛ 44line 42 didn't jump to line 44 because the condition on line 42 was always true
43 dates = search[0]
44 for issue in issues:
45 link = issue.get("href")
46 number = issue.get_text()
47 xissue = self.create_dmlcz_xissue(link, volume, number, dates)
48 if xissue: 48 ↛ 44line 48 didn't jump to line 44 because the condition on line 48 was always true
49 xissues.append(xissue)
51 self.periode_begin = self.get_year(xissues[0].year)
52 self.periode_end = self.get_year(xissues[-1].year)
53 self.periode = self.get_or_create_periode()
55 return xissues
57 def get_year(self, year):
58 if "/" in year: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 year = year.split("/")[0]
61 return year
63 def create_dmlcz_xissue(self, url, volume, number, dates):
64 year = dates.replace("/", "-")
65 number = number.replace(",", "-")
67 # volume might not be an integer. eLibM puts special issue titles as volume number.
69 try:
70 volume_for_pid = int(volume)
71 except ValueError:
72 print("error parsing volume")
74 xissue = super().create_xissue(self.source_website + url, year, volume, number)
75 xissue.pid = f"{self.collection_id}_{year}__{volume_for_pid}_{number}"
77 return xissue
79 def parse_issue_content(self, content, xissue):
80 soup = BeautifulSoup(content, "html.parser")
81 article_nodes = soup.find_all("td", {"class": "article"})
83 # DML-CZ may list the same article multiple times (ex: https://dml.cz/handle/10338.dmlcz/149887)
84 # We need to ignore the articles already crawled
85 article_urls = []
87 for index_article, article_node in enumerate(article_nodes):
88 article_link_node = article_node.find("a")
89 if article_link_node: 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true
90 url = article_link_node.get("href")
91 if url not in article_urls: 91 ↛ 87line 91 didn't jump to line 87 because the condition on line 91 was always true
92 article_urls.append(url)
94 xarticle = create_articledata()
95 xarticle.pid = "a" + str(index_article)
96 xarticle.url = self.source_website + url
98 xissue.articles.append(xarticle)
100 def parse_article_content(self, content, xissue, xarticle, url, pid):
101 """
102 Parse the content with Beautifulsoup and returns an ArticleData
103 """
104 xarticle.pid = pid
105 soup = BeautifulSoup(content, "html.parser")
106 self.get_metadata_using_citation_meta(
107 xarticle,
108 xissue,
109 soup,
110 [
111 "lang",
112 "title",
113 "author",
114 "pdf",
115 "abstract",
116 "page",
117 "mr",
118 "zbl",
119 "publisher",
120 "keywords",
121 ],
122 )
124 bloc_ref_ids = soup.find("div", {"class": "item-refids"})
125 # TITLE
126 title_node = soup.find("span", {"class": "item-title"})
127 if title_node: 127 ↛ 131line 127 didn't jump to line 131 because the condition on line 127 was always true
128 xarticle.title_tex = title_node.get_text()
130 # ABSTRACT
131 abstract_section_node = soup.find("dim:field")
132 if abstract_section_node: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 abstract = str(abstract_section_node.get_text())
134 xabstract = create_abstract(
135 tag="abstract",
136 value_tex=abstract,
137 lang=xarticle.lang,
138 )
139 xarticle.abstracts.append(xabstract)
141 # PDF
142 # link_nodes = soup.find_all("a")
143 # for link_node in link_nodes:
144 # pdf_url = link_node.get("href")
145 # if pdf_url.startswith("/bitstream/"):
146 # add_pdf_link_to_xarticle(xarticle, pdf_url)
147 reg_msc = re.compile("/browse-subject")
148 subjs_nodes = [a.get_text() for a in soup.find_all("a") if reg_msc.search(a.get("href"))]
150 # MSC
151 for subj in subjs_nodes: 151 ↛ 152line 151 didn't jump to line 152 because the loop on line 151 never started
152 subject = create_subj(value=subj, type="msc", lang=xarticle.lang)
153 xarticle.kwds.append(subject)
155 # PAGES
156 pages = soup.find("span", {"class": "item-pp"})
157 if pages: 157 ↛ 175line 157 didn't jump to line 175 because the condition on line 157 was always true
158 pages_to = re.compile(r"(\(?\d+\)?)?-?(\(?\d+\)?)").search(pages.get_text())
159 if pages_to: 159 ↛ 175line 159 didn't jump to line 175 because the condition on line 159 was always true
160 parts = pages_to[0].split("-")
161 first_page = parts[0].replace("(", "").replace(")", "")
162 if len(parts) > 1: 162 ↛ 166line 162 didn't jump to line 166 because the condition on line 162 was always true
163 last_page = parts[1].replace("(", "").replace(")", "")
164 xarticle.lpage = last_page
166 xarticle.fpage = first_page
168 # Biblio
169 # bibitems_tags = soup.select("div.references-inside div.reference")
170 # bibitems = [self.parse_bibitem_tag(item) for item in bibitems_tags]
171 # if len(bibitems) > 0:
172 # xarticle.abstracts.append(self.create_bibliography(bibitems))
174 # DOI
175 reg_doi = re.compile("dx.doi.org")
177 if bloc_ref_ids and isinstance(bloc_ref_ids, Tag): 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 doi_node = [a for a in bloc_ref_ids.find_all("a") if reg_doi.search(a.get("href"))]
179 if len(doi_node) > 0:
180 doi = doi_node[0].get_text()
181 pos = doi.find("10.")
182 if pos > 0:
183 doi = doi[pos:]
184 xarticle.doi = doi
186 # fix wrong doi attribution for article a14 of volume 62 number 1
187 # 10.1007/s10587-012-0005-x:
188 if xarticle.pid in ["CMJ_2012__62_1_a14", "ZCSUT_2012__22_3_a3"]:
189 xarticle.doi = None
190 else:
191 xarticle.pid = (
192 doi.replace("/", "_").replace(".", "_").replace("-", "_").replace(":", "_")
193 )
195 # Hack to handle articles with no titles
196 if not xarticle.title_tex: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true
197 xarticle.title_tex = " "
199 return xarticle