Coverage for src / crawler / by_source / episciences_crawler.py: 68%
141 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1import json
2import math
3from urllib.parse import urljoin
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
12from pylatexenc.latex2text import LatexNodes2Text
14from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler
15from crawler.crawler_utils import get_issue_pid
16from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
18# TODO : Episciences provides a JATS api (currently unused in Geodesic)
19# https://arima.episciences.org/1978/zbjats
22# We could improve our data further by augmenting the articles using arxiv
23# (references)
24class EpisciencesCrawler(ThreadedCrawler):
25 source_name = "Episciences"
26 source_domain = "EPISCIENCES"
27 source_website = "https://www.episciences.org/"
29 episciences_id_re = r"https://api\.episciences\.org/api/volumes\?rvcode=(?P<episciences_id>\w+)&pagination=false"
31 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"}
32 latex_converter: LatexNodes2Text
34 def __init__(self, *args, **kwargs):
35 super().__init__(*args, **kwargs)
36 self.latex_converter = LatexNodes2Text(math_mode="verbatim")
37 dict = regex_to_dict(self.episciences_id_re, self.collection_url)
38 self.episciences_id = dict["episciences_id"]
40 def parse_collection_content(self, content):
41 data = json.loads(content)
42 xissues = []
44 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall
45 if self.collection_id == "DMTCS": 45 ↛ 49line 45 didn't jump to line 49 because the condition on line 45 was always true
46 for issue in data:
47 xissues.append(self.prefetch_dmcts_issue(issue))
48 else:
49 for issue in data:
50 url = f"https://api.episciences.org/api/volumes/{issue['vid']}?rvcode={self.episciences_id}&pagination=false"
51 data = self.download_file(url)
52 issue_content = json.loads(data)
53 xissues.append(self.prefetch_episciences_issue(issue_content))
55 issues_by_volume = {}
56 for issue in xissues:
57 if issue.volume not in issues_by_volume:
58 issues_by_volume[issue.volume] = []
59 issues_by_volume[issue.volume].append(issue)
61 for volume_issues in issues_by_volume.values():
62 try:
63 year_iterable = [int(i.year) for i in volume_issues]
64 except ValueError:
65 pass
66 firstyear = min(year_iterable)
67 lastyear = max(year_iterable)
68 if firstyear != lastyear:
69 for i in volume_issues:
70 i.year = f"{firstyear}-{lastyear}"
72 return xissues
74 def prefetch_dmcts_issue(self, issue: dict):
75 if "vol_year" in issue:
76 year = str(issue["vol_year"])
77 else:
78 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
79 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
80 mid_art_content = self.download_file(mid_art_url)
81 mid_art_data = json.loads(mid_art_content)
82 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
83 issue_title = issue["titles"]["en"]
85 # parsed_url = urlparse(self.collection_url)
86 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
88 if "DMTCS Proceedings" in issue_title:
89 xissue = create_issuedata()
90 xissue.url = None
91 xissue.year = year
92 xissue.pid = get_issue_pid(self.collection_id, year, "special_" + str(issue["vid"]))
93 # HACK : handle this elsewhere ? transform title_tex into title_xml
94 # Is title_xml here even valid ?
95 xissue.title_tex = issue_title
96 xissue.title_html = issue_title
97 xissue.title_xml = issue_title
98 xissue.volume = issue_title
100 else:
101 # Vol. 1
102 # Vol. 3 no. 2
103 # Vol. 17 no.2
104 # vol. 24, no 2
105 # Vol. 18 no. 2, Permutation Patterns 2015
106 # Vol. 19 no. 4, FCT '15
107 # vol. 27:2
108 # vol. 25:3 special issue ICGT'22
109 # vol. 26:1, Permutation Patterns 2023
110 title_dict = regex_to_dict(
111 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?",
112 issue_title,
113 error_msg="Couldn't parse issue title",
114 )
115 xissue = self.create_xissue(
116 None, year, title_dict["volume"], title_dict.get("number", None)
117 )
118 if title_dict["title"] is not None:
119 # HACK : handle this elsewhere ? transform title_tex into title_xml
120 # Is title_xml here even valid ?
121 xissue.title_tex = title_dict["title"]
122 xissue.title_html = title_dict["title"]
123 xissue.title_xml = title_dict["title"]
125 for index, paper in enumerate(issue["papers"]):
126 xarticle = create_articledata()
127 xarticle.url = urljoin(self.collection_url, paper["@id"])
128 xarticle.pid = f"a{index}"
129 xissue.articles.append(xarticle)
130 return xissue
132 def prefetch_episciences_issue(self, issue: dict):
133 """
134 Episciences doesn't provides issue years.
135 We have to parse the year from the first article of the issue (publication date).
136 """
138 if "vol_year" in issue and issue["vol_year"] is not None:
139 year = str(issue["vol_year"])
140 elif "year" in issue and issue["year"] is not None:
141 year = str(issue["year"])
142 else:
143 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
144 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
145 mid_art_content = self.download_file(mid_art_url)
146 mid_art_data = json.loads(mid_art_content)
147 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
149 # parsed_url = urlparse(self.collection_url)
150 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
152 issue_title = issue["titles"]["en"]
153 xissue = self.create_xissue(None, year, None, None)
154 xissue.lang = "en"
155 try:
156 title_dict = regex_to_dict(
157 r"Volume (?P<number>\d+)(?:, Issue (?P<issue>\d+))?(?:, (?P<title>[\w ]+))?",
158 issue_title,
159 error_msg="Couldn't parse issue title",
160 )
162 xissue.volume = title_dict["number"]
163 if title_dict["issue"]:
164 xissue.number = title_dict["issue"]
165 if title_dict["title"]:
166 xissue.title_tex = title_dict["title"]
167 except ValueError:
168 xissue.title_tex = issue_title
170 if "fr" in issue["titles"]:
171 title_trans = issue["titles"]["fr"]
172 xissue.titles.append(
173 self.create_trans_title(
174 xresource_lang=xissue.lang,
175 resource_type="issue",
176 title_tex=title_trans,
177 lang="fr",
178 )
179 )
181 xissue.pid = get_issue_pid(self.collection_id, year, xissue.volume, xissue.number)
183 for index, paper in enumerate(issue["papers"]):
184 xarticle = create_articledata()
185 xarticle.url = urljoin(self.collection_url, paper["@id"])
186 xarticle.pid = f"a{index}"
187 xissue.articles.append(xarticle)
189 return xissue
191 def parse_article_content(self, content, xissue, xarticle, url):
192 data = json.loads(content)
194 journal_data = data["document"]["journal"]["journal_article"]
196 add_pdf_link_to_xarticle(
197 xarticle, data["document"]["database"]["current"]["files"]["link"]
198 )
199 xarticle.lang = journal_data["@language"]
200 xarticle.title_tex = self.latex_converter.latex_to_text(journal_data["titles"]["title"])
201 contributors = journal_data["contributors"]["person_name"]
202 if isinstance(contributors, list): 202 ↛ 214line 202 didn't jump to line 214 because the condition on line 202 was always true
203 for contrib in journal_data["contributors"]["person_name"]:
204 if contrib["@contributor_role"] != "author": 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 raise NotImplementedError("Contributor type not implemented")
206 xarticle.contributors.append(
207 create_contributor(
208 first_name=contrib["given_name"],
209 last_name=contrib["surname"],
210 role="author",
211 )
212 )
213 else:
214 if not contributors["@contributor_role"] == "author":
215 raise NotImplementedError("Contributor type not implemented")
216 xarticle.contributors.append(
217 create_contributor(
218 first_name=contributors["given_name"],
219 last_name=contributors["surname"],
220 role="author",
221 )
222 )
224 if "abstract" in journal_data: 224 ↛ 239line 224 didn't jump to line 239 because the condition on line 224 was always true
225 xabstract = create_abstract(value_tex="")
227 abstract = journal_data["abstract"]["value"]
228 if isinstance(abstract, list):
229 abstract = abstract[0]
230 if isinstance(abstract, dict):
231 if "@xml:lang" in abstract: 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true
232 xabstract["lang"] = abstract["@xml:lang"]
233 abstract = abstract["value"]
235 xabstract["value_tex"] = self.latex_converter.latex_to_text(abstract)
237 xarticle.abstracts.append(xabstract)
239 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 239 ↛ 243line 239 didn't jump to line 243 because the condition on line 239 was always true
240 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]:
241 xarticle.kwds.append(create_subj(type="msc", value=msc["code"]))
243 xarticle.doi = journal_data["doi_data"]["doi"].strip()
244 xarticle.url = data["document"]["database"]["current"]["url"]
245 return xarticle