Coverage for src/crawler/by_source/episciences_crawler.py: 72%
126 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1import json
2import math
3from urllib.parse import urljoin
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
12from pylatexenc.latex2text import LatexNodes2Text
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
18# We could improve our data further by augmenting the articles using arxiv
19# (references)
20class EpisciencesCrawler(BaseCollectionCrawler):
21 source_name = "Episciences"
22 source_domain = "EPISCIENCES"
23 source_website = "https://www.episciences.org/"
25 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"}
26 latex_converter: LatexNodes2Text
28 def __init__(self, *args, **kwargs):
29 super().__init__(*args, **kwargs)
30 self.latex_converter = LatexNodes2Text(math_mode="verbatim")
32 def parse_collection_content(self, content):
33 data = json.loads(content)
34 xissues = []
36 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall
37 if self.collection_id == "DMTCS": 37 ↛ 41line 37 didn't jump to line 41 because the condition on line 37 was always true
38 for issue in data:
39 xissues.append(self.prefetch_dmcts_issue(issue))
40 else:
41 for issue in data:
42 xissues.append(self.prefetch_episciences_issue(issue))
44 issues_by_volume = {}
45 for issue in xissues:
46 if issue.volume not in issues_by_volume:
47 issues_by_volume[issue.volume] = []
48 issues_by_volume[issue.volume].append(issue)
50 for volume_issues in issues_by_volume.values():
51 year_iterable = [int(i.year) for i in volume_issues]
52 firstyear = min(year_iterable)
53 lastyear = max(year_iterable)
54 if firstyear != lastyear:
55 for i in volume_issues:
56 i.year = f"{firstyear}-{lastyear}"
58 return xissues
60 def prefetch_dmcts_issue(self, issue: dict):
61 if "vol_year" in issue:
62 year = str(issue["vol_year"])
63 else:
64 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
65 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
66 mid_art_content = self.download_file(mid_art_url)
67 mid_art_data = json.loads(mid_art_content)
68 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
69 issue_title = issue["titles"]["en"]
71 # parsed_url = urlparse(self.collection_url)
72 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
74 if "DMTCS Proceedings" in issue_title:
75 xissue = create_issuedata()
76 xissue.url = None
77 xissue.year = year
78 xissue.pid = self.get_issue_pid(
79 self.collection_id, year, "special_" + str(issue["vid"])
80 )
81 # HACK : handle this elsewhere ? transform title_tex into title_xml
82 # Is title_xml here even valid ?
83 xissue.title_tex = issue_title
84 xissue.title_html = issue_title
85 xissue.title_xml = issue_title
86 xissue.volume = issue_title
88 else:
89 # Vol. 1
90 # Vol. 3 no. 2
91 # Vol. 17 no.2
92 # vol. 24, no 2
93 # Vol. 18 no. 2, Permutation Patterns 2015
94 # Vol. 19 no. 4, FCT '15
95 # vol. 27:2
96 # vol. 25:3 special issue ICGT'22
97 # vol. 26:1, Permutation Patterns 2023
98 title_dict = regex_to_dict(
99 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?",
100 issue_title,
101 error_msg="Couldn't parse issue title",
102 )
103 xissue = self.create_xissue(
104 None, year, title_dict["volume"], title_dict.get("number", None)
105 )
106 if title_dict["title"] is not None:
107 # HACK : handle this elsewhere ? transform title_tex into title_xml
108 # Is title_xml here even valid ?
109 xissue.title_tex = title_dict["title"]
110 xissue.title_html = title_dict["title"]
111 xissue.title_xml = title_dict["title"]
113 for index, paper in enumerate(issue["papers"]):
114 xarticle = create_articledata()
115 xarticle.url = urljoin(self.collection_url, paper["@id"])
116 xarticle.pid = f"a{index}"
117 xissue.articles.append(xarticle)
118 return xissue
120 def prefetch_episciences_issue(self, issue: dict):
121 """
122 Episciences doesn't provides issue years.
123 We have to parse the year from the first article of the issue (publication date).
124 """
126 if "vol_year" in issue:
127 year = str(issue["vol_year"])
128 if "year" in issue:
129 year = str(issue["year"])
130 else:
131 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
132 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
133 mid_art_content = self.download_file(mid_art_url)
134 mid_art_data = json.loads(mid_art_content)
135 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
137 # parsed_url = urlparse(self.collection_url)
138 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
140 issue_title = issue["titles"]["en"]
141 xissue = self.create_xissue(None, year, None, None)
142 xissue.lang = "en"
143 try:
144 title_dict = regex_to_dict(
145 r"Volume (?<number>\d+)",
146 issue_title,
147 error_msg="Couldn't parse issue title",
148 )
150 xissue.volume = title_dict["number"]
151 except ValueError:
152 xissue.title_tex = issue_title
154 if "fr" in issue["titles"]:
155 title_trans = issue["titles"]["fr"]
156 xissue.titles.append(
157 self.create_trans_title(
158 xresource_lang=xissue.lang,
159 resource_type="issue",
160 title_tex=title_trans,
161 lang="fr",
162 )
163 )
165 xissue.pid = self.get_issue_pid(self.collection_id, year, xissue.volume, xissue.number)
167 for index, paper in enumerate(issue["papers"]):
168 xarticle = create_articledata()
169 xarticle.url = urljoin(self.collection_url, paper["@id"])
170 xarticle.pid = f"a{index}"
171 xissue.articles.append(xarticle)
173 return xissue
175 def parse_article_content(self, content, xissue, xarticle, url):
176 data = json.loads(content)
178 journal_data = data["document"]["journal"]["journal_article"]
180 add_pdf_link_to_xarticle(
181 xarticle, data["document"]["database"]["current"]["files"]["link"]
182 )
183 xarticle.lang = journal_data["@language"]
184 xarticle.title_tex = self.latex_converter.latex_to_text(journal_data["titles"]["title"])
185 contributors = journal_data["contributors"]["person_name"]
186 if isinstance(contributors, list): 186 ↛ 198line 186 didn't jump to line 198 because the condition on line 186 was always true
187 for contrib in journal_data["contributors"]["person_name"]:
188 if contrib["@contributor_role"] != "author": 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true
189 raise NotImplementedError("Contributor type not implemented")
190 xarticle.contributors.append(
191 create_contributor(
192 first_name=contrib["given_name"],
193 last_name=contrib["surname"],
194 role="author",
195 )
196 )
197 else:
198 if not contributors["@contributor_role"] == "author":
199 raise NotImplementedError("Contributor type not implemented")
200 xarticle.contributors.append(
201 create_contributor(
202 first_name=contributors["given_name"],
203 last_name=contributors["surname"],
204 role="author",
205 )
206 )
208 xabstract = create_abstract(value_tex="")
209 abstract = journal_data["abstract"]["value"]
210 if isinstance(abstract, list):
211 abstract = abstract[0]
212 if isinstance(abstract, dict):
213 if "@xml:lang" in abstract: 213 ↛ 215line 213 didn't jump to line 215 because the condition on line 213 was always true
214 xabstract["lang"] = abstract["@xml:lang"]
215 abstract = abstract["value"]
217 xabstract["value_tex"] = self.latex_converter.latex_to_text(abstract)
219 xarticle.abstracts.append(xabstract)
221 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 221 ↛ 225line 221 didn't jump to line 225 because the condition on line 221 was always true
222 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]:
223 xarticle.kwds.append(create_subj(type="msc", value=msc["code"]))
225 xarticle.doi = journal_data["doi_data"]["doi"].strip()
226 xarticle.url = data["document"]["database"]["current"]["url"]
227 return xarticle