Coverage for src / crawler / by_source / episciences_crawler.py: 72%
127 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import json
2import math
3from urllib.parse import urljoin
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
12from pylatexenc.latex2text import LatexNodes2Text
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.crawler_utils import get_issue_pid
16from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
19# We could improve our data further by augmenting the articles using arxiv
20# (references)
21class EpisciencesCrawler(BaseCollectionCrawler):
22 source_name = "Episciences"
23 source_domain = "EPISCIENCES"
24 source_website = "https://www.episciences.org/"
26 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"}
27 latex_converter: LatexNodes2Text
29 def __init__(self, *args, **kwargs):
30 super().__init__(*args, **kwargs)
31 self.latex_converter = LatexNodes2Text(math_mode="verbatim")
33 def parse_collection_content(self, content):
34 data = json.loads(content)
35 xissues = []
37 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall
38 if self.collection_id == "DMTCS": 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true
39 for issue in data:
40 xissues.append(self.prefetch_dmcts_issue(issue))
41 else:
42 for issue in data:
43 xissues.append(self.prefetch_episciences_issue(issue))
45 issues_by_volume = {}
46 for issue in xissues:
47 if issue.volume not in issues_by_volume:
48 issues_by_volume[issue.volume] = []
49 issues_by_volume[issue.volume].append(issue)
51 for volume_issues in issues_by_volume.values():
52 year_iterable = [int(i.year) for i in volume_issues]
53 firstyear = min(year_iterable)
54 lastyear = max(year_iterable)
55 if firstyear != lastyear:
56 for i in volume_issues:
57 i.year = f"{firstyear}-{lastyear}"
59 return xissues
61 def prefetch_dmcts_issue(self, issue: dict):
62 if "vol_year" in issue:
63 year = str(issue["vol_year"])
64 else:
65 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
66 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
67 mid_art_content = self.download_file(mid_art_url)
68 mid_art_data = json.loads(mid_art_content)
69 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
70 issue_title = issue["titles"]["en"]
72 # parsed_url = urlparse(self.collection_url)
73 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
75 if "DMTCS Proceedings" in issue_title:
76 xissue = create_issuedata()
77 xissue.url = None
78 xissue.year = year
79 xissue.pid = get_issue_pid(self.collection_id, year, "special_" + str(issue["vid"]))
80 # HACK : handle this elsewhere ? transform title_tex into title_xml
81 # Is title_xml here even valid ?
82 xissue.title_tex = issue_title
83 xissue.title_html = issue_title
84 xissue.title_xml = issue_title
85 xissue.volume = issue_title
87 else:
88 # Vol. 1
89 # Vol. 3 no. 2
90 # Vol. 17 no.2
91 # vol. 24, no 2
92 # Vol. 18 no. 2, Permutation Patterns 2015
93 # Vol. 19 no. 4, FCT '15
94 # vol. 27:2
95 # vol. 25:3 special issue ICGT'22
96 # vol. 26:1, Permutation Patterns 2023
97 title_dict = regex_to_dict(
98 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?",
99 issue_title,
100 error_msg="Couldn't parse issue title",
101 )
102 xissue = self.create_xissue(
103 None, year, title_dict["volume"], title_dict.get("number", None)
104 )
105 if title_dict["title"] is not None:
106 # HACK : handle this elsewhere ? transform title_tex into title_xml
107 # Is title_xml here even valid ?
108 xissue.title_tex = title_dict["title"]
109 xissue.title_html = title_dict["title"]
110 xissue.title_xml = title_dict["title"]
112 for index, paper in enumerate(issue["papers"]):
113 xarticle = create_articledata()
114 xarticle.url = urljoin(self.collection_url, paper["@id"])
115 xarticle.pid = f"a{index}"
116 xissue.articles.append(xarticle)
117 return xissue
119 def prefetch_episciences_issue(self, issue: dict):
120 """
121 Episciences doesn't provides issue years.
122 We have to parse the year from the first article of the issue (publication date).
123 """
125 if "vol_year" in issue:
126 year = str(issue["vol_year"])
127 if "year" in issue:
128 year = str(issue["year"])
129 else:
130 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
131 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
132 mid_art_content = self.download_file(mid_art_url)
133 mid_art_data = json.loads(mid_art_content)
134 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
136 # parsed_url = urlparse(self.collection_url)
137 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
139 issue_title = issue["titles"]["en"]
140 xissue = self.create_xissue(None, year, None, None)
141 xissue.lang = "en"
142 try:
143 title_dict = regex_to_dict(
144 r"Volume (?<number>\d+)",
145 issue_title,
146 error_msg="Couldn't parse issue title",
147 )
149 xissue.volume = title_dict["number"]
150 except ValueError:
151 xissue.title_tex = issue_title
153 if "fr" in issue["titles"]:
154 title_trans = issue["titles"]["fr"]
155 xissue.titles.append(
156 self.create_trans_title(
157 xresource_lang=xissue.lang,
158 resource_type="issue",
159 title_tex=title_trans,
160 lang="fr",
161 )
162 )
164 xissue.pid = get_issue_pid(self.collection_id, year, xissue.volume, xissue.number)
166 for index, paper in enumerate(issue["papers"]):
167 xarticle = create_articledata()
168 xarticle.url = urljoin(self.collection_url, paper["@id"])
169 xarticle.pid = f"a{index}"
170 xissue.articles.append(xarticle)
172 return xissue
174 def parse_article_content(self, content, xissue, xarticle, url):
175 data = json.loads(content)
177 journal_data = data["document"]["journal"]["journal_article"]
179 add_pdf_link_to_xarticle(
180 xarticle, data["document"]["database"]["current"]["files"]["link"]
181 )
182 xarticle.lang = journal_data["@language"]
183 xarticle.title_tex = self.latex_converter.latex_to_text(journal_data["titles"]["title"])
184 contributors = journal_data["contributors"]["person_name"]
185 if isinstance(contributors, list): 185 ↛ 197line 185 didn't jump to line 197 because the condition on line 185 was always true
186 for contrib in journal_data["contributors"]["person_name"]:
187 if contrib["@contributor_role"] != "author": 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 raise NotImplementedError("Contributor type not implemented")
189 xarticle.contributors.append(
190 create_contributor(
191 first_name=contrib["given_name"],
192 last_name=contrib["surname"],
193 role="author",
194 )
195 )
196 else:
197 if not contributors["@contributor_role"] == "author":
198 raise NotImplementedError("Contributor type not implemented")
199 xarticle.contributors.append(
200 create_contributor(
201 first_name=contributors["given_name"],
202 last_name=contributors["surname"],
203 role="author",
204 )
205 )
207 xabstract = create_abstract(value_tex="")
208 abstract = journal_data["abstract"]["value"]
209 if isinstance(abstract, list):
210 abstract = abstract[0]
211 if isinstance(abstract, dict):
212 if "@xml:lang" in abstract: 212 ↛ 214line 212 didn't jump to line 214 because the condition on line 212 was always true
213 xabstract["lang"] = abstract["@xml:lang"]
214 abstract = abstract["value"]
216 xabstract["value_tex"] = self.latex_converter.latex_to_text(abstract)
218 xarticle.abstracts.append(xabstract)
220 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 220 ↛ 224line 220 didn't jump to line 224 because the condition on line 220 was always true
221 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]:
222 xarticle.kwds.append(create_subj(type="msc", value=msc["code"]))
224 xarticle.doi = journal_data["doi_data"]["doi"].strip()
225 xarticle.url = data["document"]["database"]["current"]["url"]
226 return xarticle