Coverage for src/crawler/by_source/episciences_crawler.py: 71%
121 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import json
2import math
3from urllib.parse import urljoin
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
17# We could improve our data further by augmenting the articles using arxiv
18# (references)
19class EpisciencesCrawler(BaseCollectionCrawler):
20 source_name = "Episciences"
21 source_domain = "EPISCIENCES"
22 source_website = "https://www.episciences.org/"
24 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"}
26 def parse_collection_content(self, content):
27 data = json.loads(content)
28 xissues = []
30 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall
31 if self.collection_id == "DMTCS": 31 ↛ 35line 31 didn't jump to line 35 because the condition on line 31 was always true
32 for issue in data:
33 xissues.append(self.prefetch_dmcts_issue(issue))
34 else:
35 for issue in data:
36 xissues.append(self.prefetch_episciences_issue(issue))
38 issues_by_volume = {}
39 for issue in xissues:
40 if issue.volume not in issues_by_volume:
41 issues_by_volume[issue.volume] = []
42 issues_by_volume[issue.volume].append(issue)
44 for volume_issues in issues_by_volume.values():
45 year_iterable = [int(i.year) for i in volume_issues]
46 firstyear = min(year_iterable)
47 lastyear = max(year_iterable)
48 if firstyear != lastyear:
49 for i in volume_issues:
50 i.year = f"{firstyear}-{lastyear}"
52 return xissues
54 def prefetch_dmcts_issue(self, issue: dict):
55 if "vol_year" in issue:
56 year = str(issue["vol_year"])
57 else:
58 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
59 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
60 mid_art_content = self.download_file(mid_art_url)
61 mid_art_data = json.loads(mid_art_content)
62 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
63 issue_title = issue["titles"]["en"]
65 # parsed_url = urlparse(self.collection_url)
66 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
68 if "DMTCS Proceedings" in issue_title:
69 xissue = create_issuedata()
70 xissue.url = None
71 xissue.year = year
72 xissue.pid = self.get_issue_pid(
73 self.collection_id, year, "special_" + str(issue["vid"])
74 )
75 # HACK : handle this elsewhere ? transform title_tex into title_xml
76 # Is title_xml here even valid ?
77 xissue.title_tex = issue_title
78 xissue.title_html = issue_title
79 xissue.title_xml = issue_title
80 xissue.volume = issue_title
82 else:
83 # Vol. 1
84 # Vol. 3 no. 2
85 # Vol. 17 no.2
86 # vol. 24, no 2
87 # Vol. 18 no. 2, Permutation Patterns 2015
88 # Vol. 19 no. 4, FCT '15
89 # vol. 27:2
90 # vol. 25:3 special issue ICGT'22
91 # vol. 26:1, Permutation Patterns 2023
92 title_dict = regex_to_dict(
93 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?",
94 issue_title,
95 error_msg="Couldn't parse issue title",
96 )
97 xissue = self.create_xissue(
98 None, year, title_dict["volume"], title_dict.get("number", None)
99 )
100 if title_dict["title"] is not None:
101 # HACK : handle this elsewhere ? transform title_tex into title_xml
102 # Is title_xml here even valid ?
103 xissue.title_tex = title_dict["title"]
104 xissue.title_html = title_dict["title"]
105 xissue.title_xml = title_dict["title"]
107 for index, paper in enumerate(issue["papers"]):
108 xarticle = create_articledata()
109 xarticle.url = urljoin(self.collection_url, paper["@id"])
110 xarticle.pid = f"a{index}"
111 xissue.articles.append(xarticle)
112 return xissue
114 def prefetch_episciences_issue(self, issue: dict):
115 """
116 Episciences doesn't provides issue years.
117 We have to parse the year from the first article of the issue (publication date).
118 """
120 if "vol_year" in issue:
121 year = str(issue["vol_year"])
122 if "year" in issue:
123 year = str(issue["year"])
124 else:
125 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
126 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
127 mid_art_content = self.download_file(mid_art_url)
128 mid_art_data = json.loads(mid_art_content)
129 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
131 # parsed_url = urlparse(self.collection_url)
132 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
134 issue_title = issue["titles"]["en"]
135 xissue = self.create_xissue(None, year, None, None)
136 xissue.lang = "en"
137 try:
138 title_dict = regex_to_dict(
139 r"Volume (?<number>\d+)",
140 issue_title,
141 error_msg="Couldn't parse issue title",
142 )
144 xissue.volume = title_dict["number"]
145 except ValueError:
146 xissue.title_tex = issue_title
148 if "fr" in issue["titles"]:
149 title_trans = issue["titles"]["fr"]
150 xissue.titles.append(
151 self.create_trans_title(
152 xresource_lang=xissue.lang,
153 resource_type="issue",
154 title_tex=title_trans,
155 lang="fr",
156 )
157 )
159 xissue.pid = self.get_issue_pid(self.collection_id, year, xissue.volume, xissue.number)
161 for index, paper in enumerate(issue["papers"]):
162 xarticle = create_articledata()
163 xarticle.url = urljoin(self.collection_url, paper["@id"])
164 xarticle.pid = f"a{index}"
165 xissue.articles.append(xarticle)
167 return xissue
169 def parse_article_content(self, content, xissue, xarticle, url):
170 data = json.loads(content)
172 journal_data = data["document"]["journal"]["journal_article"]
174 add_pdf_link_to_xarticle(
175 xarticle, data["document"]["database"]["current"]["files"]["link"]
176 )
177 xarticle.lang = journal_data["@language"]
178 xarticle.title_tex = journal_data["titles"]["title"]
179 contributors = journal_data["contributors"]["person_name"]
180 if isinstance(contributors, list): 180 ↛ 192line 180 didn't jump to line 192 because the condition on line 180 was always true
181 for contrib in journal_data["contributors"]["person_name"]:
182 if contrib["@contributor_role"] != "author": 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 raise NotImplementedError("Contributor type not implemented")
184 xarticle.contributors.append(
185 create_contributor(
186 first_name=contrib["given_name"],
187 last_name=contrib["surname"],
188 role="author",
189 )
190 )
191 else:
192 if not contributors["@contributor_role"] == "author":
193 raise NotImplementedError("Contributor type not implemented")
194 xarticle.contributors.append(
195 create_contributor(
196 first_name=contributors["given_name"],
197 last_name=contributors["surname"],
198 role="author",
199 )
200 )
202 xabstract = create_abstract(tag="abstract", value_tex="")
203 abstract = journal_data["abstract"]["value"]
204 if isinstance(abstract, list):
205 abstract = abstract[0]
206 if isinstance(abstract, dict):
207 if "@xml:lang" in abstract: 207 ↛ 209line 207 didn't jump to line 209 because the condition on line 207 was always true
208 xabstract["lang"] = abstract["@xml:lang"]
209 abstract = abstract["value"]
211 xabstract["value_tex"] = abstract
213 xarticle.abstracts.append(xabstract)
215 if "msc2020" in data["document"]["database"]["current"]["classifications"]:
216 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]:
217 xarticle.kwds.append(create_subj(type="msc", value=msc["code"]))
219 xarticle.doi = journal_data["doi_data"]["doi"].strip()
220 xarticle.url = data["document"]["database"]["current"]["url"]
221 return xarticle