Coverage for src/crawler/by_source/episciences_crawler.py: 93%
91 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1import json
2import math
3from urllib.parse import urljoin
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
17# We could improve our data further by augmenting the articles using arxiv
18# (references)
19class EpisciencesCrawler(BaseCollectionCrawler):
20 source_name = "Episciences"
21 source_domain = "EPISCIENCES"
22 source_website = "https://www.episciences.org/"
24 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"}
26 # Vol. 1
27 # Vol. 3 no. 2
28 # Vol. 17 no.2
29 # vol. 24, no 2
30 # Vol. 18 no. 2, Permutation Patterns 2015
31 # Vol. 19 no. 4, FCT '15
32 # vol. 27:2
33 # vol. 25:3 special issue ICGT'22
34 # vol. 26:1, Permutation Patterns 2023
35 issue_title_re = r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?"
37 def parse_collection_content(self, content):
38 data = json.loads(content)
39 xissues = []
40 for issue in data:
41 xissues.append(self.prefetch_episciences_issue(issue))
43 issues_by_volume = {}
44 for issue in xissues:
45 if issue.volume not in issues_by_volume:
46 issues_by_volume[issue.volume] = []
47 issues_by_volume[issue.volume].append(issue)
49 for volume_issues in issues_by_volume.values():
50 year_iterable = [int(i.year) for i in volume_issues]
51 firstyear = min(year_iterable)
52 lastyear = max(year_iterable)
53 if firstyear != lastyear:
54 for i in volume_issues:
55 i.year = f"{firstyear}-{lastyear}"
57 return xissues
59 def prefetch_episciences_issue(self, issue: dict):
60 """
61 Episciences doesn't provides issue years.
62 We have to parse the year from the first article of the issue (publication date).
63 """
65 if "vol_year" in issue:
66 year = str(issue["vol_year"])
67 else:
68 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
69 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
70 mid_art_content = self.download_file(mid_art_url)
71 mid_art_data = json.loads(mid_art_content)
72 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
73 issue_title = issue["titles"]["en"]
75 # parsed_url = urlparse(self.collection_url)
76 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
78 if "DMTCS Proceedings" in issue_title:
79 xissue = create_issuedata()
80 xissue.url = None
81 xissue.year = year
82 xissue.pid = self.get_issue_pid(
83 self.collection_id, year, "special_" + str(issue["vid"])
84 )
85 # HACK : handle this elsewhere ? transform title_tex into title_xml
86 # Is title_xml here even valid ?
87 xissue.title_tex = issue_title
88 xissue.title_html = issue_title
89 xissue.title_xml = issue_title
90 xissue.volume = issue_title
92 else:
93 title_dict = regex_to_dict(
94 self.issue_title_re, issue_title, error_msg="Couldn't parse issue title"
95 )
96 xissue = self.create_xissue(
97 None, year, title_dict["volume"], title_dict.get("number", None)
98 )
99 if title_dict["title"] is not None:
100 # HACK : handle this elsewhere ? transform title_tex into title_xml
101 # Is title_xml here even valid ?
102 xissue.title_tex = title_dict["title"]
103 xissue.title_html = title_dict["title"]
104 xissue.title_xml = title_dict["title"]
106 for index, paper in enumerate(issue["papers"]):
107 xarticle = create_articledata()
108 xarticle.url = urljoin(self.collection_url, paper["@id"])
109 xarticle.pid = f"a{index}"
110 xissue.articles.append(xarticle)
111 return xissue
113 def parse_article_content(self, content, xissue, xarticle, url):
114 data = json.loads(content)
116 journal_data = data["document"]["journal"]["journal_article"]
118 add_pdf_link_to_xarticle(
119 xarticle, data["document"]["database"]["current"]["files"]["link"]
120 )
121 xarticle.lang = journal_data["@language"]
122 xarticle.title_tex = journal_data["titles"]["title"]
123 contributors = journal_data["contributors"]["person_name"]
124 if isinstance(contributors, list): 124 ↛ 136line 124 didn't jump to line 136 because the condition on line 124 was always true
125 for contrib in journal_data["contributors"]["person_name"]:
126 if contrib["@contributor_role"] != "author": 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 raise NotImplementedError("Contributor type not implemented")
128 xarticle.contributors.append(
129 create_contributor(
130 first_name=contrib["given_name"],
131 last_name=contrib["surname"],
132 role="author",
133 )
134 )
135 else:
136 if not contributors["@contributor_role"] == "author":
137 raise NotImplementedError("Contributor type not implemented")
138 xarticle.contributors.append(
139 create_contributor(
140 first_name=contributors["given_name"],
141 last_name=contributors["surname"],
142 role="author",
143 )
144 )
146 xabstract = create_abstract(tag="abstract", value_tex="")
147 abstract = journal_data["abstract"]["value"]
148 if isinstance(abstract, list):
149 abstract = abstract[0]
150 if isinstance(abstract, dict):
151 if "@xml:lang" in abstract: 151 ↛ 153line 151 didn't jump to line 153 because the condition on line 151 was always true
152 xabstract["lang"] = abstract["@xml:lang"]
153 abstract = abstract["value"]
155 xabstract["value_tex"] = abstract
157 xarticle.abstracts.append(xabstract)
159 if "msc2020" in data["document"]["database"]["current"]["classifications"]:
160 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]:
161 xarticle.kwds.append(create_subj(type="msc", value=msc["code"]))
163 xarticle.doi = journal_data["doi_data"]["doi"].strip()
164 xarticle.url = data["document"]["database"]["current"]["url"]
165 return xarticle