Coverage for src/crawler/by_source/episciences_crawler.py: 92%
95 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import json
2import math
3from urllib.parse import urljoin
5import regex
6from ptf.model_data import (
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_issuedata,
11 create_subj,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle
18# We could improve our data further by augmenting the articles using arxiv
19# (references)
20class EpisciencesCrawler(BaseCollectionCrawler):
21 source_name = "Episciences"
22 source_domain = "EPISCIENCES"
23 source_website = "https://www.episciences.org/"
25 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"}
27 # Vol. 1
28 # Vol. 3 no. 2
29 # Vol. 17 no.2
30 # vol. 24, no 2
31 # Vol. 18 no. 2, Permutation Patterns 2015
32 # Vol. 19 no. 4, FCT '15
33 # vol. 27:2
34 # vol. 25:3 special issue ICGT'22
35 # vol. 26:1, Permutation Patterns 2023
36 issue_title_re = r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?"
38 def parse_collection_content(self, content):
39 data = json.loads(content)
40 xissues = []
41 for issue in data:
42 xissues.append(self.prefetch_episciences_issue(issue))
44 issues_by_volume = {}
45 for issue in xissues:
46 if issue.volume not in issues_by_volume:
47 issues_by_volume[issue.volume] = []
48 issues_by_volume[issue.volume].append(issue)
50 for volume_issues in issues_by_volume.values():
51 year_iterable = [int(i.year) for i in volume_issues]
52 firstyear = min(year_iterable)
53 lastyear = max(year_iterable)
54 if firstyear != lastyear:
55 for i in volume_issues:
56 i.year = f"{firstyear}-{lastyear}"
58 return xissues
60 def prefetch_episciences_issue(self, issue: dict):
61 """
62 Episciences doesn't provides issue years.
63 We have to parse the year from the first article of the issue (publication date).
64 """
66 if "vol_year" in issue:
67 year = str(issue["vol_year"])
68 else:
69 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)]
70 mid_art_url = urljoin(self.collection_url, mid_art["@id"])
71 mid_art_content = self.download_file(mid_art_url)
72 mid_art_data = json.loads(mid_art_content)
73 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"]
74 issue_title = issue["titles"]["en"]
76 # parsed_url = urlparse(self.collection_url)
77 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}"
79 if "DMTCS Proceedings" in issue_title:
80 xissue = create_issuedata()
81 xissue.url = None
82 xissue.year = year
83 xissue.pid = self.get_issue_pid(
84 self.collection_id, year, "special_" + str(issue["vid"])
85 )
86 # HACK : handle this elsewhere ? transform title_tex into title_xml
87 # Is title_xml here even valid ?
88 xissue.title_tex = issue_title
89 xissue.title_html = issue_title
90 xissue.title_xml = issue_title
91 xissue.volume = issue_title
93 else:
94 title_search = regex.search(self.issue_title_re, issue_title)
95 if not title_search: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 raise ValueError("Couldn't parse issue title")
97 title_dict = title_search.groupdict()
98 xissue = self.create_xissue(
99 None, year, title_dict["volume"], title_dict.get("number", None)
100 )
101 if title_dict["title"] is not None:
102 # HACK : handle this elsewhere ? transform title_tex into title_xml
103 # Is title_xml here even valid ?
104 xissue.title_tex = title_dict["title"]
105 xissue.title_html = title_dict["title"]
106 xissue.title_xml = title_dict["title"]
108 for index, paper in enumerate(issue["papers"]):
109 xarticle = create_articledata()
110 xarticle.url = urljoin(self.collection_url, paper["@id"])
111 xarticle.pid = f"a{index}"
112 xissue.articles.append(xarticle)
113 return xissue
115 def parse_article_content(self, content, xissue, xarticle, url, pid):
116 data = json.loads(content)
118 journal_data = data["document"]["journal"]["journal_article"]
120 add_pdf_link_to_xarticle(
121 xarticle, data["document"]["database"]["current"]["files"]["link"]
122 )
123 xarticle.lang = journal_data["@language"]
124 xarticle.title_tex = journal_data["titles"]["title"]
125 contributors = journal_data["contributors"]["person_name"]
126 if isinstance(contributors, list): 126 ↛ 138line 126 didn't jump to line 138 because the condition on line 126 was always true
127 for contrib in journal_data["contributors"]["person_name"]:
128 if contrib["@contributor_role"] != "author": 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true
129 raise NotImplementedError("Contributor type not implemented")
130 xarticle.contributors.append(
131 create_contributor(
132 first_name=contrib["given_name"],
133 last_name=contrib["surname"],
134 role="author",
135 )
136 )
137 else:
138 if not contributors["@contributor_role"] == "author":
139 raise NotImplementedError("Contributor type not implemented")
140 xarticle.contributors.append(
141 create_contributor(
142 first_name=contributors["given_name"],
143 last_name=contributors["surname"],
144 role="author",
145 )
146 )
148 xabstract = create_abstract(tag="abstract", value_tex="")
149 abstract = journal_data["abstract"]["value"]
150 if isinstance(abstract, list):
151 abstract = abstract[0]
152 if isinstance(abstract, dict):
153 if "@xml:lang" in abstract: 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true
154 xabstract["lang"] = abstract["@xml:lang"]
155 abstract = abstract["value"]
157 xabstract["value_tex"] = abstract
159 xarticle.abstracts.append(xabstract)
161 if "msc2020" in data["document"]["database"]["current"]["classifications"]:
162 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]:
163 xarticle.kwds.append(create_subj(type="msc", value=msc["code"]))
165 xarticle.doi = journal_data["doi_data"]["doi"].strip()
166 xarticle.url = data["document"]["database"]["current"]["url"]
167 return super().parse_article_content(content, xissue, xarticle, url, pid)