Coverage for src / crawler / by_source / episciences_crawler.py: 68%

141 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import json 

2import math 

3from urllib.parse import urljoin 

4 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12from pylatexenc.latex2text import LatexNodes2Text 

13 

14from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler 

15from crawler.crawler_utils import get_issue_pid 

16from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

17 

18# TODO : Episciences provides a JATS api (currently unused in Geodesic) 

19# https://arima.episciences.org/1978/zbjats 

20 

21 

22# We could improve our data further by augmenting the articles using arxiv 

23# (references) 

24class EpisciencesCrawler(ThreadedCrawler): 

25 source_name = "Episciences" 

26 source_domain = "EPISCIENCES" 

27 source_website = "https://www.episciences.org/" 

28 

29 episciences_id_re = r"https://api\.episciences\.org/api/volumes\?rvcode=(?P<episciences_id>\w+)&pagination=false" 

30 

31 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"} 

32 latex_converter: LatexNodes2Text 

33 

34 def __init__(self, *args, **kwargs): 

35 super().__init__(*args, **kwargs) 

36 self.latex_converter = LatexNodes2Text(math_mode="verbatim") 

37 dict = regex_to_dict(self.episciences_id_re, self.collection_url) 

38 self.episciences_id = dict["episciences_id"] 

39 

40 def parse_collection_content(self, content): 

41 data = json.loads(content) 

42 xissues = [] 

43 

44 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall 

45 if self.collection_id == "DMTCS": 45 ↛ 49line 45 didn't jump to line 49 because the condition on line 45 was always true

46 for issue in data: 

47 xissues.append(self.prefetch_dmcts_issue(issue)) 

48 else: 

49 for issue in data: 

50 url = f"https://api.episciences.org/api/volumes/{issue['vid']}?rvcode={self.episciences_id}&pagination=false" 

51 data = self.download_file(url) 

52 issue_content = json.loads(data) 

53 xissues.append(self.prefetch_episciences_issue(issue_content)) 

54 

55 issues_by_volume = {} 

56 for issue in xissues: 

57 if issue.volume not in issues_by_volume: 

58 issues_by_volume[issue.volume] = [] 

59 issues_by_volume[issue.volume].append(issue) 

60 

61 for volume_issues in issues_by_volume.values(): 

62 try: 

63 year_iterable = [int(i.year) for i in volume_issues] 

64 except ValueError: 

65 pass 

66 firstyear = min(year_iterable) 

67 lastyear = max(year_iterable) 

68 if firstyear != lastyear: 

69 for i in volume_issues: 

70 i.year = f"{firstyear}-{lastyear}" 

71 

72 return xissues 

73 

74 def prefetch_dmcts_issue(self, issue: dict): 

75 if "vol_year" in issue: 

76 year = str(issue["vol_year"]) 

77 else: 

78 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

79 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

80 mid_art_content = self.download_file(mid_art_url) 

81 mid_art_data = json.loads(mid_art_content) 

82 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

83 issue_title = issue["titles"]["en"] 

84 

85 # parsed_url = urlparse(self.collection_url) 

86 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

87 

88 if "DMTCS Proceedings" in issue_title: 

89 xissue = create_issuedata() 

90 xissue.url = None 

91 xissue.year = year 

92 xissue.pid = get_issue_pid(self.collection_id, year, "special_" + str(issue["vid"])) 

93 # HACK : handle this elsewhere ? transform title_tex into title_xml 

94 # Is title_xml here even valid ? 

95 xissue.title_tex = issue_title 

96 xissue.title_html = issue_title 

97 xissue.title_xml = issue_title 

98 xissue.volume = issue_title 

99 

100 else: 

101 # Vol. 1 

102 # Vol. 3 no. 2 

103 # Vol. 17 no.2 

104 # vol. 24, no 2 

105 # Vol. 18 no. 2, Permutation Patterns 2015 

106 # Vol. 19 no. 4, FCT '15 

107 # vol. 27:2 

108 # vol. 25:3 special issue ICGT'22 

109 # vol. 26:1, Permutation Patterns 2023 

110 title_dict = regex_to_dict( 

111 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?", 

112 issue_title, 

113 error_msg="Couldn't parse issue title", 

114 ) 

115 xissue = self.create_xissue( 

116 None, year, title_dict["volume"], title_dict.get("number", None) 

117 ) 

118 if title_dict["title"] is not None: 

119 # HACK : handle this elsewhere ? transform title_tex into title_xml 

120 # Is title_xml here even valid ? 

121 xissue.title_tex = title_dict["title"] 

122 xissue.title_html = title_dict["title"] 

123 xissue.title_xml = title_dict["title"] 

124 

125 for index, paper in enumerate(issue["papers"]): 

126 xarticle = create_articledata() 

127 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

128 xarticle.pid = f"a{index}" 

129 xissue.articles.append(xarticle) 

130 return xissue 

131 

132 def prefetch_episciences_issue(self, issue: dict): 

133 """ 

134 Episciences doesn't provides issue years. 

135 We have to parse the year from the first article of the issue (publication date). 

136 """ 

137 

138 if "vol_year" in issue and issue["vol_year"] is not None: 

139 year = str(issue["vol_year"]) 

140 elif "year" in issue and issue["year"] is not None: 

141 year = str(issue["year"]) 

142 else: 

143 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

144 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

145 mid_art_content = self.download_file(mid_art_url) 

146 mid_art_data = json.loads(mid_art_content) 

147 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

148 

149 # parsed_url = urlparse(self.collection_url) 

150 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

151 

152 issue_title = issue["titles"]["en"] 

153 xissue = self.create_xissue(None, year, None, None) 

154 xissue.lang = "en" 

155 try: 

156 title_dict = regex_to_dict( 

157 r"Volume (?P<number>\d+)(?:, Issue (?P<issue>\d+))?(?:, (?P<title>[\w ]+))?", 

158 issue_title, 

159 error_msg="Couldn't parse issue title", 

160 ) 

161 

162 xissue.volume = title_dict["number"] 

163 if title_dict["issue"]: 

164 xissue.number = title_dict["issue"] 

165 if title_dict["title"]: 

166 xissue.title_tex = title_dict["title"] 

167 except ValueError: 

168 xissue.title_tex = issue_title 

169 

170 if "fr" in issue["titles"]: 

171 title_trans = issue["titles"]["fr"] 

172 xissue.titles.append( 

173 self.create_trans_title( 

174 xresource_lang=xissue.lang, 

175 resource_type="issue", 

176 title_tex=title_trans, 

177 lang="fr", 

178 ) 

179 ) 

180 

181 xissue.pid = get_issue_pid(self.collection_id, year, xissue.volume, xissue.number) 

182 

183 for index, paper in enumerate(issue["papers"]): 

184 xarticle = create_articledata() 

185 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

186 xarticle.pid = f"a{index}" 

187 xissue.articles.append(xarticle) 

188 

189 return xissue 

190 

191 def parse_article_content(self, content, xissue, xarticle, url): 

192 data = json.loads(content) 

193 

194 journal_data = data["document"]["journal"]["journal_article"] 

195 

196 add_pdf_link_to_xarticle( 

197 xarticle, data["document"]["database"]["current"]["files"]["link"] 

198 ) 

199 xarticle.lang = journal_data["@language"] 

200 xarticle.title_tex = self.latex_converter.latex_to_text(journal_data["titles"]["title"]) 

201 contributors = journal_data["contributors"]["person_name"] 

202 if isinstance(contributors, list): 202 ↛ 214line 202 didn't jump to line 214 because the condition on line 202 was always true

203 for contrib in journal_data["contributors"]["person_name"]: 

204 if contrib["@contributor_role"] != "author": 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 raise NotImplementedError("Contributor type not implemented") 

206 xarticle.contributors.append( 

207 create_contributor( 

208 first_name=contrib["given_name"], 

209 last_name=contrib["surname"], 

210 role="author", 

211 ) 

212 ) 

213 else: 

214 if not contributors["@contributor_role"] == "author": 

215 raise NotImplementedError("Contributor type not implemented") 

216 xarticle.contributors.append( 

217 create_contributor( 

218 first_name=contributors["given_name"], 

219 last_name=contributors["surname"], 

220 role="author", 

221 ) 

222 ) 

223 

224 if "abstract" in journal_data: 224 ↛ 239line 224 didn't jump to line 239 because the condition on line 224 was always true

225 xabstract = create_abstract(value_tex="") 

226 

227 abstract = journal_data["abstract"]["value"] 

228 if isinstance(abstract, list): 

229 abstract = abstract[0] 

230 if isinstance(abstract, dict): 

231 if "@xml:lang" in abstract: 231 ↛ 233line 231 didn't jump to line 233 because the condition on line 231 was always true

232 xabstract["lang"] = abstract["@xml:lang"] 

233 abstract = abstract["value"] 

234 

235 xabstract["value_tex"] = self.latex_converter.latex_to_text(abstract) 

236 

237 xarticle.abstracts.append(xabstract) 

238 

239 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 239 ↛ 243line 239 didn't jump to line 243 because the condition on line 239 was always true

240 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]: 

241 xarticle.kwds.append(create_subj(type="msc", value=msc["code"])) 

242 

243 xarticle.doi = journal_data["doi_data"]["doi"].strip() 

244 xarticle.url = data["document"]["database"]["current"]["url"] 

245 return xarticle