Coverage for src/crawler/by_source/episciences_crawler.py: 71%

121 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import json 

2import math 

3from urllib.parse import urljoin 

4 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

15 

16 

17# We could improve our data further by augmenting the articles using arxiv 

18# (references) 

19class EpisciencesCrawler(BaseCollectionCrawler): 

20 source_name = "Episciences" 

21 source_domain = "EPISCIENCES" 

22 source_website = "https://www.episciences.org/" 

23 

24 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"} 

25 

26 def parse_collection_content(self, content): 

27 data = json.loads(content) 

28 xissues = [] 

29 

30 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall 

31 if self.collection_id == "DMTCS": 31 ↛ 35line 31 didn't jump to line 35 because the condition on line 31 was always true

32 for issue in data: 

33 xissues.append(self.prefetch_dmcts_issue(issue)) 

34 else: 

35 for issue in data: 

36 xissues.append(self.prefetch_episciences_issue(issue)) 

37 

38 issues_by_volume = {} 

39 for issue in xissues: 

40 if issue.volume not in issues_by_volume: 

41 issues_by_volume[issue.volume] = [] 

42 issues_by_volume[issue.volume].append(issue) 

43 

44 for volume_issues in issues_by_volume.values(): 

45 year_iterable = [int(i.year) for i in volume_issues] 

46 firstyear = min(year_iterable) 

47 lastyear = max(year_iterable) 

48 if firstyear != lastyear: 

49 for i in volume_issues: 

50 i.year = f"{firstyear}-{lastyear}" 

51 

52 return xissues 

53 

54 def prefetch_dmcts_issue(self, issue: dict): 

55 if "vol_year" in issue: 

56 year = str(issue["vol_year"]) 

57 else: 

58 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

59 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

60 mid_art_content = self.download_file(mid_art_url) 

61 mid_art_data = json.loads(mid_art_content) 

62 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

63 issue_title = issue["titles"]["en"] 

64 

65 # parsed_url = urlparse(self.collection_url) 

66 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

67 

68 if "DMTCS Proceedings" in issue_title: 

69 xissue = create_issuedata() 

70 xissue.url = None 

71 xissue.year = year 

72 xissue.pid = self.get_issue_pid( 

73 self.collection_id, year, "special_" + str(issue["vid"]) 

74 ) 

75 # HACK : handle this elsewhere ? transform title_tex into title_xml 

76 # Is title_xml here even valid ? 

77 xissue.title_tex = issue_title 

78 xissue.title_html = issue_title 

79 xissue.title_xml = issue_title 

80 xissue.volume = issue_title 

81 

82 else: 

83 # Vol. 1 

84 # Vol. 3 no. 2 

85 # Vol. 17 no.2 

86 # vol. 24, no 2 

87 # Vol. 18 no. 2, Permutation Patterns 2015 

88 # Vol. 19 no. 4, FCT '15 

89 # vol. 27:2 

90 # vol. 25:3 special issue ICGT'22 

91 # vol. 26:1, Permutation Patterns 2023 

92 title_dict = regex_to_dict( 

93 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?", 

94 issue_title, 

95 error_msg="Couldn't parse issue title", 

96 ) 

97 xissue = self.create_xissue( 

98 None, year, title_dict["volume"], title_dict.get("number", None) 

99 ) 

100 if title_dict["title"] is not None: 

101 # HACK : handle this elsewhere ? transform title_tex into title_xml 

102 # Is title_xml here even valid ? 

103 xissue.title_tex = title_dict["title"] 

104 xissue.title_html = title_dict["title"] 

105 xissue.title_xml = title_dict["title"] 

106 

107 for index, paper in enumerate(issue["papers"]): 

108 xarticle = create_articledata() 

109 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

110 xarticle.pid = f"a{index}" 

111 xissue.articles.append(xarticle) 

112 return xissue 

113 

114 def prefetch_episciences_issue(self, issue: dict): 

115 """ 

116 Episciences doesn't provides issue years. 

117 We have to parse the year from the first article of the issue (publication date). 

118 """ 

119 

120 if "vol_year" in issue: 

121 year = str(issue["vol_year"]) 

122 if "year" in issue: 

123 year = str(issue["year"]) 

124 else: 

125 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

126 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

127 mid_art_content = self.download_file(mid_art_url) 

128 mid_art_data = json.loads(mid_art_content) 

129 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

130 

131 # parsed_url = urlparse(self.collection_url) 

132 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

133 

134 issue_title = issue["titles"]["en"] 

135 xissue = self.create_xissue(None, year, None, None) 

136 xissue.lang = "en" 

137 try: 

138 title_dict = regex_to_dict( 

139 r"Volume (?<number>\d+)", 

140 issue_title, 

141 error_msg="Couldn't parse issue title", 

142 ) 

143 

144 xissue.volume = title_dict["number"] 

145 except ValueError: 

146 xissue.title_tex = issue_title 

147 

148 if "fr" in issue["titles"]: 

149 title_trans = issue["titles"]["fr"] 

150 xissue.titles.append( 

151 self.create_trans_title( 

152 xresource_lang=xissue.lang, 

153 resource_type="issue", 

154 title_tex=title_trans, 

155 lang="fr", 

156 ) 

157 ) 

158 

159 xissue.pid = self.get_issue_pid(self.collection_id, year, xissue.volume, xissue.number) 

160 

161 for index, paper in enumerate(issue["papers"]): 

162 xarticle = create_articledata() 

163 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

164 xarticle.pid = f"a{index}" 

165 xissue.articles.append(xarticle) 

166 

167 return xissue 

168 

169 def parse_article_content(self, content, xissue, xarticle, url): 

170 data = json.loads(content) 

171 

172 journal_data = data["document"]["journal"]["journal_article"] 

173 

174 add_pdf_link_to_xarticle( 

175 xarticle, data["document"]["database"]["current"]["files"]["link"] 

176 ) 

177 xarticle.lang = journal_data["@language"] 

178 xarticle.title_tex = journal_data["titles"]["title"] 

179 contributors = journal_data["contributors"]["person_name"] 

180 if isinstance(contributors, list): 180 ↛ 192line 180 didn't jump to line 192 because the condition on line 180 was always true

181 for contrib in journal_data["contributors"]["person_name"]: 

182 if contrib["@contributor_role"] != "author": 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 raise NotImplementedError("Contributor type not implemented") 

184 xarticle.contributors.append( 

185 create_contributor( 

186 first_name=contrib["given_name"], 

187 last_name=contrib["surname"], 

188 role="author", 

189 ) 

190 ) 

191 else: 

192 if not contributors["@contributor_role"] == "author": 

193 raise NotImplementedError("Contributor type not implemented") 

194 xarticle.contributors.append( 

195 create_contributor( 

196 first_name=contributors["given_name"], 

197 last_name=contributors["surname"], 

198 role="author", 

199 ) 

200 ) 

201 

202 xabstract = create_abstract(tag="abstract", value_tex="") 

203 abstract = journal_data["abstract"]["value"] 

204 if isinstance(abstract, list): 

205 abstract = abstract[0] 

206 if isinstance(abstract, dict): 

207 if "@xml:lang" in abstract: 207 ↛ 209line 207 didn't jump to line 209 because the condition on line 207 was always true

208 xabstract["lang"] = abstract["@xml:lang"] 

209 abstract = abstract["value"] 

210 

211 xabstract["value_tex"] = abstract 

212 

213 xarticle.abstracts.append(xabstract) 

214 

215 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 

216 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]: 

217 xarticle.kwds.append(create_subj(type="msc", value=msc["code"])) 

218 

219 xarticle.doi = journal_data["doi_data"]["doi"].strip() 

220 xarticle.url = data["document"]["database"]["current"]["url"] 

221 return xarticle