Coverage for src/crawler/by_source/episciences_crawler.py: 72%

126 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1import json 

2import math 

3from urllib.parse import urljoin 

4 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12from pylatexenc.latex2text import LatexNodes2Text 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

16 

17 

18# We could improve our data further by augmenting the articles using arxiv 

19# (references) 

20class EpisciencesCrawler(BaseCollectionCrawler): 

21 source_name = "Episciences" 

22 source_domain = "EPISCIENCES" 

23 source_website = "https://www.episciences.org/" 

24 

25 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"} 

26 latex_converter: LatexNodes2Text 

27 

28 def __init__(self, *args, **kwargs): 

29 super().__init__(*args, **kwargs) 

30 self.latex_converter = LatexNodes2Text(math_mode="verbatim") 

31 

32 def parse_collection_content(self, content): 

33 data = json.loads(content) 

34 xissues = [] 

35 

36 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall 

37 if self.collection_id == "DMTCS": 37 ↛ 41line 37 didn't jump to line 41 because the condition on line 37 was always true

38 for issue in data: 

39 xissues.append(self.prefetch_dmcts_issue(issue)) 

40 else: 

41 for issue in data: 

42 xissues.append(self.prefetch_episciences_issue(issue)) 

43 

44 issues_by_volume = {} 

45 for issue in xissues: 

46 if issue.volume not in issues_by_volume: 

47 issues_by_volume[issue.volume] = [] 

48 issues_by_volume[issue.volume].append(issue) 

49 

50 for volume_issues in issues_by_volume.values(): 

51 year_iterable = [int(i.year) for i in volume_issues] 

52 firstyear = min(year_iterable) 

53 lastyear = max(year_iterable) 

54 if firstyear != lastyear: 

55 for i in volume_issues: 

56 i.year = f"{firstyear}-{lastyear}" 

57 

58 return xissues 

59 

60 def prefetch_dmcts_issue(self, issue: dict): 

61 if "vol_year" in issue: 

62 year = str(issue["vol_year"]) 

63 else: 

64 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

65 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

66 mid_art_content = self.download_file(mid_art_url) 

67 mid_art_data = json.loads(mid_art_content) 

68 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

69 issue_title = issue["titles"]["en"] 

70 

71 # parsed_url = urlparse(self.collection_url) 

72 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

73 

74 if "DMTCS Proceedings" in issue_title: 

75 xissue = create_issuedata() 

76 xissue.url = None 

77 xissue.year = year 

78 xissue.pid = self.get_issue_pid( 

79 self.collection_id, year, "special_" + str(issue["vid"]) 

80 ) 

81 # HACK : handle this elsewhere ? transform title_tex into title_xml 

82 # Is title_xml here even valid ? 

83 xissue.title_tex = issue_title 

84 xissue.title_html = issue_title 

85 xissue.title_xml = issue_title 

86 xissue.volume = issue_title 

87 

88 else: 

89 # Vol. 1 

90 # Vol. 3 no. 2 

91 # Vol. 17 no.2 

92 # vol. 24, no 2 

93 # Vol. 18 no. 2, Permutation Patterns 2015 

94 # Vol. 19 no. 4, FCT '15 

95 # vol. 27:2 

96 # vol. 25:3 special issue ICGT'22 

97 # vol. 26:1, Permutation Patterns 2023 

98 title_dict = regex_to_dict( 

99 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?", 

100 issue_title, 

101 error_msg="Couldn't parse issue title", 

102 ) 

103 xissue = self.create_xissue( 

104 None, year, title_dict["volume"], title_dict.get("number", None) 

105 ) 

106 if title_dict["title"] is not None: 

107 # HACK : handle this elsewhere ? transform title_tex into title_xml 

108 # Is title_xml here even valid ? 

109 xissue.title_tex = title_dict["title"] 

110 xissue.title_html = title_dict["title"] 

111 xissue.title_xml = title_dict["title"] 

112 

113 for index, paper in enumerate(issue["papers"]): 

114 xarticle = create_articledata() 

115 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

116 xarticle.pid = f"a{index}" 

117 xissue.articles.append(xarticle) 

118 return xissue 

119 

120 def prefetch_episciences_issue(self, issue: dict): 

121 """ 

122 Episciences doesn't provides issue years. 

123 We have to parse the year from the first article of the issue (publication date). 

124 """ 

125 

126 if "vol_year" in issue: 

127 year = str(issue["vol_year"]) 

128 if "year" in issue: 

129 year = str(issue["year"]) 

130 else: 

131 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

132 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

133 mid_art_content = self.download_file(mid_art_url) 

134 mid_art_data = json.loads(mid_art_content) 

135 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

136 

137 # parsed_url = urlparse(self.collection_url) 

138 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

139 

140 issue_title = issue["titles"]["en"] 

141 xissue = self.create_xissue(None, year, None, None) 

142 xissue.lang = "en" 

143 try: 

144 title_dict = regex_to_dict( 

145 r"Volume (?<number>\d+)", 

146 issue_title, 

147 error_msg="Couldn't parse issue title", 

148 ) 

149 

150 xissue.volume = title_dict["number"] 

151 except ValueError: 

152 xissue.title_tex = issue_title 

153 

154 if "fr" in issue["titles"]: 

155 title_trans = issue["titles"]["fr"] 

156 xissue.titles.append( 

157 self.create_trans_title( 

158 xresource_lang=xissue.lang, 

159 resource_type="issue", 

160 title_tex=title_trans, 

161 lang="fr", 

162 ) 

163 ) 

164 

165 xissue.pid = self.get_issue_pid(self.collection_id, year, xissue.volume, xissue.number) 

166 

167 for index, paper in enumerate(issue["papers"]): 

168 xarticle = create_articledata() 

169 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

170 xarticle.pid = f"a{index}" 

171 xissue.articles.append(xarticle) 

172 

173 return xissue 

174 

175 def parse_article_content(self, content, xissue, xarticle, url): 

176 data = json.loads(content) 

177 

178 journal_data = data["document"]["journal"]["journal_article"] 

179 

180 add_pdf_link_to_xarticle( 

181 xarticle, data["document"]["database"]["current"]["files"]["link"] 

182 ) 

183 xarticle.lang = journal_data["@language"] 

184 xarticle.title_tex = self.latex_converter.latex_to_text(journal_data["titles"]["title"]) 

185 contributors = journal_data["contributors"]["person_name"] 

186 if isinstance(contributors, list): 186 ↛ 198line 186 didn't jump to line 198 because the condition on line 186 was always true

187 for contrib in journal_data["contributors"]["person_name"]: 

188 if contrib["@contributor_role"] != "author": 188 ↛ 189line 188 didn't jump to line 189 because the condition on line 188 was never true

189 raise NotImplementedError("Contributor type not implemented") 

190 xarticle.contributors.append( 

191 create_contributor( 

192 first_name=contrib["given_name"], 

193 last_name=contrib["surname"], 

194 role="author", 

195 ) 

196 ) 

197 else: 

198 if not contributors["@contributor_role"] == "author": 

199 raise NotImplementedError("Contributor type not implemented") 

200 xarticle.contributors.append( 

201 create_contributor( 

202 first_name=contributors["given_name"], 

203 last_name=contributors["surname"], 

204 role="author", 

205 ) 

206 ) 

207 

208 xabstract = create_abstract(value_tex="") 

209 abstract = journal_data["abstract"]["value"] 

210 if isinstance(abstract, list): 

211 abstract = abstract[0] 

212 if isinstance(abstract, dict): 

213 if "@xml:lang" in abstract: 213 ↛ 215line 213 didn't jump to line 215 because the condition on line 213 was always true

214 xabstract["lang"] = abstract["@xml:lang"] 

215 abstract = abstract["value"] 

216 

217 xabstract["value_tex"] = self.latex_converter.latex_to_text(abstract) 

218 

219 xarticle.abstracts.append(xabstract) 

220 

221 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 221 ↛ 225line 221 didn't jump to line 225 because the condition on line 221 was always true

222 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]: 

223 xarticle.kwds.append(create_subj(type="msc", value=msc["code"])) 

224 

225 xarticle.doi = journal_data["doi_data"]["doi"].strip() 

226 xarticle.url = data["document"]["database"]["current"]["url"] 

227 return xarticle