Coverage for src / crawler / by_source / episciences_crawler.py: 72%

127 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import json 

2import math 

3from urllib.parse import urljoin 

4 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12from pylatexenc.latex2text import LatexNodes2Text 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.crawler_utils import get_issue_pid 

16from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

17 

18 

19# We could improve our data further by augmenting the articles using arxiv 

20# (references) 

21class EpisciencesCrawler(BaseCollectionCrawler): 

22 source_name = "Episciences" 

23 source_domain = "EPISCIENCES" 

24 source_website = "https://www.episciences.org/" 

25 

26 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"} 

27 latex_converter: LatexNodes2Text 

28 

29 def __init__(self, *args, **kwargs): 

30 super().__init__(*args, **kwargs) 

31 self.latex_converter = LatexNodes2Text(math_mode="verbatim") 

32 

33 def parse_collection_content(self, content): 

34 data = json.loads(content) 

35 xissues = [] 

36 

37 # Would a dedicated class be preferable ? Or maybe a smarter crawler overall 

38 if self.collection_id == "DMTCS": 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true

39 for issue in data: 

40 xissues.append(self.prefetch_dmcts_issue(issue)) 

41 else: 

42 for issue in data: 

43 xissues.append(self.prefetch_episciences_issue(issue)) 

44 

45 issues_by_volume = {} 

46 for issue in xissues: 

47 if issue.volume not in issues_by_volume: 

48 issues_by_volume[issue.volume] = [] 

49 issues_by_volume[issue.volume].append(issue) 

50 

51 for volume_issues in issues_by_volume.values(): 

52 year_iterable = [int(i.year) for i in volume_issues] 

53 firstyear = min(year_iterable) 

54 lastyear = max(year_iterable) 

55 if firstyear != lastyear: 

56 for i in volume_issues: 

57 i.year = f"{firstyear}-{lastyear}" 

58 

59 return xissues 

60 

61 def prefetch_dmcts_issue(self, issue: dict): 

62 if "vol_year" in issue: 

63 year = str(issue["vol_year"]) 

64 else: 

65 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

66 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

67 mid_art_content = self.download_file(mid_art_url) 

68 mid_art_data = json.loads(mid_art_content) 

69 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

70 issue_title = issue["titles"]["en"] 

71 

72 # parsed_url = urlparse(self.collection_url) 

73 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

74 

75 if "DMTCS Proceedings" in issue_title: 

76 xissue = create_issuedata() 

77 xissue.url = None 

78 xissue.year = year 

79 xissue.pid = get_issue_pid(self.collection_id, year, "special_" + str(issue["vid"])) 

80 # HACK : handle this elsewhere ? transform title_tex into title_xml 

81 # Is title_xml here even valid ? 

82 xissue.title_tex = issue_title 

83 xissue.title_html = issue_title 

84 xissue.title_xml = issue_title 

85 xissue.volume = issue_title 

86 

87 else: 

88 # Vol. 1 

89 # Vol. 3 no. 2 

90 # Vol. 17 no.2 

91 # vol. 24, no 2 

92 # Vol. 18 no. 2, Permutation Patterns 2015 

93 # Vol. 19 no. 4, FCT '15 

94 # vol. 27:2 

95 # vol. 25:3 special issue ICGT'22 

96 # vol. 26:1, Permutation Patterns 2023 

97 title_dict = regex_to_dict( 

98 r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?", 

99 issue_title, 

100 error_msg="Couldn't parse issue title", 

101 ) 

102 xissue = self.create_xissue( 

103 None, year, title_dict["volume"], title_dict.get("number", None) 

104 ) 

105 if title_dict["title"] is not None: 

106 # HACK : handle this elsewhere ? transform title_tex into title_xml 

107 # Is title_xml here even valid ? 

108 xissue.title_tex = title_dict["title"] 

109 xissue.title_html = title_dict["title"] 

110 xissue.title_xml = title_dict["title"] 

111 

112 for index, paper in enumerate(issue["papers"]): 

113 xarticle = create_articledata() 

114 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

115 xarticle.pid = f"a{index}" 

116 xissue.articles.append(xarticle) 

117 return xissue 

118 

119 def prefetch_episciences_issue(self, issue: dict): 

120 """ 

121 Episciences doesn't provides issue years. 

122 We have to parse the year from the first article of the issue (publication date). 

123 """ 

124 

125 if "vol_year" in issue: 

126 year = str(issue["vol_year"]) 

127 if "year" in issue: 

128 year = str(issue["year"]) 

129 else: 

130 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

131 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

132 mid_art_content = self.download_file(mid_art_url) 

133 mid_art_data = json.loads(mid_art_content) 

134 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

135 

136 # parsed_url = urlparse(self.collection_url) 

137 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

138 

139 issue_title = issue["titles"]["en"] 

140 xissue = self.create_xissue(None, year, None, None) 

141 xissue.lang = "en" 

142 try: 

143 title_dict = regex_to_dict( 

144 r"Volume (?<number>\d+)", 

145 issue_title, 

146 error_msg="Couldn't parse issue title", 

147 ) 

148 

149 xissue.volume = title_dict["number"] 

150 except ValueError: 

151 xissue.title_tex = issue_title 

152 

153 if "fr" in issue["titles"]: 

154 title_trans = issue["titles"]["fr"] 

155 xissue.titles.append( 

156 self.create_trans_title( 

157 xresource_lang=xissue.lang, 

158 resource_type="issue", 

159 title_tex=title_trans, 

160 lang="fr", 

161 ) 

162 ) 

163 

164 xissue.pid = get_issue_pid(self.collection_id, year, xissue.volume, xissue.number) 

165 

166 for index, paper in enumerate(issue["papers"]): 

167 xarticle = create_articledata() 

168 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

169 xarticle.pid = f"a{index}" 

170 xissue.articles.append(xarticle) 

171 

172 return xissue 

173 

174 def parse_article_content(self, content, xissue, xarticle, url): 

175 data = json.loads(content) 

176 

177 journal_data = data["document"]["journal"]["journal_article"] 

178 

179 add_pdf_link_to_xarticle( 

180 xarticle, data["document"]["database"]["current"]["files"]["link"] 

181 ) 

182 xarticle.lang = journal_data["@language"] 

183 xarticle.title_tex = self.latex_converter.latex_to_text(journal_data["titles"]["title"]) 

184 contributors = journal_data["contributors"]["person_name"] 

185 if isinstance(contributors, list): 185 ↛ 197line 185 didn't jump to line 197 because the condition on line 185 was always true

186 for contrib in journal_data["contributors"]["person_name"]: 

187 if contrib["@contributor_role"] != "author": 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 raise NotImplementedError("Contributor type not implemented") 

189 xarticle.contributors.append( 

190 create_contributor( 

191 first_name=contrib["given_name"], 

192 last_name=contrib["surname"], 

193 role="author", 

194 ) 

195 ) 

196 else: 

197 if not contributors["@contributor_role"] == "author": 

198 raise NotImplementedError("Contributor type not implemented") 

199 xarticle.contributors.append( 

200 create_contributor( 

201 first_name=contributors["given_name"], 

202 last_name=contributors["surname"], 

203 role="author", 

204 ) 

205 ) 

206 

207 xabstract = create_abstract(value_tex="") 

208 abstract = journal_data["abstract"]["value"] 

209 if isinstance(abstract, list): 

210 abstract = abstract[0] 

211 if isinstance(abstract, dict): 

212 if "@xml:lang" in abstract: 212 ↛ 214line 212 didn't jump to line 214 because the condition on line 212 was always true

213 xabstract["lang"] = abstract["@xml:lang"] 

214 abstract = abstract["value"] 

215 

216 xabstract["value_tex"] = self.latex_converter.latex_to_text(abstract) 

217 

218 xarticle.abstracts.append(xabstract) 

219 

220 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 220 ↛ 224line 220 didn't jump to line 224 because the condition on line 220 was always true

221 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]: 

222 xarticle.kwds.append(create_subj(type="msc", value=msc["code"])) 

223 

224 xarticle.doi = journal_data["doi_data"]["doi"].strip() 

225 xarticle.url = data["document"]["database"]["current"]["url"] 

226 return xarticle