Coverage for src/crawler/by_source/episciences_crawler.py: 93%

91 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1import json 

2import math 

3from urllib.parse import urljoin 

4 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

15 

16 

17# We could improve our data further by augmenting the articles using arxiv 

18# (references) 

19class EpisciencesCrawler(BaseCollectionCrawler): 

20 source_name = "Episciences" 

21 source_domain = "EPISCIENCES" 

22 source_website = "https://www.episciences.org/" 

23 

24 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"} 

25 

26 # Vol. 1 

27 # Vol. 3 no. 2 

28 # Vol. 17 no.2 

29 # vol. 24, no 2 

30 # Vol. 18 no. 2, Permutation Patterns 2015 

31 # Vol. 19 no. 4, FCT '15 

32 # vol. 27:2 

33 # vol. 25:3 special issue ICGT'22 

34 # vol. 26:1, Permutation Patterns 2023 

35 issue_title_re = r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?" 

36 

37 def parse_collection_content(self, content): 

38 data = json.loads(content) 

39 xissues = [] 

40 for issue in data: 

41 xissues.append(self.prefetch_episciences_issue(issue)) 

42 

43 issues_by_volume = {} 

44 for issue in xissues: 

45 if issue.volume not in issues_by_volume: 

46 issues_by_volume[issue.volume] = [] 

47 issues_by_volume[issue.volume].append(issue) 

48 

49 for volume_issues in issues_by_volume.values(): 

50 year_iterable = [int(i.year) for i in volume_issues] 

51 firstyear = min(year_iterable) 

52 lastyear = max(year_iterable) 

53 if firstyear != lastyear: 

54 for i in volume_issues: 

55 i.year = f"{firstyear}-{lastyear}" 

56 

57 return xissues 

58 

59 def prefetch_episciences_issue(self, issue: dict): 

60 """ 

61 Episciences doesn't provides issue years. 

62 We have to parse the year from the first article of the issue (publication date). 

63 """ 

64 

65 if "vol_year" in issue: 

66 year = str(issue["vol_year"]) 

67 else: 

68 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

69 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

70 mid_art_content = self.download_file(mid_art_url) 

71 mid_art_data = json.loads(mid_art_content) 

72 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

73 issue_title = issue["titles"]["en"] 

74 

75 # parsed_url = urlparse(self.collection_url) 

76 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

77 

78 if "DMTCS Proceedings" in issue_title: 

79 xissue = create_issuedata() 

80 xissue.url = None 

81 xissue.year = year 

82 xissue.pid = self.get_issue_pid( 

83 self.collection_id, year, "special_" + str(issue["vid"]) 

84 ) 

85 # HACK : handle this elsewhere ? transform title_tex into title_xml 

86 # Is title_xml here even valid ? 

87 xissue.title_tex = issue_title 

88 xissue.title_html = issue_title 

89 xissue.title_xml = issue_title 

90 xissue.volume = issue_title 

91 

92 else: 

93 title_dict = regex_to_dict( 

94 self.issue_title_re, issue_title, error_msg="Couldn't parse issue title" 

95 ) 

96 xissue = self.create_xissue( 

97 None, year, title_dict["volume"], title_dict.get("number", None) 

98 ) 

99 if title_dict["title"] is not None: 

100 # HACK : handle this elsewhere ? transform title_tex into title_xml 

101 # Is title_xml here even valid ? 

102 xissue.title_tex = title_dict["title"] 

103 xissue.title_html = title_dict["title"] 

104 xissue.title_xml = title_dict["title"] 

105 

106 for index, paper in enumerate(issue["papers"]): 

107 xarticle = create_articledata() 

108 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

109 xarticle.pid = f"a{index}" 

110 xissue.articles.append(xarticle) 

111 return xissue 

112 

113 def parse_article_content(self, content, xissue, xarticle, url): 

114 data = json.loads(content) 

115 

116 journal_data = data["document"]["journal"]["journal_article"] 

117 

118 add_pdf_link_to_xarticle( 

119 xarticle, data["document"]["database"]["current"]["files"]["link"] 

120 ) 

121 xarticle.lang = journal_data["@language"] 

122 xarticle.title_tex = journal_data["titles"]["title"] 

123 contributors = journal_data["contributors"]["person_name"] 

124 if isinstance(contributors, list): 124 ↛ 136line 124 didn't jump to line 136 because the condition on line 124 was always true

125 for contrib in journal_data["contributors"]["person_name"]: 

126 if contrib["@contributor_role"] != "author": 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise NotImplementedError("Contributor type not implemented") 

128 xarticle.contributors.append( 

129 create_contributor( 

130 first_name=contrib["given_name"], 

131 last_name=contrib["surname"], 

132 role="author", 

133 ) 

134 ) 

135 else: 

136 if not contributors["@contributor_role"] == "author": 

137 raise NotImplementedError("Contributor type not implemented") 

138 xarticle.contributors.append( 

139 create_contributor( 

140 first_name=contributors["given_name"], 

141 last_name=contributors["surname"], 

142 role="author", 

143 ) 

144 ) 

145 

146 xabstract = create_abstract(tag="abstract", value_tex="") 

147 abstract = journal_data["abstract"]["value"] 

148 if isinstance(abstract, list): 

149 abstract = abstract[0] 

150 if isinstance(abstract, dict): 

151 if "@xml:lang" in abstract: 151 ↛ 153line 151 didn't jump to line 153 because the condition on line 151 was always true

152 xabstract["lang"] = abstract["@xml:lang"] 

153 abstract = abstract["value"] 

154 

155 xabstract["value_tex"] = abstract 

156 

157 xarticle.abstracts.append(xabstract) 

158 

159 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 

160 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]: 

161 xarticle.kwds.append(create_subj(type="msc", value=msc["code"])) 

162 

163 xarticle.doi = journal_data["doi_data"]["doi"].strip() 

164 xarticle.url = data["document"]["database"]["current"]["url"] 

165 return xarticle