Coverage for src/crawler/by_source/episciences_crawler.py: 92%

95 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import json 

2import math 

3from urllib.parse import urljoin 

4 

5import regex 

6from ptf.model_data import ( 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_issuedata, 

11 create_subj, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle 

16 

17 

18# We could improve our data further by augmenting the articles using arxiv 

19# (references) 

20class EpisciencesCrawler(BaseCollectionCrawler): 

21 source_name = "Episciences" 

22 source_domain = "EPISCIENCES" 

23 source_website = "https://www.episciences.org/" 

24 

25 headers = {"accept_encoding": "utf-8", "accept": "application/ld+json"} 

26 

27 # Vol. 1 

28 # Vol. 3 no. 2 

29 # Vol. 17 no.2 

30 # vol. 24, no 2 

31 # Vol. 18 no. 2, Permutation Patterns 2015 

32 # Vol. 19 no. 4, FCT '15 

33 # vol. 27:2 

34 # vol. 25:3 special issue ICGT'22 

35 # vol. 26:1, Permutation Patterns 2023 

36 issue_title_re = r"[Vv]ol. (?P<volume>\d+)(?:(?:(?:,? no\.? ?)|(?:\:))?(?P<number>\d+))?(?:,? (?P<title>.+))?" 

37 

38 def parse_collection_content(self, content): 

39 data = json.loads(content) 

40 xissues = [] 

41 for issue in data: 

42 xissues.append(self.prefetch_episciences_issue(issue)) 

43 

44 issues_by_volume = {} 

45 for issue in xissues: 

46 if issue.volume not in issues_by_volume: 

47 issues_by_volume[issue.volume] = [] 

48 issues_by_volume[issue.volume].append(issue) 

49 

50 for volume_issues in issues_by_volume.values(): 

51 year_iterable = [int(i.year) for i in volume_issues] 

52 firstyear = min(year_iterable) 

53 lastyear = max(year_iterable) 

54 if firstyear != lastyear: 

55 for i in volume_issues: 

56 i.year = f"{firstyear}-{lastyear}" 

57 

58 return xissues 

59 

60 def prefetch_episciences_issue(self, issue: dict): 

61 """ 

62 Episciences doesn't provides issue years. 

63 We have to parse the year from the first article of the issue (publication date). 

64 """ 

65 

66 if "vol_year" in issue: 

67 year = str(issue["vol_year"]) 

68 else: 

69 mid_art = issue["papers"][math.floor(len(issue["papers"]) / 2)] 

70 mid_art_url = urljoin(self.collection_url, mid_art["@id"]) 

71 mid_art_content = self.download_file(mid_art_url) 

72 mid_art_data = json.loads(mid_art_content) 

73 year = mid_art_data["document"]["journal"]["journal_issue"]["publication_date"]["year"] 

74 issue_title = issue["titles"]["en"] 

75 

76 # parsed_url = urlparse(self.collection_url) 

77 # issue_view_url = f"https://{parse_qs(parsed_url.query)['rvcode'][0]}.episciences.org/volume/view/id/{issue['vid']}" 

78 

79 if "DMTCS Proceedings" in issue_title: 

80 xissue = create_issuedata() 

81 xissue.url = None 

82 xissue.year = year 

83 xissue.pid = self.get_issue_pid( 

84 self.collection_id, year, "special_" + str(issue["vid"]) 

85 ) 

86 # HACK : handle this elsewhere ? transform title_tex into title_xml 

87 # Is title_xml here even valid ? 

88 xissue.title_tex = issue_title 

89 xissue.title_html = issue_title 

90 xissue.title_xml = issue_title 

91 xissue.volume = issue_title 

92 

93 else: 

94 title_search = regex.search(self.issue_title_re, issue_title) 

95 if not title_search: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 raise ValueError("Couldn't parse issue title") 

97 title_dict = title_search.groupdict() 

98 xissue = self.create_xissue( 

99 None, year, title_dict["volume"], title_dict.get("number", None) 

100 ) 

101 if title_dict["title"] is not None: 

102 # HACK : handle this elsewhere ? transform title_tex into title_xml 

103 # Is title_xml here even valid ? 

104 xissue.title_tex = title_dict["title"] 

105 xissue.title_html = title_dict["title"] 

106 xissue.title_xml = title_dict["title"] 

107 

108 for index, paper in enumerate(issue["papers"]): 

109 xarticle = create_articledata() 

110 xarticle.url = urljoin(self.collection_url, paper["@id"]) 

111 xarticle.pid = f"a{index}" 

112 xissue.articles.append(xarticle) 

113 return xissue 

114 

115 def parse_article_content(self, content, xissue, xarticle, url, pid): 

116 data = json.loads(content) 

117 

118 journal_data = data["document"]["journal"]["journal_article"] 

119 

120 add_pdf_link_to_xarticle( 

121 xarticle, data["document"]["database"]["current"]["files"]["link"] 

122 ) 

123 xarticle.lang = journal_data["@language"] 

124 xarticle.title_tex = journal_data["titles"]["title"] 

125 contributors = journal_data["contributors"]["person_name"] 

126 if isinstance(contributors, list): 126 ↛ 138line 126 didn't jump to line 138 because the condition on line 126 was always true

127 for contrib in journal_data["contributors"]["person_name"]: 

128 if contrib["@contributor_role"] != "author": 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 raise NotImplementedError("Contributor type not implemented") 

130 xarticle.contributors.append( 

131 create_contributor( 

132 first_name=contrib["given_name"], 

133 last_name=contrib["surname"], 

134 role="author", 

135 ) 

136 ) 

137 else: 

138 if not contributors["@contributor_role"] == "author": 

139 raise NotImplementedError("Contributor type not implemented") 

140 xarticle.contributors.append( 

141 create_contributor( 

142 first_name=contributors["given_name"], 

143 last_name=contributors["surname"], 

144 role="author", 

145 ) 

146 ) 

147 

148 xabstract = create_abstract(tag="abstract", value_tex="") 

149 abstract = journal_data["abstract"]["value"] 

150 if isinstance(abstract, list): 

151 abstract = abstract[0] 

152 if isinstance(abstract, dict): 

153 if "@xml:lang" in abstract: 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true

154 xabstract["lang"] = abstract["@xml:lang"] 

155 abstract = abstract["value"] 

156 

157 xabstract["value_tex"] = abstract 

158 

159 xarticle.abstracts.append(xabstract) 

160 

161 if "msc2020" in data["document"]["database"]["current"]["classifications"]: 

162 for msc in data["document"]["database"]["current"]["classifications"]["msc2020"]: 

163 xarticle.kwds.append(create_subj(type="msc", value=msc["code"])) 

164 

165 xarticle.doi = journal_data["doi_data"]["doi"].strip() 

166 xarticle.url = data["document"]["database"]["current"]["url"] 

167 return super().parse_article_content(content, xissue, xarticle, url, pid)