Coverage for src/crawler/by_source/isrp_crawler.py: 84%

142 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import os 

2 

3import langcodes 

4import langcodes.tag_parser 

5import regex 

6from bs4 import BeautifulSoup, Tag 

7from lxml import etree 

8from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

9from ptf.cmds.xml.jats.jats_parser import JatsArticle 

10from ptf.model_data import ArticleData, IssueData, create_articledata, create_extlink 

11 

12from crawler.base_crawler import BaseCollectionCrawler 

13from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

14 

15 

16class IsrpCrawler(BaseCollectionCrawler): 

17 source_name = "International Scientific Research Publications" 

18 source_domain = "ISRP" 

19 source_website = "https://www.isr-publications.com" 

20 

21 delimiter_inline_formula = "\\(" 

22 delimiter_disp_formula = "\\[" 

23 

24 issue_regex = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) pp. (?P<pages>[\d,\-(?:In Progress)]+) (?:(?P<special>\w+)? )?\((?P<year>\d+).*\)" 

25 

26 biblinks_to_keep = { 

27 "https://doi.org/": lambda link: (link.removeprefix("https://doi.org/"), "doi"), 

28 "https://zbmath.org/?q=an:": lambda link: ( 

29 link.removeprefix("https://zbmath.org/?q=an:"), 

30 "zbl-item-id", 

31 ), 

32 "https://zbmath.org/": lambda link: ( 

33 link.removeprefix("https://zbmath.org/"), 

34 "zbl-item-id", 

35 ), 

36 # http://archive.numdam.org/article/AFST_1907_2_9__203_0.pdf 

37 "http://archive.numdam.org/": lambda link: ( 

38 regex.search(r".+numdam.org\/.+\/(.+)\.pdf", link).group(1), 

39 "numdam-id", 

40 ), 

41 # https://eudml.org/serve/127518/accessibleLayeredPdf/0 

42 "https://eudml.org": lambda link: ( 

43 regex.search(r".*:\/\/eudml.org\/[\.\w]+\/(?:[a-zA-Z:]+)?(\d+)", link).group(1), 

44 "eudml-item-id", 

45 ), 

46 } 

47 

48 def parse_collection_content(self, content): 

49 """ 

50 Parse the HTML page of Annals of Math and returns a list of xissue. 

51 Each xissue has its pid/volume/number/year metadata + its url 

52 

53 self.periode is set at the end based on the xissue years of the HTML page 

54 """ 

55 soup = BeautifulSoup(content, "html.parser") 

56 issue_tags = soup.select("ul.issues > li") 

57 issues = [] 

58 

59 for issue_tag in issue_tags: 

60 a_tag = issue_tag.select_one("a") 

61 if not a_tag: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError(f"{self.source_domain}] {self.collection_id} Cannot parse issue") 

63 issue_href = a_tag.get("href") 

64 if not isinstance(issue_href, str): 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 raise ValueError( 

66 f"{self.source_domain}] {self.collection_id} Cannot parse issue link" 

67 ) 

68 text = cleanup_str(issue_tag.text) 

69 issue_rx = regex.search(self.issue_regex, text) 

70 if not issue_rx: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 raise ValueError( 

72 f"{self.source_domain}] {self.collection_id} Cannot parse issue information" 

73 ) 

74 issue_data = issue_rx.groupdict() 

75 if "number" not in issue_data: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 pass 

77 issues.append( 

78 self.create_xissue( 

79 issue_href, issue_data["year"], issue_data["volume"], issue_data["number"] 

80 ) 

81 ) 

82 return issues 

83 

84 def parse_issue_content(self, content, xissue): 

85 soup = BeautifulSoup(content, "html.parser") 

86 articles_tags = soup.select("ul.articles-list > li.article-title") 

87 for index, article_tag in enumerate(articles_tags): 

88 xarticle = create_articledata() 

89 

90 a_tag = article_tag.select_one("a") 

91 if not a_tag: 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true

92 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find article link") 

93 

94 url = a_tag.get("href") 

95 if not isinstance(url, str): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse article link") 

97 

98 xarticle.url = url 

99 xarticle.pid = f"a{index}" 

100 

101 xissue.articles.append(xarticle) 

102 

103 def parse_article_content( 

104 self, 

105 content: str, 

106 xissue: IssueData, 

107 xarticle: ArticleData, 

108 url: str, 

109 pid: str, 

110 ): 

111 parser = etree.XMLParser( 

112 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True 

113 ) 

114 dom = etree.fromstring(content.encode("utf-8"), parser) 

115 # Fix for invalid self-uri tag 

116 xslt = etree.parse( 

117 os.path.dirname(os.path.realpath(__file__)) + "/isrp-article.xsl", parser 

118 ) 

119 transform = etree.XSLT(xslt) 

120 tree = transform(dom) 

121 

122 parsed_xarticle = JatsArticle(tree=tree.getroot()) 

123 parsed_xarticle.ext_links = xarticle.ext_links 

124 parsed_xarticle.url = url 

125 parsed_xarticle.pid = cleanup_str(pid) 

126 

127 # Sometimes the source DOI has white spaces at the end 

128 if parsed_xarticle.doi: 128 ↛ 130line 128 didn't jump to line 130 because the condition on line 128 was always true

129 parsed_xarticle.doi = parsed_xarticle.doi.strip() 

130 lang = langcodes.Language.get(parsed_xarticle.lang).language 

131 if lang: 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true

132 parsed_xarticle.lang = lang 

133 for abstract in parsed_xarticle.abstracts: 

134 abstract_lang = langcodes.Language.get(abstract["lang"]).language 

135 if abstract_lang: 135 ↛ 133line 135 didn't jump to line 133 because the condition on line 135 was always true

136 abstract["lang"] = abstract_lang 

137 for kwd in parsed_xarticle.kwds: 

138 kwd_lang = langcodes.Language.get(kwd["lang"]).language 

139 if kwd_lang: 139 ↛ 137line 139 didn't jump to line 137 because the condition on line 139 was always true

140 kwd["lang"] = kwd_lang 

141 

142 content = self.download_file(url.removesuffix("/xml")) 

143 soup = BeautifulSoup(content, "html.parser") 

144 

145 main = soup.select_one(".simple-content") 

146 if not main: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 raise ValueError("Cannot parse HTML page") 

148 

149 references_flag = False 

150 references_tag = None 

151 for c in main.findChildren(recursive=False): 151 ↛ 162line 151 didn't jump to line 162 because the loop on line 151 didn't complete

152 if c.name != "h3" and c.name != "ul": 

153 continue 

154 if c.name == "h3" and c.text == "References": 

155 references_flag = True 

156 continue 

157 if references_flag and c.name == "ul": 

158 references_tag = c 

159 break 

160 references_flag = False 

161 

162 if not references_tag: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 print("Couldn't find References tag") 

164 else: 

165 self.parse_bibitems(parsed_xarticle, references_tag) 

166 

167 # And now let's hope the html page is consistent with the XML 

168 

169 pdf_url = tree.find("/front/article-meta/uri[@specific-use='for-review']").text 

170 add_pdf_link_to_xarticle(parsed_xarticle, pdf_url) 

171 

172 for contrib in parsed_xarticle.contributors: 

173 if ( 173 ↛ 177line 173 didn't jump to line 177

174 len(contrib["string_name"]) > 200 

175 or len(contrib["first_name"] + contrib["last_name"]) > 200 

176 ): 

177 pass 

178 return parsed_xarticle 

179 

180 def parse_bibitems(self, xarticle, references_tag: Tag): 

181 for index, c in enumerate(references_tag.findChildren("li", recursive=False)): 

182 links: set[str] = set() 

183 

184 # Sometimes, the same link gets referenced multiple times in an entry 

185 for a_tag in c.select("a"): 

186 ref_link = a_tag.get("href") 

187 if not ref_link: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 continue 

189 links.add(ref_link) 

190 

191 # We populate ext_links_xmls with PTF <ext-links> xmls 

192 ext_links_xmls = [] 

193 for link in links: 

194 for url in self.biblinks_to_keep: 

195 if link.startswith(url): 

196 url, link_type = self.biblinks_to_keep[url](link) 

197 ext_links_xmls.append( 

198 get_ext_link_xml( 

199 url, 

200 url, 

201 link_type, 

202 ) 

203 ) 

204 break 

205 

206 # We recreate bibitems while adding ext-links inside <element-citation> 

207 soup = BeautifulSoup( 

208 "<ref>" + xarticle.bibitems[index].citation_xml + "</ref>", "lxml-xml" 

209 ) 

210 soup_ref = soup.select_one("ref") 

211 

212 if not soup_ref: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 raise ValueError("Cannot find ref in xml") 

214 

215 # add ext-links inside <element-citation> 

216 element_citation = soup_ref.select_one("element-citation") 

217 if not element_citation: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 raise ValueError("Cannot find element citation in xml") 

219 element_citation.extend(BeautifulSoup(c, "lxml-xml") for c in ext_links_xmls) 

220 del element_citation 

221 

222 xarticle.bibitems[index].citation_xml = str("".join(str(c) for c in soup_ref.children)) 

223 

224 # Here we already have an <element-citation> tag, so we want to skip the <mixed-citation> creation 

225 xarticle.bibitems[index] = self.create_crawled_bibitem(xarticle.bibitems[index]) 

226 

227 if len(xarticle.bibitems) > 0: 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true

228 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems)) 

229 

230 xarticle.bibitems = [] 

231 xarticle.bibitem = [] 

232 

233 def crawl_article(self, xarticle: ArticleData, xissue: IssueData): 

234 # We crawl {article.url}/xml instead of article url 

235 

236 if not xarticle.url: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse article url") 

238 

239 ext_link = create_extlink() 

240 ext_link["rel"] = "source" 

241 ext_link["location"] = str(xarticle.url) 

242 ext_link["metadata"] = self.source_domain 

243 xarticle.ext_links.append(ext_link) 

244 

245 xarticle.url = xarticle.url + "/xml" 

246 return super().crawl_article(xarticle, xissue)