Coverage for src/crawler/by_source/isrp

1import os

3import langcodes

4import langcodes.tag_parser

5import regex

6from bs4 import BeautifulSoup, Tag

7from lxml import etree

8from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml

9from ptf.cmds.xml.jats.jats_parser import JatsArticle, JatsBase

10from ptf.model_data import ArticleData, IssueData, create_articledata, create_extlink

12from crawler.base_crawler import BaseCollectionCrawler

13from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

16class IsrpCrawler(BaseCollectionCrawler):

17 source_name = "International Scientific Research Publications"

18 source_domain = "ISRP"

19 source_website = "https://www.isr-publications.com"

21 delimiter_inline_formula = "\\("

22 delimiter_disp_formula = "\\["

24 issue_regex = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) pp. (?P<pages>[\d,\-(?:In Progress)]+) (?:(?P<special>\w+)? )?\((?P<year>\d+).*\)"

26 biblinks_to_keep = {

27 "https://doi.org/": lambda link: (link.removeprefix("https://doi.org/"), "doi"),

28 "https://zbmath.org/?q=an:": lambda link: (

29 link.removeprefix("https://zbmath.org/?q=an:"),

30 "zbl-item-id",

31 ),

32 "https://zbmath.org/": lambda link: (

33 link.removeprefix("https://zbmath.org/"),

34 "zbl-item-id",

35 ),

36 # http://archive.numdam.org/article/AFST_1907_2_9__203_0.pdf

37 "http://archive.numdam.org/": lambda link: (

38 regex.search(r".+numdam.org\/.+\/(.+)\.pdf", link).group(1),

39 "numdam-id",

40 ),

41 # https://eudml.org/serve/127518/accessibleLayeredPdf/0

42 "https://eudml.org": lambda link: (

43 regex.search(r".*:\/\/eudml.org\/[\.\w]+\/(?:[a-zA-Z:]+)?(\d+)", link).group(1),

44 "eudml-item-id",

45 ),

46 }

48 def parse_collection_content(self, content):

49 """

50 Parse the HTML page of Annals of Math and returns a list of xissue.

51 Each xissue has its pid/volume/number/year metadata + its url

52 """

53 soup = BeautifulSoup(content, "html.parser")

54 issue_tags = soup.select("ul.issues > li")

55 issues = []

57 for issue_tag in issue_tags:

58 a_tag = issue_tag.select_one("a")

59 if not a_tag: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError(f"{self.source_domain}] {self.collection_id} Cannot parse issue")

61 issue_href = a_tag.get("href")

62 if not isinstance(issue_href, str): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError(

64 f"{self.source_domain}] {self.collection_id} Cannot parse issue link"

65 )

67 issue_data = regex_to_dict(

68 self.issue_regex,

69 cleanup_str(issue_tag.text),

70 error_msg=f"{self.source_domain}] {self.collection_id} Cannot parse issue information",

71 )

72 if "number" not in issue_data: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 pass

74 issues.append(

75 self.create_xissue(

76 issue_href, issue_data["year"], issue_data["volume"], issue_data["number"]

77 )

78 )

79 return issues

81 def parse_issue_content(self, content, xissue):

82 soup = BeautifulSoup(content, "html.parser")

83 articles_tags = soup.select("ul.articles-list > li.article-title")

84 for index, article_tag in enumerate(articles_tags):

85 xarticle = create_articledata()

87 a_tag = article_tag.select_one("a")

88 if not a_tag: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find article link")

91 url = a_tag.get("href")

92 if not isinstance(url, str): 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse article link")

95 xarticle.url = url

96 xarticle.pid = f"a{index}"

98 xissue.articles.append(xarticle)

100 def parse_article_content(

101 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str

102 ):

103 parser = etree.XMLParser(

104 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True

105 )

106 dom = etree.fromstring(content.encode("utf-8"), parser)

107 # Fix for invalid self-uri tag

108 xslt = etree.parse(

109 os.path.dirname(os.path.realpath(__file__)) + "/isrp-article.xsl", parser

110 )

111 transform = etree.XSLT(xslt)

112 tree = transform(dom)

113

114 parsed_xarticle = JatsArticle(tree=tree.getroot())

115 parsed_xarticle.ext_links = xarticle.ext_links

116 parsed_xarticle.url = url

117

118 # Sometimes the source DOI has white spaces at the end

119 if parsed_xarticle.doi: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 parsed_xarticle.doi = parsed_xarticle.doi.strip()

121 lang = langcodes.Language.get(parsed_xarticle.lang).language

122 if lang: 122 ↛ 124line 122 didn't jump to line 124 because the condition on line 122 was always true

123 parsed_xarticle.lang = lang

124 for abstract in parsed_xarticle.abstracts:

125 abstract_lang = langcodes.Language.get(abstract["lang"]).language

126 if abstract_lang: 126 ↛ 124line 126 didn't jump to line 124 because the condition on line 126 was always true

127 abstract["lang"] = abstract_lang

128 for kwd in parsed_xarticle.kwds:

129 kwd_lang = langcodes.Language.get(kwd["lang"]).language

130 if kwd_lang: 130 ↛ 128line 130 didn't jump to line 128 because the condition on line 130 was always true

131 kwd["lang"] = kwd_lang

132

133 content = self.download_file(url.removesuffix("/xml"))

134 soup = BeautifulSoup(content, "html.parser")

135

136 main = soup.select_one(".simple-content")

137 if not main: 137 ↛ 138line 137 didn't jump to line 138 because the condition on line 137 was never true

138 raise ValueError("Cannot parse HTML page")

139

140 references_flag = False

141 references_tag = None

142 for c in main.findChildren(recursive=False): 142 ↛ 153line 142 didn't jump to line 153 because the loop on line 142 didn't complete

143 if c.name != "h3" and c.name != "ul":

144 continue

145 if c.name == "h3" and c.text == "References":

146 references_flag = True

147 continue

148 if references_flag and c.name == "ul":

149 references_tag = c

150 break

151 references_flag = False

152

153 if not references_tag: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 self.logger.debug("Couldn't find References tag", {"pid": xarticle.pid})

155 else:

156 self.parse_bibitems(parsed_xarticle, references_tag)

157

158 # And now let's hope the html page is consistent with the XML

159

160 pdf_url = tree.find("/front/article-meta/uri[@specific-use='for-review']").text

161 add_pdf_link_to_xarticle(parsed_xarticle, pdf_url)

162

163 for contrib in parsed_xarticle.contributors:

164 if ( 164 ↛ 168line 164 didn't jump to line 168 because the condition on line 164 was never true

165 len(contrib["string_name"]) > 200

166 or len(contrib["first_name"] + contrib["last_name"]) > 200

167 ):

168 pass

169 return parsed_xarticle

170

171 def parse_bibitems(self, xarticle, references_tag: Tag):

172 for index, c in enumerate(references_tag.findChildren("li", recursive=False)):

173 links: set[str] = set()

174

175 # Sometimes, the same link gets referenced multiple times in an entry

176 for a_tag in c.select("a"):

177 ref_link = a_tag.get("href")

178 if not ref_link: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 continue

180 links.add(ref_link)

181

182 # We populate ext_links_xmls with PTF <ext-links> xmls

183 ext_links_xmls = []

184 for link in links:

185 for url in self.biblinks_to_keep:

186 if link.startswith(url):

187 url, link_type = self.biblinks_to_keep[url](link)

188 ext_links_xmls.append(

189 get_ext_link_xml(

190 url,

191 url,

192 link_type,

193 )

194 )

195 break

196

197 # We recreate bibitems while adding ext-links inside <element-citation>

198 soup = BeautifulSoup(

199 "<ref>" + xarticle.bibitems[index].citation_xml + "</ref>", "lxml-xml"

200 )

201 soup_ref = soup.select_one("ref")

202

203 if not soup_ref: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 raise ValueError("Cannot find ref in xml")

205

206 # add ext-links inside <element-citation>

207 element_citation = soup_ref.select_one("element-citation")

208 if not element_citation: 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true

209 raise ValueError("Cannot find element citation in xml")

210 element_citation.extend(BeautifulSoup(c, "lxml-xml") for c in ext_links_xmls)

211 del element_citation

212

213 xarticle.bibitems[index].citation_xml = str("".join(str(c) for c in soup_ref.children))

214

215 # Here we already have an <element-citation> tag, so we want to skip the <mixed-citation> creation

216 xarticle.bibitems[index] = JatsBase.bake_ref(xarticle.bibitems[index])

217

218 if len(xarticle.bibitems) > 0: 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true

219 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))

220

221 xarticle.bibitems = []

222 xarticle.bibitem = []

223

224 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):

225 # We crawl {article.url}/xml instead of article url

226

227 if not xarticle.url: 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse article url")

229

230 ext_link = create_extlink()

231 ext_link["rel"] = "source"

232 ext_link["location"] = str(xarticle.url)

233 ext_link["metadata"] = self.source_domain

234 xarticle.ext_links.append(ext_link)

235

236 xarticle.url = xarticle.url + "/xml"

237 return super().crawl_article(xarticle, xissue)

Coverage for src/crawler/by_source/isrp_crawler.py: 85%

137 statements