Coverage for src/crawler/crawler

1# This file contains utils functions related to ArticleData or IssueData parsing and population

2# Some of the functions present here were initially present in base_crawler but then moved here.

5import logging

6from email.policy import EmailPolicy

7from typing import Callable

9import regex

10from bs4 import BeautifulSoup

11from langcodes import standardize_tag

12from ptf.cmds.xml.jats.builder.references import (

13 get_article_title_xml,

14 get_author_xml,

15 get_fpage_xml,

16 get_lpage_xml,

17 get_source_xml,

18 get_year_xml,

19)

20from ptf.cmds.xml.jats.jats_parser import JatsBase

21from ptf.model_data import (

22 ArticleData,

23 ContributorDict,

24 IssueData,

25 create_abstract,

26 create_contributor,

27 create_issuedata,

28 create_publisherdata,

29)

31from crawler.types import CitationLiteral

32from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

34references_mapping = {

35 "citation_title": get_article_title_xml,

36 "citation_journal_title": get_source_xml,

37 "citation_publication_date": get_year_xml,

38 "citation_firstpage": get_fpage_xml,

39 "citation_lastpage": get_lpage_xml,

40}

42logger = logging.getLogger(__name__)

45def parse_content_type_charset(content_type: str):

46 header = EmailPolicy.header_factory("content-type", content_type)

47 if "charset" in header.params:

48 return header.params.get("charset")

51def parse_meta_citation_reference(content: str, label=None):

52 categories = content.split(";")

54 if len(categories) == 1:

55 return JatsBase.bake_ref(content, label=label)

57 citation_data = [c.split("=") for c in categories if "=" in c]

58 del categories

60 xml_string = ""

61 authors_parsed = False

62 authors_strings = []

63 for data in citation_data:

64 key = data[0].strip()

65 citation_content = data[1]

66 if key == "citation_author":

67 authors_strings.append(get_author_xml(template_str=citation_content))

68 continue

69 elif not authors_parsed:

70 xml_string += ", ".join(authors_strings)

71 authors_parsed = True

73 if key in references_mapping:

74 xml_string += " " + references_mapping[key](citation_content)

76 return JatsBase.bake_ref(xml_string, label=label)

79def set_pages(article: ArticleData, pages: str, separator: str = "-"):

80 pages_split = pages.split(separator)

81 if len(pages_split) == 0: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 article.page_range = pages

83 if len(pages_split) > 0: 83 ↛ exitline 83 didn't return from function 'set_pages' because the condition on line 83 was always true

84 if pages[0].isnumeric(): 84 ↛ exitline 84 didn't return from function 'set_pages' because the condition on line 84 was always true

85 article.fpage = pages_split[0]

86 if (

87 len(pages_split) > 1

88 and pages_split[0] != pages_split[1]

89 and pages_split[1].isnumeric()

90 ):

91 article.lpage = pages_split[1]

94def get_issue_pid(

95 collection_id: str,

96 year: str,

97 volume_number: str | None = None,

98 issue_number: str | None = None,

99 series: str | None = None,

100):

101 # Replace any non-word character with an underscore

102 pid = f"{collection_id}_{year}"

103 if series is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 pid += f"_{series}"

105 if volume_number is not None: 105 ↛ 107line 105 didn't jump to line 107 because the condition on line 105 was always true

106 pid += f"_{volume_number}"

107 if issue_number is not None:

108 pid += f"_{issue_number}"

109 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))

110 return pid

111

112

113def create_xissue(

114 collection_id: str,

115 url: str | None,

116 year: str,

117 volume_number: str | None,

118 issue_number: str | None = "1",

119 vseries: str | None = None,

120):

121 if url is not None and url.endswith("/"):

122 url = url[:-1]

123 xissue = create_issuedata()

124 xissue.url = url

125

126 xissue.pid = get_issue_pid(collection_id, year, volume_number, issue_number, vseries)

127

128 xissue.year = year

129

130 if volume_number is not None:

131 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)

132

133 if issue_number is not None:

134 xissue.number = issue_number.replace(",", "-")

135

136 if vseries is not None:

137 xissue.vseries = vseries

138 return xissue

139

140

141def get_metadata_using_citation_meta(

142 xarticle: ArticleData,

143 xissue: IssueData,

144 soup: BeautifulSoup,

145 what: list[CitationLiteral] = [],

146 detect_language_fct: Callable[[str, ArticleData], str] | None = None,

147):

148 """

149 :param xarticle: the xarticle that will collect the metadata

150 :param xissue: the xissue that will collect the publisher

151 :param soup: the BeautifulSoup object of tha article page

152 :param what: list of citation_ items to collect.

153 :return: None. The given article is modified

154 """

155

156 if "title" in what:

157 # TITLE

158 citation_title_node = soup.select_one("meta[name='citation_title']")

159 if citation_title_node:

160 title = citation_title_node.get("content")

161 if isinstance(title, str):

162 xarticle.title_tex = title

163

164 if "author" in what:

165 # AUTHORS

166 citation_author_nodes = soup.select("meta[name^='citation_author']")

167 current_author: ContributorDict | None = None

168 for citation_author_node in citation_author_nodes:

169 if citation_author_node.get("name") == "citation_author":

170 text_author = citation_author_node.get("content")

171 if not isinstance(text_author, str):

172 raise ValueError("Cannot parse author")

173 if text_author == "":

174 current_author = None

175 continue

176 current_author = create_contributor(role="author", string_name=text_author)

177 xarticle.contributors.append(current_author)

178 continue

179 if current_author is None:

180 logger.warning("Couldn't parse citation author")

181 continue

182 if citation_author_node.get("name") == "citation_author_institution":

183 text_institution = citation_author_node.get("content")

184 if not isinstance(text_institution, str):

185 continue

186 current_author["addresses"].append(text_institution)

187 if citation_author_node.get("name") == "citation_author_ocrid":

188 text_orcid = citation_author_node.get("content")

189 if not isinstance(text_orcid, str):

190 continue

191 current_author["orcid"] = text_orcid

192

193 if "pdf" in what:

194 # PDF

195 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')

196 if citation_pdf_node:

197 pdf_url = citation_pdf_node.get("content")

198 if isinstance(pdf_url, str):

199 add_pdf_link_to_xarticle(xarticle, pdf_url)

200

201 if "lang" in what:

202 # LANG

203 citation_lang_node = soup.select_one("meta[name='citation_language']")

204 if citation_lang_node:

205 # TODO: check other language code

206 content_text = citation_lang_node.get("content")

207 if isinstance(content_text, str):

208 xarticle.lang = standardize_tag(content_text)

209

210 if "abstract" in what:

211 # ABSTRACT

212 abstract_node = soup.select_one("meta[name='citation_abstract']")

213 if abstract_node is not None:

214 abstract = abstract_node.get("content")

215 if not isinstance(abstract, str):

216 raise ValueError("Couldn't parse abstract from meta")

217 abstract = BeautifulSoup(abstract, "html.parser").text

218 lang = abstract_node.get("lang")

219 if not isinstance(lang, str):

220 if not detect_language_fct:

221 return

222 lang = detect_language_fct(abstract, xarticle)

223 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))

224

225 if "page" in what:

226 # PAGES

227 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")

228 if citation_fpage_node:

229 page = citation_fpage_node.get("content")

230 if isinstance(page, str):

231 page = page.split("(")[0]

232 if len(page) < 32:

233 xarticle.fpage = page

234

235 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")

236 if citation_lpage_node:

237 page = citation_lpage_node.get("content")

238 if isinstance(page, str):

239 page = page.split("(")[0]

240 if len(page) < 32:

241 xarticle.lpage = page

242

243 if "doi" in what:

244 # DOI

245 citation_doi_node = soup.select_one("meta[name='citation_doi']")

246 if citation_doi_node:

247 doi = citation_doi_node.get("content")

248 if isinstance(doi, str):

249 doi = doi.strip()

250 pos = doi.find("10.")

251 if pos > 0:

252 doi = doi[pos:]

253 xarticle.doi = doi

254

255 if "mr" in what:

256 # MR

257 citation_mr_node = soup.select_one("meta[name='citation_mr']")

258 if citation_mr_node:

259 mr = citation_mr_node.get("content")

260 if isinstance(mr, str):

261 mr = mr.strip()

262 if mr.find("MR") == 0:

263 mr = mr[2:]

264 xarticle.extids.append(("mr-item-id", mr))

265

266 if "zbl" in what:

267 # ZBL

268 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")

269 if citation_zbl_node:

270 zbl = citation_zbl_node.get("content")

271 if isinstance(zbl, str):

272 zbl = zbl.strip()

273 if zbl.find("Zbl") == 0:

274 zbl = zbl[3:].strip()

275 xarticle.extids.append(("zbl-item-id", zbl))

276

277 if "publisher" in what:

278 # PUBLISHER

279 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")

280 if citation_publisher_node:

281 pub = citation_publisher_node.get("content")

282 if isinstance(pub, str):

283 pub = pub.strip()

284 if pub != "":

285 xpub = create_publisherdata()

286 xpub.name = pub

287 xissue.publisher = xpub

288

289 if "keywords" in what:

290 # KEYWORDS

291 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")

292 for kwd_node in citation_kwd_nodes:

293 kwds = kwd_node.get("content")

294 if isinstance(kwds, str):

295 kwds = kwds.split(",")

296 for kwd in kwds:

297 if kwd == "":

298 continue

299 kwd = kwd.strip()

300 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})

301

302 if "references" in what:

303 citation_references = soup.select("meta[name='citation_reference']")

304 for index, tag in enumerate(citation_references):

305 content = tag.get("content")

306 if not isinstance(content, str):

307 raise ValueError("Cannot parse citation_reference meta")

308 label = str(index + 1)

309 if regex.match(r"^\[\d+\].*", content):

310 label = None

311 xarticle.bibitems.append(parse_meta_citation_reference(content, label))

312

313

314def article_has_pdf(art: ArticleData | IssueData):

315 return next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) is not None

316

317

318def article_has_source(art: ArticleData | IssueData):

319 return (

320 next(

321 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),

322 None,

323 )

324 is not None

325 )

Coverage for src / crawler / crawler_utils.py: 13%

208 statements