Coverage for src / crawler / crawler_utils.py: 13%

208 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1# This file contains utils functions related to ArticleData or IssueData parsing and population 

2# Some of the functions present here were initially present in base_crawler but then moved here. 

3 

4 

5import logging 

6from email.policy import EmailPolicy 

7from typing import Callable 

8 

9import regex 

10from bs4 import BeautifulSoup 

11from langcodes import standardize_tag 

12from ptf.cmds.xml.jats.builder.references import ( 

13 get_article_title_xml, 

14 get_author_xml, 

15 get_fpage_xml, 

16 get_lpage_xml, 

17 get_source_xml, 

18 get_year_xml, 

19) 

20from ptf.cmds.xml.jats.jats_parser import JatsBase 

21from ptf.model_data import ( 

22 ArticleData, 

23 ContributorDict, 

24 IssueData, 

25 create_abstract, 

26 create_contributor, 

27 create_issuedata, 

28 create_publisherdata, 

29) 

30 

31from crawler.types import CitationLiteral 

32from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

33 

34references_mapping = { 

35 "citation_title": get_article_title_xml, 

36 "citation_journal_title": get_source_xml, 

37 "citation_publication_date": get_year_xml, 

38 "citation_firstpage": get_fpage_xml, 

39 "citation_lastpage": get_lpage_xml, 

40} 

41 

42logger = logging.getLogger(__name__) 

43 

44 

45def parse_content_type_charset(content_type: str): 

46 header = EmailPolicy.header_factory("content-type", content_type) 

47 if "charset" in header.params: 

48 return header.params.get("charset") 

49 

50 

51def parse_meta_citation_reference(content: str, label=None): 

52 categories = content.split(";") 

53 

54 if len(categories) == 1: 

55 return JatsBase.bake_ref(content, label=label) 

56 

57 citation_data = [c.split("=") for c in categories if "=" in c] 

58 del categories 

59 

60 xml_string = "" 

61 authors_parsed = False 

62 authors_strings = [] 

63 for data in citation_data: 

64 key = data[0].strip() 

65 citation_content = data[1] 

66 if key == "citation_author": 

67 authors_strings.append(get_author_xml(template_str=citation_content)) 

68 continue 

69 elif not authors_parsed: 

70 xml_string += ", ".join(authors_strings) 

71 authors_parsed = True 

72 

73 if key in references_mapping: 

74 xml_string += " " + references_mapping[key](citation_content) 

75 

76 return JatsBase.bake_ref(xml_string, label=label) 

77 

78 

79def set_pages(article: ArticleData, pages: str, separator: str = "-"): 

80 pages_split = pages.split(separator) 

81 if len(pages_split) == 0: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 article.page_range = pages 

83 if len(pages_split) > 0: 83 ↛ exitline 83 didn't return from function 'set_pages' because the condition on line 83 was always true

84 if pages[0].isnumeric(): 84 ↛ exitline 84 didn't return from function 'set_pages' because the condition on line 84 was always true

85 article.fpage = pages_split[0] 

86 if ( 

87 len(pages_split) > 1 

88 and pages_split[0] != pages_split[1] 

89 and pages_split[1].isnumeric() 

90 ): 

91 article.lpage = pages_split[1] 

92 

93 

94def get_issue_pid( 

95 collection_id: str, 

96 year: str, 

97 volume_number: str | None = None, 

98 issue_number: str | None = None, 

99 series: str | None = None, 

100): 

101 # Replace any non-word character with an underscore 

102 pid = f"{collection_id}_{year}" 

103 if series is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 pid += f"_{series}" 

105 if volume_number is not None: 105 ↛ 107line 105 didn't jump to line 107 because the condition on line 105 was always true

106 pid += f"_{volume_number}" 

107 if issue_number is not None: 

108 pid += f"_{issue_number}" 

109 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid)) 

110 return pid 

111 

112 

113def create_xissue( 

114 collection_id: str, 

115 url: str | None, 

116 year: str, 

117 volume_number: str | None, 

118 issue_number: str | None = "1", 

119 vseries: str | None = None, 

120): 

121 if url is not None and url.endswith("/"): 

122 url = url[:-1] 

123 xissue = create_issuedata() 

124 xissue.url = url 

125 

126 xissue.pid = get_issue_pid(collection_id, year, volume_number, issue_number, vseries) 

127 

128 xissue.year = year 

129 

130 if volume_number is not None: 

131 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number) 

132 

133 if issue_number is not None: 

134 xissue.number = issue_number.replace(",", "-") 

135 

136 if vseries is not None: 

137 xissue.vseries = vseries 

138 return xissue 

139 

140 

141def get_metadata_using_citation_meta( 

142 xarticle: ArticleData, 

143 xissue: IssueData, 

144 soup: BeautifulSoup, 

145 what: list[CitationLiteral] = [], 

146 detect_language_fct: Callable[[str, ArticleData], str] | None = None, 

147): 

148 """ 

149 :param xarticle: the xarticle that will collect the metadata 

150 :param xissue: the xissue that will collect the publisher 

151 :param soup: the BeautifulSoup object of tha article page 

152 :param what: list of citation_ items to collect. 

153 :return: None. The given article is modified 

154 """ 

155 

156 if "title" in what: 

157 # TITLE 

158 citation_title_node = soup.select_one("meta[name='citation_title']") 

159 if citation_title_node: 

160 title = citation_title_node.get("content") 

161 if isinstance(title, str): 

162 xarticle.title_tex = title 

163 

164 if "author" in what: 

165 # AUTHORS 

166 citation_author_nodes = soup.select("meta[name^='citation_author']") 

167 current_author: ContributorDict | None = None 

168 for citation_author_node in citation_author_nodes: 

169 if citation_author_node.get("name") == "citation_author": 

170 text_author = citation_author_node.get("content") 

171 if not isinstance(text_author, str): 

172 raise ValueError("Cannot parse author") 

173 if text_author == "": 

174 current_author = None 

175 continue 

176 current_author = create_contributor(role="author", string_name=text_author) 

177 xarticle.contributors.append(current_author) 

178 continue 

179 if current_author is None: 

180 logger.warning("Couldn't parse citation author") 

181 continue 

182 if citation_author_node.get("name") == "citation_author_institution": 

183 text_institution = citation_author_node.get("content") 

184 if not isinstance(text_institution, str): 

185 continue 

186 current_author["addresses"].append(text_institution) 

187 if citation_author_node.get("name") == "citation_author_ocrid": 

188 text_orcid = citation_author_node.get("content") 

189 if not isinstance(text_orcid, str): 

190 continue 

191 current_author["orcid"] = text_orcid 

192 

193 if "pdf" in what: 

194 # PDF 

195 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]') 

196 if citation_pdf_node: 

197 pdf_url = citation_pdf_node.get("content") 

198 if isinstance(pdf_url, str): 

199 add_pdf_link_to_xarticle(xarticle, pdf_url) 

200 

201 if "lang" in what: 

202 # LANG 

203 citation_lang_node = soup.select_one("meta[name='citation_language']") 

204 if citation_lang_node: 

205 # TODO: check other language code 

206 content_text = citation_lang_node.get("content") 

207 if isinstance(content_text, str): 

208 xarticle.lang = standardize_tag(content_text) 

209 

210 if "abstract" in what: 

211 # ABSTRACT 

212 abstract_node = soup.select_one("meta[name='citation_abstract']") 

213 if abstract_node is not None: 

214 abstract = abstract_node.get("content") 

215 if not isinstance(abstract, str): 

216 raise ValueError("Couldn't parse abstract from meta") 

217 abstract = BeautifulSoup(abstract, "html.parser").text 

218 lang = abstract_node.get("lang") 

219 if not isinstance(lang, str): 

220 if not detect_language_fct: 

221 return 

222 lang = detect_language_fct(abstract, xarticle) 

223 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract)) 

224 

225 if "page" in what: 

226 # PAGES 

227 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']") 

228 if citation_fpage_node: 

229 page = citation_fpage_node.get("content") 

230 if isinstance(page, str): 

231 page = page.split("(")[0] 

232 if len(page) < 32: 

233 xarticle.fpage = page 

234 

235 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']") 

236 if citation_lpage_node: 

237 page = citation_lpage_node.get("content") 

238 if isinstance(page, str): 

239 page = page.split("(")[0] 

240 if len(page) < 32: 

241 xarticle.lpage = page 

242 

243 if "doi" in what: 

244 # DOI 

245 citation_doi_node = soup.select_one("meta[name='citation_doi']") 

246 if citation_doi_node: 

247 doi = citation_doi_node.get("content") 

248 if isinstance(doi, str): 

249 doi = doi.strip() 

250 pos = doi.find("10.") 

251 if pos > 0: 

252 doi = doi[pos:] 

253 xarticle.doi = doi 

254 

255 if "mr" in what: 

256 # MR 

257 citation_mr_node = soup.select_one("meta[name='citation_mr']") 

258 if citation_mr_node: 

259 mr = citation_mr_node.get("content") 

260 if isinstance(mr, str): 

261 mr = mr.strip() 

262 if mr.find("MR") == 0: 

263 mr = mr[2:] 

264 xarticle.extids.append(("mr-item-id", mr)) 

265 

266 if "zbl" in what: 

267 # ZBL 

268 citation_zbl_node = soup.select_one("meta[name='citation_zbl']") 

269 if citation_zbl_node: 

270 zbl = citation_zbl_node.get("content") 

271 if isinstance(zbl, str): 

272 zbl = zbl.strip() 

273 if zbl.find("Zbl") == 0: 

274 zbl = zbl[3:].strip() 

275 xarticle.extids.append(("zbl-item-id", zbl)) 

276 

277 if "publisher" in what: 

278 # PUBLISHER 

279 citation_publisher_node = soup.select_one("meta[name='citation_publisher']") 

280 if citation_publisher_node: 

281 pub = citation_publisher_node.get("content") 

282 if isinstance(pub, str): 

283 pub = pub.strip() 

284 if pub != "": 

285 xpub = create_publisherdata() 

286 xpub.name = pub 

287 xissue.publisher = xpub 

288 

289 if "keywords" in what: 

290 # KEYWORDS 

291 citation_kwd_nodes = soup.select("meta[name='citation_keywords']") 

292 for kwd_node in citation_kwd_nodes: 

293 kwds = kwd_node.get("content") 

294 if isinstance(kwds, str): 

295 kwds = kwds.split(",") 

296 for kwd in kwds: 

297 if kwd == "": 

298 continue 

299 kwd = kwd.strip() 

300 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd}) 

301 

302 if "references" in what: 

303 citation_references = soup.select("meta[name='citation_reference']") 

304 for index, tag in enumerate(citation_references): 

305 content = tag.get("content") 

306 if not isinstance(content, str): 

307 raise ValueError("Cannot parse citation_reference meta") 

308 label = str(index + 1) 

309 if regex.match(r"^\[\d+\].*", content): 

310 label = None 

311 xarticle.bibitems.append(parse_meta_citation_reference(content, label)) 

312 

313 

314def article_has_pdf(art: ArticleData | IssueData): 

315 return next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) is not None 

316 

317 

318def article_has_source(art: ArticleData | IssueData): 

319 return ( 

320 next( 

321 (e_link for e_link in art.ext_links if e_link["rel"] == "source"), 

322 None, 

323 ) 

324 is not None 

325 )