Coverage for src / crawler / crawler_utils.py: 13%
208 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1# This file contains utils functions related to ArticleData or IssueData parsing and population
2# Some of the functions present here were initially present in base_crawler but then moved here.
5import logging
6from email.policy import EmailPolicy
7from typing import Callable
9import regex
10from bs4 import BeautifulSoup
11from langcodes import standardize_tag
12from ptf.cmds.xml.jats.builder.references import (
13 get_article_title_xml,
14 get_author_xml,
15 get_fpage_xml,
16 get_lpage_xml,
17 get_source_xml,
18 get_year_xml,
19)
20from ptf.cmds.xml.jats.jats_parser import JatsBase
21from ptf.model_data import (
22 ArticleData,
23 ContributorDict,
24 IssueData,
25 create_abstract,
26 create_contributor,
27 create_issuedata,
28 create_publisherdata,
29)
31from crawler.types import CitationLiteral
32from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
34references_mapping = {
35 "citation_title": get_article_title_xml,
36 "citation_journal_title": get_source_xml,
37 "citation_publication_date": get_year_xml,
38 "citation_firstpage": get_fpage_xml,
39 "citation_lastpage": get_lpage_xml,
40}
42logger = logging.getLogger(__name__)
45def parse_content_type_charset(content_type: str):
46 header = EmailPolicy.header_factory("content-type", content_type)
47 if "charset" in header.params:
48 return header.params.get("charset")
51def parse_meta_citation_reference(content: str, label=None):
52 categories = content.split(";")
54 if len(categories) == 1:
55 return JatsBase.bake_ref(content, label=label)
57 citation_data = [c.split("=") for c in categories if "=" in c]
58 del categories
60 xml_string = ""
61 authors_parsed = False
62 authors_strings = []
63 for data in citation_data:
64 key = data[0].strip()
65 citation_content = data[1]
66 if key == "citation_author":
67 authors_strings.append(get_author_xml(template_str=citation_content))
68 continue
69 elif not authors_parsed:
70 xml_string += ", ".join(authors_strings)
71 authors_parsed = True
73 if key in references_mapping:
74 xml_string += " " + references_mapping[key](citation_content)
76 return JatsBase.bake_ref(xml_string, label=label)
79def set_pages(article: ArticleData, pages: str, separator: str = "-"):
80 pages_split = pages.split(separator)
81 if len(pages_split) == 0: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 article.page_range = pages
83 if len(pages_split) > 0: 83 ↛ exitline 83 didn't return from function 'set_pages' because the condition on line 83 was always true
84 if pages[0].isnumeric(): 84 ↛ exitline 84 didn't return from function 'set_pages' because the condition on line 84 was always true
85 article.fpage = pages_split[0]
86 if (
87 len(pages_split) > 1
88 and pages_split[0] != pages_split[1]
89 and pages_split[1].isnumeric()
90 ):
91 article.lpage = pages_split[1]
94def get_issue_pid(
95 collection_id: str,
96 year: str,
97 volume_number: str | None = None,
98 issue_number: str | None = None,
99 series: str | None = None,
100):
101 # Replace any non-word character with an underscore
102 pid = f"{collection_id}_{year}"
103 if series is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 pid += f"_{series}"
105 if volume_number is not None: 105 ↛ 107line 105 didn't jump to line 107 because the condition on line 105 was always true
106 pid += f"_{volume_number}"
107 if issue_number is not None:
108 pid += f"_{issue_number}"
109 pid = regex.sub(r"[^a-zA-Z0-9-]+", "_", cleanup_str(pid))
110 return pid
113def create_xissue(
114 collection_id: str,
115 url: str | None,
116 year: str,
117 volume_number: str | None,
118 issue_number: str | None = "1",
119 vseries: str | None = None,
120):
121 if url is not None and url.endswith("/"):
122 url = url[:-1]
123 xissue = create_issuedata()
124 xissue.url = url
126 xissue.pid = get_issue_pid(collection_id, year, volume_number, issue_number, vseries)
128 xissue.year = year
130 if volume_number is not None:
131 xissue.volume = regex.sub(r"[^a-zA-Z0-9-]+", "_", volume_number)
133 if issue_number is not None:
134 xissue.number = issue_number.replace(",", "-")
136 if vseries is not None:
137 xissue.vseries = vseries
138 return xissue
141def get_metadata_using_citation_meta(
142 xarticle: ArticleData,
143 xissue: IssueData,
144 soup: BeautifulSoup,
145 what: list[CitationLiteral] = [],
146 detect_language_fct: Callable[[str, ArticleData], str] | None = None,
147):
148 """
149 :param xarticle: the xarticle that will collect the metadata
150 :param xissue: the xissue that will collect the publisher
151 :param soup: the BeautifulSoup object of tha article page
152 :param what: list of citation_ items to collect.
153 :return: None. The given article is modified
154 """
156 if "title" in what:
157 # TITLE
158 citation_title_node = soup.select_one("meta[name='citation_title']")
159 if citation_title_node:
160 title = citation_title_node.get("content")
161 if isinstance(title, str):
162 xarticle.title_tex = title
164 if "author" in what:
165 # AUTHORS
166 citation_author_nodes = soup.select("meta[name^='citation_author']")
167 current_author: ContributorDict | None = None
168 for citation_author_node in citation_author_nodes:
169 if citation_author_node.get("name") == "citation_author":
170 text_author = citation_author_node.get("content")
171 if not isinstance(text_author, str):
172 raise ValueError("Cannot parse author")
173 if text_author == "":
174 current_author = None
175 continue
176 current_author = create_contributor(role="author", string_name=text_author)
177 xarticle.contributors.append(current_author)
178 continue
179 if current_author is None:
180 logger.warning("Couldn't parse citation author")
181 continue
182 if citation_author_node.get("name") == "citation_author_institution":
183 text_institution = citation_author_node.get("content")
184 if not isinstance(text_institution, str):
185 continue
186 current_author["addresses"].append(text_institution)
187 if citation_author_node.get("name") == "citation_author_ocrid":
188 text_orcid = citation_author_node.get("content")
189 if not isinstance(text_orcid, str):
190 continue
191 current_author["orcid"] = text_orcid
193 if "pdf" in what:
194 # PDF
195 citation_pdf_node = soup.select_one('meta[name="citation_pdf_url"]')
196 if citation_pdf_node:
197 pdf_url = citation_pdf_node.get("content")
198 if isinstance(pdf_url, str):
199 add_pdf_link_to_xarticle(xarticle, pdf_url)
201 if "lang" in what:
202 # LANG
203 citation_lang_node = soup.select_one("meta[name='citation_language']")
204 if citation_lang_node:
205 # TODO: check other language code
206 content_text = citation_lang_node.get("content")
207 if isinstance(content_text, str):
208 xarticle.lang = standardize_tag(content_text)
210 if "abstract" in what:
211 # ABSTRACT
212 abstract_node = soup.select_one("meta[name='citation_abstract']")
213 if abstract_node is not None:
214 abstract = abstract_node.get("content")
215 if not isinstance(abstract, str):
216 raise ValueError("Couldn't parse abstract from meta")
217 abstract = BeautifulSoup(abstract, "html.parser").text
218 lang = abstract_node.get("lang")
219 if not isinstance(lang, str):
220 if not detect_language_fct:
221 return
222 lang = detect_language_fct(abstract, xarticle)
223 xarticle.abstracts.append(create_abstract(lang=lang, value_tex=abstract))
225 if "page" in what:
226 # PAGES
227 citation_fpage_node = soup.select_one("meta[name='citation_firstpage']")
228 if citation_fpage_node:
229 page = citation_fpage_node.get("content")
230 if isinstance(page, str):
231 page = page.split("(")[0]
232 if len(page) < 32:
233 xarticle.fpage = page
235 citation_lpage_node = soup.select_one("meta[name='citation_lastpage']")
236 if citation_lpage_node:
237 page = citation_lpage_node.get("content")
238 if isinstance(page, str):
239 page = page.split("(")[0]
240 if len(page) < 32:
241 xarticle.lpage = page
243 if "doi" in what:
244 # DOI
245 citation_doi_node = soup.select_one("meta[name='citation_doi']")
246 if citation_doi_node:
247 doi = citation_doi_node.get("content")
248 if isinstance(doi, str):
249 doi = doi.strip()
250 pos = doi.find("10.")
251 if pos > 0:
252 doi = doi[pos:]
253 xarticle.doi = doi
255 if "mr" in what:
256 # MR
257 citation_mr_node = soup.select_one("meta[name='citation_mr']")
258 if citation_mr_node:
259 mr = citation_mr_node.get("content")
260 if isinstance(mr, str):
261 mr = mr.strip()
262 if mr.find("MR") == 0:
263 mr = mr[2:]
264 xarticle.extids.append(("mr-item-id", mr))
266 if "zbl" in what:
267 # ZBL
268 citation_zbl_node = soup.select_one("meta[name='citation_zbl']")
269 if citation_zbl_node:
270 zbl = citation_zbl_node.get("content")
271 if isinstance(zbl, str):
272 zbl = zbl.strip()
273 if zbl.find("Zbl") == 0:
274 zbl = zbl[3:].strip()
275 xarticle.extids.append(("zbl-item-id", zbl))
277 if "publisher" in what:
278 # PUBLISHER
279 citation_publisher_node = soup.select_one("meta[name='citation_publisher']")
280 if citation_publisher_node:
281 pub = citation_publisher_node.get("content")
282 if isinstance(pub, str):
283 pub = pub.strip()
284 if pub != "":
285 xpub = create_publisherdata()
286 xpub.name = pub
287 xissue.publisher = xpub
289 if "keywords" in what:
290 # KEYWORDS
291 citation_kwd_nodes = soup.select("meta[name='citation_keywords']")
292 for kwd_node in citation_kwd_nodes:
293 kwds = kwd_node.get("content")
294 if isinstance(kwds, str):
295 kwds = kwds.split(",")
296 for kwd in kwds:
297 if kwd == "":
298 continue
299 kwd = kwd.strip()
300 xarticle.kwds.append({"type": "", "lang": xarticle.lang, "value": kwd})
302 if "references" in what:
303 citation_references = soup.select("meta[name='citation_reference']")
304 for index, tag in enumerate(citation_references):
305 content = tag.get("content")
306 if not isinstance(content, str):
307 raise ValueError("Cannot parse citation_reference meta")
308 label = str(index + 1)
309 if regex.match(r"^\[\d+\].*", content):
310 label = None
311 xarticle.bibitems.append(parse_meta_citation_reference(content, label))
314def article_has_pdf(art: ArticleData | IssueData):
315 return next((link for link in art.ext_links if link["rel"] == "article-pdf"), None) is not None
318def article_has_source(art: ArticleData | IssueData):
319 return (
320 next(
321 (e_link for e_link in art.ext_links if e_link["rel"] == "source"),
322 None,
323 )
324 is not None
325 )