Coverage for src / crawler / zbmath.py: 14%
138 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-17 12:56 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-02-17 12:56 +0000
1import logging
2import time
3from typing import TYPE_CHECKING
5import langcodes
6from django.conf import settings
7from matching.matching import get_article_params, zbmath_matching_query
8from ptf import model_data
9from ptf.cmds.xml.ckeditor.utils import (
10 build_jats_data_from_html_field,
11 get_html_and_xml_from_text_with_formulas,
12)
13from ptf.cmds.xml.xml_utils import escape
14from ptf.model_data_converter import get_contrib_xml
15from requests import HTTPError, ReadTimeout
17from crawler.utils import add_pdf_link_to_xarticle, get_session
19if TYPE_CHECKING:
20 from ptf.models import Article
22logger = logging.getLogger(__name__)
24last_zbmath_request = 0
25request_delay = 5
26_logger = logging.getLogger(__name__)
27ZBMATH_URL = "https://zbmath.org"
28ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}"
29ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf"
30ZBMATH_REQUEST_INTERVAL = getattr(settings, "REQUESTS_INTERVAL", 10)
33def parse_zbmath_article(zbmath_article: dict):
34 """
35 Parse the json response of the zbMATH OPEN article fetched given its zblid
36 zbmath_article: a json storing article data
37 returns an ArticleData, that has no pid
38 TODO: Move in ptf-back
39 """
40 xarticle = model_data.create_articledata()
41 for json_author in zbmath_article["contributors"]["authors"]:
42 author = model_data.create_contributor(role="author", string_name=json_author["name"])
43 author["contrib_xml"] = get_contrib_xml(author)
44 xarticle.contributors.append(author)
46 # extids
47 zbmath_id = zbmath_article["identifier"]
48 if zbmath_id is None:
49 zbmath_id = zbmath_article["id"]
50 xarticle.extids.append(("zbl-item-id", str(zbmath_id).strip()))
52 # Lang
53 languages = zbmath_article["language"]["languages"]
54 if len(languages) > 0:
55 xarticle.lang = zbmath_article["language"]["languages"][0]
57 # Title
58 xarticle.title_tex = zbmath_article["title"]["title"]
60 ckeditor_data = build_jats_data_from_html_field(
61 xarticle.title_tex,
62 tag="article-title",
63 text_lang=xarticle.lang,
64 delimiter_inline="$",
65 delimiter_disp="$",
66 )
67 xarticle.title_html = ckeditor_data["value_html"]
68 xarticle.title_xml = ckeditor_data["value_xml"]
70 # Abstract
71 zbl_abstract = next(
72 (
73 c
74 for c in zbmath_article["editorial_contributions"]
75 if c["contribution_type"] in "summary"
76 ),
77 None,
78 )
79 if zbl_abstract:
80 abstract_data = model_data.create_abstract(value_tex=escape(zbl_abstract["text"]))
81 if zbl_abstract["language"]:
82 lang = langcodes.Language.find(zbl_abstract["language"]).language
83 if lang and xarticle.lang and lang != xarticle.lang:
84 abstract_data["lang"] = lang
85 # Zbmath abstracts are sometimes impossible to parse
86 # (They do not escape html-reserved characters like < >)
87 try:
88 abstract_data["value_html"], abstract_data["value_xml"] = (
89 get_html_and_xml_from_text_with_formulas(
90 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\["
91 )
92 )
93 xarticle.abstracts.append(abstract_data)
94 except BaseException as e:
95 logger.error(f"Got exception while parsing abstract for {zbmath_id} : {e}")
97 # Keywords
98 for kwd in zbmath_article["keywords"]:
99 if not kwd:
100 continue
101 xarticle.kwds.append(model_data.create_subj(value=kwd))
102 # MSC
103 for msc in zbmath_article["msc"]:
104 if msc["scheme"] != "msc2020":
105 continue
106 xarticle.kwds.append(model_data.create_subj(value=msc["code"], type="msc"))
108 # Pages
109 pages: str = zbmath_article["source"]["pages"]
110 if pages:
111 pages.split("-")
112 if len(pages) > 0:
113 xarticle.fpage = pages[0]
114 if len(pages) > 1:
115 xarticle.lpage = pages[1]
117 pdf_link: str | None = None
118 for link in zbmath_article["links"]:
119 match link["type"]:
120 case "doi":
121 xarticle.doi = link["identifier"].strip()
122 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
123 xarticle.extids.append(("doi", link["identifier"].strip()))
124 case "eudml":
125 xarticle.extids.append(("eudml-item-id", link["identifier"].strip()))
126 case "arxiv":
127 xarticle.extids.append(("arxiv", link["identifier"].strip()))
128 case "emis_ft":
129 pdf_link = link["url"].strip()
131 case "emis":
132 pdf_link = (
133 link["url"].strip().replace("http://www.emis.de/", "http://www.emis.de/ft/")
134 )
135 if pdf_link:
136 from crawler.base_crawler import BaseCollectionCrawler
138 BaseCollectionCrawler.session = get_session()
139 isok, *_ = BaseCollectionCrawler.check_pdf_link_validity(url=pdf_link)
140 if not isok:
141 logger.warning(f"This pdf link doesn't seems to work : {pdf_link}")
142 else:
143 add_pdf_link_to_xarticle(xarticle, pdf_link)
145 return xarticle
148def zbmath_request_article(zblid):
149 """
150 Fetch the article from zbMATH OPEN given its zblid
151 returns an ArticleData, that has no pid
152 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module
153 """
154 session = get_session()
155 response = session.get(
156 ZBMATH_API_URL.format(query=zblid),
157 headers={**session.headers, "Content-Type": "text/json"},
158 )
159 # Silence errors, but continue operation
160 try:
161 response.raise_for_status()
162 except HTTPError:
163 return
165 response = response.json()
166 return parse_zbmath_article(response["result"])
169def zbmath_request_article_by_doi(doi: str):
170 session = get_session()
171 search = f"_structured_search?results_per_page=1&DOI={doi}"
172 response = session.get(
173 ZBMATH_API_URL.format(query=search),
174 headers={**session.headers, "Content-Type": "text/json"},
175 )
176 if response.status_code == 404:
177 logging.debug(f"ZBMATH API {doi} not found")
178 return
180 response.raise_for_status()
181 response = response.json()
183 if response["status"]["nr_total_results"] > 1:
184 logging.error(f"ZBMATH API found multiple candidates for doi {doi}")
185 return
186 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}")
187 if response["status"]["nr_total_results"] == 0:
188 logging.debug(f"ZBMATH API {doi} not found")
189 return
190 return parse_zbmath_article(response["result"][0])
193def zbmath_request_article_by_extid(extid: str):
194 pass
197def _zbmath_query_retry(params: dict, timeout: int):
198 global last_zbmath_request
199 RETRIES = 3
200 for i in range(RETRIES):
201 try:
202 response = zbmath_matching_query(params, timeout, get_session())
203 return response
204 except ReadTimeout as e:
205 if i >= RETRIES - 1:
206 raise e
207 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs")
208 time.sleep(60)
210 assert False, "unreachable"
213def match_zbl_article(article: "Article"):
214 """Finds article using matching score"""
215 # query zbl
216 params = get_article_params(article)
217 response = _zbmath_query_retry(params, timeout=30)
218 results = response.json()
220 try:
221 item = results["results"][0][0]
222 except (KeyError, IndexError):
223 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping")
224 return
225 return item
228def zbmath_get_pdfurl(zbmathid: str):
229 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)