Coverage for src / crawler / zbmath.py: 15%
152 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
1import logging
2import time
3from datetime import datetime, timedelta
4from typing import TYPE_CHECKING, Literal
6import langcodes
7import requests
8from django.conf import settings
9from matching.matching import get_article_params, zbmath_matching_query
10from ptf import model_data
11from ptf.cmds.xml.ckeditor.utils import (
12 build_jats_data_from_html_field,
13 get_html_and_xml_from_text_with_formulas,
14)
15from ptf.cmds.xml.xml_utils import escape
16from ptf.model_data_converter import get_contrib_xml
17from requests import HTTPError, ReadTimeout
19from crawler.utils import add_pdf_link_to_xarticle, get_session
21if TYPE_CHECKING:
22 from ptf.models import Article
24logger = logging.getLogger(__name__)
26last_zbmath_request = 0
27request_delay = 5
28_logger = logging.getLogger(__name__)
29ZBMATH_URL = "https://zbmath.org"
30ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}"
31ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf"
32ZBMATH_REQUEST_INTERVAL = getattr(settings, "REQUESTS_INTERVAL", 10)
35def parse_zbmath_article(zbmath_article: dict):
36 """
37 Parse the json response of the zbMATH OPEN article fetched given its zblid
38 zbmath_article: a json storing article data
39 returns an ArticleData, that has no pid
40 TODO: Move in ptf-back
41 """
42 xarticle = model_data.create_articledata()
43 for json_author in zbmath_article["contributors"]["authors"]:
44 author = model_data.create_contributor(role="author", string_name=json_author["name"])
45 author["contrib_xml"] = get_contrib_xml(author)
46 xarticle.contributors.append(author)
48 # extids
49 zbmath_id = zbmath_article["identifier"]
50 if zbmath_id is None:
51 zbmath_id = zbmath_article["id"]
52 xarticle.extids.append(("zbl-item-id", str(zbmath_id).strip()))
54 # Lang
55 languages = zbmath_article["language"]["languages"]
56 if len(languages) > 0:
57 xarticle.lang = zbmath_article["language"]["languages"][0]
59 # Title
60 xarticle.title_tex = zbmath_article["title"]["title"]
62 ckeditor_data = build_jats_data_from_html_field(
63 xarticle.title_tex,
64 tag="article-title",
65 text_lang=xarticle.lang,
66 delimiter_inline="$",
67 delimiter_disp="$",
68 )
69 xarticle.title_html = ckeditor_data["value_html"]
70 xarticle.title_xml = ckeditor_data["value_xml"]
72 # Abstract
73 zbl_abstract = next(
74 (
75 c
76 for c in zbmath_article["editorial_contributions"]
77 if c["contribution_type"] in "summary"
78 ),
79 None,
80 )
81 if zbl_abstract:
82 abstract_data = model_data.create_abstract(value_tex=escape(zbl_abstract["text"]))
83 if zbl_abstract["language"]:
84 lang = langcodes.Language.find(zbl_abstract["language"]).language
85 if lang and xarticle.lang and lang != xarticle.lang:
86 abstract_data["lang"] = lang
87 # Zbmath abstracts are sometimes impossible to parse
88 # (They do not escape html-reserved characters like < >)
89 try:
90 abstract_data["value_html"], abstract_data["value_xml"] = (
91 get_html_and_xml_from_text_with_formulas(
92 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\["
93 )
94 )
95 xarticle.abstracts.append(abstract_data)
96 except BaseException as e:
97 logger.error(f"Got exception while parsing abstract for {zbmath_id} : {e}")
99 # Keywords
100 for kwd in zbmath_article["keywords"]:
101 if not kwd:
102 continue
103 xarticle.kwds.append(model_data.create_subj(value=kwd))
104 # MSC
105 for msc in zbmath_article["msc"]:
106 if msc["scheme"] != "msc2020":
107 continue
108 xarticle.kwds.append(model_data.create_subj(value=msc["code"], type="msc"))
110 # Pages
111 pages: str = zbmath_article["source"]["pages"]
112 if pages:
113 pages.split("-")
114 if len(pages) > 0:
115 xarticle.fpage = pages[0]
116 if len(pages) > 1:
117 xarticle.lpage = pages[1]
119 pdf_link: str | None = None
120 for link in zbmath_article["links"]:
121 match link["type"]:
122 case "doi":
123 xarticle.doi = link["identifier"].strip()
124 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
125 xarticle.extids.append(("doi", link["identifier"].strip()))
126 case "eudml":
127 xarticle.extids.append(("eudml-item-id", link["identifier"].strip()))
128 case "arxiv":
129 xarticle.extids.append(("arxiv", link["identifier"].strip()))
130 case "emis_ft":
131 pdf_link = link["url"].strip()
133 case "emis":
134 pdf_link = (
135 link["url"].strip().replace("http://www.emis.de/", "http://www.emis.de/ft/")
136 )
137 if pdf_link:
138 from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
140 BaseCollectionCrawler.session = get_session()
141 isok, *_ = BaseCollectionCrawler.check_pdf_link_validity(url=pdf_link)
142 if not isok:
143 logger.warning(f"This pdf link doesn't seems to work : {pdf_link}")
144 else:
145 add_pdf_link_to_xarticle(xarticle, pdf_link)
147 return xarticle
150def zbmath_request_article(zblid, force_refresh=False):
151 """
152 Fetch the article from zbMATH OPEN given its zblid
153 returns an ArticleData, that has no pid
154 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module
155 """
156 # LOTs of cache-specific code here.
157 # Maybe handle the caching logic elsewhere
158 # so this code can be reused without the requests_cache module
160 session = get_session()
161 response = session.get(
162 ZBMATH_API_URL.format(query=zblid),
163 headers={**session.headers, "Content-Type": "text/json"},
164 force_refresh=force_refresh,
165 )
166 # Silence errors, but continue operation
167 try:
168 response.raise_for_status()
169 except HTTPError:
170 return
172 try:
173 response = response.json()
174 except requests.exceptions.JSONDecodeError as e:
175 # Cannot invalidate: response is not taken from cache
176 if force_refresh or not response.from_cache:
177 raise requests.exceptions.JSONDecodeError(
178 f"Couldn't decode JSON from ZBMATH at address {response.url}",
179 response=response,
180 ) from e
181 logger.error(f"Couldn't decode JSON from ZBMATH at address {response.url}.")
182 logger.info(f"Retrying in {60}s ({(datetime.now() + timedelta(minutes=1)).time()})")
183 time.sleep(60)
184 return zbmath_request_article(zblid, force_refresh=True)
186 return parse_zbmath_article(response["result"])
189id_type_mapping = {"doi": "DOI", "arXiv": "arXiv%20ID"}
192def _zbmath_request_article_by_extid(extid: str, id_type: Literal["doi", "arXiv"]):
193 query_search = id_type_mapping[id_type]
194 session = get_session()
195 search = f"_structured_search?results_per_page=1&{query_search}={extid}"
196 response = session.get(
197 ZBMATH_API_URL.format(query=search),
198 headers={**session.headers, "Content-Type": "text/json"},
199 )
200 if response.status_code == 404:
201 logging.debug(f"ZBMATH API {id_type}={extid} not found")
202 return
204 response.raise_for_status()
205 response = response.json()
207 if response["status"]["nr_total_results"] > 1:
208 logging.error(f"ZBMATH API found multiple candidates for {id_type} {extid}")
209 return
210 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}")
211 if response["status"]["nr_total_results"] == 0:
212 logging.debug(f"ZBMATH API {id_type}={extid} not found")
213 return
214 return parse_zbmath_article(response["result"][0])
217def zbmath_request_article_by_doi(doi: str):
218 return _zbmath_request_article_by_extid(doi, "doi")
221def zbmath_request_article_by_arxivId(id: str):
222 return _zbmath_request_article_by_extid(id, "arXiv")
225def _zbmath_query_retry(params: dict, timeout: int):
226 global last_zbmath_request
227 RETRIES = 3
228 for i in range(RETRIES):
229 try:
230 response = zbmath_matching_query(params, timeout, get_session())
231 return response
232 except ReadTimeout as e:
233 if i >= RETRIES - 1:
234 raise e
235 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs")
236 time.sleep(60)
238 assert False, "unreachable"
241def match_zbl_article(article: "Article"):
242 """Finds article using matching score"""
243 # query zbl
244 params = get_article_params(article)
245 response = _zbmath_query_retry(params, timeout=30)
246 results = response.json()
248 try:
249 item = results["results"][0][0]
250 except (KeyError, IndexError):
251 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping")
252 return
253 return item
256def zbmath_get_pdfurl(zbmathid: str):
257 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)