Coverage for src / crawler / zbmath.py: 14%
134 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1import logging
2import time
3from typing import TYPE_CHECKING
5import langcodes
6from django.conf import settings
7from matching.matching import get_article_params, zbmath_matching_query
8from ptf import model_data
9from ptf.cmds.xml.ckeditor.utils import (
10 build_jats_data_from_html_field,
11 get_html_and_xml_from_text_with_formulas,
12)
13from ptf.model_data_converter import get_contrib_xml
14from requests import HTTPError, ReadTimeout
16from crawler.utils import add_pdf_link_to_xarticle, get_session
18if TYPE_CHECKING:
19 from ptf.models import Article
21logger = logging.getLogger(__name__)
23last_zbmath_request = 0
24request_delay = 5
25_logger = logging.getLogger(__name__)
26ZBMATH_URL = "https://zbmath.org"
27ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}"
28ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf"
29ZBMATH_REQUEST_INTERVAL = getattr(settings, "REQUESTS_INTERVAL", 10)
32def parse_zbmath_article(zbmath_article):
33 """
34 Parse the json response of the zbMATH OPEN article fetched given its zblid
35 zbmath_article: a json storing article data
36 returns an ArticleData, that has no pid
37 TODO: Move in ptf-back
38 """
39 xarticle = model_data.create_articledata()
40 for json_author in zbmath_article["contributors"]["authors"]:
41 author = model_data.create_contributor(role="author", string_name=json_author["name"])
42 author["contrib_xml"] = get_contrib_xml(author)
43 xarticle.contributors.append(author)
45 # Lang
46 languages = zbmath_article["language"]["languages"]
47 if len(languages) > 0:
48 xarticle.lang = zbmath_article["language"]["languages"][0]
50 # Title
51 xarticle.title_tex = zbmath_article["title"]["title"]
53 ckeditor_data = build_jats_data_from_html_field(
54 xarticle.title_tex,
55 tag="article-title",
56 text_lang=xarticle.lang,
57 delimiter_inline="$",
58 delimiter_disp="$",
59 )
60 xarticle.title_html = ckeditor_data["value_html"]
61 xarticle.title_xml = ckeditor_data["value_xml"]
63 # Abstract
64 zbl_abstract = next(
65 (
66 c
67 for c in zbmath_article["editorial_contributions"]
68 if c["contribution_type"] in "summary"
69 ),
70 None,
71 )
72 if zbl_abstract:
73 abstract_data = model_data.create_abstract(value_tex=zbl_abstract["text"])
74 if zbl_abstract["language"]:
75 lang = langcodes.Language.find(zbl_abstract["language"]).language
76 if lang and xarticle.lang and lang != xarticle.lang:
77 abstract_data["lang"] = lang
79 abstract_data["value_html"], abstract_data["value_xml"] = (
80 get_html_and_xml_from_text_with_formulas(
81 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\["
82 )
83 )
84 xarticle.abstracts.append(abstract_data)
86 # Keywords
87 for kwd in zbmath_article["keywords"]:
88 if not kwd:
89 continue
90 xarticle.kwds.append(model_data.create_subj(value=kwd))
91 # MSC
92 for msc in zbmath_article["msc"]:
93 if msc["scheme"] != "msc2020":
94 continue
95 xarticle.kwds.append(model_data.create_subj(value=msc["code"], type="msc"))
97 # Pages
98 pages: str = zbmath_article["source"]["pages"]
99 if pages:
100 pages.split("-")
101 if len(pages) > 0:
102 xarticle.fpage = pages[0]
103 if len(pages) > 1:
104 xarticle.lpage = pages[1]
106 # extids
107 zbmath_id = zbmath_article["identifier"]
108 if zbmath_id is None:
109 zbmath_id = zbmath_article["id"]
110 xarticle.extids.append(("zbl-item-id", str(zbmath_id).strip()))
112 pdf_link: str | None = None
113 for link in zbmath_article["links"]:
114 match link["type"]:
115 case "doi":
116 xarticle.doi = link["identifier"].strip()
117 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
118 xarticle.extids.append(("doi", link["identifier"].strip()))
119 case "eudml":
120 xarticle.extids.append(("eudml-item-id", link["identifier"].strip()))
121 case "arxiv":
122 xarticle.extids.append(("arxiv", link["identifier"].strip()))
123 case "emis_ft":
124 pdf_link = link["url"].strip()
126 case "emis":
127 pdf_link = (
128 link["url"].strip().replace("http://www.emis.de/", "http://www.emis.de/ft/")
129 )
130 if pdf_link:
131 from crawler.base_crawler import BaseCollectionCrawler
133 BaseCollectionCrawler.session = get_session()
134 isok, *_ = BaseCollectionCrawler.check_pdf_link_validity(url=pdf_link)
135 if not isok:
136 logger.warning(f"This pdf link doesn't seems to work : {pdf_link}")
137 else:
138 add_pdf_link_to_xarticle(xarticle, pdf_link)
140 return xarticle
143def zbmath_request_article(zblid):
144 """
145 Fetch the article from zbMATH OPEN given its zblid
146 returns an ArticleData, that has no pid
147 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module
148 """
149 session = get_session()
150 response = session.get(
151 ZBMATH_API_URL.format(query=zblid),
152 headers={**session.headers, "Content-Type": "text/json"},
153 )
154 # Silence errors, but continue operation
155 try:
156 response.raise_for_status()
157 except HTTPError:
158 return
160 response = response.json()
161 return parse_zbmath_article(response["result"])
164def zbmath_request_article_by_doi(doi: str):
165 session = get_session()
166 search = f"_structured_search?results_per_page=1&DOI={doi}"
167 response = session.get(
168 ZBMATH_API_URL.format(query=search),
169 headers={**session.headers, "Content-Type": "text/json"},
170 )
171 if response.status_code == 404:
172 logging.debug(f"ZBMATH API {doi} not found")
173 return
175 response.raise_for_status()
176 response = response.json()
178 if response["status"]["nr_total_results"] > 1:
179 logging.error(f"ZBMATH API found multiple candidates for doi {doi}")
180 return
181 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}")
182 if response["status"]["nr_total_results"] == 0:
183 logging.debug(f"ZBMATH API {doi} not found")
184 return
185 return parse_zbmath_article(response["result"][0])
188def zbmath_request_article_by_extid(extid: str):
189 pass
192def _zbmath_query_retry(params: dict, timeout: int):
193 global last_zbmath_request
194 RETRIES = 3
195 for i in range(RETRIES):
196 try:
197 response = zbmath_matching_query(params, timeout, get_session())
198 return response
199 except ReadTimeout as e:
200 if i >= RETRIES - 1:
201 raise e
202 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs")
203 time.sleep(60)
205 assert False, "unreachable"
208def match_zbl_article(article: "Article"):
209 """Finds article using matching score"""
210 # query zbl
211 params = get_article_params(article)
212 response = _zbmath_query_retry(params, timeout=30)
213 results = response.json()
215 try:
216 item = results["results"][0][0]
217 except (KeyError, IndexError):
218 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping")
219 return
220 return item
223def zbmath_get_pdfurl(zbmathid: str):
224 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)