Coverage for src / crawler / zbmath.py: 0%
131 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-23 15:27 +0000
1# import shutil
2# import tempfile
3# import urllib
4# import xml.etree.ElementTree as ET
7import logging
8import time
9from typing import TYPE_CHECKING
11import langcodes
12from matching.matching import get_article_params, zbmath_matching_query
13from ptf import model_data
14from ptf.cmds.xml.ckeditor.utils import (
15 build_jats_data_from_html_field,
16 get_html_and_xml_from_text_with_formulas,
17)
18from ptf.model_data_converter import get_contrib_xml
19from requests import HTTPError, ReadTimeout
21from crawler.utils import add_pdf_link_to_xarticle, get_session
23if TYPE_CHECKING:
24 from ptf.models import Article
27last_zbmath_request = 0
28request_delay = 5
29_logger = logging.getLogger(__name__)
30ZBMATH_URL = "https://zbmath.org"
31ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}"
32ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf"
35def parse_zbmath_article(zbmath_article):
36 """
37 Parse the json response of the zbMATH OPEN article fetched given its zblid
38 zbmath_article: a json storing article data
39 returns an ArticleData, that has no pid
40 TODO: Move in ptf-back
41 """
42 xarticle = model_data.create_articledata()
43 for json_author in zbmath_article["contributors"]["authors"]:
44 author = model_data.create_contributor(role="author", string_name=json_author["name"])
45 author["contrib_xml"] = get_contrib_xml(author)
46 xarticle.contributors.append(author)
48 # Lang
49 languages = zbmath_article["language"]["languages"]
50 if len(languages) > 0:
51 xarticle.lang = zbmath_article["language"]["languages"][0]
53 # Title
54 xarticle.title_tex = zbmath_article["title"]["title"]
56 ckeditor_data = build_jats_data_from_html_field(
57 xarticle.title_tex,
58 tag="article-title",
59 text_lang=xarticle.lang,
60 delimiter_inline="$",
61 delimiter_disp="$",
62 )
63 xarticle.title_html = ckeditor_data["value_html"]
64 xarticle.title_xml = ckeditor_data["value_xml"]
66 # Abstract
67 zbl_abstract = next(
68 (
69 c
70 for c in zbmath_article["editorial_contributions"]
71 if c["contribution_type"] in "summary"
72 ),
73 None,
74 )
75 if zbl_abstract:
76 abstract_data = model_data.create_abstract(value_tex=zbl_abstract["text"])
77 if zbl_abstract["language"]:
78 lang = langcodes.Language.find(zbl_abstract["language"]).language
79 if lang and xarticle.lang and lang != xarticle.lang:
80 abstract_data["lang"] = lang
82 abstract_data["value_html"], abstract_data["value_xml"] = (
83 get_html_and_xml_from_text_with_formulas(
84 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\["
85 )
86 )
87 xarticle.abstracts.append(abstract_data)
89 # Keywords
90 for kwd in zbmath_article["keywords"]:
91 if not kwd:
92 continue
93 xarticle.kwds.append(model_data.create_subj(value=kwd))
94 # MSC
95 for msc in zbmath_article["msc"]:
96 if msc["scheme"] != "msc2020":
97 continue
98 xarticle.kwds.append(model_data.create_subj(value=msc["code"]))
100 # Pages
101 pages: str = zbmath_article["source"]["pages"]
102 if pages:
103 pages.split("-")
104 if len(pages) > 0:
105 xarticle.fpage = pages[0]
106 if len(pages) > 1:
107 xarticle.lpage = pages[1]
109 # extids
110 zbmath_id = zbmath_article["identifier"]
111 if zbmath_id is None:
112 zbmath_id = zbmath_article["id"]
113 xarticle.extids.append(("zbl-item-id", zbmath_id.strip()))
115 for link in zbmath_article["links"]:
116 match link["type"]:
117 case "doi":
118 xarticle.doi = link["identifier"].strip()
119 xarticle.pid = xarticle.doi.replace("/", "_").replace(".", "_").replace("-", "_")
120 xarticle.extids.append(("doi", link["identifier"].strip()))
121 case "eudml":
122 xarticle.extids.append(("eudml-item-id", link["identifier"].strip()))
123 case "arxiv":
124 xarticle.extids.append(("arxiv", link["identifier"].strip()))
125 case "emis_ft":
126 add_pdf_link_to_xarticle(xarticle, link["url"].strip())
127 return xarticle
130def zbmath_request_article(zblid):
131 """
132 Fetch the article from zbMATH OPEN given its zblid
133 returns an ArticleData, that has no pid
134 TODO: Move the code in ptf-back and refactor with zbmath_request in the matching module
135 """
136 session = get_session()
137 response = session.get(
138 ZBMATH_API_URL.format(query=zblid),
139 headers={**session.headers, "Content-Type": "text/json"},
140 )
141 try:
142 response.raise_for_status()
143 except HTTPError:
144 return
146 if response.from_cache:
147 time.sleep(10)
149 response = response.json()
150 return parse_zbmath_article(response["result"])
153def zbmath_request_article_by_doi(doi: str):
154 session = get_session()
155 search = f"_structured_search?results_per_page=1&DOI={doi}"
156 response = session.get(
157 ZBMATH_API_URL.format(query=search),
158 headers={**session.headers, "Content-Type": "text/json"},
159 )
160 if response.status_code == 404:
161 logging.debug(f"ZBMATH API {doi} not found")
162 return
164 response.raise_for_status()
166 if response.from_cache:
167 time.sleep(10)
169 response = response.json()
170 if response["status"]["nr_total_results"] > 1:
171 logging.error(f"ZBMATH API found multiple candidates for doi {doi}")
172 return
173 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}")
174 if response["status"]["nr_total_results"] == 0:
175 logging.debug(f"ZBMATH API {doi} not found")
176 return
177 return parse_zbmath_article(response["result"][0])
180def zbmath_request_article_by_extid(extid: str):
181 pass
184def _zbmath_query_retry(params: dict, timeout: int):
185 global last_zbmath_request
186 RETRIES = 3
187 for i in range(RETRIES):
188 try:
189 delta = (last_zbmath_request + request_delay) - time.time()
190 if delta > 0:
191 _logger.info(f"Waiting {round(delta)}s before making another request")
192 time.sleep(delta)
194 response = zbmath_matching_query(params, timeout, get_session())
195 if not getattr(response, "from_cache", False):
196 last_zbmath_request = time.time()
197 return response
198 except ReadTimeout as e:
199 if i >= RETRIES - 1:
200 raise e
201 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs")
202 time.sleep(60)
204 assert False, "unreachable"
207def match_zbl_article(article: "Article"):
208 """Finds article using matching score"""
209 # query zbl
210 params = get_article_params(article)
211 response = _zbmath_query_retry(params, timeout=30)
212 results = response.json()
214 try:
215 item = results["results"][0][0]
216 except (KeyError, IndexError):
217 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping")
218 return
219 return item
222def zbmath_get_pdfurl(zbmathid: str):
223 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)