Coverage for src/crawler/zbmath.py: 0%
108 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-11-21 14:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-11-21 14:41 +0000
1# import shutil
2# import tempfile
3# import urllib
4# import xml.etree.ElementTree as ET
7import logging
8import time
9from typing import TYPE_CHECKING
11import langcodes
12from matching.matching import get_article_params, zbmath_matching_query
13from ptf import model_data
14from ptf.cmds.xml.ckeditor.utils import (
15 build_jats_data_from_html_field,
16 get_html_and_xml_from_text_with_formulas,
17)
18from ptf.external import get_session
19from ptf.model_data_converter import get_contrib_xml
20from requests import ReadTimeout
22if TYPE_CHECKING:
23 from ptf.models import Article
24_logger = logging.getLogger(__name__)
25ZBMATH_URL = "https://zbmath.org"
26ZBMATH_API_URL = "https://api.zbmath.org/v1/document/{query}"
27ZBMATH_PDF_URL = "https://zbmath.org/pdf/{zbmathid}.pdf"
30def parse_zbmath_article(zbmath_article):
31 xarticle = model_data.create_articledata()
32 for json_author in zbmath_article["contributors"]["authors"]:
33 author = model_data.create_contributor(role="author", string_name=json_author["name"])
34 author["contrib_xml"] = get_contrib_xml(author)
35 xarticle.contributors.append(author)
37 # Lang
38 xarticle.lang = zbmath_article["language"]["languages"][0]
40 # Title
41 xarticle.title_tex = zbmath_article["title"]["title"]
43 ckeditor_data = build_jats_data_from_html_field(
44 xarticle.title_tex,
45 tag="article-title",
46 text_lang=xarticle.lang,
47 delimiter_inline="$",
48 delimiter_disp="$",
49 )
50 xarticle.title_html = ckeditor_data["value_html"]
51 xarticle.title_xml = ckeditor_data["value_xml"]
53 # Abstract
54 zbl_abstract = next(
55 (
56 c
57 for c in zbmath_article["editorial_contributions"]
58 if c["contribution_type"] == "summary"
59 ),
60 None,
61 )
62 if zbl_abstract:
63 abstract_data = model_data.create_abstract(value_tex=zbl_abstract["text"])
64 if zbl_abstract["language"]:
65 lang = langcodes.Language.find(zbl_abstract["language"]).language
66 if lang and xarticle.lang and lang != xarticle.lang:
67 abstract_data["lang"] = lang
69 abstract_data["value_html"], abstract_data["value_xml"] = (
70 get_html_and_xml_from_text_with_formulas(
71 abstract_data["value_tex"], delimiter_inline="\\(", delimiter_disp="\\["
72 )
73 )
74 xarticle.abstracts.append(abstract_data)
76 # Keywords
77 for kwd in zbmath_article["keywords"]:
78 xarticle.kwds.append(model_data.create_subj(value=kwd))
79 # MSC
80 for msc in zbmath_article["msc"]:
81 if msc["scheme"] != "msc2020":
82 continue
83 xarticle.kwds.append(model_data.create_subj(value=msc["code"]))
85 # Pages
86 pages: str = zbmath_article["source"]["pages"]
87 pages.split("-")
88 if len(pages) > 0:
89 xarticle.fpage = pages[0]
90 if len(pages) > 1:
91 xarticle.lpage = pages[1]
93 # extids
94 zbmath_id = zbmath_article["identifier"]
95 if zbmath_id is None:
96 zbmath_id = zbmath_article["id"]
97 xarticle.extids.append(("zbl-item-id", zbmath_id))
99 for link in zbmath_article["links"]:
100 if link["type"] == "http":
101 continue
102 if link["type"] == "doi":
103 xarticle.doi = link["identifier"]
104 continue
105 if link["type"] == "eudml":
106 xarticle.extids.append(("eudml-item-id", link["identifier"]))
107 return xarticle
110def zbmath_request_article(zblid):
111 session = get_session()
112 response = session.get(
113 ZBMATH_API_URL.format(query=zblid),
114 headers={**session.headers, "Content-Type": "text/json"},
115 )
116 response.raise_for_status()
118 response = response.json()
119 return parse_zbmath_article(response["result"])
122def zbmath_request_article_by_doi(doi: str):
123 session = get_session()
124 search = f"_structured_search?results_per_page=1&DOI={doi}"
125 response = session.get(
126 ZBMATH_API_URL.format(query=search),
127 headers={**session.headers, "Content-Type": "text/json"},
128 )
129 if response.status_code == 404:
130 logging.debug(f"ZBMATH API {doi} not found")
131 return
133 response.raise_for_status()
135 response = response.json()
136 if response["status"]["nr_total_results"] > 1:
137 logging.error(f"ZBMATH API found multiple candidates for doi {doi}")
138 return
139 # raise ValueError(f"ZBMATH API found multiple candidates for doi {doi}")
140 if response["status"]["nr_total_results"] == 0:
141 logging.debug(f"ZBMATH API {doi} not found")
142 return
143 return parse_zbmath_article(response["result"][0])
146def zbmath_request_article_by_extid(extid: str):
147 pass
150def _zbmath_query_retry(params: dict, timeout: int):
151 RETRIES = 3
152 for i in range(RETRIES):
153 try:
154 return zbmath_matching_query(params, timeout)
155 except ReadTimeout as e:
156 if i >= RETRIES - 1:
157 raise e
158 _logger.warning("Encountered ReadTimeout while fetching Zbmath. retrying in 60 secs")
159 time.sleep(60)
161 assert False, "unreachable"
164def match_zbl_article(article: "Article"):
165 """Finds article using matching score"""
166 # query zbl
167 params = get_article_params(article)
168 response = _zbmath_query_retry(params, timeout=30)
169 results = response.json()
171 try:
172 item = results["results"][0][0]
173 except (KeyError, IndexError):
174 _logger.debug(f"Couldn't get any matching results for resource {article.pid}. Skipping")
175 return
176 return item
179def zbmath_get_pdfurl(zbmathid: str):
180 return ZBMATH_PDF_URL.format(zbmathid=zbmathid)