Coverage for src/crawler/zbmath.py: 0%
57 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-24 10:35 +0000
1# import shutil
2# import tempfile
3# import urllib
4# import xml.etree.ElementTree as ET
6import requests
7from bs4 import BeautifulSoup
8from ptf import model_data
9from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
10from ptf.cmds.xml.xml_utils import get_contrib_xml
12ZBMATH_URL = "https://zbmath.org"
13ZBMATH_API_URL = "https://api.zbmath.org/v1/document/"
16def zbmath_request_article(zblid):
17 params = {"q": "an:" + zblid}
18 headers = {"Content-Type": "text/html"}
19 response = requests.get(ZBMATH_URL, params=params, headers=headers, timeout=2.0)
20 if response.status_code != 200:
21 return None
23 soup = BeautifulSoup(response.text, "html.parser")
25 # TODO:
26 # 1. call cited_by.get_zbmath_bibtex(params)
27 # 2. extract the zbMATH number
28 # 3. call the zbMATH OAI to get the article XML
29 # 4. parse the XML to create the article_data
30 # The XML does not always have metadata so the following is the fallback method
32 article_data = model_data.create_articledata()
34 for div_author in soup.find_all("div", {"class": "author"}):
35 for link in div_author.find_all("a"):
36 author = model_data.create_contributor()
37 author["role"] = "author"
38 author["string_name"] = link.get_text()
39 author["contrib_xml"] = get_contrib_xml(author)
40 article_data.contributors.append(author)
42 title_h2 = soup.find("h2", {"class": "title"})
43 if title_h2:
44 title_h2 = title_h2.find("strong")
45 if title_h2:
46 value_tex = str(title_h2)[
47 8:-9
48 ] # Get the zbmath text keeping the tags, except the surrounding <strong>
49 if value_tex[-1] == ".": # Remove a trailing "."
50 value_tex = value_tex[:-1]
51 value_html, value_xml = get_html_and_xml_from_text_with_formulas(
52 value_tex, delimiter_inline="\\(", delimiter_disp="\\["
53 )
55 article_data.title_tex = value_tex
56 article_data.title_html = value_html
57 article_data.title_xml = (
58 f"<title-group><article-title>{value_xml}</article-title></title-group>"
59 )
61 abstract_elt = soup.find("div", {"class": "abstract"})
62 if abstract_elt:
63 value_tex = str(abstract_elt)
64 value_tex = value_tex.replace('a href="/', 'a href="https://zbmath.org/')
65 value_html, value_xml = get_html_and_xml_from_text_with_formulas(
66 value_tex, delimiter_inline="\\(", delimiter_disp="\\["
67 )
69 value_xml = f'<abstract xml:lang="en">{value_xml}</abstract>'
71 abstract_data = {
72 "tag": "abstract",
73 "lang": "en",
74 "value_xml": value_xml,
75 "value_html": value_html,
76 "value_tex": value_tex,
77 }
79 article_data.abstracts.append(abstract_data)
81 # Use api.zbmath.org to find more info
83 # First find the zbMATH internal id.
84 oai_node = soup.find("a", {"class": "btn btn-default btn-xs xml"})
85 if oai_node:
86 oai_url = oai_node.get("href")
87 if "%3A" in oai_url:
88 url = oai_url.split("%3A")[-1]
89 url = ZBMATH_API_URL + url
91 headers = {"Content-Type": "application.json"}
92 response = requests.get(url, headers=headers, timeout=2.0)
93 data = response.json()
94 try:
95 reviewer = data["result"]["editorial_contributions"][0]["reviewer"]["name"]
96 if reviewer is not None:
97 # The abstract is in fact a review, remove it
98 article_data.abstracts = []
99 except Exception:
100 pass
102 # OLD CODE to request and parse the zbmath OAI
103 # with urllib.request.urlopen(OAI_URL) as response:
104 # with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp_file:
105 # shutil.copyfileobj(response, tmp_file)
106 # with open(tmp_file.name) as article_oai:
107 # body = f"""{article_oai.read()}"""
108 #
109 # tree = ET.fromstring(body)
110 # records = tree[2]
111 # for record in records:
112 # # tag = record.tag
113 # metadata = record[1]
114 # """for node in metadata[0]:
115 # text = node.text
116 # """
117 #
118 # xarticle = jats_parser.JatsArticle(tree=metadata[0])
120 return article_data