Coverage for src/crawler/zbmath.py: 0%
57 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1# import shutil
2# import tempfile
3# import urllib
4# import xml.etree.ElementTree as ET
6import requests
7from bs4 import BeautifulSoup
9from ptf import model_data
10from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
11from ptf.cmds.xml.xml_utils import get_contrib_xml
13ZBMATH_URL = "https://zbmath.org"
14ZBMATH_API_URL = "https://api.zbmath.org/v1/document/"
17def zbmath_request_article(zblid):
18 params = {"q": "an:" + zblid}
19 headers = {"Content-Type": "text/html"}
20 response = requests.get(ZBMATH_URL, params=params, headers=headers, timeout=2.0)
21 if response.status_code != 200:
22 return None
24 soup = BeautifulSoup(response.text, "html.parser")
26 # TODO:
27 # 1. call cited_by.get_zbmath_bibtex(params)
28 # 2. extract the zbMATH number
29 # 3. call the zbMATH OAI to get the article XML
30 # 4. parse the XML to create the article_data
31 # The XML does not always have metadata so the following is the fallback method
33 article_data = model_data.create_articledata()
35 for div_author in soup.find_all("div", {"class": "author"}):
36 for link in div_author.find_all("a"):
37 author = model_data.create_contributor()
38 author["role"] = "author"
39 author["string_name"] = link.get_text()
40 author["contrib_xml"] = get_contrib_xml(author)
41 article_data.contributors.append(author)
43 title_h2 = soup.find("h2", {"class": "title"})
44 if title_h2:
45 title_h2 = title_h2.find("strong")
46 if title_h2:
47 value_tex = str(title_h2)[
48 8:-9
49 ] # Get the zbmath text keeping the tags, except the surrounding <strong>
50 if value_tex[-1] == ".": # Remove a trailing "."
51 value_tex = value_tex[:-1]
52 value_html, value_xml = get_html_and_xml_from_text_with_formulas(
53 value_tex, delimiter_inline="\\(", delimiter_disp="\\["
54 )
56 article_data.title_tex = value_tex
57 article_data.title_html = value_html
58 article_data.title_xml = (
59 f"<title-group><article-title>{value_xml}</article-title></title-group>"
60 )
62 abstract_elt = soup.find("div", {"class": "abstract"})
63 if abstract_elt:
64 value_tex = str(abstract_elt)
65 value_tex = value_tex.replace('a href="/', 'a href="https://zbmath.org/')
66 value_html, value_xml = get_html_and_xml_from_text_with_formulas(
67 value_tex, delimiter_inline="\\(", delimiter_disp="\\["
68 )
70 value_xml = f'<abstract xml:lang="en">{value_xml}</abstract>'
72 abstract_data = {
73 "tag": "abstract",
74 "lang": "en",
75 "value_xml": value_xml,
76 "value_html": value_html,
77 "value_tex": value_tex,
78 }
80 article_data.abstracts.append(abstract_data)
82 # Use api.zbmath.org to find more info
84 # First find the zbMATH internal id.
85 oai_node = soup.find("a", {"class": "btn btn-default btn-xs xml"})
86 if oai_node:
87 oai_url = oai_node.get("href")
88 if "%3A" in oai_url:
89 url = oai_url.split("%3A")[-1]
90 url = ZBMATH_API_URL + url
92 headers = {"Content-Type": "application.json"}
93 response = requests.get(url, headers=headers, timeout=2.0)
94 data = response.json()
95 try:
96 reviewer = data["result"]["editorial_contributions"][0]["reviewer"]["name"]
97 if reviewer is not None:
98 # The abstract is in fact a review, remove it
99 article_data.abstracts = []
100 except Exception:
101 pass
103 # OLD CODE to request and parse the zbmath OAI
104 # with urllib.request.urlopen(OAI_URL) as response:
105 # with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp_file:
106 # shutil.copyfileobj(response, tmp_file)
107 # with open(tmp_file.name) as article_oai:
108 # body = f"""{article_oai.read()}"""
109 #
110 # tree = ET.fromstring(body)
111 # records = tree[2]
112 # for record in records:
113 # # tag = record.tag
114 # metadata = record[1]
115 # """for node in metadata[0]:
116 # text = node.text
117 # """
118 #
119 # xarticle = jats_parser.JatsArticle(tree=metadata[0])
121 return article_data