Coverage for src/crawler/zbmath.py: 0%

1# import shutil

2# import tempfile

3# import urllib

4# import xml.etree.ElementTree as ET

6import requests

7from bs4 import BeautifulSoup

8from ptf import model_data

9from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas

10from ptf.cmds.xml.xml_utils import get_contrib_xml

12ZBMATH_URL = "https://zbmath.org"

13ZBMATH_API_URL = "https://api.zbmath.org/v1/document/"

16def zbmath_request_article(zblid):

17 params = {"q": "an:" + zblid}

18 headers = {"Content-Type": "text/html"}

19 response = requests.get(ZBMATH_URL, params=params, headers=headers, timeout=2.0)

20 if response.status_code != 200:

21 return None

23 soup = BeautifulSoup(response.text, "html.parser")

25 # TODO:

26 # 1. call cited_by.get_zbmath_bibtex(params)

27 # 2. extract the zbMATH number

28 # 3. call the zbMATH OAI to get the article XML

29 # 4. parse the XML to create the article_data

30 # The XML does not always have metadata so the following is the fallback method

32 article_data = model_data.create_articledata()

34 for div_author in soup.find_all("div", {"class": "author"}):

35 for link in div_author.find_all("a"):

36 author = model_data.create_contributor()

37 author["role"] = "author"

38 author["string_name"] = link.get_text()

39 author["contrib_xml"] = get_contrib_xml(author)

40 article_data.contributors.append(author)

42 title_h2 = soup.find("h2", {"class": "title"})

43 if title_h2:

44 title_h2 = title_h2.find("strong")

45 if title_h2:

46 value_tex = str(title_h2)[

47 8:-9

48 ] # Get the zbmath text keeping the tags, except the surrounding <strong>

49 if value_tex[-1] == ".": # Remove a trailing "."

50 value_tex = value_tex[:-1]

51 value_html, value_xml = get_html_and_xml_from_text_with_formulas(

52 value_tex, delimiter_inline="\\(", delimiter_disp="\\["

53 )

55 article_data.title_tex = value_tex

56 article_data.title_html = value_html

57 article_data.title_xml = (

58 f"<title-group><article-title>{value_xml}</article-title></title-group>"

59 )

61 abstract_elt = soup.find("div", {"class": "abstract"})

62 if abstract_elt:

63 value_tex = str(abstract_elt)

64 value_tex = value_tex.replace('a href="/', 'a href="https://zbmath.org/')

65 value_html, value_xml = get_html_and_xml_from_text_with_formulas(

66 value_tex, delimiter_inline="\\(", delimiter_disp="\\["

67 )

69 value_xml = f'<abstract xml:lang="en">{value_xml}</abstract>'

71 abstract_data = {

72 "tag": "abstract",

73 "lang": "en",

74 "value_xml": value_xml,

75 "value_html": value_html,

76 "value_tex": value_tex,

77 }

79 article_data.abstracts.append(abstract_data)

81 # Use api.zbmath.org to find more info

83 # First find the zbMATH internal id.

84 oai_node = soup.find("a", {"class": "btn btn-default btn-xs xml"})

85 if oai_node:

86 oai_url = oai_node.get("href")

87 if "%3A" in oai_url:

88 url = oai_url.split("%3A")[-1]

89 url = ZBMATH_API_URL + url

91 headers = {"Content-Type": "application.json"}

92 response = requests.get(url, headers=headers, timeout=2.0)

93 data = response.json()

94 try:

95 reviewer = data["result"]["editorial_contributions"][0]["reviewer"]["name"]

96 if reviewer is not None:

97 # The abstract is in fact a review, remove it

98 article_data.abstracts = []

99 except Exception:

100 pass

101

102 # OLD CODE to request and parse the zbmath OAI

103 # with urllib.request.urlopen(OAI_URL) as response:

104 # with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp_file:

105 # shutil.copyfileobj(response, tmp_file)

106 # with open(tmp_file.name) as article_oai:

107 # body = f"""{article_oai.read()}"""

108 #

109 # tree = ET.fromstring(body)

110 # records = tree[2]

111 # for record in records:

112 # # tag = record.tag

113 # metadata = record[1]

114 # """for node in metadata[0]:

115 # text = node.text

116 # """

117 #

118 # xarticle = jats_parser.JatsArticle(tree=metadata[0])

119

120 return article_data