Coverage for src/crawler/zbmath.py: 0%

56 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1# import shutil 

2# import tempfile 

3# import urllib 

4# import xml.etree.ElementTree as ET 

5 

6import requests 

7from bs4 import BeautifulSoup 

8from ptf import model_data 

9from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

10from ptf.cmds.xml.xml_utils import get_contrib_xml 

11 

12ZBMATH_URL = "https://zbmath.org" 

13ZBMATH_API_URL = "https://api.zbmath.org/v1/document/" 

14 

15 

16def zbmath_request_article(zblid): 

17 params = {"q": "an:" + zblid} 

18 headers = {"Content-Type": "text/html"} 

19 response = requests.get(ZBMATH_URL, params=params, headers=headers, timeout=2.0) 

20 if response.status_code != 200: 

21 return None 

22 

23 soup = BeautifulSoup(response.text, "html.parser") 

24 

25 # TODO: 

26 # 1. call cited_by.get_zbmath_bibtex(params) 

27 # 2. extract the zbMATH number 

28 # 3. call the zbMATH OAI to get the article XML 

29 # 4. parse the XML to create the article_data 

30 # The XML does not always have metadata so the following is the fallback method 

31 

32 article_data = model_data.create_articledata() 

33 

34 for div_author in soup.find_all("div", {"class": "author"}): 

35 for link in div_author.find_all("a"): 

36 author = model_data.create_contributor() 

37 author["role"] = "author" 

38 author["string_name"] = link.get_text() 

39 author["contrib_xml"] = get_contrib_xml(author) 

40 article_data.contributors.append(author) 

41 

42 title_h2 = soup.find("h2", {"class": "title"}) 

43 if title_h2: 

44 title_h2 = title_h2.find("strong") 

45 if title_h2: 

46 value_tex = str(title_h2)[ 

47 8:-9 

48 ] # Get the zbmath text keeping the tags, except the surrounding <strong> 

49 if value_tex[-1] == ".": # Remove a trailing "." 

50 value_tex = value_tex[:-1] 

51 value_html, value_xml = get_html_and_xml_from_text_with_formulas( 

52 value_tex, delimiter_inline="\\(", delimiter_disp="\\[" 

53 ) 

54 

55 article_data.title_tex = value_tex 

56 article_data.title_html = value_html 

57 article_data.title_xml = ( 

58 f"<title-group><article-title>{value_xml}</article-title></title-group>" 

59 ) 

60 

61 abstract_elt = soup.find("div", {"class": "abstract"}) 

62 if abstract_elt: 

63 value_tex = str(abstract_elt) 

64 value_tex = value_tex.replace('a href="/', 'a href="https://zbmath.org/') 

65 value_html, value_xml = get_html_and_xml_from_text_with_formulas( 

66 value_tex, delimiter_inline="\\(", delimiter_disp="\\[" 

67 ) 

68 

69 value_xml = f'<abstract xml:lang="en">{value_xml}</abstract>' 

70 

71 article_data.abstracts.append( 

72 model_data.create_abstract( 

73 lang="en", 

74 value_xml=value_xml, 

75 value_html=value_html, 

76 value_tex=value_tex, 

77 ) 

78 ) 

79 

80 # Use api.zbmath.org to find more info 

81 

82 # First find the zbMATH internal id. 

83 oai_node = soup.find("a", {"class": "btn btn-default btn-xs xml"}) 

84 if oai_node: 

85 oai_url = oai_node.get("href") 

86 if "%3A" in oai_url: 

87 url = oai_url.split("%3A")[-1] 

88 url = ZBMATH_API_URL + url 

89 

90 headers = {"Content-Type": "application.json"} 

91 response = requests.get(url, headers=headers, timeout=2.0) 

92 data = response.json() 

93 try: 

94 reviewer = data["result"]["editorial_contributions"][0]["reviewer"]["name"] 

95 if reviewer is not None: 

96 # The abstract is in fact a review, remove it 

97 article_data.abstracts = [] 

98 except Exception: 

99 pass 

100 

101 # OLD CODE to request and parse the zbmath OAI 

102 # with urllib.request.urlopen(OAI_URL) as response: 

103 # with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp_file: 

104 # shutil.copyfileobj(response, tmp_file) 

105 # with open(tmp_file.name) as article_oai: 

106 # body = f"""{article_oai.read()}""" 

107 # 

108 # tree = ET.fromstring(body) 

109 # records = tree[2] 

110 # for record in records: 

111 # # tag = record.tag 

112 # metadata = record[1] 

113 # """for node in metadata[0]: 

114 # text = node.text 

115 # """ 

116 # 

117 # xarticle = jats_parser.JatsArticle(tree=metadata[0]) 

118 

119 return article_data