Coverage for src/crawler/zbmath.py: 0%

57 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1# import shutil 

2# import tempfile 

3# import urllib 

4# import xml.etree.ElementTree as ET 

5 

6import requests 

7from bs4 import BeautifulSoup 

8from ptf import model_data 

9from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

10from ptf.cmds.xml.xml_utils import get_contrib_xml 

11 

12ZBMATH_URL = "https://zbmath.org" 

13ZBMATH_API_URL = "https://api.zbmath.org/v1/document/" 

14 

15 

16def zbmath_request_article(zblid): 

17 params = {"q": "an:" + zblid} 

18 headers = {"Content-Type": "text/html"} 

19 response = requests.get(ZBMATH_URL, params=params, headers=headers, timeout=2.0) 

20 if response.status_code != 200: 

21 return None 

22 

23 soup = BeautifulSoup(response.text, "html.parser") 

24 

25 # TODO: 

26 # 1. call cited_by.get_zbmath_bibtex(params) 

27 # 2. extract the zbMATH number 

28 # 3. call the zbMATH OAI to get the article XML 

29 # 4. parse the XML to create the article_data 

30 # The XML does not always have metadata so the following is the fallback method 

31 

32 article_data = model_data.create_articledata() 

33 

34 for div_author in soup.find_all("div", {"class": "author"}): 

35 for link in div_author.find_all("a"): 

36 author = model_data.create_contributor() 

37 author["role"] = "author" 

38 author["string_name"] = link.get_text() 

39 author["contrib_xml"] = get_contrib_xml(author) 

40 article_data.contributors.append(author) 

41 

42 title_h2 = soup.find("h2", {"class": "title"}) 

43 if title_h2: 

44 title_h2 = title_h2.find("strong") 

45 if title_h2: 

46 value_tex = str(title_h2)[ 

47 8:-9 

48 ] # Get the zbmath text keeping the tags, except the surrounding <strong> 

49 if value_tex[-1] == ".": # Remove a trailing "." 

50 value_tex = value_tex[:-1] 

51 value_html, value_xml = get_html_and_xml_from_text_with_formulas( 

52 value_tex, delimiter_inline="\\(", delimiter_disp="\\[" 

53 ) 

54 

55 article_data.title_tex = value_tex 

56 article_data.title_html = value_html 

57 article_data.title_xml = ( 

58 f"<title-group><article-title>{value_xml}</article-title></title-group>" 

59 ) 

60 

61 abstract_elt = soup.find("div", {"class": "abstract"}) 

62 if abstract_elt: 

63 value_tex = str(abstract_elt) 

64 value_tex = value_tex.replace('a href="/', 'a href="https://zbmath.org/') 

65 value_html, value_xml = get_html_and_xml_from_text_with_formulas( 

66 value_tex, delimiter_inline="\\(", delimiter_disp="\\[" 

67 ) 

68 

69 value_xml = f'<abstract xml:lang="en">{value_xml}</abstract>' 

70 

71 abstract_data = { 

72 "tag": "abstract", 

73 "lang": "en", 

74 "value_xml": value_xml, 

75 "value_html": value_html, 

76 "value_tex": value_tex, 

77 } 

78 

79 article_data.abstracts.append(abstract_data) 

80 

81 # Use api.zbmath.org to find more info 

82 

83 # First find the zbMATH internal id. 

84 oai_node = soup.find("a", {"class": "btn btn-default btn-xs xml"}) 

85 if oai_node: 

86 oai_url = oai_node.get("href") 

87 if "%3A" in oai_url: 

88 url = oai_url.split("%3A")[-1] 

89 url = ZBMATH_API_URL + url 

90 

91 headers = {"Content-Type": "application.json"} 

92 response = requests.get(url, headers=headers, timeout=2.0) 

93 data = response.json() 

94 try: 

95 reviewer = data["result"]["editorial_contributions"][0]["reviewer"]["name"] 

96 if reviewer is not None: 

97 # The abstract is in fact a review, remove it 

98 article_data.abstracts = [] 

99 except Exception: 

100 pass 

101 

102 # OLD CODE to request and parse the zbmath OAI 

103 # with urllib.request.urlopen(OAI_URL) as response: 

104 # with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp_file: 

105 # shutil.copyfileobj(response, tmp_file) 

106 # with open(tmp_file.name) as article_oai: 

107 # body = f"""{article_oai.read()}""" 

108 # 

109 # tree = ET.fromstring(body) 

110 # records = tree[2] 

111 # for record in records: 

112 # # tag = record.tag 

113 # metadata = record[1] 

114 # """for node in metadata[0]: 

115 # text = node.text 

116 # """ 

117 # 

118 # xarticle = jats_parser.JatsArticle(tree=metadata[0]) 

119 

120 return article_data