Coverage for src/crawler/zbmath.py: 0%

57 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1# import shutil 

2# import tempfile 

3# import urllib 

4# import xml.etree.ElementTree as ET 

5 

6import requests 

7from bs4 import BeautifulSoup 

8 

9from ptf import model_data 

10from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

11from ptf.cmds.xml.xml_utils import get_contrib_xml 

12 

13ZBMATH_URL = "https://zbmath.org" 

14ZBMATH_API_URL = "https://api.zbmath.org/v1/document/" 

15 

16 

17def zbmath_request_article(zblid): 

18 params = {"q": "an:" + zblid} 

19 headers = {"Content-Type": "text/html"} 

20 response = requests.get(ZBMATH_URL, params=params, headers=headers, timeout=2.0) 

21 if response.status_code != 200: 

22 return None 

23 

24 soup = BeautifulSoup(response.text, "html.parser") 

25 

26 # TODO: 

27 # 1. call cited_by.get_zbmath_bibtex(params) 

28 # 2. extract the zbMATH number 

29 # 3. call the zbMATH OAI to get the article XML 

30 # 4. parse the XML to create the article_data 

31 # The XML does not always have metadata so the following is the fallback method 

32 

33 article_data = model_data.create_articledata() 

34 

35 for div_author in soup.find_all("div", {"class": "author"}): 

36 for link in div_author.find_all("a"): 

37 author = model_data.create_contributor() 

38 author["role"] = "author" 

39 author["string_name"] = link.get_text() 

40 author["contrib_xml"] = get_contrib_xml(author) 

41 article_data.contributors.append(author) 

42 

43 title_h2 = soup.find("h2", {"class": "title"}) 

44 if title_h2: 

45 title_h2 = title_h2.find("strong") 

46 if title_h2: 

47 value_tex = str(title_h2)[ 

48 8:-9 

49 ] # Get the zbmath text keeping the tags, except the surrounding <strong> 

50 if value_tex[-1] == ".": # Remove a trailing "." 

51 value_tex = value_tex[:-1] 

52 value_html, value_xml = get_html_and_xml_from_text_with_formulas( 

53 value_tex, delimiter_inline="\\(", delimiter_disp="\\[" 

54 ) 

55 

56 article_data.title_tex = value_tex 

57 article_data.title_html = value_html 

58 article_data.title_xml = ( 

59 f"<title-group><article-title>{value_xml}</article-title></title-group>" 

60 ) 

61 

62 abstract_elt = soup.find("div", {"class": "abstract"}) 

63 if abstract_elt: 

64 value_tex = str(abstract_elt) 

65 value_tex = value_tex.replace('a href="/', 'a href="https://zbmath.org/') 

66 value_html, value_xml = get_html_and_xml_from_text_with_formulas( 

67 value_tex, delimiter_inline="\\(", delimiter_disp="\\[" 

68 ) 

69 

70 value_xml = f'<abstract xml:lang="en">{value_xml}</abstract>' 

71 

72 abstract_data = { 

73 "tag": "abstract", 

74 "lang": "en", 

75 "value_xml": value_xml, 

76 "value_html": value_html, 

77 "value_tex": value_tex, 

78 } 

79 

80 article_data.abstracts.append(abstract_data) 

81 

82 # Use api.zbmath.org to find more info 

83 

84 # First find the zbMATH internal id. 

85 oai_node = soup.find("a", {"class": "btn btn-default btn-xs xml"}) 

86 if oai_node: 

87 oai_url = oai_node.get("href") 

88 if "%3A" in oai_url: 

89 url = oai_url.split("%3A")[-1] 

90 url = ZBMATH_API_URL + url 

91 

92 headers = {"Content-Type": "application.json"} 

93 response = requests.get(url, headers=headers, timeout=2.0) 

94 data = response.json() 

95 try: 

96 reviewer = data["result"]["editorial_contributions"][0]["reviewer"]["name"] 

97 if reviewer is not None: 

98 # The abstract is in fact a review, remove it 

99 article_data.abstracts = [] 

100 except Exception: 

101 pass 

102 

103 # OLD CODE to request and parse the zbmath OAI 

104 # with urllib.request.urlopen(OAI_URL) as response: 

105 # with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as tmp_file: 

106 # shutil.copyfileobj(response, tmp_file) 

107 # with open(tmp_file.name) as article_oai: 

108 # body = f"""{article_oai.read()}""" 

109 # 

110 # tree = ET.fromstring(body) 

111 # records = tree[2] 

112 # for record in records: 

113 # # tag = record.tag 

114 # metadata = record[1] 

115 # """for node in metadata[0]: 

116 # text = node.text 

117 # """ 

118 # 

119 # xarticle = jats_parser.JatsArticle(tree=metadata[0]) 

120 

121 return article_data