Coverage for src/crawler/augment.py: 0%

64 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import requests 

2 

3from matching import crossref 

4from ptf import model_data_converter 

5from ptf.cmds import xml_cmds 

6 

7from .zbmath import zbmath_request_article 

8 

9 

10def augment_article(article, source, what="all", doi_to_fetch=None): 

11 """ 

12 Fetch metadata from external sources and augment an article metadata. 

13 

14 what is the database to query: crossref/zbmath/all 

15 doi_to_fetch let's you specify an alternate DOI to use, for example in the case of a translation 

16 

17 """ 

18 

19 # An article in Mathnet.ru may have a DOI and a DOI for the translation 

20 # Metadata in Crossref for the DOI are in Russian: use the translated version. 

21 

22 databases = ["crossref", "zbmath"] 

23 

24 if source == "EUDML": 

25 # Ignore some databases of some sources 

26 # TODO: Move the check in a concrete crawler ? 

27 databases = ["zbmath"] 

28 

29 if what != "all": 

30 databases = [what] 

31 

32 for what in databases: 

33 article_data = fetch_article_data(article, what, doi_to_fetch) 

34 if article_data is not None: 

35 update_article(article, source, article_data, what) 

36 

37 

38def update_article(article, source, article_data, what): 

39 """ 

40 Update the article with external metadata stored in article_data 

41 """ 

42 collection = article.get_collection() 

43 existing_article_data = model_data_converter.db_to_article_data(article) 

44 

45 model_data_converter.update_data_for_jats(article_data) 

46 

47 # Protection to make sure the database returned something valid. 

48 if article_data.title_xml: 

49 # Preserve existing values not set by the Crossref/zbMATH API 

50 article_data.pid = article.pid 

51 article_data.doi = article.doi 

52 article_data.seq = existing_article_data.seq 

53 article_data.ids = existing_article_data.ids 

54 article_data.extids = existing_article_data.extids 

55 article_data.ext_links = existing_article_data.ext_links 

56 if not article_data.fpage: 

57 article_data.fpage = existing_article_data.fpage 

58 if not article_data.lpage: 

59 article_data.lpage = existing_article_data.lpage 

60 article_data.streams = existing_article_data.streams 

61 

62 # Replace with metadata from the database (if it is found) 

63 

64 # TITLE 

65 if not article_data.title_html or ( 

66 what == "crossref" and existing_article_data.title_html 

67 ): 

68 # TODO: handle Crossref titles with formulas. 

69 # The quality depends on the publisher so we might not be able to know in general if we have to replace 

70 article_data.title_html = existing_article_data.title_html 

71 article_data.title_tex = existing_article_data.title_tex 

72 article_data.title_xml = existing_article_data.title_xml 

73 

74 # AUTHOR 

75 if not article_data.contributors: 

76 article_data.contributors = existing_article_data.contributors 

77 

78 # ABSTRACTS 

79 if not article_data.abstracts: 

80 article_data.abstracts = existing_article_data.abstracts 

81 

82 params = { 

83 "xarticle": article_data, 

84 "use_body": False, 

85 "issue": article.my_container, 

86 "standalone": True, 

87 } 

88 cmd = xml_cmds.addArticleXmlCmd(params) 

89 cmd.set_collection(collection) 

90 article = cmd.do() 

91 if not article_data.title_html: 

92 print(f" Warning: {article.pid} has no title_html after XmlCmd") 

93 

94 

95def fetch_article_data(article, what, doi_to_fetch=None): 

96 article_data = None 

97 

98 if what == "crossref": 

99 doi_to_fetch = doi_to_fetch if doi_to_fetch is not None else article.doi 

100 

101 # qs = article.extid_set.filter(id_type="doi-translation") 

102 # if qs: 

103 # doi_translation = qs.first().id_value 

104 # print(f" DOI Translation {doi_translation}") 

105 

106 article_data = crossref.fetch_article(doi_to_fetch) 

107 elif what == "zbmath": 

108 qs = article.extid_set.filter(id_type="zbl-item-id") 

109 if qs: 

110 zblid = qs.first().id_value 

111 if "|" not in zblid and "%7" not in zblid: 

112 attempt = 0 

113 done = False 

114 while not done and attempt < 3: 

115 try: 

116 article_data = zbmath_request_article(zblid) 

117 done = True 

118 except ( 

119 requests.exceptions.ConnectionError, 

120 requests.exceptions.ReadTimeout, 

121 ): 

122 attempt += 1 

123 

124 return article_data