Coverage for src/crawler/augment.py: 0%

66 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-04 14:17 +0000

1import requests 

2from matching import crossref 

3from ptf import model_data_converter 

4from ptf.cmds import xml_cmds 

5from ptf.model_data import ArticleData 

6from ptf.models import Article 

7 

8from .zbmath import zbmath_request_article 

9 

10 

11def augment_article(article: Article, source, what="all", doi_to_fetch=None): 

12 """ 

13 Fetch metadata from external sources and augment an article metadata. 

14 

15 what is the database to query: crossref/zbmath/all 

16 doi_to_fetch let's you specify an alternate DOI to use, for example in the case of a translation 

17 

18 """ 

19 

20 # An article in Mathnet.ru may have a DOI and a DOI for the translation 

21 # Metadata in Crossref for the DOI are in Russian: use the translated version. 

22 

23 databases = ["crossref", "zbmath"] 

24 

25 if source == "EUDML": 

26 # Ignore some databases of some sources 

27 # TODO: Move the check in a concrete crawler ? 

28 databases = ["zbmath"] 

29 

30 if what != "all": 

31 databases = [what] 

32 

33 for what in databases: 

34 article_data = fetch_article_data(article, what, doi_to_fetch) 

35 if article_data is not None: 

36 update_article(article, article_data, what) 

37 

38 

39def update_article(article: Article, article_data: ArticleData, what): 

40 """ 

41 Update the article with external metadata stored in article_data 

42 """ 

43 collection = article.get_collection() 

44 existing_article_data = model_data_converter.db_to_article_data(article) 

45 

46 model_data_converter.update_data_for_jats(article_data) 

47 

48 # Protection to make sure the database returned something valid. 

49 if article_data.title_xml: 

50 # Preserve existing values not set by the Crossref/zbMATH API 

51 article_data.pid = article.pid 

52 article_data.doi = article.doi 

53 article_data.seq = existing_article_data.seq 

54 article_data.ids = existing_article_data.ids 

55 article_data.extids = existing_article_data.extids 

56 article_data.ext_links = existing_article_data.ext_links 

57 if not article_data.fpage: 

58 article_data.fpage = existing_article_data.fpage 

59 if not article_data.lpage: 

60 article_data.lpage = existing_article_data.lpage 

61 article_data.streams = existing_article_data.streams 

62 

63 # Replace with metadata from the database (if it is found) 

64 

65 # TITLE 

66 if not article_data.title_html or ( 

67 what == "crossref" and existing_article_data.title_html 

68 ): 

69 # TODO: handle Crossref titles with formulas. 

70 # The quality depends on the publisher so we might not be able to know in general if we have to replace 

71 article_data.title_html = existing_article_data.title_html 

72 article_data.title_tex = existing_article_data.title_tex 

73 article_data.title_xml = existing_article_data.title_xml 

74 

75 # AUTHOR 

76 if not article_data.contributors: 

77 article_data.contributors = existing_article_data.contributors 

78 

79 # ABSTRACTS 

80 if not article_data.abstracts: 

81 article_data.abstracts = existing_article_data.abstracts 

82 

83 params = { 

84 "xarticle": article_data, 

85 "use_body": False, 

86 "issue": article.my_container, 

87 "standalone": True, 

88 } 

89 cmd = xml_cmds.addArticleXmlCmd(params) 

90 cmd.set_collection(collection) 

91 article = cmd.do() 

92 if not article_data.title_html: 

93 print(f" Warning: {article.pid} has no title_html after XmlCmd") 

94 

95 

96def fetch_article_data(article, what, doi_to_fetch=None): 

97 article_data = None 

98 

99 if what == "crossref": 

100 doi_to_fetch = doi_to_fetch if doi_to_fetch is not None else article.doi 

101 

102 # qs = article.extid_set.filter(id_type="doi-translation") 

103 # if qs: 

104 # doi_translation = qs.first().id_value 

105 # print(f" DOI Translation {doi_translation}") 

106 

107 article_data = crossref.fetch_article(doi_to_fetch) 

108 elif what == "zbmath": 

109 qs = article.extid_set.filter(id_type="zbl-item-id") 

110 if qs: 

111 zblid = qs.first().id_value 

112 if "|" not in zblid and "%7" not in zblid: 

113 attempt = 0 

114 done = False 

115 while not done and attempt < 3: 

116 try: 

117 article_data = zbmath_request_article(zblid) 

118 done = True 

119 except ( 

120 requests.exceptions.ConnectionError, 

121 requests.exceptions.ReadTimeout, 

122 ): 

123 attempt += 1 

124 

125 return article_data