Coverage for src/crawler/augment.py: 0%

1import requests

3from matching import crossref

4from ptf import model_data_converter

5from ptf.cmds import xml_cmds

7from .zbmath import zbmath_request_article

10def augment_article(article, source, what="all", doi_to_fetch=None):

11 """

12 Fetch metadata from external sources and augment an article metadata.

14 what is the database to query: crossref/zbmath/all

15 doi_to_fetch let's you specify an alternate DOI to use, for example in the case of a translation

17 """

19 # An article in Mathnet.ru may have a DOI and a DOI for the translation

20 # Metadata in Crossref for the DOI are in Russian: use the translated version.

22 databases = ["crossref", "zbmath"]

24 if source == "EUDML":

25 # Ignore some databases of some sources

26 # TODO: Move the check in a concrete crawler ?

27 databases = ["zbmath"]

29 if what != "all":

30 databases = [what]

32 for what in databases:

33 article_data = fetch_article_data(article, what, doi_to_fetch)

34 if article_data is not None:

35 update_article(article, source, article_data, what)

38def update_article(article, source, article_data, what):

39 """

40 Update the article with external metadata stored in article_data

41 """

42 collection = article.get_collection()

43 existing_article_data = model_data_converter.db_to_article_data(article)

45 model_data_converter.update_data_for_jats(article_data)

47 # Protection to make sure the database returned something valid.

48 if article_data.title_xml:

49 # Preserve existing values not set by the Crossref/zbMATH API

50 article_data.pid = article.pid

51 article_data.doi = article.doi

52 article_data.seq = existing_article_data.seq

53 article_data.ids = existing_article_data.ids

54 article_data.extids = existing_article_data.extids

55 article_data.ext_links = existing_article_data.ext_links

56 if not article_data.fpage:

57 article_data.fpage = existing_article_data.fpage

58 if not article_data.lpage:

59 article_data.lpage = existing_article_data.lpage

60 article_data.streams = existing_article_data.streams

62 # Replace with metadata from the database (if it is found)

64 # TITLE

65 if not article_data.title_html or (

66 what == "crossref" and existing_article_data.title_html

67 ):

68 # TODO: handle Crossref titles with formulas.

69 # The quality depends on the publisher so we might not be able to know in general if we have to replace

70 article_data.title_html = existing_article_data.title_html

71 article_data.title_tex = existing_article_data.title_tex

72 article_data.title_xml = existing_article_data.title_xml

74 # AUTHOR

75 if not article_data.contributors:

76 article_data.contributors = existing_article_data.contributors

78 # ABSTRACTS

79 if not article_data.abstracts:

80 article_data.abstracts = existing_article_data.abstracts

82 params = {

83 "xarticle": article_data,

84 "use_body": False,

85 "issue": article.my_container,

86 "standalone": True,

87 }

88 cmd = xml_cmds.addArticleXmlCmd(params)

89 cmd.set_collection(collection)

90 article = cmd.do()

91 if not article_data.title_html:

92 print(f" Warning: {article.pid} has no title_html after XmlCmd")

95def fetch_article_data(article, what, doi_to_fetch=None):

96 article_data = None

98 if what == "crossref":

99 doi_to_fetch = doi_to_fetch if doi_to_fetch is not None else article.doi

100

101 # qs = article.extid_set.filter(id_type="doi-translation")

102 # if qs:

103 # doi_translation = qs.first().id_value

104 # print(f" DOI Translation {doi_translation}")

105

106 article_data = crossref.fetch_article(doi_to_fetch)

107 elif what == "zbmath":

108 qs = article.extid_set.filter(id_type="zbl-item-id")

109 if qs:

110 zblid = qs.first().id_value

111 if "|" not in zblid and "%7" not in zblid:

112 attempt = 0

113 done = False

114 while not done and attempt < 3:

115 try:

116 article_data = zbmath_request_article(zblid)

117 done = True

118 except (

119 requests.exceptions.ConnectionError,

120 requests.exceptions.ReadTimeout,

121 ):

122 attempt += 1

123

124 return article_data