Coverage for src/crawler/augment.py: 0%
65 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import requests
2from matching import crossref
3from ptf import model_data_converter
4from ptf.cmds import xml_cmds
5from ptf.models import Article
7from .zbmath import zbmath_request_article
10def augment_article(article: Article, source, what="all", doi_to_fetch=None):
11 """
12 Fetch metadata from external sources and augment an article metadata.
14 what is the database to query: crossref/zbmath/all
15 doi_to_fetch let's you specify an alternate DOI to use, for example in the case of a translation
17 """
19 # An article in Mathnet.ru may have a DOI and a DOI for the translation
20 # Metadata in Crossref for the DOI are in Russian: use the translated version.
22 databases = ["crossref", "zbmath"]
24 if source == "EUDML":
25 # Ignore some databases of some sources
26 # TODO: Move the check in a concrete crawler ?
27 databases = ["zbmath"]
29 if what != "all":
30 databases = [what]
32 for what in databases:
33 article_data = fetch_article_data(article, what, doi_to_fetch)
34 if article_data is not None:
35 update_article(article, source, article_data, what)
38def update_article(article: Article, source, article_data, what):
39 """
40 Update the article with external metadata stored in article_data
41 """
42 collection = article.get_collection()
43 existing_article_data = model_data_converter.db_to_article_data(article)
45 model_data_converter.update_data_for_jats(article_data)
47 # Protection to make sure the database returned something valid.
48 if article_data.title_xml:
49 # Preserve existing values not set by the Crossref/zbMATH API
50 article_data.pid = article.pid
51 article_data.doi = article.doi
52 article_data.seq = existing_article_data.seq
53 article_data.ids = existing_article_data.ids
54 article_data.extids = existing_article_data.extids
55 article_data.ext_links = existing_article_data.ext_links
56 if not article_data.fpage:
57 article_data.fpage = existing_article_data.fpage
58 if not article_data.lpage:
59 article_data.lpage = existing_article_data.lpage
60 article_data.streams = existing_article_data.streams
62 # Replace with metadata from the database (if it is found)
64 # TITLE
65 if not article_data.title_html or (
66 what == "crossref" and existing_article_data.title_html
67 ):
68 # TODO: handle Crossref titles with formulas.
69 # The quality depends on the publisher so we might not be able to know in general if we have to replace
70 article_data.title_html = existing_article_data.title_html
71 article_data.title_tex = existing_article_data.title_tex
72 article_data.title_xml = existing_article_data.title_xml
74 # AUTHOR
75 if not article_data.contributors:
76 article_data.contributors = existing_article_data.contributors
78 # ABSTRACTS
79 if not article_data.abstracts:
80 article_data.abstracts = existing_article_data.abstracts
82 params = {
83 "xarticle": article_data,
84 "use_body": False,
85 "issue": article.my_container,
86 "standalone": True,
87 }
88 cmd = xml_cmds.addArticleXmlCmd(params)
89 cmd.set_collection(collection)
90 article = cmd.do()
91 if not article_data.title_html:
92 print(f" Warning: {article.pid} has no title_html after XmlCmd")
95def fetch_article_data(article, what, doi_to_fetch=None):
96 article_data = None
98 if what == "crossref":
99 doi_to_fetch = doi_to_fetch if doi_to_fetch is not None else article.doi
101 # qs = article.extid_set.filter(id_type="doi-translation")
102 # if qs:
103 # doi_translation = qs.first().id_value
104 # print(f" DOI Translation {doi_translation}")
106 article_data = crossref.fetch_article(doi_to_fetch)
107 elif what == "zbmath":
108 qs = article.extid_set.filter(id_type="zbl-item-id")
109 if qs:
110 zblid = qs.first().id_value
111 if "|" not in zblid and "%7" not in zblid:
112 attempt = 0
113 done = False
114 while not done and attempt < 3:
115 try:
116 article_data = zbmath_request_article(zblid)
117 done = True
118 except (
119 requests.exceptions.ConnectionError,
120 requests.exceptions.ReadTimeout,
121 ):
122 attempt += 1
124 return article_data