Coverage for src/crawler/augment.py: 0%
66 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-04 14:17 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-04 14:17 +0000
1import requests
2from matching import crossref
3from ptf import model_data_converter
4from ptf.cmds import xml_cmds
5from ptf.model_data import ArticleData
6from ptf.models import Article
8from .zbmath import zbmath_request_article
11def augment_article(article: Article, source, what="all", doi_to_fetch=None):
12 """
13 Fetch metadata from external sources and augment an article metadata.
15 what is the database to query: crossref/zbmath/all
16 doi_to_fetch let's you specify an alternate DOI to use, for example in the case of a translation
18 """
20 # An article in Mathnet.ru may have a DOI and a DOI for the translation
21 # Metadata in Crossref for the DOI are in Russian: use the translated version.
23 databases = ["crossref", "zbmath"]
25 if source == "EUDML":
26 # Ignore some databases of some sources
27 # TODO: Move the check in a concrete crawler ?
28 databases = ["zbmath"]
30 if what != "all":
31 databases = [what]
33 for what in databases:
34 article_data = fetch_article_data(article, what, doi_to_fetch)
35 if article_data is not None:
36 update_article(article, article_data, what)
39def update_article(article: Article, article_data: ArticleData, what):
40 """
41 Update the article with external metadata stored in article_data
42 """
43 collection = article.get_collection()
44 existing_article_data = model_data_converter.db_to_article_data(article)
46 model_data_converter.update_data_for_jats(article_data)
48 # Protection to make sure the database returned something valid.
49 if article_data.title_xml:
50 # Preserve existing values not set by the Crossref/zbMATH API
51 article_data.pid = article.pid
52 article_data.doi = article.doi
53 article_data.seq = existing_article_data.seq
54 article_data.ids = existing_article_data.ids
55 article_data.extids = existing_article_data.extids
56 article_data.ext_links = existing_article_data.ext_links
57 if not article_data.fpage:
58 article_data.fpage = existing_article_data.fpage
59 if not article_data.lpage:
60 article_data.lpage = existing_article_data.lpage
61 article_data.streams = existing_article_data.streams
63 # Replace with metadata from the database (if it is found)
65 # TITLE
66 if not article_data.title_html or (
67 what == "crossref" and existing_article_data.title_html
68 ):
69 # TODO: handle Crossref titles with formulas.
70 # The quality depends on the publisher so we might not be able to know in general if we have to replace
71 article_data.title_html = existing_article_data.title_html
72 article_data.title_tex = existing_article_data.title_tex
73 article_data.title_xml = existing_article_data.title_xml
75 # AUTHOR
76 if not article_data.contributors:
77 article_data.contributors = existing_article_data.contributors
79 # ABSTRACTS
80 if not article_data.abstracts:
81 article_data.abstracts = existing_article_data.abstracts
83 params = {
84 "xarticle": article_data,
85 "use_body": False,
86 "issue": article.my_container,
87 "standalone": True,
88 }
89 cmd = xml_cmds.addArticleXmlCmd(params)
90 cmd.set_collection(collection)
91 article = cmd.do()
92 if not article_data.title_html:
93 print(f" Warning: {article.pid} has no title_html after XmlCmd")
96def fetch_article_data(article, what, doi_to_fetch=None):
97 article_data = None
99 if what == "crossref":
100 doi_to_fetch = doi_to_fetch if doi_to_fetch is not None else article.doi
102 # qs = article.extid_set.filter(id_type="doi-translation")
103 # if qs:
104 # doi_translation = qs.first().id_value
105 # print(f" DOI Translation {doi_translation}")
107 article_data = crossref.fetch_article(doi_to_fetch)
108 elif what == "zbmath":
109 qs = article.extid_set.filter(id_type="zbl-item-id")
110 if qs:
111 zblid = qs.first().id_value
112 if "|" not in zblid and "%7" not in zblid:
113 attempt = 0
114 done = False
115 while not done and attempt < 3:
116 try:
117 article_data = zbmath_request_article(zblid)
118 done = True
119 except (
120 requests.exceptions.ConnectionError,
121 requests.exceptions.ReadTimeout,
122 ):
123 attempt += 1
125 return article_data