Coverage for src/crawler/by_source/lofpl_crawler.py: 85%
79 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import json
3import regex
4from langcodes import standardize_tag
5from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
6from ptf.model_data import (
7 ArticleData,
8 IssueData,
9 create_abstract,
10 create_articledata,
11 create_contributor,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle
18class LofplCrawler(BaseCollectionCrawler):
19 source_name = "Library of Science"
20 source_domain = "LOFPL"
21 source_website = "https://bibliotekanauki.pl"
23 periode_begin = 0
24 periode_end = 9999
26 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)"
28 def parse_collection_content(self, content):
29 """
30 Parse the HTML page of Annals of Math and returns a list of xissue.
31 Each xissue has its pid/volume/number/year metadata + its url
33 self.periode is set at the end based on the xissue years of the HTML page
34 """
35 issues = []
36 data = json.loads(content)
37 for entry in data:
38 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles"
39 year = entry["year"]
40 volume = entry["volume"]
41 number = entry["number"]
42 issue = self.create_xissue(link, year, volume, number)
43 issues.append(issue)
44 return issues
46 def parse_issue_content(self, content: str, xissue: IssueData):
47 data = json.loads(content)
48 for index, entry in enumerate(data):
49 xarticle = create_articledata()
50 xarticle.pid = "a" + str(index)
51 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"])
52 xissue.articles.append(xarticle)
54 def parse_article_content(
55 self,
56 content: str,
57 xissue: IssueData,
58 xarticle: ArticleData,
59 url: str,
60 pid: str,
61 ):
62 xarticle.pid = pid
64 data = json.loads(content)
66 # Titles
67 xarticle.title_tex = data["mainTitle"]["text"]
68 xarticle.lang = standardize_tag(data["language"])
70 if data["mainTitleTranslations"]:
71 # PID known to have translations : PMF
72 print(f"[{pid}]title translations found")
73 # raise NotImplementedError("title translations found")
75 # Abstracts
76 if data["mainAbstract"]["text"] != "": 76 ↛ 85line 76 didn't jump to line 85 because the condition on line 76 was always true
77 xabstract = create_abstract(
78 tag="abstract",
79 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]),
80 lang=standardize_tag(data["mainAbstract"]["language"]),
81 )
83 xarticle.abstracts.append(xabstract)
85 if data["abstractTranslations"]: 85 ↛ 87line 85 didn't jump to line 87 because the condition on line 85 was never true
86 # PID known to have translations : PMF
87 print(f"[{pid}]abstract translations found")
89 if data["pageRange"]:
90 self.set_pages(xarticle, data["pageRange"])
92 # Keywords
93 for keyword in data["keywords"]:
94 xarticle.kwds.append(
95 {
96 "type": "",
97 "lang": standardize_tag(keyword["language"]),
98 "value": keyword["text"],
99 }
100 )
102 # Authors
103 for a in data["contributors"]:
104 if a["role"] != "AUTHOR": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 raise NotImplementedError("Author role not implemented")
106 author = create_contributor()
107 author["role"] = "author"
108 # Workaround for https://bibliotekanauki.pl/api/articles/2201019
109 if a["firstName"] and a["lastName"]: 109 ↛ 113line 109 didn't jump to line 113 because the condition on line 109 was always true
110 author["first_name"] = a["firstName"]
111 author["last_name"] = a["lastName"]
112 else:
113 author["string_name"] = a["lastName"] or a["firstName"]
115 author["corresponding"] = a["corresponding"]
116 author["orcid"] = a["orcid"]
117 xarticle.contributors.append(author)
119 if len(data["bibEntries"]) > 0:
120 xarticle.abstracts.append(
121 self.create_bibliography(
122 [self.parse_bibitem(bib_entry) for bib_entry in data["bibEntries"]]
123 )
124 )
126 add_pdf_link_to_xarticle(
127 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf"
128 )
129 return xarticle
131 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
132 # Fix: set correct article url
133 article = super().crawl_article(xarticle, xissue)
134 ext_link = next(link for link in article.ext_links if link["rel"] == "source")
135 ext_link["location"] = ext_link["location"].replace("/api", "")
136 return article
138 def parse_bibitem(self, text: str):
139 doi_re = list(regex.finditer(self.doi_regex, text))
140 if len(doi_re) == 0: 140 ↛ 142line 140 didn't jump to line 142 because the condition on line 140 was always true
141 return self.create_crawled_bibitem(text)
142 text = regex.sub(self.doi_regex, "", text)
143 text = text.removesuffix(", ")
144 for doi_entry in doi_re:
145 href = doi_entry.group(1)
146 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi")
148 return self.create_crawled_bibitem(text)