Coverage for src/crawler/by_source/lofpl_crawler.py: 84%
82 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import json
3import regex
4from langcodes import standardize_tag
5from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
6from ptf.model_data import (
7 ArticleData,
8 IssueData,
9 create_abstract,
10 create_articledata,
11 create_contributor,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle
18class LofplCrawler(BaseCollectionCrawler):
19 source_name = "Library of Science"
20 source_domain = "LOFPL"
21 source_website = "https://bibliotekanauki.pl"
23 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)"
25 def parse_collection_content(self, content):
26 """
27 Parse the HTML page of Annals of Math and returns a list of xissue.
28 Each xissue has its pid/volume/number/year metadata + its url
29 """
30 issues = []
31 data = json.loads(content)
32 for entry in data:
33 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles"
34 year = entry["year"]
35 volume = entry["volume"]
36 if entry["number"]: 36 ↛ 39line 36 didn't jump to line 39 because the condition on line 36 was always true
37 number = entry["number"].replace("/", "-")
38 else:
39 number = None
40 issue = self.create_xissue(link, year, volume, number)
41 issues.append(issue)
42 return issues
44 def parse_issue_content(self, content: str, xissue: IssueData):
45 data = json.loads(content)
46 for index, entry in enumerate(data):
47 xarticle = create_articledata()
48 xarticle.pid = "a" + str(index)
49 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"])
50 xissue.articles.append(xarticle)
52 def parse_article_content(
53 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
54 ):
55 data = json.loads(content)
57 # used only for prints
58 pid = f"{xissue.pid}_{xarticle.pid}"
59 # Titles
60 xarticle.title_tex = data["mainTitle"]["text"]
61 xarticle.lang = standardize_tag(data["language"])
63 if data["mainTitleTranslations"]:
64 # PID known to have translations : PMF
65 print(f"[{pid}]title translations found")
66 # raise NotImplementedError("title translations found")
68 # Abstracts
69 if data["mainAbstract"]["text"] != "": 69 ↛ 78line 69 didn't jump to line 78 because the condition on line 69 was always true
70 xabstract = create_abstract(
71 tag="abstract",
72 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]),
73 lang=standardize_tag(data["mainAbstract"]["language"]),
74 )
76 xarticle.abstracts.append(xabstract)
78 if data["abstractTranslations"]: 78 ↛ 80line 78 didn't jump to line 80 because the condition on line 78 was never true
79 # PID known to have translations : PMF
80 print(f"[{pid}]abstract translations found")
82 if data["pageRange"]:
83 self.set_pages(xarticle, data["pageRange"])
85 # Keywords
86 for keyword in data["keywords"]:
87 xarticle.kwds.append(
88 {
89 "type": "",
90 "lang": standardize_tag(keyword["language"]),
91 "value": keyword["text"],
92 }
93 )
95 # Authors
96 for a in data["contributors"]:
97 if a["role"] != "AUTHOR": 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 raise NotImplementedError("Author role not implemented")
99 author = create_contributor()
100 author["role"] = "author"
101 # Workaround for https://bibliotekanauki.pl/api/articles/2201019
102 if a["firstName"] and a["lastName"]: 102 ↛ 106line 102 didn't jump to line 106 because the condition on line 102 was always true
103 author["first_name"] = a["firstName"]
104 author["last_name"] = a["lastName"]
105 else:
106 author["string_name"] = a["lastName"] or a["firstName"]
108 author["corresponding"] = a["corresponding"]
109 author["orcid"] = a["orcid"]
110 xarticle.contributors.append(author)
112 if len(data["bibEntries"]) > 0:
113 bibitems = []
115 for index, bib_entry in enumerate(data["bibEntries"]):
116 bibitems.append(self.parse_bibitem(bib_entry, index+1))
118 xarticle.abstracts.append(
119 self.create_bibliography(
120 bibitems
121 )
122 )
124 add_pdf_link_to_xarticle(
125 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf"
126 )
127 return xarticle
129 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
130 # Fix: set correct article url
131 article = super().crawl_article(xarticle, xissue)
132 ext_link = next(link for link in article.ext_links if link["rel"] == "source")
133 ext_link["location"] = ext_link["location"].replace("/api", "")
134 return article
136 def parse_bibitem(self, text: str, index = None):
137 doi_re = list(regex.finditer(self.doi_regex, text))
138 if len(doi_re) == 0: 138 ↛ 140line 138 didn't jump to line 140 because the condition on line 138 was always true
139 return self.create_crawled_bibitem(text, index)
140 text = regex.sub(self.doi_regex, "", text)
141 text = text.removesuffix(", ")
142 for doi_entry in doi_re:
143 href = doi_entry.group(1)
144 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi")
146 return self.create_crawled_bibitem(text, index)