Coverage for src / crawler / by_source / lofpl_crawler.py: 81%
82 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import json
3import regex
4from langcodes import standardize_tag
5from ptf.cmds.xml.jats.builder.references import get_ext_link_xml
6from ptf.cmds.xml.jats.jats_parser import JatsBase
7from ptf.model_data import (
8 ArticleData,
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.crawler_utils import set_pages
17from crawler.utils import add_pdf_link_to_xarticle
20class LofplCrawler(BaseCollectionCrawler):
21 source_name = "Library of Science"
22 source_domain = "LOFPL"
23 source_website = "https://bibliotekanauki.pl"
25 doi_regex = r"DOI: (?P<doi_url>https:\/\/doi.org[^ \n\r]+)"
27 def parse_collection_content(self, content):
28 """
29 Parse the HTML page of Annals of Math and returns a list of xissue.
30 Each xissue has its pid/volume/number/year metadata + its url
31 """
32 issues = []
33 data = json.loads(content)
34 for entry in data:
35 link = self.source_website + "/api/issues/" + str(entry["id"]) + "/articles"
36 year = entry["year"]
37 volume = entry["volume"]
38 if entry["number"]: 38 ↛ 41line 38 didn't jump to line 41 because the condition on line 38 was always true
39 number = entry["number"].replace("/", "-")
40 else:
41 number = None
42 issue = self.create_xissue(link, year, volume, number)
43 issues.append(issue)
44 return issues
46 def parse_issue_content(self, content: str, xissue: IssueData):
47 data = json.loads(content)
48 for index, entry in enumerate(data):
49 xarticle = create_articledata()
50 xarticle.pid = "a" + str(index)
51 xarticle.url = self.source_website + "/api/articles/" + str(entry["id"])
52 xissue.articles.append(xarticle)
54 def parse_article_content(
55 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
56 ):
57 data = json.loads(content)
59 # used only for prints
60 pid = f"{xissue.pid}_{xarticle.pid}"
61 # Titles
62 xarticle.title_tex = data["mainTitle"]["text"]
63 xarticle.lang = standardize_tag(data["language"])
65 if data["mainTitleTranslations"]: 65 ↛ 67line 65 didn't jump to line 67 because the condition on line 65 was never true
66 # PID known to have translations : PMF
67 self.logger.debug("Title translations found", {"pid": pid})
68 # raise NotImplementedError("title translations found")
70 # Abstracts
71 if data["mainAbstract"]["text"] != "": 71 ↛ 79line 71 didn't jump to line 79 because the condition on line 71 was always true
72 xabstract = create_abstract(
73 value_tex=self.latext_parser.latex_to_text(data["mainAbstract"]["text"]),
74 lang=standardize_tag(data["mainAbstract"]["language"]),
75 )
77 xarticle.abstracts.append(xabstract)
79 if data["abstractTranslations"]: 79 ↛ 81line 79 didn't jump to line 81 because the condition on line 79 was never true
80 # PID known to have translations : PMF
81 self.logger.debug("Abstract translations found", extra={"pid": pid})
83 if data["pageRange"]:
84 set_pages(xarticle, data["pageRange"])
86 # Keywords
87 for keyword in data["keywords"]:
88 xarticle.kwds.append(
89 {
90 "type": "",
91 "lang": standardize_tag(keyword["language"]),
92 "value": keyword["text"],
93 }
94 )
96 # Authors
97 for a in data["contributors"]:
98 if a["role"] != "AUTHOR": 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 raise NotImplementedError("Author role not implemented")
100 author = create_contributor()
101 author["role"] = "author"
102 # Workaround for https://bibliotekanauki.pl/api/articles/2201019
103 if a["firstName"] and a["lastName"]: 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was always true
104 author["first_name"] = a["firstName"]
105 author["last_name"] = a["lastName"]
106 else:
107 author["string_name"] = a["lastName"] or a["firstName"]
109 author["corresponding"] = a["corresponding"]
110 author["orcid"] = a["orcid"]
111 xarticle.contributors.append(author)
113 if len(data["bibEntries"]) > 0: 113 ↛ 117line 113 didn't jump to line 117 because the condition on line 113 was always true
114 for index, bib_entry in enumerate(data["bibEntries"]):
115 xarticle.bibitems.append(self.parse_bibitem(bib_entry, index + 1))
117 add_pdf_link_to_xarticle(
118 xarticle, self.source_website + "/articles/" + str(data["id"]) + ".pdf"
119 )
120 return xarticle
122 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
123 # Fix: set correct article url
124 article = super().crawl_article(xarticle, xissue)
125 ext_link = next(link for link in article.ext_links if link["rel"] == "source")
126 ext_link["location"] = ext_link["location"].replace("/api", "")
127 return article
129 def parse_bibitem(self, text: str, index=None):
130 doi_re = list(regex.finditer(self.doi_regex, text))
131 if len(doi_re) == 0: 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true
132 return JatsBase.bake_ref(text, str(index))
133 text = regex.sub(self.doi_regex, "", text)
134 text = text.removesuffix(", ")
135 for doi_entry in doi_re:
136 href = doi_entry.group(1)
137 text += get_ext_link_xml(href, href.removeprefix("https://doi.org/"), "doi")
139 return JatsBase.bake_ref(text, index)