Coverage for src/crawler/by_source/isrp_crawler.py: 84%
141 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import os
3import langcodes
4import langcodes.tag_parser
5import regex
6from bs4 import BeautifulSoup, Tag
7from lxml import etree
8from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
9from ptf.cmds.xml.jats.jats_parser import JatsArticle
10from ptf.model_data import ArticleData, IssueData, create_articledata, create_extlink
12from crawler.base_crawler import BaseCollectionCrawler
13from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
16class IsrpCrawler(BaseCollectionCrawler):
17 source_name = "International Scientific Research Publications"
18 source_domain = "ISRP"
19 source_website = "https://www.isr-publications.com"
21 delimiter_inline_formula = "\\("
22 delimiter_disp_formula = "\\["
24 issue_regex = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) pp. (?P<pages>[\d,\-(?:In Progress)]+) (?:(?P<special>\w+)? )?\((?P<year>\d+).*\)"
26 biblinks_to_keep = {
27 "https://doi.org/": lambda link: (link.removeprefix("https://doi.org/"), "doi"),
28 "https://zbmath.org/?q=an:": lambda link: (
29 link.removeprefix("https://zbmath.org/?q=an:"),
30 "zbl-item-id",
31 ),
32 "https://zbmath.org/": lambda link: (
33 link.removeprefix("https://zbmath.org/"),
34 "zbl-item-id",
35 ),
36 # http://archive.numdam.org/article/AFST_1907_2_9__203_0.pdf
37 "http://archive.numdam.org/": lambda link: (
38 regex.search(r".+numdam.org\/.+\/(.+)\.pdf", link).group(1),
39 "numdam-id",
40 ),
41 # https://eudml.org/serve/127518/accessibleLayeredPdf/0
42 "https://eudml.org": lambda link: (
43 regex.search(r".*:\/\/eudml.org\/[\.\w]+\/(?:[a-zA-Z:]+)?(\d+)", link).group(1),
44 "eudml-item-id",
45 ),
46 }
48 def parse_collection_content(self, content):
49 """
50 Parse the HTML page of Annals of Math and returns a list of xissue.
51 Each xissue has its pid/volume/number/year metadata + its url
52 """
53 soup = BeautifulSoup(content, "html.parser")
54 issue_tags = soup.select("ul.issues > li")
55 issues = []
57 for issue_tag in issue_tags:
58 a_tag = issue_tag.select_one("a")
59 if not a_tag: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 raise ValueError(f"{self.source_domain}] {self.collection_id} Cannot parse issue")
61 issue_href = a_tag.get("href")
62 if not isinstance(issue_href, str): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 raise ValueError(
64 f"{self.source_domain}] {self.collection_id} Cannot parse issue link"
65 )
66 text = cleanup_str(issue_tag.text)
67 issue_rx = regex.search(self.issue_regex, text)
68 if not issue_rx: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 raise ValueError(
70 f"{self.source_domain}] {self.collection_id} Cannot parse issue information"
71 )
72 issue_data = issue_rx.groupdict()
73 if "number" not in issue_data: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 pass
75 issues.append(
76 self.create_xissue(
77 issue_href, issue_data["year"], issue_data["volume"], issue_data["number"]
78 )
79 )
80 return issues
82 def parse_issue_content(self, content, xissue):
83 soup = BeautifulSoup(content, "html.parser")
84 articles_tags = soup.select("ul.articles-list > li.article-title")
85 for index, article_tag in enumerate(articles_tags):
86 xarticle = create_articledata()
88 a_tag = article_tag.select_one("a")
89 if not a_tag: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot find article link")
92 url = a_tag.get("href")
93 if not isinstance(url, str): 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse article link")
96 xarticle.url = url
97 xarticle.pid = f"a{index}"
99 xissue.articles.append(xarticle)
101 def parse_article_content(
102 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
103 ):
104 parser = etree.XMLParser(
105 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
106 )
107 dom = etree.fromstring(content.encode("utf-8"), parser)
108 # Fix for invalid self-uri tag
109 xslt = etree.parse(
110 os.path.dirname(os.path.realpath(__file__)) + "/isrp-article.xsl", parser
111 )
112 transform = etree.XSLT(xslt)
113 tree = transform(dom)
115 parsed_xarticle = JatsArticle(tree=tree.getroot())
116 parsed_xarticle.ext_links = xarticle.ext_links
117 parsed_xarticle.url = url
119 # Sometimes the source DOI has white spaces at the end
120 if parsed_xarticle.doi: 120 ↛ 122line 120 didn't jump to line 122 because the condition on line 120 was always true
121 parsed_xarticle.doi = parsed_xarticle.doi.strip()
122 lang = langcodes.Language.get(parsed_xarticle.lang).language
123 if lang: 123 ↛ 125line 123 didn't jump to line 125 because the condition on line 123 was always true
124 parsed_xarticle.lang = lang
125 for abstract in parsed_xarticle.abstracts:
126 abstract_lang = langcodes.Language.get(abstract["lang"]).language
127 if abstract_lang: 127 ↛ 125line 127 didn't jump to line 125 because the condition on line 127 was always true
128 abstract["lang"] = abstract_lang
129 for kwd in parsed_xarticle.kwds:
130 kwd_lang = langcodes.Language.get(kwd["lang"]).language
131 if kwd_lang: 131 ↛ 129line 131 didn't jump to line 129 because the condition on line 131 was always true
132 kwd["lang"] = kwd_lang
134 content = self.download_file(url.removesuffix("/xml"))
135 soup = BeautifulSoup(content, "html.parser")
137 main = soup.select_one(".simple-content")
138 if not main: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 raise ValueError("Cannot parse HTML page")
141 references_flag = False
142 references_tag = None
143 for c in main.findChildren(recursive=False): 143 ↛ 154line 143 didn't jump to line 154 because the loop on line 143 didn't complete
144 if c.name != "h3" and c.name != "ul":
145 continue
146 if c.name == "h3" and c.text == "References":
147 references_flag = True
148 continue
149 if references_flag and c.name == "ul":
150 references_tag = c
151 break
152 references_flag = False
154 if not references_tag: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 print("Couldn't find References tag")
156 else:
157 self.parse_bibitems(parsed_xarticle, references_tag)
159 # And now let's hope the html page is consistent with the XML
161 pdf_url = tree.find("/front/article-meta/uri[@specific-use='for-review']").text
162 add_pdf_link_to_xarticle(parsed_xarticle, pdf_url)
164 for contrib in parsed_xarticle.contributors:
165 if ( 165 ↛ 169line 165 didn't jump to line 169 because the condition on line 165 was never true
166 len(contrib["string_name"]) > 200
167 or len(contrib["first_name"] + contrib["last_name"]) > 200
168 ):
169 pass
170 return parsed_xarticle
172 def parse_bibitems(self, xarticle, references_tag: Tag):
173 for index, c in enumerate(references_tag.findChildren("li", recursive=False)):
174 links: set[str] = set()
176 # Sometimes, the same link gets referenced multiple times in an entry
177 for a_tag in c.select("a"):
178 ref_link = a_tag.get("href")
179 if not ref_link: 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true
180 continue
181 links.add(ref_link)
183 # We populate ext_links_xmls with PTF <ext-links> xmls
184 ext_links_xmls = []
185 for link in links:
186 for url in self.biblinks_to_keep:
187 if link.startswith(url):
188 url, link_type = self.biblinks_to_keep[url](link)
189 ext_links_xmls.append(
190 get_ext_link_xml(
191 url,
192 url,
193 link_type,
194 )
195 )
196 break
198 # We recreate bibitems while adding ext-links inside <element-citation>
199 soup = BeautifulSoup(
200 "<ref>" + xarticle.bibitems[index].citation_xml + "</ref>", "lxml-xml"
201 )
202 soup_ref = soup.select_one("ref")
204 if not soup_ref: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 raise ValueError("Cannot find ref in xml")
207 # add ext-links inside <element-citation>
208 element_citation = soup_ref.select_one("element-citation")
209 if not element_citation: 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 raise ValueError("Cannot find element citation in xml")
211 element_citation.extend(BeautifulSoup(c, "lxml-xml") for c in ext_links_xmls)
212 del element_citation
214 xarticle.bibitems[index].citation_xml = str("".join(str(c) for c in soup_ref.children))
216 # Here we already have an <element-citation> tag, so we want to skip the <mixed-citation> creation
217 xarticle.bibitems[index] = self.create_crawled_bibitem(xarticle.bibitems[index])
219 if len(xarticle.bibitems) > 0: 219 ↛ 222line 219 didn't jump to line 222 because the condition on line 219 was always true
220 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems))
222 xarticle.bibitems = []
223 xarticle.bibitem = []
225 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
226 # We crawl {article.url}/xml instead of article url
228 if not xarticle.url: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 raise ValueError(f"{self.source_domain}] {xissue.pid} Cannot parse article url")
231 ext_link = create_extlink()
232 ext_link["rel"] = "source"
233 ext_link["location"] = str(xarticle.url)
234 ext_link["metadata"] = self.source_domain
235 xarticle.ext_links.append(ext_link)
237 xarticle.url = xarticle.url + "/xml"
238 return super().crawl_article(xarticle, xissue)