Coverage for src/crawler/by_source/ 84%
142 statements
« prev ^ index » next v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next v7.6.4, created at 2025-02-14 14:36 +0000
1import os
3import langcodes
4import langcodes.tag_parser
5import regex
6from bs4 import BeautifulSoup, Tag
7from lxml import etree
8from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
9from ptf.cmds.xml.jats.jats_parser import JatsArticle
10from ptf.model_data import ArticleData, IssueData, create_articledata, create_extlink
12from crawler.base_crawler import BaseCollectionCrawler
13from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
16class IsrpCrawler(BaseCollectionCrawler):
17 source_name = "International Scientific Research Publications"
18 source_domain = "ISRP"
19 source_website = ""
21 delimiter_inline_formula = "\\("
22 delimiter_disp_formula = "\\["
24 issue_regex = r"Volume (?P<volume>\d+), Issue (?P<number>\d+) pp. (?P<pages>[\d,\-(?:In Progress)]+) (?:(?P<special>\w+)? )?\((?P<year>\d+).*\)"
26 biblinks_to_keep = {
27 "": lambda link: (link.removeprefix(""), "doi"),
28 "": lambda link: (
29 link.removeprefix(""),
30 "zbl-item-id",
31 ),
32 "": lambda link: (
33 link.removeprefix(""),
34 "zbl-item-id",
35 ),
36 #
37 "": lambda link: (
38"\/.+\/(.+)\.pdf", link).group(1),
39 "numdam-id",
40 ),
41 #
42 "": lambda link: (
43".*:\/\/\/[\.\w]+\/(?:[a-zA-Z:]+)?(\d+)", link).group(1),
44 "eudml-item-id",
45 ),
46 }
48 def parse_collection_content(self, content):
49 """
50 Parse the HTML page of Annals of Math and returns a list of xissue.
51 Each xissue has its pid/volume/number/year metadata + its url
53 self.periode is set at the end based on the xissue years of the HTML page
54 """
55 soup = BeautifulSoup(content, "html.parser")
56 issue_tags ="ul.issues > li")
57 issues = []
59 for issue_tag in issue_tags:
60 a_tag = issue_tag.select_one("a")
61 if not a_tag: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError(f"{self.source_domain}] {self.collection_id} Cannot parse issue")
63 issue_href = a_tag.get("href")
64 if not isinstance(issue_href, str): 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 raise ValueError(
66 f"{self.source_domain}] {self.collection_id} Cannot parse issue link"
67 )
68 text = cleanup_str(issue_tag.text)
69 issue_rx =, text)
70 if not issue_rx: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 raise ValueError(
72 f"{self.source_domain}] {self.collection_id} Cannot parse issue information"
73 )
74 issue_data = issue_rx.groupdict()
75 if "number" not in issue_data: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 pass
77 issues.append(
78 self.create_xissue(
79 issue_href, issue_data["year"], issue_data["volume"], issue_data["number"]
80 )
81 )
82 return issues
84 def parse_issue_content(self, content, xissue):
85 soup = BeautifulSoup(content, "html.parser")
86 articles_tags ="ul.articles-list > li.article-title")
87 for index, article_tag in enumerate(articles_tags):
88 xarticle = create_articledata()
90 a_tag = article_tag.select_one("a")
91 if not a_tag: 91 ↛ 92line 91 didn't jump to line 92 because the condition on line 91 was never true
92 raise ValueError(f"{self.source_domain}] {} Cannot find article link")
94 url = a_tag.get("href")
95 if not isinstance(url, str): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 raise ValueError(f"{self.source_domain}] {} Cannot parse article link")
98 xarticle.url = url
99 = f"a{index}"
101 xissue.articles.append(xarticle)
103 def parse_article_content(
104 self,
105 content: str,
106 xissue: IssueData,
107 xarticle: ArticleData,
108 url: str,
109 pid: str,
110 ):
111 parser = etree.XMLParser(
112 huge_tree=True, recover=True, remove_blank_text=False, remove_comments=True
113 )
114 dom = etree.fromstring(content.encode("utf-8"), parser)
115 # Fix for invalid self-uri tag
116 xslt = etree.parse(
117 os.path.dirname(os.path.realpath(__file__)) + "/isrp-article.xsl", parser
118 )
119 transform = etree.XSLT(xslt)
120 tree = transform(dom)
122 parsed_xarticle = JatsArticle(tree=tree.getroot())
123 parsed_xarticle.ext_links = xarticle.ext_links
124 parsed_xarticle.url = url
125 = cleanup_str(pid)
127 # Sometimes the source DOI has white spaces at the end
128 if parsed_xarticle.doi: 128 ↛ 130line 128 didn't jump to line 130 because the condition on line 128 was always true
129 parsed_xarticle.doi = parsed_xarticle.doi.strip()
130 lang = langcodes.Language.get(parsed_xarticle.lang).language
131 if lang: 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true
132 parsed_xarticle.lang = lang
133 for abstract in parsed_xarticle.abstracts:
134 abstract_lang = langcodes.Language.get(abstract["lang"]).language
135 if abstract_lang: 135 ↛ 133line 135 didn't jump to line 133 because the condition on line 135 was always true
136 abstract["lang"] = abstract_lang
137 for kwd in parsed_xarticle.kwds:
138 kwd_lang = langcodes.Language.get(kwd["lang"]).language
139 if kwd_lang: 139 ↛ 137line 139 didn't jump to line 137 because the condition on line 139 was always true
140 kwd["lang"] = kwd_lang
142 content = self.download_file(url.removesuffix("/xml"))
143 soup = BeautifulSoup(content, "html.parser")
145 main = soup.select_one(".simple-content")
146 if not main: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 raise ValueError("Cannot parse HTML page")
149 references_flag = False
150 references_tag = None
151 for c in main.findChildren(recursive=False): 151 ↛ 162line 151 didn't jump to line 162 because the loop on line 151 didn't complete
152 if != "h3" and != "ul":
153 continue
154 if == "h3" and c.text == "References":
155 references_flag = True
156 continue
157 if references_flag and == "ul":
158 references_tag = c
159 break
160 references_flag = False
162 if not references_tag: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 print("Couldn't find References tag")
164 else:
165 self.parse_bibitems(parsed_xarticle, references_tag)
167 # And now let's hope the html page is consistent with the XML
169 pdf_url = tree.find("/front/article-meta/uri[@specific-use='for-review']").text
170 add_pdf_link_to_xarticle(parsed_xarticle, pdf_url)
172 for contrib in parsed_xarticle.contributors:
173 if ( 173 ↛ 177line 173 didn't jump to line 177
174 len(contrib["string_name"]) > 200
175 or len(contrib["first_name"] + contrib["last_name"]) > 200
176 ):
177 pass
178 return parsed_xarticle
180 def parse_bibitems(self, xarticle, references_tag: Tag):
181 for index, c in enumerate(references_tag.findChildren("li", recursive=False)):
182 links: set[str] = set()
184 # Sometimes, the same link gets referenced multiple times in an entry
185 for a_tag in"a"):
186 ref_link = a_tag.get("href")
187 if not ref_link: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 continue
189 links.add(ref_link)
191 # We populate ext_links_xmls with PTF <ext-links> xmls
192 ext_links_xmls = []
193 for link in links:
194 for url in self.biblinks_to_keep:
195 if link.startswith(url):
196 url, link_type = self.biblinks_to_keep[url](link)
197 ext_links_xmls.append(
198 get_ext_link_xml(
199 url,
200 url,
201 link_type,
202 )
203 )
204 break
206 # We recreate bibitems while adding ext-links inside <element-citation>
207 soup = BeautifulSoup(
208 "<ref>" + xarticle.bibitems[index].citation_xml + "</ref>", "lxml-xml"
209 )
210 soup_ref = soup.select_one("ref")
212 if not soup_ref: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 raise ValueError("Cannot find ref in xml")
215 # add ext-links inside <element-citation>
216 element_citation = soup_ref.select_one("element-citation")
217 if not element_citation: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 raise ValueError("Cannot find element citation in xml")
219 element_citation.extend(BeautifulSoup(c, "lxml-xml") for c in ext_links_xmls)
220 del element_citation
222 xarticle.bibitems[index].citation_xml = str("".join(str(c) for c in soup_ref.children))
224 # Here we already have an <element-citation> tag, so we want to skip the <mixed-citation> creation
225 xarticle.bibitems[index] = self.create_crawled_bibitem(xarticle.bibitems[index])
227 if len(xarticle.bibitems) > 0: 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true
228 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems))
230 xarticle.bibitems = []
231 xarticle.bibitem = []
233 def crawl_article(self, xarticle: ArticleData, xissue: IssueData):
234 # We crawl {article.url}/xml instead of article url
236 if not xarticle.url: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true
237 raise ValueError(f"{self.source_domain}] {} Cannot parse article url")
239 ext_link = create_extlink()
240 ext_link["rel"] = "source"
241 ext_link["location"] = str(xarticle.url)
242 ext_link["metadata"] = self.source_domain
243 xarticle.ext_links.append(ext_link)
245 xarticle.url = xarticle.url + "/xml"
246 return super().crawl_article(xarticle, xissue)