Coverage for src / crawler / by_source / msp_crawler.py: 11%
134 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1import os
2from urllib.parse import urljoin, urlparse
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser
6from ptf.cmds.xml.jats.builder.issue import get_abstract_xml
7from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml
8from ptf.cmds.xml.jats.jats_parser import JatsBase
9from ptf.model_data import (
10 ArticleData,
11 create_abstract,
12 create_articledata,
13 create_subj,
14)
16from crawler.base_crawler import BaseCollectionCrawler
17from crawler.utils import cleanup_str, regex_to_dict
20class MspCrawler(BaseCollectionCrawler):
21 source_name = "Mathematical Sciences Publishers"
22 source_domain = "MSP"
23 source_website = "https://msp.org/"
25 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)"
27 def parse_collection_content(self, content):
28 xissues = []
29 soup = BeautifulSoup(content, "html.parser")
30 issues = soup.select("td.issues-area a.about[href]")
31 for issue in issues:
32 issue_href = issue.get("href")
33 if not isinstance(issue_href, str):
34 raise ValueError("Couldn't parse issue href")
36 issue_dict = regex_to_dict(
37 self.issue_re, issue_href, error_msg="Couldn't parse issue data"
38 )
40 xissues.append(
41 self.create_xissue(
42 urljoin(self.source_website, issue_href),
43 issue_dict["year"],
44 issue_dict["volume"],
45 issue_dict["number"],
46 )
47 )
48 return xissues
50 def parse_issue_content(self, content, xissue):
51 soup = BeautifulSoup(content, "html.parser")
52 if xissue.url is None:
53 raise ValueError("Cannot parse article : issue url is None")
55 incomplete = soup.select_one(".incomplete")
56 if incomplete:
57 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.":
58 self.logger.debug(
59 "Ignoring : Issue is not available due to S2O policy",
60 extra={"pid": xissue.pid},
61 )
62 return
64 issue_doi_tag = soup.select_one("div.issue-doi a")
65 if issue_doi_tag:
66 xissue.doi = cleanup_str(issue_doi_tag.text)
68 articles = soup.select("#toc-area .title")
69 for index, article_tag in enumerate(articles):
70 xarticle = create_articledata()
71 article_href = article_tag.get("href")
72 if not isinstance(article_href, str):
73 raise ValueError("Couldn't parse article url")
74 xarticle.url = urljoin(xissue.url, article_href)
75 xarticle.pid = "a" + str(index)
76 xissue.articles.append(xarticle)
78 def parse_article_content(self, content, xissue, xarticle, url):
79 soup = BeautifulSoup(content, "html.parser")
81 # Warn : meta doi is sometimes incorrect
82 self.get_metadata_using_citation_meta(
83 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"]
84 )
86 if not self.is_article_openaccess(xarticle):
87 return
89 doi_tag = soup.select_one(".paper-doi > a")
90 if doi_tag:
91 xarticle.doi = doi_tag.text
93 article_data: dict[str, Tag] = {}
94 article_sections = soup.select("#content-area > .article")
95 for section in article_sections:
96 if section.select_one(".copyright-license"):
97 continue
99 tabs = section.select("tr")
100 section_title_tag = tabs[0].select_one("h5")
101 if not section_title_tag:
102 self.logger.debug(f"Skipping {tabs[0].name} section at {xarticle.pid}")
103 continue
104 section_title = section_title_tag.text
105 section_title_tag.decompose()
106 del section_title_tag
108 section_content = tabs[0]
109 if len(tabs) > 1:
110 section_content = tabs[1]
111 section_tag = section_content.select_one("tr > td.article-area")
112 if section_tag:
113 article_data[section_title] = section_tag
114 del article_sections
116 if "Keywords" in article_data and article_data["Keywords"] != "":
117 for kwd in article_data["Keywords"].text.split(", "):
118 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd))
120 if (
121 "Mathematical Subject Classification 2010" in article_data
122 and article_data["Mathematical Subject Classification 2010"] != ""
123 ):
124 msc_long_text = (
125 cleanup_str(article_data["Mathematical Subject Classification 2010"].text)
126 .replace("Primary: ", "")
127 .replace(" Secondary: ", ", ")
128 )
129 for kwd in msc_long_text.split(", "):
130 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd))
132 if "Abstract" in article_data and article_data["Abstract"] != "":
133 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p"))
134 test = CkeditorParser(
135 html_value=abstract_str,
136 mml_formulas="",
137 )
139 abstract = create_abstract(
140 lang="en",
141 value_xml=get_abstract_xml(test.value_xml, lang="en"),
142 value_tex=test.value_tex,
143 value_html=test.value_html,
144 )
146 xarticle.abstracts.append(abstract)
148 self.parse_msp_references(xarticle)
149 return xarticle
151 def is_article_openaccess(self, xarticle: ArticleData):
152 stream = next(stream for stream in xarticle.streams if stream["rel"] == "full-text")
153 pdf_url = stream["location"]
155 isok, *_ = self.check_pdf_link_validity(url=pdf_url, verify=True)
157 return isok
159 def parse_msp_references(self, xarticle: ArticleData):
160 url = urlparse(xarticle.url)
161 dirname = os.path.dirname(url.path)
162 filename = os.path.basename(url.path)
163 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b")))
165 content = self.download_file(str(url.geturl()))
166 soup = BeautifulSoup(content, "html.parser")
167 references = soup.select("#content-area table.article:last-of-type tr")
169 # TODO : extensive parsing (authors, title etc...)
170 # Currently, only the text is inserted
171 for ref in references:
172 td = ref.select("td")
173 value_xml = self.parse_single_ref(td[1])
174 xarticle.bibitems.append(JatsBase.bake_ref(value_xml, cleanup_str(td[0].text)))
176 def parse_single_ref(self, tag: Tag):
177 xml_list = []
178 ext_links = []
179 authors_closed = False
181 for element in tag.contents:
182 if isinstance(element, str):
183 xml_list.append(element)
184 continue
185 if isinstance(element, Tag):
186 if element.name == "b" and not authors_closed:
187 xml_list.append(f"<string-name>{element.text}</string-name>")
188 elif element.name == "i" and not authors_closed:
189 temp_element = xml_list.pop()
190 xml_list = [
191 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>',
192 temp_element,
193 ]
194 xml_list.append(get_article_title_xml(element.text))
195 del temp_element
197 link = element.select_one("a")
198 if link:
199 link_href = link.get("href")
200 if isinstance(link_href, str):
201 if link_href.startswith("https://doi.org/"):
202 link_href = link_href.removeprefix("https://doi.org/")
203 ext_links.append(get_ext_link_xml(link_href, link_href, "doi"))
205 authors_closed = True
206 elif element.name == "a":
207 pass
208 continue
210 return cleanup_str("".join(xml_list) + "".join(ext_links))