Coverage for src/crawler/by_source/msp_crawler.py: 88%
146 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1import os
2from urllib.parse import urljoin, urlparse
4import regex
5from bs4 import BeautifulSoup, Tag
6from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser
7from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
8from ptf.cmds.xml.jats.builder.citation import (
9 get_article_title_xml as get_citation_article_title_xml,
10)
11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
12from ptf.cmds.xml.jats.builder.issue import get_title_xml as get_issue_title_xml
13from ptf.cmds.xml.jats.jats_parser import JatsBase
14from ptf.model_data import (
15 ArticleData,
16 ResourceData,
17 create_abstract,
18 create_articledata,
19 create_subj,
20)
21from ptf.model_data_converter import update_data_for_jats
23from crawler.base_crawler import BaseCollectionCrawler
24from crawler.utils import cleanup_str
27class MspCrawler(BaseCollectionCrawler):
28 source_name = "Mathematical Sciences Publishers"
29 source_domain = "MSP"
30 source_website = "https://msp.org/"
32 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)"
34 def parse_collection_content(self, content):
35 xissues = []
36 soup = BeautifulSoup(content, "html.parser")
37 issues = soup.select("td.issues-area a.about[href]")
38 for issue in issues:
39 issue_href = issue.get("href")
40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 raise ValueError("Couldn't parse issue href")
43 issue_search = regex.search(self.issue_re, issue_href)
44 if not issue_search: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 raise ValueError("Couldn't parse issue data")
47 issue_dict = issue_search.groupdict()
49 xissues.append(
50 self.create_xissue(
51 urljoin(self.source_website, issue_href),
52 issue_dict["year"],
53 issue_dict["volume"],
54 issue_dict["number"],
55 )
56 )
57 return xissues
59 def parse_issue_content(self, content, xissue):
60 soup = BeautifulSoup(content, "html.parser")
61 if xissue.url is None: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError("Cannot parse article : issue url is None")
64 incomplete = soup.select_one(".incomplete")
65 if incomplete: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true
66 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy")
68 return
70 issue_doi_tag = soup.select_one("div.issue-doi a")
71 if issue_doi_tag: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was always true
72 xissue.doi = cleanup_str(issue_doi_tag.text)
74 articles = soup.select("#toc-area .title")
75 for index, article_tag in enumerate(articles):
76 xarticle = create_articledata()
77 article_href = article_tag.get("href")
78 if not isinstance(article_href, str): 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 raise ValueError("Couldn't parse article url")
80 xarticle.url = urljoin(xissue.url, article_href)
81 xarticle.pid = "a" + str(index)
82 xissue.articles.append(xarticle)
84 def parse_article_content(self, content, xissue, xarticle, url):
85 soup = BeautifulSoup(content, "html.parser")
87 # Warn : meta doi is sometimes incorrect
88 self.get_metadata_using_citation_meta(
89 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"]
90 )
92 doi_tag = soup.select_one(".paper-doi > a")
93 if doi_tag: 93 ↛ 96line 93 didn't jump to line 96 because the condition on line 93 was always true
94 xarticle.doi = doi_tag.text
96 article_data: dict[str, Tag] = {}
97 article_sections = soup.select("#content-area > .article")
98 for section in article_sections:
99 if section.select_one(".copyright-license"):
100 continue
102 tabs = section.select("tr")
103 section_title_tag = tabs[0].select_one("h5")
104 if not section_title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 print(f"{xarticle.pid} : Skipping section")
106 continue
107 section_title = section_title_tag.text
108 section_title_tag.decompose()
109 del section_title_tag
111 section_content = tabs[0]
112 if len(tabs) > 1:
113 section_content = tabs[1]
114 section_tag = section_content.select_one("tr > td.article-area")
115 if section_tag: 115 ↛ 98line 115 didn't jump to line 98 because the condition on line 115 was always true
116 article_data[section_title] = section_tag
117 del article_sections
119 if "Keywords" in article_data and article_data["Keywords"] != "": 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was always true
120 for kwd in article_data["Keywords"].text.split(", "):
121 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd))
123 if (
124 "Mathematical Subject Classification 2010" in article_data
125 and article_data["Mathematical Subject Classification 2010"] != ""
126 ):
127 msc_long_text = (
128 cleanup_str(article_data["Mathematical Subject Classification 2010"].text)
129 .replace("Primary: ", "")
130 .replace(" Secondary: ", ", ")
131 )
132 for kwd in msc_long_text.split(", "):
133 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd))
135 if "Abstract" in article_data and article_data["Abstract"] != "": 135 ↛ 152line 135 didn't jump to line 152 because the condition on line 135 was always true
136 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p"))
137 test = CkeditorParser(
138 html_value=abstract_str,
139 mml_formulas="",
140 )
141 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag
142 abstract = create_abstract(
143 lang="en",
144 tag="abstract",
145 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>',
146 value_tex=test.value_tex,
147 value_html=test.value_html,
148 )
150 xarticle.abstracts.append(abstract)
152 self.parse_msp_references(xarticle)
153 return xarticle
155 def parse_msp_references(self, xarticle: ArticleData):
156 url = urlparse(xarticle.url)
157 dirname = os.path.dirname(url.path)
158 filename = os.path.basename(url.path)
159 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b")))
161 content = self.download_file(str(url.geturl()))
162 soup = BeautifulSoup(content, "html.parser")
163 references = soup.select("#content-area table.article:last-of-type tr")
165 bibitems = []
166 # TODO : extensive parsing (authors, title etc...)
167 # Currently, only the text is inserted
168 for ref in references:
169 td = ref.select("td")
170 value_xml = self.parse_single_ref(td[1])
171 bibitem = JatsBase.bake_ref(value_xml, cleanup_str(td[0].text))
172 bibitems.append(bibitem)
173 if len(bibitems) > 0: 173 ↛ exitline 173 didn't return from function 'parse_msp_references' because the condition on line 173 was always true
174 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))
176 def parse_single_ref(self, tag: Tag):
177 xml_list = []
178 ext_links = []
179 authors_closed = False
181 for element in tag.contents:
182 if isinstance(element, str):
183 xml_list.append(element)
184 continue
185 if isinstance(element, Tag): 185 ↛ 181line 185 didn't jump to line 181 because the condition on line 185 was always true
186 if element.name == "b" and not authors_closed:
187 xml_list.append(f"<string-name>{element.text}</string-name>")
188 elif element.name == "i" and not authors_closed:
189 temp_element = xml_list.pop()
190 xml_list = [
191 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>',
192 temp_element,
193 ]
194 xml_list.append(get_citation_article_title_xml(element.text))
195 del temp_element
197 link = element.select_one("a")
198 if link:
199 link_href = link.get("href")
200 if isinstance(link_href, str): 200 ↛ 205line 200 didn't jump to line 205 because the condition on line 200 was always true
201 if link_href.startswith("https://doi.org/"):
202 link_href = link_href.removeprefix("https://doi.org/")
203 ext_links.append(get_ext_link_xml(link_href, link_href, "doi"))
205 authors_closed = True
206 elif element.name == "a":
207 pass
208 continue
210 return cleanup_str("".join(xml_list) + "".join(ext_links))
212 def process_resource_metadata(self, xresource: ResourceData):
213 html, xml = get_html_and_xml_from_text_with_formulas(
214 xresource.title_tex,
215 delimiter_inline=self.delimiter_inline_formula,
216 delimiter_disp=self.delimiter_disp_formula,
217 )
218 xml = get_issue_title_xml(xml, with_tex_values=False)
219 xresource.title_html = html
220 xresource.title_xml = xml
222 if isinstance(xresource, ArticleData): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true
223 update_data_for_jats(xresource)
225 return xresource