Coverage for src/crawler/by_source/msp_crawler.py: 89%
144 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import os
2from urllib.parse import urljoin, urlparse
4import regex
5from bs4 import BeautifulSoup, Tag
6from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser
7from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas
8from ptf.cmds.xml.jats.builder.citation import (
9 get_article_title_xml as get_citation_article_title_xml,
10)
11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml
12from ptf.cmds.xml.jats.builder.issue import get_title_xml as get_issue_title_xml
13from ptf.model_data import ArticleData, create_abstract, create_articledata, create_subj
14from ptf.model_data_converter import update_data_for_jats
16from crawler.base_crawler import BaseCollectionCrawler
17from crawler.utils import cleanup_str
20class MspCrawler(BaseCollectionCrawler):
21 source_name = "Mathematical Sciences Publishers"
22 source_domain = "MSP"
23 source_website = "https://msp.org/"
25 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)"
27 def parse_collection_content(self, content):
28 xissues = []
29 soup = BeautifulSoup(content, "html.parser")
30 issues = soup.select("td.issues-area a.about[href]")
31 for issue in issues:
32 issue_href = issue.get("href")
33 if not isinstance(issue_href, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true
34 raise ValueError("Couldn't parse issue href")
36 issue_search = regex.search(self.issue_re, issue_href)
37 if not issue_search: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 raise ValueError("Couldn't parse issue data")
40 issue_dict = issue_search.groupdict()
42 xissues.append(
43 self.create_xissue(
44 urljoin(self.source_website, issue_href),
45 issue_dict["year"],
46 issue_dict["volume"],
47 issue_dict["number"],
48 )
49 )
50 return xissues
52 def parse_issue_content(self, content, xissue):
53 soup = BeautifulSoup(content, "html.parser")
54 if xissue.url is None: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 raise ValueError("Cannot parse article : issue url is None")
57 incomplete = soup.select_one(".incomplete")
58 if incomplete: 58 ↛ 63line 58 didn't jump to line 63 because the condition on line 58 was always true
59 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy")
61 return
63 issue_doi_tag = soup.select_one("div.issue-doi a")
64 if issue_doi_tag: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true
65 xissue.doi = cleanup_str(issue_doi_tag.text)
67 articles = soup.select("#toc-area .title")
68 for index, article_tag in enumerate(articles):
69 xarticle = create_articledata()
70 article_href = article_tag.get("href")
71 if not isinstance(article_href, str): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raise ValueError("Couldn't parse article url")
73 xarticle.url = urljoin(xissue.url, article_href)
74 xarticle.pid = "a" + str(index)
75 xissue.articles.append(xarticle)
77 def parse_article_content(self, content, xissue, xarticle, url, pid):
78 soup = BeautifulSoup(content, "html.parser")
79 xarticle.pid = pid
81 # Warn : meta doi is sometimes incorrect
82 self.get_metadata_using_citation_meta(
83 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"]
84 )
86 doi_tag = soup.select_one(".paper-doi > a")
87 if doi_tag: 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true
88 xarticle.doi = doi_tag.text
90 article_data: dict[str, Tag] = {}
91 article_sections = soup.select("#content-area > .article")
92 for section in article_sections:
93 if section.select_one(".copyright-license"):
94 continue
96 tabs = section.select("tr")
97 section_title_tag = tabs[0].select_one("h5")
98 if not section_title_tag: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 print(f"{xarticle.pid} : Skipping section")
100 continue
101 section_title = section_title_tag.text
102 section_title_tag.decompose()
103 del section_title_tag
105 section_content = tabs[0]
106 if len(tabs) > 1:
107 section_content = tabs[1]
108 section_tag = section_content.select_one("tr > td.article-area")
109 if section_tag: 109 ↛ 92line 109 didn't jump to line 92 because the condition on line 109 was always true
110 article_data[section_title] = section_tag
111 del article_sections
113 if "Keywords" in article_data and article_data["Keywords"] != "": 113 ↛ 117line 113 didn't jump to line 117 because the condition on line 113 was always true
114 for kwd in article_data["Keywords"].text.split(", "):
115 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd))
117 if (
118 "Mathematical Subject Classification 2010" in article_data
119 and article_data["Mathematical Subject Classification 2010"] != ""
120 ):
121 msc_long_text = (
122 cleanup_str(article_data["Mathematical Subject Classification 2010"].text)
123 .replace("Primary: ", "")
124 .replace(" Secondary: ", ", ")
125 )
126 for kwd in msc_long_text.split(", "):
127 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd))
129 if "Abstract" in article_data and article_data["Abstract"] != "": 129 ↛ 146line 129 didn't jump to line 146 because the condition on line 129 was always true
130 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p"))
131 test = CkeditorParser(
132 html_value=abstract_str,
133 mml_formulas="",
134 )
135 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag
136 abstract = create_abstract(
137 lang="en",
138 tag="abstract",
139 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>',
140 value_tex=test.value_tex,
141 value_html=test.value_html,
142 )
144 xarticle.abstracts.append(abstract)
146 self.parse_msp_references(xarticle)
147 return xarticle
149 def parse_msp_references(self, xarticle: ArticleData):
150 url = urlparse(xarticle.url)
151 dirname = os.path.dirname(url.path)
152 filename = os.path.basename(url.path)
153 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b")))
155 content = self.download_file(str(url.geturl()))
156 soup = BeautifulSoup(content, "html.parser")
157 references = soup.select("#content-area table.article:last-of-type tr")
159 bibitems = []
160 # TODO : extensive parsing (authors, title etc...)
161 # Currently, only the text is inserted
162 for ref in references:
163 td = ref.select("td")
164 value_xml = self.parse_single_ref(td[1])
165 bibitem = self.create_crawled_bibitem(value_xml, cleanup_str(td[0].text))
166 bibitems.append(bibitem)
168 xarticle.abstracts.append(self.create_bibliography(bibitems))
170 def parse_single_ref(self, tag: Tag):
171 xml_list = []
172 ext_links = []
173 authors_closed = False
175 for element in tag.contents:
176 if isinstance(element, str):
177 xml_list.append(element)
178 continue
179 if isinstance(element, Tag): 179 ↛ 175line 179 didn't jump to line 175 because the condition on line 179 was always true
180 if element.name == "b" and not authors_closed:
181 xml_list.append(f"<string-name>{element.text}</string-name>")
182 elif element.name == "i" and not authors_closed:
183 temp_element = xml_list.pop()
184 xml_list = [
185 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>',
186 temp_element,
187 ]
188 xml_list.append(get_citation_article_title_xml(element.text))
189 del temp_element
191 link = element.select_one("a")
192 if link:
193 link_href = link.get("href")
194 if isinstance(link_href, str): 194 ↛ 199line 194 didn't jump to line 199 because the condition on line 194 was always true
195 if link_href.startswith("https://doi.org/"):
196 link_href = link_href.removeprefix("https://doi.org/")
197 ext_links.append(get_ext_link_xml(link_href, link_href, "doi"))
199 authors_closed = True
200 elif element.name == "a":
201 pass
202 continue
204 return cleanup_str("".join(xml_list) + "".join(ext_links))
206 def process_article_metadata(self, xarticle: ArticleData):
207 html, xml = get_html_and_xml_from_text_with_formulas(
208 xarticle.title_tex,
209 delimiter_inline=self.delimiter_inline_formula,
210 delimiter_disp=self.delimiter_disp_formula,
211 )
212 xml = get_issue_title_xml(xml, with_tex_values=False)
213 xarticle.title_html = html
214 xarticle.title_xml = xml
216 update_data_for_jats(xarticle)
218 return xarticle