Coverage for src/crawler/by_source/msp_crawler.py: 88%
139 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import os
2from urllib.parse import urljoin, urlparse
4from bs4 import BeautifulSoup, Tag
5from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser
6from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml
7from ptf.cmds.xml.jats.jats_parser import JatsBase
8from ptf.model_data import (
9 ArticleData,
10 create_abstract,
11 create_articledata,
12 create_subj,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import cleanup_str, regex_to_dict
19class MspCrawler(BaseCollectionCrawler):
20 source_name = "Mathematical Sciences Publishers"
21 source_domain = "MSP"
22 source_website = "https://msp.org/"
24 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)"
26 def parse_collection_content(self, content):
27 xissues = []
28 soup = BeautifulSoup(content, "html.parser")
29 issues = soup.select("td.issues-area a.about[href]")
30 for issue in issues:
31 issue_href = issue.get("href")
32 if not isinstance(issue_href, str): 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true
33 raise ValueError("Couldn't parse issue href")
35 issue_dict = regex_to_dict(
36 self.issue_re, issue_href, error_msg="Couldn't parse issue data"
37 )
39 xissues.append(
40 self.create_xissue(
41 urljoin(self.source_website, issue_href),
42 issue_dict["year"],
43 issue_dict["volume"],
44 issue_dict["number"],
45 )
46 )
47 return xissues
49 def parse_issue_content(self, content, xissue):
50 soup = BeautifulSoup(content, "html.parser")
51 if xissue.url is None: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 raise ValueError("Cannot parse article : issue url is None")
54 incomplete = soup.select_one(".incomplete")
55 if incomplete: 55 ↛ 63line 55 didn't jump to line 63 because the condition on line 55 was always true
56 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 self.logger.debug(
58 "Ignoring : Issue is not available due to S2O policy",
59 extra={"pid": xissue.pid},
60 )
61 return
63 issue_doi_tag = soup.select_one("div.issue-doi a")
64 if issue_doi_tag: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true
65 xissue.doi = cleanup_str(issue_doi_tag.text)
67 articles = soup.select("#toc-area .title")
68 for index, article_tag in enumerate(articles):
69 xarticle = create_articledata()
70 article_href = article_tag.get("href")
71 if not isinstance(article_href, str): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raise ValueError("Couldn't parse article url")
73 xarticle.url = urljoin(xissue.url, article_href)
74 xarticle.pid = "a" + str(index)
75 xissue.articles.append(xarticle)
77 def parse_article_content(self, content, xissue, xarticle, url):
78 soup = BeautifulSoup(content, "html.parser")
80 # Warn : meta doi is sometimes incorrect
81 self.get_metadata_using_citation_meta(
82 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"]
83 )
85 if not self.is_article_openaccess(xarticle): 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 return
88 doi_tag = soup.select_one(".paper-doi > a")
89 if doi_tag: 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was always true
90 xarticle.doi = doi_tag.text
92 article_data: dict[str, Tag] = {}
93 article_sections = soup.select("#content-area > .article")
94 for section in article_sections:
95 if section.select_one(".copyright-license"):
96 continue
98 tabs = section.select("tr")
99 section_title_tag = tabs[0].select_one("h5")
100 if not section_title_tag: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 self.logger.debug("Skipping section", extra={"pid": xarticle.pid})
102 continue
103 section_title = section_title_tag.text
104 section_title_tag.decompose()
105 del section_title_tag
107 section_content = tabs[0]
108 if len(tabs) > 1:
109 section_content = tabs[1]
110 section_tag = section_content.select_one("tr > td.article-area")
111 if section_tag: 111 ↛ 94line 111 didn't jump to line 94 because the condition on line 111 was always true
112 article_data[section_title] = section_tag
113 del article_sections
115 if "Keywords" in article_data and article_data["Keywords"] != "": 115 ↛ 119line 115 didn't jump to line 119 because the condition on line 115 was always true
116 for kwd in article_data["Keywords"].text.split(", "):
117 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd))
119 if (
120 "Mathematical Subject Classification 2010" in article_data
121 and article_data["Mathematical Subject Classification 2010"] != ""
122 ):
123 msc_long_text = (
124 cleanup_str(article_data["Mathematical Subject Classification 2010"].text)
125 .replace("Primary: ", "")
126 .replace(" Secondary: ", ", ")
127 )
128 for kwd in msc_long_text.split(", "):
129 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd))
131 if "Abstract" in article_data and article_data["Abstract"] != "": 131 ↛ 148line 131 didn't jump to line 148 because the condition on line 131 was always true
132 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p"))
133 test = CkeditorParser(
134 html_value=abstract_str,
135 mml_formulas="",
136 )
137 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag
138 abstract = create_abstract(
139 lang="en",
140 tag="abstract",
141 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>',
142 value_tex=test.value_tex,
143 value_html=test.value_html,
144 )
146 xarticle.abstracts.append(abstract)
148 self.parse_msp_references(xarticle)
149 return xarticle
151 def is_article_openaccess(self, xarticle: ArticleData):
152 stream = next(stream for stream in xarticle.streams if stream["rel"] == "full-text")
153 pdf_url = stream["location"]
154 pdf_response = self.session.head(pdf_url)
156 if pdf_response.headers["Content-Type"] == "application/pdf": 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true
157 return True
158 return False
160 def parse_msp_references(self, xarticle: ArticleData):
161 url = urlparse(xarticle.url)
162 dirname = os.path.dirname(url.path)
163 filename = os.path.basename(url.path)
164 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b")))
166 content = self.download_file(str(url.geturl()))
167 soup = BeautifulSoup(content, "html.parser")
168 references = soup.select("#content-area table.article:last-of-type tr")
170 bibitems = []
171 # TODO : extensive parsing (authors, title etc...)
172 # Currently, only the text is inserted
173 for ref in references:
174 td = ref.select("td")
175 value_xml = self.parse_single_ref(td[1])
176 bibitem = JatsBase.bake_ref(value_xml, cleanup_str(td[0].text))
177 bibitems.append(bibitem)
178 if len(bibitems) > 0: 178 ↛ exitline 178 didn't return from function 'parse_msp_references' because the condition on line 178 was always true
179 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))
181 def parse_single_ref(self, tag: Tag):
182 xml_list = []
183 ext_links = []
184 authors_closed = False
186 for element in tag.contents:
187 if isinstance(element, str):
188 xml_list.append(element)
189 continue
190 if isinstance(element, Tag): 190 ↛ 186line 190 didn't jump to line 186 because the condition on line 190 was always true
191 if element.name == "b" and not authors_closed:
192 xml_list.append(f"<string-name>{element.text}</string-name>")
193 elif element.name == "i" and not authors_closed:
194 temp_element = xml_list.pop()
195 xml_list = [
196 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>',
197 temp_element,
198 ]
199 xml_list.append(get_article_title_xml(element.text))
200 del temp_element
202 link = element.select_one("a")
203 if link:
204 link_href = link.get("href")
205 if isinstance(link_href, str): 205 ↛ 210line 205 didn't jump to line 210 because the condition on line 205 was always true
206 if link_href.startswith("https://doi.org/"):
207 link_href = link_href.removeprefix("https://doi.org/")
208 ext_links.append(get_ext_link_xml(link_href, link_href, "doi"))
210 authors_closed = True
211 elif element.name == "a":
212 pass
213 continue
215 return cleanup_str("".join(xml_list) + "".join(ext_links))