Coverage for src/crawler/by_source/msp

1import os

2from urllib.parse import urljoin, urlparse

4import regex

5from bs4 import BeautifulSoup, Tag

6from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser

7from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas

8from ptf.cmds.xml.jats.builder.citation import (

9 get_article_title_xml as get_citation_article_title_xml,

10)

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml

12from ptf.cmds.xml.jats.builder.issue import get_title_xml as get_issue_title_xml

13from ptf.cmds.xml.jats.jats_parser import JatsBase

14from ptf.model_data import (

15 ArticleData,

16 ResourceData,

17 create_abstract,

18 create_articledata,

19 create_subj,

20)

21from ptf.model_data_converter import update_data_for_jats

23from crawler.base_crawler import BaseCollectionCrawler

24from crawler.utils import cleanup_str

27class MspCrawler(BaseCollectionCrawler):

28 source_name = "Mathematical Sciences Publishers"

29 source_domain = "MSP"

30 source_website = "https://msp.org/"

32 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)"

34 def parse_collection_content(self, content):

35 xissues = []

36 soup = BeautifulSoup(content, "html.parser")

37 issues = soup.select("td.issues-area a.about[href]")

38 for issue in issues:

39 issue_href = issue.get("href")

40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError("Couldn't parse issue href")

43 issue_search = regex.search(self.issue_re, issue_href)

44 if not issue_search: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Couldn't parse issue data")

47 issue_dict = issue_search.groupdict()

49 xissues.append(

50 self.create_xissue(

51 urljoin(self.source_website, issue_href),

52 issue_dict["year"],

53 issue_dict["volume"],

54 issue_dict["number"],

55 )

56 )

57 return xissues

59 def parse_issue_content(self, content, xissue):

60 soup = BeautifulSoup(content, "html.parser")

61 if xissue.url is None: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Cannot parse article : issue url is None")

64 incomplete = soup.select_one(".incomplete")

65 if incomplete: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true

66 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy")

68 return

70 issue_doi_tag = soup.select_one("div.issue-doi a")

71 if issue_doi_tag: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was always true

72 xissue.doi = cleanup_str(issue_doi_tag.text)

74 articles = soup.select("#toc-area .title")

75 for index, article_tag in enumerate(articles):

76 xarticle = create_articledata()

77 article_href = article_tag.get("href")

78 if not isinstance(article_href, str): 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 raise ValueError("Couldn't parse article url")

80 xarticle.url = urljoin(xissue.url, article_href)

81 xarticle.pid = "a" + str(index)

82 xissue.articles.append(xarticle)

84 def parse_article_content(self, content, xissue, xarticle, url):

85 soup = BeautifulSoup(content, "html.parser")

87 # Warn : meta doi is sometimes incorrect

88 self.get_metadata_using_citation_meta(

89 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"]

90 )

92 doi_tag = soup.select_one(".paper-doi > a")

93 if doi_tag: 93 ↛ 96line 93 didn't jump to line 96 because the condition on line 93 was always true

94 xarticle.doi = doi_tag.text

96 article_data: dict[str, Tag] = {}

97 article_sections = soup.select("#content-area > .article")

98 for section in article_sections:

99 if section.select_one(".copyright-license"):

100 continue

101

102 tabs = section.select("tr")

103 section_title_tag = tabs[0].select_one("h5")

104 if not section_title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 print(f"{xarticle.pid} : Skipping section")

106 continue

107 section_title = section_title_tag.text

108 section_title_tag.decompose()

109 del section_title_tag

110

111 section_content = tabs[0]

112 if len(tabs) > 1:

113 section_content = tabs[1]

114 section_tag = section_content.select_one("tr > td.article-area")

115 if section_tag: 115 ↛ 98line 115 didn't jump to line 98 because the condition on line 115 was always true

116 article_data[section_title] = section_tag

117 del article_sections

118

119 if "Keywords" in article_data and article_data["Keywords"] != "": 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was always true

120 for kwd in article_data["Keywords"].text.split(", "):

121 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd))

122

123 if (

124 "Mathematical Subject Classification 2010" in article_data

125 and article_data["Mathematical Subject Classification 2010"] != ""

126 ):

127 msc_long_text = (

128 cleanup_str(article_data["Mathematical Subject Classification 2010"].text)

129 .replace("Primary: ", "")

130 .replace(" Secondary: ", ", ")

131 )

132 for kwd in msc_long_text.split(", "):

133 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd))

134

135 if "Abstract" in article_data and article_data["Abstract"] != "": 135 ↛ 152line 135 didn't jump to line 152 because the condition on line 135 was always true

136 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p"))

137 test = CkeditorParser(

138 html_value=abstract_str,

139 mml_formulas="",

140 )

141 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag

142 abstract = create_abstract(

143 lang="en",

144 tag="abstract",

145 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>',

146 value_tex=test.value_tex,

147 value_html=test.value_html,

148 )

149

150 xarticle.abstracts.append(abstract)

151

152 self.parse_msp_references(xarticle)

153 return xarticle

154

155 def parse_msp_references(self, xarticle: ArticleData):

156 url = urlparse(xarticle.url)

157 dirname = os.path.dirname(url.path)

158 filename = os.path.basename(url.path)

159 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b")))

160

161 content = self.download_file(str(url.geturl()))

162 soup = BeautifulSoup(content, "html.parser")

163 references = soup.select("#content-area table.article:last-of-type tr")

164

165 bibitems = []

166 # TODO : extensive parsing (authors, title etc...)

167 # Currently, only the text is inserted

168 for ref in references:

169 td = ref.select("td")

170 value_xml = self.parse_single_ref(td[1])

171 bibitem = JatsBase.bake_ref(value_xml, cleanup_str(td[0].text))

172 bibitems.append(bibitem)

173 if len(bibitems) > 0: 173 ↛ exitline 173 didn't return from function 'parse_msp_references' because the condition on line 173 was always true

174 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))

175

176 def parse_single_ref(self, tag: Tag):

177 xml_list = []

178 ext_links = []

179 authors_closed = False

180

181 for element in tag.contents:

182 if isinstance(element, str):

183 xml_list.append(element)

184 continue

185 if isinstance(element, Tag): 185 ↛ 181line 185 didn't jump to line 181 because the condition on line 185 was always true

186 if element.name == "b" and not authors_closed:

187 xml_list.append(f"<string-name>{element.text}</string-name>")

188 elif element.name == "i" and not authors_closed:

189 temp_element = xml_list.pop()

190 xml_list = [

191 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>',

192 temp_element,

193 ]

194 xml_list.append(get_citation_article_title_xml(element.text))

195 del temp_element

196

197 link = element.select_one("a")

198 if link:

199 link_href = link.get("href")

200 if isinstance(link_href, str): 200 ↛ 205line 200 didn't jump to line 205 because the condition on line 200 was always true

201 if link_href.startswith("https://doi.org/"):

202 link_href = link_href.removeprefix("https://doi.org/")

203 ext_links.append(get_ext_link_xml(link_href, link_href, "doi"))

204

205 authors_closed = True

206 elif element.name == "a":

207 pass

208 continue

209

210 return cleanup_str("".join(xml_list) + "".join(ext_links))

211

212 def process_resource_metadata(self, xresource: ResourceData):

213 html, xml = get_html_and_xml_from_text_with_formulas(

214 xresource.title_tex,

215 delimiter_inline=self.delimiter_inline_formula,

216 delimiter_disp=self.delimiter_disp_formula,

217 )

218 xml = get_issue_title_xml(xml, with_tex_values=False)

219 xresource.title_html = html

220 xresource.title_xml = xml

221

222 if isinstance(xresource, ArticleData): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true

223 update_data_for_jats(xresource)

224

225 return xresource

Coverage for src/crawler/by_source/msp_crawler.py: 88%

146 statements