Coverage for src/crawler/by_source/msp_crawler.py: 88%

146 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import os 

2from urllib.parse import urljoin, urlparse 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

7from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

8from ptf.cmds.xml.jats.builder.citation import ( 

9 get_article_title_xml as get_citation_article_title_xml, 

10) 

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

12from ptf.cmds.xml.jats.builder.issue import get_title_xml as get_issue_title_xml 

13from ptf.cmds.xml.jats.jats_parser import JatsBase 

14from ptf.model_data import ( 

15 ArticleData, 

16 ResourceData, 

17 create_abstract, 

18 create_articledata, 

19 create_subj, 

20) 

21from ptf.model_data_converter import update_data_for_jats 

22 

23from crawler.base_crawler import BaseCollectionCrawler 

24from crawler.utils import cleanup_str 

25 

26 

27class MspCrawler(BaseCollectionCrawler): 

28 source_name = "Mathematical Sciences Publishers" 

29 source_domain = "MSP" 

30 source_website = "https://msp.org/" 

31 

32 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)" 

33 

34 def parse_collection_content(self, content): 

35 xissues = [] 

36 soup = BeautifulSoup(content, "html.parser") 

37 issues = soup.select("td.issues-area a.about[href]") 

38 for issue in issues: 

39 issue_href = issue.get("href") 

40 if not isinstance(issue_href, str): 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 raise ValueError("Couldn't parse issue href") 

42 

43 issue_search = regex.search(self.issue_re, issue_href) 

44 if not issue_search: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 raise ValueError("Couldn't parse issue data") 

46 

47 issue_dict = issue_search.groupdict() 

48 

49 xissues.append( 

50 self.create_xissue( 

51 urljoin(self.source_website, issue_href), 

52 issue_dict["year"], 

53 issue_dict["volume"], 

54 issue_dict["number"], 

55 ) 

56 ) 

57 return xissues 

58 

59 def parse_issue_content(self, content, xissue): 

60 soup = BeautifulSoup(content, "html.parser") 

61 if xissue.url is None: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Cannot parse article : issue url is None") 

63 

64 incomplete = soup.select_one(".incomplete") 

65 if incomplete: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true

66 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy") 

68 return 

69 

70 issue_doi_tag = soup.select_one("div.issue-doi a") 

71 if issue_doi_tag: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was always true

72 xissue.doi = cleanup_str(issue_doi_tag.text) 

73 

74 articles = soup.select("#toc-area .title") 

75 for index, article_tag in enumerate(articles): 

76 xarticle = create_articledata() 

77 article_href = article_tag.get("href") 

78 if not isinstance(article_href, str): 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 raise ValueError("Couldn't parse article url") 

80 xarticle.url = urljoin(xissue.url, article_href) 

81 xarticle.pid = "a" + str(index) 

82 xissue.articles.append(xarticle) 

83 

84 def parse_article_content(self, content, xissue, xarticle, url): 

85 soup = BeautifulSoup(content, "html.parser") 

86 

87 # Warn : meta doi is sometimes incorrect 

88 self.get_metadata_using_citation_meta( 

89 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"] 

90 ) 

91 

92 doi_tag = soup.select_one(".paper-doi > a") 

93 if doi_tag: 93 ↛ 96line 93 didn't jump to line 96 because the condition on line 93 was always true

94 xarticle.doi = doi_tag.text 

95 

96 article_data: dict[str, Tag] = {} 

97 article_sections = soup.select("#content-area > .article") 

98 for section in article_sections: 

99 if section.select_one(".copyright-license"): 

100 continue 

101 

102 tabs = section.select("tr") 

103 section_title_tag = tabs[0].select_one("h5") 

104 if not section_title_tag: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 print(f"{xarticle.pid} : Skipping section") 

106 continue 

107 section_title = section_title_tag.text 

108 section_title_tag.decompose() 

109 del section_title_tag 

110 

111 section_content = tabs[0] 

112 if len(tabs) > 1: 

113 section_content = tabs[1] 

114 section_tag = section_content.select_one("tr > td.article-area") 

115 if section_tag: 115 ↛ 98line 115 didn't jump to line 98 because the condition on line 115 was always true

116 article_data[section_title] = section_tag 

117 del article_sections 

118 

119 if "Keywords" in article_data and article_data["Keywords"] != "": 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was always true

120 for kwd in article_data["Keywords"].text.split(", "): 

121 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd)) 

122 

123 if ( 

124 "Mathematical Subject Classification 2010" in article_data 

125 and article_data["Mathematical Subject Classification 2010"] != "" 

126 ): 

127 msc_long_text = ( 

128 cleanup_str(article_data["Mathematical Subject Classification 2010"].text) 

129 .replace("Primary: ", "") 

130 .replace(" Secondary: ", ", ") 

131 ) 

132 for kwd in msc_long_text.split(", "): 

133 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd)) 

134 

135 if "Abstract" in article_data and article_data["Abstract"] != "": 135 ↛ 152line 135 didn't jump to line 152 because the condition on line 135 was always true

136 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p")) 

137 test = CkeditorParser( 

138 html_value=abstract_str, 

139 mml_formulas="", 

140 ) 

141 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag 

142 abstract = create_abstract( 

143 lang="en", 

144 tag="abstract", 

145 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>', 

146 value_tex=test.value_tex, 

147 value_html=test.value_html, 

148 ) 

149 

150 xarticle.abstracts.append(abstract) 

151 

152 self.parse_msp_references(xarticle) 

153 return xarticle 

154 

155 def parse_msp_references(self, xarticle: ArticleData): 

156 url = urlparse(xarticle.url) 

157 dirname = os.path.dirname(url.path) 

158 filename = os.path.basename(url.path) 

159 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b"))) 

160 

161 content = self.download_file(str(url.geturl())) 

162 soup = BeautifulSoup(content, "html.parser") 

163 references = soup.select("#content-area table.article:last-of-type tr") 

164 

165 bibitems = [] 

166 # TODO : extensive parsing (authors, title etc...) 

167 # Currently, only the text is inserted 

168 for ref in references: 

169 td = ref.select("td") 

170 value_xml = self.parse_single_ref(td[1]) 

171 bibitem = JatsBase.bake_ref(value_xml, cleanup_str(td[0].text)) 

172 bibitems.append(bibitem) 

173 if len(bibitems) > 0: 173 ↛ exitline 173 didn't return from function 'parse_msp_references' because the condition on line 173 was always true

174 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

175 

176 def parse_single_ref(self, tag: Tag): 

177 xml_list = [] 

178 ext_links = [] 

179 authors_closed = False 

180 

181 for element in tag.contents: 

182 if isinstance(element, str): 

183 xml_list.append(element) 

184 continue 

185 if isinstance(element, Tag): 185 ↛ 181line 185 didn't jump to line 181 because the condition on line 185 was always true

186 if element.name == "b" and not authors_closed: 

187 xml_list.append(f"<string-name>{element.text}</string-name>") 

188 elif element.name == "i" and not authors_closed: 

189 temp_element = xml_list.pop() 

190 xml_list = [ 

191 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>', 

192 temp_element, 

193 ] 

194 xml_list.append(get_citation_article_title_xml(element.text)) 

195 del temp_element 

196 

197 link = element.select_one("a") 

198 if link: 

199 link_href = link.get("href") 

200 if isinstance(link_href, str): 200 ↛ 205line 200 didn't jump to line 205 because the condition on line 200 was always true

201 if link_href.startswith("https://doi.org/"): 

202 link_href = link_href.removeprefix("https://doi.org/") 

203 ext_links.append(get_ext_link_xml(link_href, link_href, "doi")) 

204 

205 authors_closed = True 

206 elif element.name == "a": 

207 pass 

208 continue 

209 

210 return cleanup_str("".join(xml_list) + "".join(ext_links)) 

211 

212 def process_resource_metadata(self, xresource: ResourceData): 

213 html, xml = get_html_and_xml_from_text_with_formulas( 

214 xresource.title_tex, 

215 delimiter_inline=self.delimiter_inline_formula, 

216 delimiter_disp=self.delimiter_disp_formula, 

217 ) 

218 xml = get_issue_title_xml(xml, with_tex_values=False) 

219 xresource.title_html = html 

220 xresource.title_xml = xml 

221 

222 if isinstance(xresource, ArticleData): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true

223 update_data_for_jats(xresource) 

224 

225 return xresource