Coverage for src/crawler/by_source/msp_crawler.py: 88%

145 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1import os 

2from urllib.parse import urljoin, urlparse 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

7from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

8from ptf.cmds.xml.jats.builder.citation import ( 

9 get_article_title_xml as get_citation_article_title_xml, 

10) 

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

12from ptf.cmds.xml.jats.builder.issue import get_title_xml as get_issue_title_xml 

13from ptf.model_data import ( 

14 ArticleData, 

15 ResourceData, 

16 create_abstract, 

17 create_articledata, 

18 create_subj, 

19) 

20from ptf.model_data_converter import update_data_for_jats 

21 

22from crawler.base_crawler import BaseCollectionCrawler 

23from crawler.utils import cleanup_str 

24 

25 

26class MspCrawler(BaseCollectionCrawler): 

27 source_name = "Mathematical Sciences Publishers" 

28 source_domain = "MSP" 

29 source_website = "https://msp.org/" 

30 

31 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)" 

32 

33 def parse_collection_content(self, content): 

34 xissues = [] 

35 soup = BeautifulSoup(content, "html.parser") 

36 issues = soup.select("td.issues-area a.about[href]") 

37 for issue in issues: 

38 issue_href = issue.get("href") 

39 if not isinstance(issue_href, str): 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 raise ValueError("Couldn't parse issue href") 

41 

42 issue_search = regex.search(self.issue_re, issue_href) 

43 if not issue_search: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 raise ValueError("Couldn't parse issue data") 

45 

46 issue_dict = issue_search.groupdict() 

47 

48 xissues.append( 

49 self.create_xissue( 

50 urljoin(self.source_website, issue_href), 

51 issue_dict["year"], 

52 issue_dict["volume"], 

53 issue_dict["number"], 

54 ) 

55 ) 

56 return xissues 

57 

58 def parse_issue_content(self, content, xissue): 

59 soup = BeautifulSoup(content, "html.parser") 

60 if xissue.url is None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Cannot parse article : issue url is None") 

62 

63 incomplete = soup.select_one(".incomplete") 

64 if incomplete: 64 ↛ 69line 64 didn't jump to line 69 because the condition on line 64 was always true

65 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy") 

67 return 

68 

69 issue_doi_tag = soup.select_one("div.issue-doi a") 

70 if issue_doi_tag: 70 ↛ 73line 70 didn't jump to line 73 because the condition on line 70 was always true

71 xissue.doi = cleanup_str(issue_doi_tag.text) 

72 

73 articles = soup.select("#toc-area .title") 

74 for index, article_tag in enumerate(articles): 

75 xarticle = create_articledata() 

76 article_href = article_tag.get("href") 

77 if not isinstance(article_href, str): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 raise ValueError("Couldn't parse article url") 

79 xarticle.url = urljoin(xissue.url, article_href) 

80 xarticle.pid = "a" + str(index) 

81 xissue.articles.append(xarticle) 

82 

83 def parse_article_content(self, content, xissue, xarticle, url): 

84 soup = BeautifulSoup(content, "html.parser") 

85 

86 # Warn : meta doi is sometimes incorrect 

87 self.get_metadata_using_citation_meta( 

88 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"] 

89 ) 

90 

91 doi_tag = soup.select_one(".paper-doi > a") 

92 if doi_tag: 92 ↛ 95line 92 didn't jump to line 95 because the condition on line 92 was always true

93 xarticle.doi = doi_tag.text 

94 

95 article_data: dict[str, Tag] = {} 

96 article_sections = soup.select("#content-area > .article") 

97 for section in article_sections: 

98 if section.select_one(".copyright-license"): 

99 continue 

100 

101 tabs = section.select("tr") 

102 section_title_tag = tabs[0].select_one("h5") 

103 if not section_title_tag: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 print(f"{xarticle.pid} : Skipping section") 

105 continue 

106 section_title = section_title_tag.text 

107 section_title_tag.decompose() 

108 del section_title_tag 

109 

110 section_content = tabs[0] 

111 if len(tabs) > 1: 

112 section_content = tabs[1] 

113 section_tag = section_content.select_one("tr > td.article-area") 

114 if section_tag: 114 ↛ 97line 114 didn't jump to line 97 because the condition on line 114 was always true

115 article_data[section_title] = section_tag 

116 del article_sections 

117 

118 if "Keywords" in article_data and article_data["Keywords"] != "": 118 ↛ 122line 118 didn't jump to line 122 because the condition on line 118 was always true

119 for kwd in article_data["Keywords"].text.split(", "): 

120 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd)) 

121 

122 if ( 

123 "Mathematical Subject Classification 2010" in article_data 

124 and article_data["Mathematical Subject Classification 2010"] != "" 

125 ): 

126 msc_long_text = ( 

127 cleanup_str(article_data["Mathematical Subject Classification 2010"].text) 

128 .replace("Primary: ", "") 

129 .replace(" Secondary: ", ", ") 

130 ) 

131 for kwd in msc_long_text.split(", "): 

132 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd)) 

133 

134 if "Abstract" in article_data and article_data["Abstract"] != "": 134 ↛ 151line 134 didn't jump to line 151 because the condition on line 134 was always true

135 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p")) 

136 test = CkeditorParser( 

137 html_value=abstract_str, 

138 mml_formulas="", 

139 ) 

140 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag 

141 abstract = create_abstract( 

142 lang="en", 

143 tag="abstract", 

144 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>', 

145 value_tex=test.value_tex, 

146 value_html=test.value_html, 

147 ) 

148 

149 xarticle.abstracts.append(abstract) 

150 

151 self.parse_msp_references(xarticle) 

152 return xarticle 

153 

154 def parse_msp_references(self, xarticle: ArticleData): 

155 url = urlparse(xarticle.url) 

156 dirname = os.path.dirname(url.path) 

157 filename = os.path.basename(url.path) 

158 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b"))) 

159 

160 content = self.download_file(str(url.geturl())) 

161 soup = BeautifulSoup(content, "html.parser") 

162 references = soup.select("#content-area table.article:last-of-type tr") 

163 

164 bibitems = [] 

165 # TODO : extensive parsing (authors, title etc...) 

166 # Currently, only the text is inserted 

167 for ref in references: 

168 td = ref.select("td") 

169 value_xml = self.parse_single_ref(td[1]) 

170 bibitem = self.create_crawled_bibitem(value_xml, cleanup_str(td[0].text)) 

171 bibitems.append(bibitem) 

172 if len(bibitems) > 0: 172 ↛ exitline 172 didn't return from function 'parse_msp_references' because the condition on line 172 was always true

173 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

174 

175 def parse_single_ref(self, tag: Tag): 

176 xml_list = [] 

177 ext_links = [] 

178 authors_closed = False 

179 

180 for element in tag.contents: 

181 if isinstance(element, str): 

182 xml_list.append(element) 

183 continue 

184 if isinstance(element, Tag): 184 ↛ 180line 184 didn't jump to line 180 because the condition on line 184 was always true

185 if element.name == "b" and not authors_closed: 

186 xml_list.append(f"<string-name>{element.text}</string-name>") 

187 elif element.name == "i" and not authors_closed: 

188 temp_element = xml_list.pop() 

189 xml_list = [ 

190 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>', 

191 temp_element, 

192 ] 

193 xml_list.append(get_citation_article_title_xml(element.text)) 

194 del temp_element 

195 

196 link = element.select_one("a") 

197 if link: 

198 link_href = link.get("href") 

199 if isinstance(link_href, str): 199 ↛ 204line 199 didn't jump to line 204 because the condition on line 199 was always true

200 if link_href.startswith("https://doi.org/"): 

201 link_href = link_href.removeprefix("https://doi.org/") 

202 ext_links.append(get_ext_link_xml(link_href, link_href, "doi")) 

203 

204 authors_closed = True 

205 elif element.name == "a": 

206 pass 

207 continue 

208 

209 return cleanup_str("".join(xml_list) + "".join(ext_links)) 

210 

211 def process_resource_metadata(self, xresource: ResourceData): 

212 html, xml = get_html_and_xml_from_text_with_formulas( 

213 xresource.title_tex, 

214 delimiter_inline=self.delimiter_inline_formula, 

215 delimiter_disp=self.delimiter_disp_formula, 

216 ) 

217 xml = get_issue_title_xml(xml, with_tex_values=False) 

218 xresource.title_html = html 

219 xresource.title_xml = xml 

220 

221 if isinstance(xresource, ArticleData): 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 update_data_for_jats(xresource) 

223 

224 return xresource