Coverage for src/crawler/by_source/msp_crawler.py: 88%

145 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import os 

2from urllib.parse import urljoin, urlparse 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

7from ptf.cmds.xml.ckeditor.utils import get_html_and_xml_from_text_with_formulas 

8from ptf.cmds.xml.jats.builder.citation import ( 

9 get_article_title_xml as get_citation_article_title_xml, 

10) 

11from ptf.cmds.xml.jats.builder.citation import get_ext_link_xml 

12from ptf.cmds.xml.jats.builder.issue import get_title_xml as get_issue_title_xml 

13from ptf.model_data import ArticleData, ResourceData, create_abstract, create_articledata, create_subj 

14from ptf.model_data_converter import update_data_for_jats 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.utils import cleanup_str 

18 

19 

20class MspCrawler(BaseCollectionCrawler): 

21 source_name = "Mathematical Sciences Publishers" 

22 source_domain = "MSP" 

23 source_website = "https://msp.org/" 

24 

25 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)" 

26 

27 def parse_collection_content(self, content): 

28 xissues = [] 

29 soup = BeautifulSoup(content, "html.parser") 

30 issues = soup.select("td.issues-area a.about[href]") 

31 for issue in issues: 

32 issue_href = issue.get("href") 

33 if not isinstance(issue_href, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 raise ValueError("Couldn't parse issue href") 

35 

36 issue_search = regex.search(self.issue_re, issue_href) 

37 if not issue_search: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 raise ValueError("Couldn't parse issue data") 

39 

40 issue_dict = issue_search.groupdict() 

41 

42 xissues.append( 

43 self.create_xissue( 

44 urljoin(self.source_website, issue_href), 

45 issue_dict["year"], 

46 issue_dict["volume"], 

47 issue_dict["number"], 

48 ) 

49 ) 

50 return xissues 

51 

52 def parse_issue_content(self, content, xissue): 

53 soup = BeautifulSoup(content, "html.parser") 

54 if xissue.url is None: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 raise ValueError("Cannot parse article : issue url is None") 

56 

57 incomplete = soup.select_one(".incomplete") 

58 if incomplete: 58 ↛ 63line 58 didn't jump to line 63 because the condition on line 58 was always true

59 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy") 

61 return 

62 

63 issue_doi_tag = soup.select_one("div.issue-doi a") 

64 if issue_doi_tag: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true

65 xissue.doi = cleanup_str(issue_doi_tag.text) 

66 

67 articles = soup.select("#toc-area .title") 

68 for index, article_tag in enumerate(articles): 

69 xarticle = create_articledata() 

70 article_href = article_tag.get("href") 

71 if not isinstance(article_href, str): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raise ValueError("Couldn't parse article url") 

73 xarticle.url = urljoin(xissue.url, article_href) 

74 xarticle.pid = "a" + str(index) 

75 xissue.articles.append(xarticle) 

76 

77 def parse_article_content(self, content, xissue, xarticle, url): 

78 soup = BeautifulSoup(content, "html.parser") 

79 

80 # Warn : meta doi is sometimes incorrect 

81 self.get_metadata_using_citation_meta( 

82 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"] 

83 ) 

84 

85 doi_tag = soup.select_one(".paper-doi > a") 

86 if doi_tag: 86 ↛ 89line 86 didn't jump to line 89 because the condition on line 86 was always true

87 xarticle.doi = doi_tag.text 

88 

89 article_data: dict[str, Tag] = {} 

90 article_sections = soup.select("#content-area > .article") 

91 for section in article_sections: 

92 if section.select_one(".copyright-license"): 

93 continue 

94 

95 tabs = section.select("tr") 

96 section_title_tag = tabs[0].select_one("h5") 

97 if not section_title_tag: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 print(f"{xarticle.pid} : Skipping section") 

99 continue 

100 section_title = section_title_tag.text 

101 section_title_tag.decompose() 

102 del section_title_tag 

103 

104 section_content = tabs[0] 

105 if len(tabs) > 1: 

106 section_content = tabs[1] 

107 section_tag = section_content.select_one("tr > td.article-area") 

108 if section_tag: 108 ↛ 91line 108 didn't jump to line 91 because the condition on line 108 was always true

109 article_data[section_title] = section_tag 

110 del article_sections 

111 

112 if "Keywords" in article_data and article_data["Keywords"] != "": 112 ↛ 116line 112 didn't jump to line 116 because the condition on line 112 was always true

113 for kwd in article_data["Keywords"].text.split(", "): 

114 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd)) 

115 

116 if ( 

117 "Mathematical Subject Classification 2010" in article_data 

118 and article_data["Mathematical Subject Classification 2010"] != "" 

119 ): 

120 msc_long_text = ( 

121 cleanup_str(article_data["Mathematical Subject Classification 2010"].text) 

122 .replace("Primary: ", "") 

123 .replace(" Secondary: ", ", ") 

124 ) 

125 for kwd in msc_long_text.split(", "): 

126 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd)) 

127 

128 if "Abstract" in article_data and article_data["Abstract"] != "": 128 ↛ 145line 128 didn't jump to line 145 because the condition on line 128 was always true

129 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p")) 

130 test = CkeditorParser( 

131 html_value=abstract_str, 

132 mml_formulas="", 

133 ) 

134 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag 

135 abstract = create_abstract( 

136 lang="en", 

137 tag="abstract", 

138 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>', 

139 value_tex=test.value_tex, 

140 value_html=test.value_html, 

141 ) 

142 

143 xarticle.abstracts.append(abstract) 

144 

145 self.parse_msp_references(xarticle) 

146 return xarticle 

147 

148 def parse_msp_references(self, xarticle: ArticleData): 

149 url = urlparse(xarticle.url) 

150 dirname = os.path.dirname(url.path) 

151 filename = os.path.basename(url.path) 

152 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b"))) 

153 

154 content = self.download_file(str(url.geturl())) 

155 soup = BeautifulSoup(content, "html.parser") 

156 references = soup.select("#content-area table.article:last-of-type tr") 

157 

158 bibitems = [] 

159 # TODO : extensive parsing (authors, title etc...) 

160 # Currently, only the text is inserted 

161 for ref in references: 

162 td = ref.select("td") 

163 value_xml = self.parse_single_ref(td[1]) 

164 bibitem = self.create_crawled_bibitem(value_xml, cleanup_str(td[0].text)) 

165 bibitems.append(bibitem) 

166 if len(bibitems) > 0: 166 ↛ exitline 166 didn't return from function 'parse_msp_references' because the condition on line 166 was always true

167 xarticle.abstracts.append(self.create_bibliography(bibitems)) 

168 

169 def parse_single_ref(self, tag: Tag): 

170 xml_list = [] 

171 ext_links = [] 

172 authors_closed = False 

173 

174 for element in tag.contents: 

175 if isinstance(element, str): 

176 xml_list.append(element) 

177 continue 

178 if isinstance(element, Tag): 178 ↛ 174line 178 didn't jump to line 174 because the condition on line 178 was always true

179 if element.name == "b" and not authors_closed: 

180 xml_list.append(f"<string-name>{element.text}</string-name>") 

181 elif element.name == "i" and not authors_closed: 

182 temp_element = xml_list.pop() 

183 xml_list = [ 

184 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>', 

185 temp_element, 

186 ] 

187 xml_list.append(get_citation_article_title_xml(element.text)) 

188 del temp_element 

189 

190 link = element.select_one("a") 

191 if link: 

192 link_href = link.get("href") 

193 if isinstance(link_href, str): 193 ↛ 198line 193 didn't jump to line 198 because the condition on line 193 was always true

194 if link_href.startswith("https://doi.org/"): 

195 link_href = link_href.removeprefix("https://doi.org/") 

196 ext_links.append(get_ext_link_xml(link_href, link_href, "doi")) 

197 

198 authors_closed = True 

199 elif element.name == "a": 

200 pass 

201 continue 

202 

203 return cleanup_str("".join(xml_list) + "".join(ext_links)) 

204 

205 def process_resource_metadata(self, xresource: ResourceData): 

206 html, xml = get_html_and_xml_from_text_with_formulas( 

207 xresource.title_tex, 

208 delimiter_inline=self.delimiter_inline_formula, 

209 delimiter_disp=self.delimiter_disp_formula, 

210 ) 

211 xml = get_issue_title_xml(xml, with_tex_values=False) 

212 xresource.title_html = html 

213 xresource.title_xml = xml 

214 

215 if isinstance(xresource, ArticleData): 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 update_data_for_jats(xresource) 

217 

218 return xresource