Coverage for src/crawler/by_source/msp_crawler.py: 88%

136 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-08-29 13:43 +0000

1import os 

2from urllib.parse import urljoin, urlparse 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

6from ptf.cmds.xml.jats.builder.issue import get_abstract_xml 

7from ptf.cmds.xml.jats.builder.references import get_article_title_xml, get_ext_link_xml 

8from ptf.cmds.xml.jats.jats_parser import JatsBase 

9from ptf.model_data import ( 

10 ArticleData, 

11 create_abstract, 

12 create_articledata, 

13 create_subj, 

14) 

15 

16from crawler.base_crawler import BaseCollectionCrawler 

17from crawler.utils import cleanup_str, regex_to_dict 

18 

19 

20class MspCrawler(BaseCollectionCrawler): 

21 source_name = "Mathematical Sciences Publishers" 

22 source_domain = "MSP" 

23 source_website = "https://msp.org/" 

24 

25 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)" 

26 

27 def parse_collection_content(self, content): 

28 xissues = [] 

29 soup = BeautifulSoup(content, "html.parser") 

30 issues = soup.select("td.issues-area a.about[href]") 

31 for issue in issues: 

32 issue_href = issue.get("href") 

33 if not isinstance(issue_href, str): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 raise ValueError("Couldn't parse issue href") 

35 

36 issue_dict = regex_to_dict( 

37 self.issue_re, issue_href, error_msg="Couldn't parse issue data" 

38 ) 

39 

40 xissues.append( 

41 self.create_xissue( 

42 urljoin(self.source_website, issue_href), 

43 issue_dict["year"], 

44 issue_dict["volume"], 

45 issue_dict["number"], 

46 ) 

47 ) 

48 return xissues 

49 

50 def parse_issue_content(self, content, xissue): 

51 soup = BeautifulSoup(content, "html.parser") 

52 if xissue.url is None: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 raise ValueError("Cannot parse article : issue url is None") 

54 

55 incomplete = soup.select_one(".incomplete") 

56 if incomplete: 56 ↛ 64line 56 didn't jump to line 64 because the condition on line 56 was always true

57 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 self.logger.debug( 

59 "Ignoring : Issue is not available due to S2O policy", 

60 extra={"pid": xissue.pid}, 

61 ) 

62 return 

63 

64 issue_doi_tag = soup.select_one("div.issue-doi a") 

65 if issue_doi_tag: 65 ↛ 68line 65 didn't jump to line 68 because the condition on line 65 was always true

66 xissue.doi = cleanup_str(issue_doi_tag.text) 

67 

68 articles = soup.select("#toc-area .title") 

69 for index, article_tag in enumerate(articles): 

70 xarticle = create_articledata() 

71 article_href = article_tag.get("href") 

72 if not isinstance(article_href, str): 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise ValueError("Couldn't parse article url") 

74 xarticle.url = urljoin(xissue.url, article_href) 

75 xarticle.pid = "a" + str(index) 

76 xissue.articles.append(xarticle) 

77 

78 def parse_article_content(self, content, xissue, xarticle, url): 

79 soup = BeautifulSoup(content, "html.parser") 

80 

81 # Warn : meta doi is sometimes incorrect 

82 self.get_metadata_using_citation_meta( 

83 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"] 

84 ) 

85 

86 if not self.is_article_openaccess(xarticle): 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 return 

88 

89 doi_tag = soup.select_one(".paper-doi > a") 

90 if doi_tag: 90 ↛ 93line 90 didn't jump to line 93 because the condition on line 90 was always true

91 xarticle.doi = doi_tag.text 

92 

93 article_data: dict[str, Tag] = {} 

94 article_sections = soup.select("#content-area > .article") 

95 for section in article_sections: 

96 if section.select_one(".copyright-license"): 

97 continue 

98 

99 tabs = section.select("tr") 

100 section_title_tag = tabs[0].select_one("h5") 

101 if not section_title_tag: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 self.logger.debug("Skipping section", extra={"pid": xarticle.pid}) 

103 continue 

104 section_title = section_title_tag.text 

105 section_title_tag.decompose() 

106 del section_title_tag 

107 

108 section_content = tabs[0] 

109 if len(tabs) > 1: 

110 section_content = tabs[1] 

111 section_tag = section_content.select_one("tr > td.article-area") 

112 if section_tag: 112 ↛ 95line 112 didn't jump to line 95 because the condition on line 112 was always true

113 article_data[section_title] = section_tag 

114 del article_sections 

115 

116 if "Keywords" in article_data and article_data["Keywords"] != "": 116 ↛ 120line 116 didn't jump to line 120 because the condition on line 116 was always true

117 for kwd in article_data["Keywords"].text.split(", "): 

118 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd)) 

119 

120 if ( 

121 "Mathematical Subject Classification 2010" in article_data 

122 and article_data["Mathematical Subject Classification 2010"] != "" 

123 ): 

124 msc_long_text = ( 

125 cleanup_str(article_data["Mathematical Subject Classification 2010"].text) 

126 .replace("Primary: ", "") 

127 .replace(" Secondary: ", ", ") 

128 ) 

129 for kwd in msc_long_text.split(", "): 

130 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd)) 

131 

132 if "Abstract" in article_data and article_data["Abstract"] != "": 132 ↛ 148line 132 didn't jump to line 148 because the condition on line 132 was always true

133 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p")) 

134 test = CkeditorParser( 

135 html_value=abstract_str, 

136 mml_formulas="", 

137 ) 

138 

139 abstract = create_abstract( 

140 lang="en", 

141 value_xml=get_abstract_xml(test.value_xml, lang="en"), 

142 value_tex=test.value_tex, 

143 value_html=test.value_html, 

144 ) 

145 

146 xarticle.abstracts.append(abstract) 

147 

148 self.parse_msp_references(xarticle) 

149 return xarticle 

150 

151 def is_article_openaccess(self, xarticle: ArticleData): 

152 stream = next(stream for stream in xarticle.streams if stream["rel"] == "full-text") 

153 pdf_url = stream["location"] 

154 pdf_response = self.session.head(pdf_url) 

155 

156 if pdf_response.headers["Content-Type"] == "application/pdf": 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true

157 return True 

158 return False 

159 

160 def parse_msp_references(self, xarticle: ArticleData): 

161 url = urlparse(xarticle.url) 

162 dirname = os.path.dirname(url.path) 

163 filename = os.path.basename(url.path) 

164 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b"))) 

165 

166 content = self.download_file(str(url.geturl())) 

167 soup = BeautifulSoup(content, "html.parser") 

168 references = soup.select("#content-area table.article:last-of-type tr") 

169 

170 # TODO : extensive parsing (authors, title etc...) 

171 # Currently, only the text is inserted 

172 for ref in references: 

173 td = ref.select("td") 

174 value_xml = self.parse_single_ref(td[1]) 

175 xarticle.bibitems.append(JatsBase.bake_ref(value_xml, cleanup_str(td[0].text))) 

176 

177 def parse_single_ref(self, tag: Tag): 

178 xml_list = [] 

179 ext_links = [] 

180 authors_closed = False 

181 

182 for element in tag.contents: 

183 if isinstance(element, str): 

184 xml_list.append(element) 

185 continue 

186 if isinstance(element, Tag): 186 ↛ 182line 186 didn't jump to line 182 because the condition on line 186 was always true

187 if element.name == "b" and not authors_closed: 

188 xml_list.append(f"<string-name>{element.text}</string-name>") 

189 elif element.name == "i" and not authors_closed: 

190 temp_element = xml_list.pop() 

191 xml_list = [ 

192 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>', 

193 temp_element, 

194 ] 

195 xml_list.append(get_article_title_xml(element.text)) 

196 del temp_element 

197 

198 link = element.select_one("a") 

199 if link: 

200 link_href = link.get("href") 

201 if isinstance(link_href, str): 201 ↛ 206line 201 didn't jump to line 206 because the condition on line 201 was always true

202 if link_href.startswith("https://doi.org/"): 

203 link_href = link_href.removeprefix("https://doi.org/") 

204 ext_links.append(get_ext_link_xml(link_href, link_href, "doi")) 

205 

206 authors_closed = True 

207 elif element.name == "a": 

208 pass 

209 continue 

210 

211 return cleanup_str("".join(xml_list) + "".join(ext_links))