Coverage for src/crawler/by_source/msp_crawler.py: 89%

130 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1import os 

2from urllib.parse import urljoin, urlparse 

3 

4from bs4 import BeautifulSoup, Tag 

5from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

6from ptf.cmds.xml.jats.builder.citation import get_article_title_xml, get_ext_link_xml 

7from ptf.cmds.xml.jats.jats_parser import JatsBase 

8from ptf.model_data import ( 

9 ArticleData, 

10 create_abstract, 

11 create_articledata, 

12 create_subj, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import cleanup_str, regex_to_dict 

17 

18 

19class MspCrawler(BaseCollectionCrawler): 

20 source_name = "Mathematical Sciences Publishers" 

21 source_domain = "MSP" 

22 source_website = "https://msp.org/" 

23 

24 issue_re = r"\/\w+\/(?P<year>\d+)\/(?P<volume>\d+)\-(?P<number>\d+)" 

25 

26 def parse_collection_content(self, content): 

27 xissues = [] 

28 soup = BeautifulSoup(content, "html.parser") 

29 issues = soup.select("td.issues-area a.about[href]") 

30 for issue in issues: 

31 issue_href = issue.get("href") 

32 if not isinstance(issue_href, str): 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true

33 raise ValueError("Couldn't parse issue href") 

34 

35 issue_dict = regex_to_dict( 

36 self.issue_re, issue_href, error_msg="Couldn't parse issue data" 

37 ) 

38 

39 xissues.append( 

40 self.create_xissue( 

41 urljoin(self.source_website, issue_href), 

42 issue_dict["year"], 

43 issue_dict["volume"], 

44 issue_dict["number"], 

45 ) 

46 ) 

47 return xissues 

48 

49 def parse_issue_content(self, content, xissue): 

50 soup = BeautifulSoup(content, "html.parser") 

51 if xissue.url is None: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("Cannot parse article : issue url is None") 

53 

54 incomplete = soup.select_one(".incomplete") 

55 if incomplete: 55 ↛ 60line 55 didn't jump to line 60 because the condition on line 55 was always true

56 if cleanup_str(incomplete.text) != "Publication of this issue is now complete.": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 print(f"Ignoring {xissue.pid} : Issue is not available due to S2O policy") 

58 return 

59 

60 issue_doi_tag = soup.select_one("div.issue-doi a") 

61 if issue_doi_tag: 61 ↛ 64line 61 didn't jump to line 64 because the condition on line 61 was always true

62 xissue.doi = cleanup_str(issue_doi_tag.text) 

63 

64 articles = soup.select("#toc-area .title") 

65 for index, article_tag in enumerate(articles): 

66 xarticle = create_articledata() 

67 article_href = article_tag.get("href") 

68 if not isinstance(article_href, str): 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("Couldn't parse article url") 

70 xarticle.url = urljoin(xissue.url, article_href) 

71 xarticle.pid = "a" + str(index) 

72 xissue.articles.append(xarticle) 

73 

74 def parse_article_content(self, content, xissue, xarticle, url): 

75 soup = BeautifulSoup(content, "html.parser") 

76 

77 # Warn : meta doi is sometimes incorrect 

78 self.get_metadata_using_citation_meta( 

79 xarticle, xissue, soup, ["title", "author", "page", "pdf", "publisher"] 

80 ) 

81 

82 doi_tag = soup.select_one(".paper-doi > a") 

83 if doi_tag: 83 ↛ 86line 83 didn't jump to line 86 because the condition on line 83 was always true

84 xarticle.doi = doi_tag.text 

85 

86 article_data: dict[str, Tag] = {} 

87 article_sections = soup.select("#content-area > .article") 

88 for section in article_sections: 

89 if section.select_one(".copyright-license"): 

90 continue 

91 

92 tabs = section.select("tr") 

93 section_title_tag = tabs[0].select_one("h5") 

94 if not section_title_tag: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 print(f"{xarticle.pid} : Skipping section") 

96 continue 

97 section_title = section_title_tag.text 

98 section_title_tag.decompose() 

99 del section_title_tag 

100 

101 section_content = tabs[0] 

102 if len(tabs) > 1: 

103 section_content = tabs[1] 

104 section_tag = section_content.select_one("tr > td.article-area") 

105 if section_tag: 105 ↛ 88line 105 didn't jump to line 88 because the condition on line 105 was always true

106 article_data[section_title] = section_tag 

107 del article_sections 

108 

109 if "Keywords" in article_data and article_data["Keywords"] != "": 109 ↛ 113line 109 didn't jump to line 113 because the condition on line 109 was always true

110 for kwd in article_data["Keywords"].text.split(", "): 

111 xarticle.kwds.append(create_subj(lang="en", type="kwd", value=kwd)) 

112 

113 if ( 

114 "Mathematical Subject Classification 2010" in article_data 

115 and article_data["Mathematical Subject Classification 2010"] != "" 

116 ): 

117 msc_long_text = ( 

118 cleanup_str(article_data["Mathematical Subject Classification 2010"].text) 

119 .replace("Primary: ", "") 

120 .replace(" Secondary: ", ", ") 

121 ) 

122 for kwd in msc_long_text.split(", "): 

123 xarticle.kwds.append(create_subj(lang="en", type="msc", value=kwd)) 

124 

125 if "Abstract" in article_data and article_data["Abstract"] != "": 125 ↛ 142line 125 didn't jump to line 142 because the condition on line 125 was always true

126 abstract_str = "".join(str(e) for e in article_data["Abstract"].select("p")) 

127 test = CkeditorParser( 

128 html_value=abstract_str, 

129 mml_formulas="", 

130 ) 

131 # QUESTION : is value_xml here valid, or should we not wrap this inside an abstract tag 

132 abstract = create_abstract( 

133 lang="en", 

134 tag="abstract", 

135 value_xml=f'<abstract xml:lang="en">{test.value_xml}</abstract>', 

136 value_tex=test.value_tex, 

137 value_html=test.value_html, 

138 ) 

139 

140 xarticle.abstracts.append(abstract) 

141 

142 self.parse_msp_references(xarticle) 

143 return xarticle 

144 

145 def parse_msp_references(self, xarticle: ArticleData): 

146 url = urlparse(xarticle.url) 

147 dirname = os.path.dirname(url.path) 

148 filename = os.path.basename(url.path) 

149 url = url._replace(path=urljoin(str(dirname) + "/", str(filename).replace("p", "b"))) 

150 

151 content = self.download_file(str(url.geturl())) 

152 soup = BeautifulSoup(content, "html.parser") 

153 references = soup.select("#content-area table.article:last-of-type tr") 

154 

155 bibitems = [] 

156 # TODO : extensive parsing (authors, title etc...) 

157 # Currently, only the text is inserted 

158 for ref in references: 

159 td = ref.select("td") 

160 value_xml = self.parse_single_ref(td[1]) 

161 bibitem = JatsBase.bake_ref(value_xml, cleanup_str(td[0].text)) 

162 bibitems.append(bibitem) 

163 if len(bibitems) > 0: 163 ↛ exitline 163 didn't return from function 'parse_msp_references' because the condition on line 163 was always true

164 xarticle.abstracts.append(JatsBase.compile_refs(bibitems)) 

165 

166 def parse_single_ref(self, tag: Tag): 

167 xml_list = [] 

168 ext_links = [] 

169 authors_closed = False 

170 

171 for element in tag.contents: 

172 if isinstance(element, str): 

173 xml_list.append(element) 

174 continue 

175 if isinstance(element, Tag): 175 ↛ 171line 175 didn't jump to line 171 because the condition on line 175 was always true

176 if element.name == "b" and not authors_closed: 

177 xml_list.append(f"<string-name>{element.text}</string-name>") 

178 elif element.name == "i" and not authors_closed: 

179 temp_element = xml_list.pop() 

180 xml_list = [ 

181 f'<person-group person-group-type="author">{cleanup_str("".join(xml_list))}</person-group>', 

182 temp_element, 

183 ] 

184 xml_list.append(get_article_title_xml(element.text)) 

185 del temp_element 

186 

187 link = element.select_one("a") 

188 if link: 

189 link_href = link.get("href") 

190 if isinstance(link_href, str): 190 ↛ 195line 190 didn't jump to line 195 because the condition on line 190 was always true

191 if link_href.startswith("https://doi.org/"): 

192 link_href = link_href.removeprefix("https://doi.org/") 

193 ext_links.append(get_ext_link_xml(link_href, link_href, "doi")) 

194 

195 authors_closed = True 

196 elif element.name == "a": 

197 pass 

198 continue 

199 

200 return cleanup_str("".join(xml_list) + "".join(ext_links))