Coverage for src / crawler / by_source / ams_crawler.py: 15%

138 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1import html 

2import json 

3import os 

4from urllib.parse import urljoin 

5from uuid import uuid4 

6 

7from bs4 import BeautifulSoup, Tag 

8from opentelemetry import trace 

9from ptf.cmds.xml.ckeditor.ckeditor_parser import CkeditorParser 

10from ptf.cmds.xml.ckeditor.utils import get_abstract_xml 

11from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

12from ptf.utils import execute_cmd 

13 

14from crawler.abstract_crawlers.threaded_crawler import ThreadedCrawler 

15from crawler.cmds.mixed_citation import ExtLinkXml, MixedCitation 

16from crawler.tests.data_generation.decorators import skip_generation 

17from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

18 

19 

20class AmsCrawler(ThreadedCrawler): 

21 source_name = "American Mathematical Society" 

22 source_domain = "AMS" 

23 source_website = "https://www.ams.org/" 

24 tracer = trace.get_tracer(__name__) 

25 

26 @classmethod 

27 def get_view_id(cls): 

28 return "AMS" 

29 

30 @skip_generation 

31 def parse_collection_content(self, content): 

32 xissues = [] 

33 soup = BeautifulSoup(content, "html.parser") 

34 issues_data_tag = soup.select_one( 

35 ".container main[role='main'] script[type='text/javascript']:not([src])" 

36 ) 

37 data = json.loads(self.get_col_issues(issues_data_tag.text)) 

38 issues = data["issues"] 

39 self.group_by_year = data["group_by_year"] == "Y" 

40 self.ams_code = data["ams_code"].lower() 

41 for i in issues: 

42 number = i.get("IssueNumber", None) 

43 if number: 

44 number = str(number) 

45 if self.group_by_year: 

46 number = None 

47 # For AMS, xissue.url is NOT a real URL, but the AMS issue ID 

48 # Issue data is fetched from an API and thus every issue url is the same 

49 xissues.append( 

50 self.create_xissue( 

51 str(i["IssueId"]), 

52 str(i["Year"]), 

53 str(i["Volume"]), 

54 number, 

55 ) 

56 ) 

57 

58 if self.group_by_year: 

59 # We take only the first issue advertised by the website 

60 # All ignored issues will be present inside the API on the next step anyways 

61 years = {} 

62 for i in xissues: 

63 if i.year not in years: 

64 years[i.year] = [] 

65 years[i.year].append(i) 

66 

67 xissues = [y[0] for y in years.values()] 

68 return xissues 

69 

70 def get_col_issues(self, input: str): 

71 """ 

72 AMS Issues are listed inside an inline js script 

73 We have to spawn a nodejs subprocess to convert javascript into json""" 

74 

75 filename = "/tmp/crawler/puppeteer/" + str(uuid4()) 

76 filename_out = filename + "-out" 

77 os.makedirs(os.path.dirname(filename), exist_ok=True) 

78 with open(filename, "w") as file: 

79 file.write(input) 

80 

81 content = None 

82 attempt = 0 

83 while not content and attempt < 3: 

84 attempt += 1 

85 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}" 

86 execute_cmd(cmd) 

87 

88 if os.path.isfile(filename_out): 

89 with open(filename_out) as file_: 

90 content = file_.read() 

91 

92 os.remove(filename) 

93 os.remove(filename_out) 

94 

95 if not content: 

96 raise ValueError("Couldn't parse collection content") 

97 return content 

98 

99 def download_issue_summary(self, issue_id): 

100 response = self.session.post( 

101 "https://pubs.ams.org/product/GetJournalIssueDetail", 

102 data={"productCode": self.ams_code, "issueId": issue_id}, 

103 headers={ 

104 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:149.0) Gecko/20100101 Firefox/149.0" 

105 }, 

106 ) 

107 return response.text 

108 

109 def start_process_issue(self, xissue): 

110 issue_url = xissue.url 

111 if not issue_url: 

112 raise ValueError("Issue does not have an URL") 

113 content = self.download_issue_summary(issue_url) 

114 # API response is somehow a list of issues 

115 # Currently CAMS somehow puts every article inside a different issue in the list... 

116 articles = [] 

117 

118 if self.group_by_year: 

119 for issue in json.loads(content): 

120 articles.extend(issue["Articles"]) 

121 else: 

122 issue_json = next(i for i in json.loads(content) if str(i["IssueId"]) == xissue.url) 

123 articles = issue_json["Articles"] 

124 

125 with self.tracer.start_as_current_span("parse_issue_content"): 

126 self.parse_ams_issue_content(articles, xissue) 

127 

128 def parse_ams_issue_content(self, articles: list[dict], xissue): 

129 for index, article_dict in enumerate(articles): 

130 xarticle = create_articledata() 

131 xarticle.title_tex = article_dict["Title"] 

132 xarticle.doi = article_dict["DOI"] 

133 xarticle.pid = f"a_{index}" 

134 xarticle.fpage = str(article_dict["StartPage"]) 

135 xarticle.lpage = str(article_dict["EndPage"]) 

136 xarticle.date_published = article_dict["PostDate"] 

137 

138 if article_dict["DocumentType"] == "BOOKREV": 

139 if xarticle.title_tex == "": 

140 book_title = article_dict["BookReviews"][0]["Title"] 

141 xarticle.title_tex = "Book review: " + book_title 

142 if article_dict["PrimaryMsc"] is not None: 

143 for msc in article_dict["PrimaryMsc"].split(", "): 

144 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc))) 

145 

146 ckeditor_data = CkeditorParser( 

147 html_value=article_dict["Abstract"], 

148 mml_formulas="", 

149 ) 

150 abstract = create_abstract( 

151 lang="en", 

152 value_xml=get_abstract_xml(ckeditor_data.value_xml, lang="en"), 

153 value_tex=ckeditor_data.value_tex, 

154 value_html=ckeditor_data.value_html, 

155 ) 

156 xarticle.abstracts.append(abstract) 

157 

158 # TODO : EnhancedReferences 

159 # TODO : UnenhancedReferences 

160 # TODO : BibliographicInfo 

161 

162 add_pdf_link_to_xarticle( 

163 xarticle, 

164 urljoin("https://www.ams.org/journals/", self.ams_code + article_dict["PdfUrl"]), 

165 ) 

166 xarticle.url = urljoin( 

167 self.collection_url, 

168 self.ams_code + "/" + article_dict["IssueDirectory"] + "/" + article_dict["PII"], 

169 ) 

170 if article_dict["MRNumber"]: 

171 xarticle.extids.append(("mr-item-id", article_dict["MRNumber"])) 

172 

173 for author in article_dict["Authors"]: 

174 # TODO : AMS Provides Firstname/MiddleName/LastName but we do not have Middlename fields 

175 # How should we proceed about that ? 

176 xarticle.contributors.append( 

177 create_contributor( 

178 role="author", 

179 string_name=html.unescape(author["FullName"]), 

180 email=html.unescape(author["Email"] or ""), 

181 addresses=[html.unescape(author["Affiliation"] or "")], 

182 ) 

183 ) 

184 

185 soup = BeautifulSoup(article_dict["EnhancedReferences"], "html5lib") 

186 refs = soup.select("ul > li") 

187 for ref in refs: 

188 xarticle.bibitems.append(self.parse_ref(ref)) 

189 

190 xissue.articles.append(xarticle) 

191 

192 def parse_ref(self, ref: "Tag"): 

193 citation_builder = MixedCitation() 

194 for el in ref.children: 

195 if isinstance(el, str): 

196 if el in [", DOI ", " DOI ", "DOI"]: 

197 continue 

198 citation_builder.elements.append(el) 

199 continue 

200 if isinstance(el, Tag): 

201 if el.name == "a": 

202 if el.text.startswith("10."): 

203 extlink = ExtLinkXml(urljoin("https://doi.org/", el.text)) 

204 citation_builder.elements.append(extlink) 

205 el.decompose() 

206 continue 

207 

208 href = el.get("href") 

209 if not isinstance(href, str): 

210 continue 

211 if href.startswith("https://mathscinet.ams.org/mathscinet-getitem"): 

212 extlink = ExtLinkXml(href) 

213 citation_builder.elements.append(extlink) 

214 el.decompose() 

215 continue 

216 citation_builder.elements.append(el.get_text()) 

217 return citation_builder.get_jats_ref() 

218 

219 # def parse_ref(self, ref: "Tag"): 

220 # citation_builder = MixedCitation() 

221 # # Everything behind the title should be authors 

222 # title_element = ref.select_one("em") 

223 # if title_element: 

224 # authors = list(title_element.previous_siblings) 

225 # # if len(authors) != 1: 

226 # # self.logger.error("Could not correctly parse reference. Fallback to text") 

227 # # citation_builder.elements.append(ref.get_text()) 

228 # # return citation_builder.get_jats_ref() 

229 # # Temporary fix : structured bibitems parsing is sometimes incorrect. 

230 # # Better have no data than incorrect data (?) 

231 # for el in authors: 

232 # citation_builder.elements.append(el.get_text()) 

233 # for el in authors: 

234 # el.extract() 

235 # # authors_el = GenericRefElement() 

236 # # authors_el.name = "person-group" 

237 # # citation_builder.elements.append(authors_el) 

238 # # authors_text = authors[0].text 

239 # # if authors_text.endswith(", "): 

240 # # authors_text = authors_text.removesuffix(", ") 

241 # # authors_el.elements.append(authors_text) 

242 # # citation_builder.elements.append(", ") 

243 # # else: 

244 # # authors_el.elements.append(authors_text) 

245 

246 # article_title = MixedCitation() 

247 # article_title.name = "article-title" 

248 # citation_builder.elements.append(article_title) 

249 # article_title.elements.append(title_element.text) 

250 # title_element.decompose() 

251 

252 # # everything before a tag is text 

253 # first_link = ref.select_one("a") 

254 # if first_link: 

255 # texts = list(first_link.previous_siblings) 

256 # if len(texts) == 0: 

257 # raise ValueError("first_link previous_siblings is empty") 

258 # for el in reversed(texts): 

259 # citation_builder.elements.append(el.get_text().removesuffix(", Preprint, arXiv:")) 

260 # el.extract() 

261 

262 # for link in ref.select("a"): 

263 # url = link.get("href") 

264 # if not isinstance(url, str): 

265 # raise ValueError("Citation extlink does not have a valid url") 

266 # reflink = ExtLinkXml(url) 

267 # citation_builder.elements.append(reflink) 

268 # else: 

269 # citation_builder.elements.append(ref.get_text()) 

270 

271 # return citation_builder.get_jats_ref()