Coverage for src/crawler/by_source/ams_crawler.py: 22%

71 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import json 

2import os 

3import subprocess 

4from urllib.parse import urljoin 

5from uuid import uuid4 

6 

7from bs4 import BeautifulSoup 

8from ptf.cmds.xml.jats.jats_parser import JatsBase 

9from ptf.model_data import create_abstract, create_articledata 

10from ptf.utils import execute_cmd 

11 

12from crawler.base_crawler import BaseCollectionCrawler 

13from crawler.utils import skip_generation 

14 

15 

16class AmsCrawler(BaseCollectionCrawler): 

17 source_name = "American Mathematical Society" 

18 source_domain = "AMS" 

19 source_website = "https://www.ams.org/" 

20 

21 @skip_generation 

22 def parse_collection_content(self, content): 

23 xissues = [] 

24 soup = BeautifulSoup(content, "html.parser") 

25 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])") 

26 issues = json.loads(self.get_col_issues(issues_data_tag.text)) 

27 for i in issues: 

28 xissues.append( 

29 self.create_xissue( 

30 urljoin(self.collection_url, issues[i]["url"]), 

31 issues[i]["year"], 

32 issues[i]["volume"], 

33 issues[i].get("number", None), 

34 ) 

35 ) 

36 return xissues 

37 

38 def get_col_issues(self, input: str): 

39 filename = "/tmp/crawler/puppeteer/" + str(uuid4()) 

40 filename_out = filename + "-out" 

41 os.makedirs(os.path.dirname(filename), exist_ok=True) 

42 with open(filename, "w") as file: 

43 file.write(input) 

44 

45 content = None 

46 attempt = 0 

47 while not content and attempt < 3: 

48 attempt += 1 

49 try: 

50 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}" 

51 print(cmd) 

52 execute_cmd(cmd) 

53 

54 if os.path.isfile(filename_out): 

55 with open(filename_out) as file_: 

56 content = file_.read() 

57 

58 os.remove(filename) 

59 os.remove(filename_out) 

60 

61 except subprocess.CalledProcessError: 

62 pass 

63 if not content: 

64 raise ValueError("Couldn't parse collection content") 

65 return content 

66 

67 def parse_issue_content(self, content, xissue): 

68 soup = BeautifulSoup(content, "html.parser") 

69 articles = soup.select( 

70 "dd > a:-soup-contains-own('Abstract, references and article information')" 

71 ) 

72 for index, a in enumerate(articles): 

73 article_url = a.get("href") 

74 if not isinstance(article_url, str): 

75 raise ValueError("Couldn't parse article url") 

76 xarticle = create_articledata() 

77 xarticle.url = urljoin(self.collection_url, article_url) 

78 xarticle.pid = "a" + str(index) 

79 xissue.articles.append(xarticle) 

80 

81 def parse_article_content(self, content, xissue, xarticle, url): 

82 soup = BeautifulSoup(content, "html.parser") 

83 self.get_metadata_using_citation_meta( 

84 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"] 

85 ) 

86 if len(xarticle.bibitems) > 0: 

87 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems)) 

88 

89 abstract_subtag = soup.select_one("a#Abstract") 

90 if abstract_subtag: 

91 abstract_tag = abstract_subtag.parent 

92 if abstract_tag: 

93 abstract_subtag.decompose() 

94 xarticle.abstracts.append( 

95 create_abstract(lang="en", tag="abstract", value_tex=abstract_tag.text) 

96 ) 

97 

98 return xarticle