Coverage for src/crawler/by_source/ams/ams_base_crawler.py: 25%

61 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1import json 

2import os 

3from urllib.parse import urljoin 

4from uuid import uuid4 

5 

6from bs4 import BeautifulSoup 

7from ptf.model_data import create_abstract 

8from ptf.utils import execute_cmd 

9 

10from crawler.base_crawler import BaseCollectionCrawler 

11from crawler.utils import skip_generation 

12 

13 

14class AmsCrawler(BaseCollectionCrawler): 

15 source_name = "American Mathematical Society" 

16 source_domain = "AMS" 

17 source_website = "https://www.ams.org/" 

18 

19 @classmethod 

20 def get_view_id(cls): 

21 return "AMS" 

22 

23 @skip_generation 

24 def parse_collection_content(self, content): 

25 xissues = [] 

26 soup = BeautifulSoup(content, "html.parser") 

27 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])") 

28 issues = json.loads(self.get_col_issues(issues_data_tag.text)) 

29 for i in issues: 

30 xissues.append( 

31 self.create_xissue( 

32 urljoin(self.collection_url, issues[i]["url"]), 

33 issues[i]["year"], 

34 issues[i]["volume"], 

35 issues[i].get("number", None), 

36 ) 

37 ) 

38 return xissues 

39 

40 def get_col_issues(self, input: str): 

41 filename = "/tmp/crawler/puppeteer/" + str(uuid4()) 

42 filename_out = filename + "-out" 

43 os.makedirs(os.path.dirname(filename), exist_ok=True) 

44 with open(filename, "w") as file: 

45 file.write(input) 

46 

47 content = None 

48 attempt = 0 

49 while not content and attempt < 3: 

50 attempt += 1 

51 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}" 

52 execute_cmd(cmd) 

53 

54 if os.path.isfile(filename_out): 

55 with open(filename_out) as file_: 

56 content = file_.read() 

57 

58 os.remove(filename) 

59 os.remove(filename_out) 

60 

61 if not content: 

62 raise ValueError("Couldn't parse collection content") 

63 return content 

64 

65 def parse_article_content(self, content, xissue, xarticle, url): 

66 soup = BeautifulSoup(content, "html.parser") 

67 self.get_metadata_using_citation_meta( 

68 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"] 

69 ) 

70 

71 abstract_subtag = soup.select_one("a#Abstract") 

72 if abstract_subtag: 

73 abstract_tag = abstract_subtag.parent 

74 if abstract_tag: 

75 abstract_subtag.decompose() 

76 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_tag.text)) 

77 else: 

78 abstract_tag = soup.select_one("section#Abstract") 

79 if abstract_tag: 

80 abstract_header = abstract_tag.select_one("h2") 

81 if abstract_header: 

82 abstract_header.decompose() 

83 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract_tag.text)) 

84 

85 return xarticle