Coverage for src/crawler/by_source/ams_crawler.py: 49%

67 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import json 

2import os 

3import subprocess 

4from urllib.parse import urljoin 

5from uuid import uuid4 

6 

7from bs4 import BeautifulSoup 

8from ptf.model_data import create_abstract, create_articledata 

9from ptf.utils import execute_cmd 

10 

11from crawler.base_crawler import BaseCollectionCrawler 

12 

13 

14class AmsCrawler(BaseCollectionCrawler): 

15 source_name = "American Mathematical Society" 

16 source_domain = "AMS" 

17 source_website = "https://www.ams.org/" 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])") 

23 issues = json.loads(self.get_col_issues(issues_data_tag.text)) 

24 for i in issues: 

25 xissues.append( 

26 self.create_xissue( 

27 urljoin(self.collection_url, issues[i]["url"]), 

28 issues[i]["year"], 

29 issues[i]["volume"], 

30 issues[i].get("number", None), 

31 ) 

32 ) 

33 return xissues 

34 

35 def get_col_issues(self, input: str): 

36 filename = "/tmp/crawler/puppeteer/" + str(uuid4()) 

37 filename_out = filename + "-out" 

38 os.makedirs(os.path.dirname(filename), exist_ok=True) 

39 with open(filename, "w") as file: 

40 file.write(input) 

41 

42 content = None 

43 attempt = 0 

44 while not content and attempt < 3: 

45 attempt += 1 

46 try: 

47 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}" 

48 print(cmd) 

49 execute_cmd(cmd) 

50 

51 if os.path.isfile(filename_out): 

52 with open(filename_out) as file_: 

53 content = file_.read() 

54 

55 os.remove(filename) 

56 os.remove(filename_out) 

57 

58 except subprocess.CalledProcessError: 

59 pass 

60 if not content: 

61 raise ValueError("Couldn't parse collection content") 

62 return content 

63 

64 def parse_issue_content(self, content, xissue): 

65 soup = BeautifulSoup(content, "html.parser") 

66 articles = soup.select( 

67 "dd > a:-soup-contains-own('Abstract, references and article information')" 

68 ) 

69 for index, a in enumerate(articles): 

70 article_url = a.get("href") 

71 if not isinstance(article_url, str): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raise ValueError("Couldn't parse article url") 

73 xarticle = create_articledata() 

74 xarticle.url = urljoin(self.collection_url, article_url) 

75 xarticle.pid = "a" + str(index) 

76 xissue.articles.append(xarticle) 

77 

78 def parse_article_content(self, content, xissue, xarticle, url, pid): 

79 soup = BeautifulSoup(content, "html.parser") 

80 self.get_metadata_using_citation_meta( 

81 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"] 

82 ) 

83 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems)) 

84 

85 abstract_subtag = soup.select_one("a#Abstract") 

86 if abstract_subtag: 86 ↛ 94line 86 didn't jump to line 94 because the condition on line 86 was always true

87 abstract_tag = abstract_subtag.parent 

88 if abstract_tag: 88 ↛ 94line 88 didn't jump to line 94 because the condition on line 88 was always true

89 abstract_subtag.decompose() 

90 xarticle.abstracts.append( 

91 create_abstract(lang="en", tag="abstract", value_tex=abstract_tag.text) 

92 ) 

93 

94 return super().parse_article_content(content, xissue, xarticle, url, pid)