Coverage for src/crawler/by_source/ams_crawler.py: 52%
70 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import json
2import os
3import subprocess
4from urllib.parse import urljoin
5from uuid import uuid4
7from bs4 import BeautifulSoup
8from ptf.model_data import create_abstract, create_articledata
9from ptf.utils import execute_cmd
11from crawler.base_crawler import BaseCollectionCrawler
12from crawler.tests.data_generation.decorators import skip_generation
15class AmsCrawler(BaseCollectionCrawler):
16 source_name = "American Mathematical Society"
17 source_domain = "AMS"
18 source_website = "https://www.ams.org/"
20 @skip_generation
21 def parse_collection_content(self, content):
22 xissues = []
23 soup = BeautifulSoup(content, "html.parser")
24 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])")
25 issues = json.loads(self.get_col_issues(issues_data_tag.text))
26 for i in issues:
27 xissues.append(
28 self.create_xissue(
29 urljoin(self.collection_url, issues[i]["url"]),
30 issues[i]["year"],
31 issues[i]["volume"],
32 issues[i].get("number", None),
33 )
34 )
35 return xissues
37 def get_col_issues(self, input: str):
38 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
39 filename_out = filename + "-out"
40 os.makedirs(os.path.dirname(filename), exist_ok=True)
41 with open(filename, "w") as file:
42 file.write(input)
44 content = None
45 attempt = 0
46 while not content and attempt < 3:
47 attempt += 1
48 try:
49 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
50 print(cmd)
51 execute_cmd(cmd)
53 if os.path.isfile(filename_out):
54 with open(filename_out) as file_:
55 content = file_.read()
57 os.remove(filename)
58 os.remove(filename_out)
60 except subprocess.CalledProcessError:
61 pass
62 if not content:
63 raise ValueError("Couldn't parse collection content")
64 return content
66 def parse_issue_content(self, content, xissue):
67 soup = BeautifulSoup(content, "html.parser")
68 articles = soup.select(
69 "dd > a:-soup-contains-own('Abstract, references and article information')"
70 )
71 for index, a in enumerate(articles):
72 article_url = a.get("href")
73 if not isinstance(article_url, str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 raise ValueError("Couldn't parse article url")
75 xarticle = create_articledata()
76 xarticle.url = urljoin(self.collection_url, article_url)
77 xarticle.pid = "a" + str(index)
78 xissue.articles.append(xarticle)
80 def parse_article_content(self, content, xissue, xarticle, url):
81 soup = BeautifulSoup(content, "html.parser")
82 self.get_metadata_using_citation_meta(
83 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"]
84 )
85 if len(xarticle.bibitems) > 0:
86 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems))
88 abstract_subtag = soup.select_one("a#Abstract")
89 if abstract_subtag: 89 ↛ 97line 89 didn't jump to line 97 because the condition on line 89 was always true
90 abstract_tag = abstract_subtag.parent
91 if abstract_tag: 91 ↛ 97line 91 didn't jump to line 97 because the condition on line 91 was always true
92 abstract_subtag.decompose()
93 xarticle.abstracts.append(
94 create_abstract(lang="en", tag="abstract", value_tex=abstract_tag.text)
95 )
97 return xarticle