Coverage for src/crawler/by_source/ams_crawler.py: 49%
67 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1import json
2import os
3import subprocess
4from urllib.parse import urljoin
5from uuid import uuid4
7from bs4 import BeautifulSoup
8from ptf.model_data import create_abstract, create_articledata
9from ptf.utils import execute_cmd
11from crawler.base_crawler import BaseCollectionCrawler
14class AmsCrawler(BaseCollectionCrawler):
15 source_name = "American Mathematical Society"
16 source_domain = "AMS"
17 source_website = "https://www.ams.org/"
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html.parser")
22 issues_data_tag = soup.select_one("script[language='JavaScript']:not([src])")
23 issues = json.loads(self.get_col_issues(issues_data_tag.text))
24 for i in issues:
25 xissues.append(
26 self.create_xissue(
27 urljoin(self.collection_url, issues[i]["url"]),
28 issues[i]["year"],
29 issues[i]["volume"],
30 issues[i].get("number", None),
31 )
32 )
33 return xissues
35 def get_col_issues(self, input: str):
36 filename = "/tmp/crawler/puppeteer/" + str(uuid4())
37 filename_out = filename + "-out"
38 os.makedirs(os.path.dirname(filename), exist_ok=True)
39 with open(filename, "w") as file:
40 file.write(input)
42 content = None
43 attempt = 0
44 while not content and attempt < 3:
45 attempt += 1
46 try:
47 cmd = f"{os.path.dirname(os.path.realpath(__file__))}/ams_crawler_col.js -f {filename} -o {filename_out}"
48 print(cmd)
49 execute_cmd(cmd)
51 if os.path.isfile(filename_out):
52 with open(filename_out) as file_:
53 content = file_.read()
55 os.remove(filename)
56 os.remove(filename_out)
58 except subprocess.CalledProcessError:
59 pass
60 if not content:
61 raise ValueError("Couldn't parse collection content")
62 return content
64 def parse_issue_content(self, content, xissue):
65 soup = BeautifulSoup(content, "html.parser")
66 articles = soup.select(
67 "dd > a:-soup-contains-own('Abstract, references and article information')"
68 )
69 for index, a in enumerate(articles):
70 article_url = a.get("href")
71 if not isinstance(article_url, str): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 raise ValueError("Couldn't parse article url")
73 xarticle = create_articledata()
74 xarticle.url = urljoin(self.collection_url, article_url)
75 xarticle.pid = "a" + str(index)
76 xissue.articles.append(xarticle)
78 def parse_article_content(self, content, xissue, xarticle, url, pid):
79 soup = BeautifulSoup(content, "html.parser")
80 self.get_metadata_using_citation_meta(
81 xarticle, xissue, soup, ["author", "page", "doi", "references", "pdf", "title"]
82 )
83 xarticle.abstracts.append(self.create_bibliography(xarticle.bibitems))
85 abstract_subtag = soup.select_one("a#Abstract")
86 if abstract_subtag: 86 ↛ 94line 86 didn't jump to line 94 because the condition on line 86 was always true
87 abstract_tag = abstract_subtag.parent
88 if abstract_tag: 88 ↛ 94line 88 didn't jump to line 94 because the condition on line 88 was always true
89 abstract_subtag.decompose()
90 xarticle.abstracts.append(
91 create_abstract(lang="en", tag="abstract", value_tex=abstract_tag.text)
92 )
94 return super().parse_article_content(content, xissue, xarticle, url, pid)