Coverage for src/crawler/by_source/cambridge_crawler.py: 19%
60 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup
4from dateutil import parser
5from ptf.cmds.xml.jats.jats_parser import JatsBase
6from ptf.model_data import create_abstract, create_articledata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.utils import cleanup_str
12class CambridgeCrawler(BaseCollectionCrawler):
13 source_name = "Cambridge University Press"
14 source_domain = "CAMBRIDGE"
15 source_website = "https://www.cambridge.org/"
17 delimiter_disp_formula = "$$"
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html.parser")
22 items = soup.select(".journal-all-issues .item")
23 for item in items:
24 href = item.get("href")
25 if not isinstance(href, str):
26 raise ValueError("Couldn't parse issue")
27 href = urljoin(self.collection_url, href)
29 volume_tag = item.select_one(".issue")
30 if not volume_tag:
31 raise ValueError("Couldn't parse issue number")
32 volume_number = cleanup_str(volume_tag.text).removeprefix("Volume ")
34 year_tag = item.select_one(".date")
35 if not year_tag:
36 raise ValueError("Couldn't parse issue year")
37 year = parser.parse(year_tag.text).year
39 xissue = self.create_xissue(
40 href, volume_number=volume_number, year=str(year), issue_number=None
41 )
42 xissues.append(xissue)
44 return xissues
46 def parse_issue_content(self, content, xissue):
47 if not xissue.url:
48 raise ValueError("Issue must have an url")
49 soup = BeautifulSoup(content, "html.parser")
50 article_tag = soup.select(".journal-reader .part-link")
52 for index_article, article_node in enumerate(article_tag):
53 url = article_node.get("href")
54 if not isinstance(url, str):
55 raise ValueError("Couldn't find article href")
56 xarticle = create_articledata()
57 xarticle.pid = "a" + str(index_article)
58 xarticle.url = urljoin(xissue.url, url)
60 xissue.articles.append(xarticle)
62 def parse_article_content(self, content, xissue, xarticle, url):
63 """
64 Parse the content with Beautifulsoup and returns an ArticleData
65 """
67 xarticle.lang = "en"
69 soup = BeautifulSoup(content, "html.parser")
70 self.get_metadata_using_citation_meta(
71 xarticle, xissue, soup, ["references", "pdf", "author", "title", "doi"]
72 )
73 if len(xarticle.bibitems) > 0:
74 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems))
76 if xarticle.title_tex == "":
77 xarticle.title_tex = cleanup_str(
78 soup.select_one("#maincontent").select_one("hgroup").text
79 )
81 abstract_header = soup.select_one("h2:-soup-contains-own('Abstract')")
82 if abstract_header:
83 abstract_parent = abstract_header.parent
84 abstract_header.decompose()
85 abstract = create_abstract(
86 lang="en", tag="abstract", value_tex=cleanup_str(abstract_parent.text)
87 )
88 xarticle.abstracts.append(abstract)
89 return xarticle