Coverage for src/crawler/by_source/cambridge_crawler.py: 11%
117 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-08-29 13:43 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup, Tag
4from dateutil import parser
5from ptf.cmds.xml.xml_utils import escape
6from ptf.model_data import create_abstract, create_articledata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.cmds.mixed_citation import (
10 ExtLinkXml,
11 GenericRefElement,
12 MixedCitation,
13)
14from crawler.utils import cleanup_str
17class CambridgeCrawler(BaseCollectionCrawler):
18 source_name = "Cambridge University Press"
19 source_domain = "CAMBRIDGE"
20 source_website = "https://www.cambridge.org/"
22 delimiter_disp_formula = "$$"
24 def parse_collection_content(self, content):
25 xissues = []
26 soup = BeautifulSoup(content, "html.parser")
27 items = soup.select(".journal-all-issues .item")
28 for item in items:
29 href = item.get("href")
30 if not isinstance(href, str):
31 raise ValueError("Couldn't parse issue")
32 href = urljoin(self.collection_url, href)
34 volume_tag = item.select_one(".issue")
35 if not volume_tag:
36 raise ValueError("Couldn't parse issue number")
37 volume_number = cleanup_str(volume_tag.text).removeprefix("Volume ")
39 year_tag = item.select_one(".date")
40 if not year_tag:
41 raise ValueError("Couldn't parse issue year")
42 year = parser.parse(year_tag.text).year
44 xissue = self.create_xissue(
45 href, volume_number=volume_number, year=str(year), issue_number=None
46 )
47 xissues.append(xissue)
49 return xissues
51 def parse_issue_content(self, content, xissue):
52 if not xissue.url:
53 raise ValueError("Issue must have an url")
54 soup = BeautifulSoup(content, "html.parser")
55 article_tag = soup.select(".journal-reader .part-link")
57 for index_article, article_node in enumerate(article_tag):
58 url = article_node.get("href")
59 if not isinstance(url, str):
60 raise ValueError("Couldn't find article href")
61 xarticle = create_articledata()
62 xarticle.pid = "a" + str(index_article)
63 xarticle.url = urljoin(xissue.url, url)
65 xissue.articles.append(xarticle)
67 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')")
68 if has_pagination:
69 pagination_link = has_pagination.get("href")
70 if isinstance(pagination_link, str):
71 page_url = urljoin(xissue.url, pagination_link)
72 content = self.download_file(page_url)
74 self.parse_issue_content(content, xissue)
76 def parse_article_content(self, content, xissue, xarticle, url):
77 """
78 Parse the content with Beautifulsoup and returns an ArticleData
79 """
81 xarticle.lang = "en"
83 soup = BeautifulSoup(content, "html.parser")
84 self.get_metadata_using_citation_meta(xarticle, xissue, soup, ["pdf", "author", "doi"])
86 xarticle.title_tex = cleanup_str(soup.select_one("#maincontent").select_one("hgroup").text)
88 abstract_header = soup.select_one("h2:-soup-contains-own('Abstract')")
89 if abstract_header:
90 abstract_parent = abstract_header.parent
91 abstract_header.decompose()
93 no_content = abstract_parent.select_one(".no-content")
94 if no_content:
95 no_content.decompose()
97 xarticle.abstracts.append(
98 create_abstract(
99 lang="en", tag="abstract", value_tex=cleanup_str(abstract_parent.text)
100 )
101 )
102 references_list = soup.select_one("#references-list")
103 if references_list:
104 xarticle.bibitems = self.parse_cambridge_references(references_list)
105 return xarticle
107 def parse_cambridge_references(self, soup: Tag):
108 bibitems = []
109 for item in soup.select(".circle-list__item"):
110 citation_builder = MixedCitation()
111 label_tag = item.select_one(".circle-list__item__number")
112 if label_tag:
113 citation_builder.label = escape(cleanup_str(label_tag.text))
114 citation_content = item.select_one(".circle-list__item__grouped__content")
115 if citation_content:
116 self.parse_cambridge_ref_nodes(citation_content, citation_builder)
118 # Group all StringNames into one PersonGroup object
119 persongroup_builder = GenericRefElement()
120 persongroup_builder.name = "person-group"
121 # Index of StringNames objects
122 i = [
123 index
124 for index, element in enumerate(citation_builder.elements)
125 if isinstance(element, GenericRefElement) and element.name == "string-name"
126 ]
127 if len(i) > 0:
128 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1]
129 del citation_builder.elements[i[0] : i[-1] + 1]
130 citation_builder.elements.insert(i[0], persongroup_builder)
132 bibitems.append(citation_builder.get_jats_ref())
133 return bibitems
135 def parse_cambridge_ref_nodes(
136 self,
137 current_tag: Tag,
138 current_builder: GenericRefElement,
139 ):
140 for element in current_tag.children:
141 if isinstance(element, str):
142 current_builder.elements.append(escape(element))
143 continue
144 if isinstance(element, Tag):
145 tag_class = element.get("class")
146 if isinstance(tag_class, list):
147 if len(tag_class) > 0:
148 tag_class = tag_class[0]
149 else:
150 tag_class = None
152 if not tag_class:
153 continue
154 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"):
155 continue
156 if element.name == "a":
157 href = element.get("href")
158 if isinstance(href, str):
159 current_builder.elements.append(
160 ExtLinkXml(escape(href), escape(element.text))
161 )
162 continue
164 if tag_class in [
165 "surname",
166 "given-names",
167 "string-name",
168 "person-group",
169 "publisher-name",
170 "source",
171 "volume",
172 "year",
173 "fpage",
174 "lpage",
175 "article-title",
176 "issue",
177 "chapter-title",
178 "inline-formula",
179 "collab",
180 "alternatives",
181 "italic",
182 "publisher-loc",
183 "roman",
184 "edition",
185 "suffix",
186 ]:
187 refnode_builder = GenericRefElement()
188 refnode_builder.name = tag_class
189 current_builder.elements.append(refnode_builder)
190 self.parse_cambridge_ref_nodes(element, refnode_builder)
191 continue
193 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}")
194 current_builder.elements.append(escape(element.text))