Coverage for src/crawler/by_source/cambridge_crawler.py: 10%
120 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup, Tag
4from dateutil import parser
5from ptf.cmds.xml.xml_utils import escape
6from ptf.model_data import create_abstract, create_articledata
8from crawler.base_crawler import BaseCollectionCrawler
9from crawler.cmds.mixed_citation import (
10 ExtLinkXml,
11 GenericRefElement,
12 MixedCitation,
13)
14from crawler.utils import cleanup_str
17class CambridgeCrawler(BaseCollectionCrawler):
18 source_name = "Cambridge University Press"
19 source_domain = "CAMBRIDGE"
20 source_website = "https://www.cambridge.org/"
22 delimiter_disp_formula = "$$"
24 def parse_collection_content(self, content):
25 xissues = []
26 soup = BeautifulSoup(content, "html.parser")
27 items = soup.select(".journal-all-issues .item")
28 for item in items:
29 href = item.get("href")
30 if not isinstance(href, str):
31 raise ValueError("Couldn't parse issue")
32 href = urljoin(self.collection_url, href)
34 volume_tag = item.select_one(".issue")
35 if not volume_tag:
36 raise ValueError("Couldn't parse issue number")
37 volume_number = cleanup_str(volume_tag.text).removeprefix("Volume ")
39 year_tag = item.select_one(".date")
40 if not year_tag:
41 raise ValueError("Couldn't parse issue year")
42 year = parser.parse(year_tag.text).year
44 xissue = self.create_xissue(
45 href, volume_number=volume_number, year=str(year), issue_number=None
46 )
47 xissues.append(xissue)
49 return xissues
51 def parse_issue_content(self, content, xissue):
52 if not xissue.url:
53 raise ValueError("Issue must have an url")
54 soup = BeautifulSoup(content, "html.parser")
55 article_tag = soup.select(".journal-reader .part-link")
57 for index_article, article_node in enumerate(article_tag):
58 url = article_node.get("href")
59 if not isinstance(url, str):
60 raise ValueError("Couldn't find article href")
61 xarticle = create_articledata()
62 xarticle.pid = "a" + str(index_article)
63 xarticle.url = urljoin(xissue.url, url)
65 xissue.articles.append(xarticle)
67 has_pagination = soup.select_one("ul.pagination a:-soup-contains-own('Next »')")
68 if has_pagination:
69 pagination_link = has_pagination.get("href")
70 if isinstance(pagination_link, str):
71 page_url = urljoin(xissue.url, pagination_link)
72 content = self.download_file(page_url)
74 self.parse_issue_content(content, xissue)
76 def parse_article_content(self, content, xissue, xarticle, url):
77 """
78 Parse the content with Beautifulsoup and returns an ArticleData
79 """
81 xarticle.lang = "en"
83 soup = BeautifulSoup(content, "html5lib")
84 self.get_metadata_using_citation_meta(xarticle, xissue, soup, ["pdf", "author", "doi"])
85 title_tag = soup.select_one("#maincontent hgroup")
86 if not title_tag:
87 raise ValueError("Couldn't find title tag")
88 xarticle.title_tex = cleanup_str(title_tag.text)
90 abstract_header = soup.select_one("h2:-soup-contains-own('Abstract')")
91 if abstract_header:
92 abstract_parent = abstract_header.parent
93 abstract_header.decompose()
95 no_content = abstract_parent.select_one(".no-content")
96 if no_content:
97 no_content.decompose()
99 xarticle.abstracts.append(
100 create_abstract(
101 lang="en", tag="abstract", value_tex=cleanup_str(abstract_parent.text)
102 )
103 )
104 references_list = soup.select_one("#references-list")
105 if references_list:
106 xarticle.bibitems = self.parse_cambridge_references(references_list)
107 return xarticle
109 def parse_cambridge_references(self, soup: Tag):
110 bibitems = []
111 for item in soup.select(".circle-list__item"):
112 citation_builder = MixedCitation()
113 label_tag = item.select_one(".circle-list__item__number")
114 if label_tag:
115 citation_builder.label = escape(cleanup_str(label_tag.text))
116 citation_content = item.select_one(".circle-list__item__grouped__content")
117 if citation_content:
118 self.parse_cambridge_ref_nodes(citation_content, citation_builder)
120 # Group all StringNames into one PersonGroup object
121 persongroup_builder = GenericRefElement()
122 persongroup_builder.name = "person-group"
123 # Index of StringNames objects
124 i = [
125 index
126 for index, element in enumerate(citation_builder.elements)
127 if isinstance(element, GenericRefElement) and element.name == "string-name"
128 ]
129 if len(i) > 0:
130 persongroup_builder.elements = citation_builder.elements[i[0] : i[-1] + 1]
131 del citation_builder.elements[i[0] : i[-1] + 1]
132 citation_builder.elements.insert(i[0], persongroup_builder)
134 bibitems.append(citation_builder.get_jats_ref())
135 return bibitems
137 def parse_cambridge_ref_nodes(
138 self,
139 current_tag: Tag,
140 current_builder: GenericRefElement,
141 ):
142 for element in current_tag.children:
143 if isinstance(element, str):
144 current_builder.elements.append(escape(element))
145 continue
146 if isinstance(element, Tag):
147 tag_class = element.get("class")
148 if isinstance(tag_class, list):
149 if len(tag_class) > 0:
150 tag_class = tag_class[0]
151 else:
152 tag_class = None
154 if not tag_class:
155 continue
156 if tag_class in ("mathjax-tex-wrapper", "aop-lazy-load-image"):
157 continue
158 if element.name == "a":
159 href = element.get("href")
160 if isinstance(href, str):
161 current_builder.elements.append(
162 ExtLinkXml(escape(href), escape(element.text))
163 )
164 continue
166 if tag_class in [
167 "surname",
168 "given-names",
169 "string-name",
170 "person-group",
171 "publisher-name",
172 "source",
173 "volume",
174 "year",
175 "fpage",
176 "lpage",
177 "article-title",
178 "issue",
179 "chapter-title",
180 "inline-formula",
181 "collab",
182 "alternatives",
183 "italic",
184 "publisher-loc",
185 "roman",
186 "edition",
187 "suffix",
188 ]:
189 refnode_builder = GenericRefElement()
190 refnode_builder.name = tag_class
191 current_builder.elements.append(refnode_builder)
192 self.parse_cambridge_ref_nodes(element, refnode_builder)
193 continue
195 self.logger.warning(f"Couldn't insert tag into mixed citation : {tag_class}")
196 current_builder.elements.append(escape(element.text))