Coverage for src/crawler/by_source/cambridge_crawler.py: 19%

60 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup 

4from dateutil import parser 

5from ptf.cmds.xml.jats.jats_parser import JatsBase 

6from ptf.model_data import create_abstract, create_articledata 

7 

8from crawler.base_crawler import BaseCollectionCrawler 

9from crawler.utils import cleanup_str 

10 

11 

12class CambridgeCrawler(BaseCollectionCrawler): 

13 source_name = "Cambridge University Press" 

14 source_domain = "CAMBRIDGE" 

15 source_website = "https://www.cambridge.org/" 

16 

17 delimiter_disp_formula = "$$" 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 items = soup.select(".journal-all-issues .item") 

23 for item in items: 

24 href = item.get("href") 

25 if not isinstance(href, str): 

26 raise ValueError("Couldn't parse issue") 

27 href = urljoin(self.collection_url, href) 

28 

29 volume_tag = item.select_one(".issue") 

30 if not volume_tag: 

31 raise ValueError("Couldn't parse issue number") 

32 volume_number = cleanup_str(volume_tag.text).removeprefix("Volume ") 

33 

34 year_tag = item.select_one(".date") 

35 if not year_tag: 

36 raise ValueError("Couldn't parse issue year") 

37 year = parser.parse(year_tag.text).year 

38 

39 xissue = self.create_xissue( 

40 href, volume_number=volume_number, year=str(year), issue_number=None 

41 ) 

42 xissues.append(xissue) 

43 

44 return xissues 

45 

46 def parse_issue_content(self, content, xissue): 

47 if not xissue.url: 

48 raise ValueError("Issue must have an url") 

49 soup = BeautifulSoup(content, "html.parser") 

50 article_tag = soup.select(".journal-reader .part-link") 

51 

52 for index_article, article_node in enumerate(article_tag): 

53 url = article_node.get("href") 

54 if not isinstance(url, str): 

55 raise ValueError("Couldn't find article href") 

56 xarticle = create_articledata() 

57 xarticle.pid = "a" + str(index_article) 

58 xarticle.url = urljoin(xissue.url, url) 

59 

60 xissue.articles.append(xarticle) 

61 

62 def parse_article_content(self, content, xissue, xarticle, url): 

63 """ 

64 Parse the content with Beautifulsoup and returns an ArticleData 

65 """ 

66 

67 xarticle.lang = "en" 

68 

69 soup = BeautifulSoup(content, "html.parser") 

70 self.get_metadata_using_citation_meta( 

71 xarticle, xissue, soup, ["references", "pdf", "author", "title", "doi"] 

72 ) 

73 if len(xarticle.bibitems) > 0: 

74 xarticle.abstracts.append(JatsBase.compile_refs(xarticle.bibitems)) 

75 

76 if xarticle.title_tex == "": 

77 xarticle.title_tex = cleanup_str( 

78 soup.select_one("#maincontent").select_one("hgroup").text 

79 ) 

80 

81 abstract_header = soup.select_one("h2:-soup-contains-own('Abstract')") 

82 if abstract_header: 

83 abstract_parent = abstract_header.parent 

84 abstract_header.decompose() 

85 abstract = create_abstract( 

86 lang="en", tag="abstract", value_tex=cleanup_str(abstract_parent.text) 

87 ) 

88 xarticle.abstracts.append(abstract) 

89 return xarticle