Coverage for src/crawler/by_source/emis_am_crawler.py: 11%

85 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-24 10:35 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

9 

10 

11class Emis_amCrawler(BaseCollectionCrawler): 

12 source_name = "European Mathematical Information Service" 

13 source_domain = "EMIS_AM" 

14 source_website = "https://www.emis.de" 

15 

16 issue_re = regex.compile( 

17 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})" 

18 ) 

19 

20 def parse_collection_content(self, content): 

21 xissues = [] 

22 

23 soup = BeautifulSoup(content, "html5lib") 

24 b_tag = soup.select( 

25 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')" 

26 ) 

27 if not b_tag: 

28 raise ValueError("Couldn't parse page") 

29 for a_tag in b_tag: 

30 href = a_tag.get("href") 

31 if not isinstance(href, str): 

32 raise ValueError("Couldn't parse ") 

33 xissue = create_issuedata() 

34 xissue.pid = href 

35 xissue.url = urljoin(self.collection_url, href) 

36 xissues.append(xissue) 

37 return xissues 

38 

39 def parse_issue_content(self, content, xissue): 

40 soup = BeautifulSoup(content, "html5lib") 

41 if not xissue.url: 

42 raise ValueError("Issue URL must be set") 

43 

44 # Parse issue year, volume and number 

45 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2") 

46 if not title_tag: 

47 raise ValueError("Couldn't find issue title") 

48 title_str = cleanup_str(title_tag.text) 

49 issue_search = self.issue_re.search(title_str) 

50 if not issue_search: 

51 raise ValueError("Couldn't parse issue title") 

52 issue_data = issue_search.groupdict() 

53 xissue.volume = issue_data["volume"] 

54 xissue.number = issue_data["number"] 

55 xissue.year = issue_data["year"] 

56 xissue.pid = self.get_issue_pid( 

57 self.collection_id, 

58 issue_data["year"], 

59 issue_data["volume"], 

60 issue_data["number"], 

61 ) 

62 

63 # Parse issue article list 

64 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]") 

65 for index, a_tag in enumerate(article_tags): 

66 href = a_tag.get("href") 

67 if not isinstance(href, str): 

68 raise ValueError("Couldn't parse article link") 

69 xarticle = create_articledata() 

70 xarticle.pid = "a" + str(index) 

71 xarticle.url = urljoin(xissue.url, href) 

72 xissue.articles.append(xarticle) 

73 

74 def parse_article_content(self, content, xissue, xarticle, url): 

75 if not xarticle.url: 

76 raise ValueError("Article must have an url") 

77 

78 soup = BeautifulSoup(content, "html5lib") 

79 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)") 

80 if not article_content: 

81 raise ValueError("Couldn't parse article") 

82 title_tag = article_content.select_one("h2") 

83 if not title_tag: 

84 raise ValueError("Couldn't find title") 

85 xarticle.title_tex = cleanup_str(title_tag.text) 

86 

87 authors_tag = article_content.select_one("h3") 

88 if not authors_tag: 

89 raise ValueError("Couldn't find title") 

90 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ") 

91 for author in authors_str: 

92 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

93 

94 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')") 

95 if keyword_tag: 

96 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ") 

97 for kwd in keywords_str.split("; "): 

98 xarticle.kwds.append(create_subj(value=kwd)) 

99 

100 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')") 

101 if msc_tag: 

102 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ") 

103 for kwd in msc_tag.split(" "): 

104 xarticle.kwds.append(create_subj(value=kwd, type="msc")) 

105 

106 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')") 

107 if pdf_tag: 

108 pdf_href = pdf_tag.get("href") 

109 if not isinstance(pdf_href, str): 

110 raise ValueError("Couldn't parse pdf href") 

111 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

112 return xarticle