Coverage for src / crawler / by_source / emis_am_crawler.py: 14%

90 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.crawler_utils import get_issue_pid 

9from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

10 

11 

12class Emis_amCrawler(BaseCollectionCrawler): 

13 source_name = "European Mathematical Information Service" 

14 source_domain = "EMIS_AM" 

15 source_website = "https://www.emis.de" 

16 

17 issue_re = regex.compile( 

18 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})" 

19 ) 

20 

21 requests_verify = False 

22 

23 @classmethod 

24 def get_view_id(cls): 

25 return "EMIS" 

26 

27 def parse_collection_content(self, content): 

28 xissues = [] 

29 

30 soup = BeautifulSoup(content, "html5lib") 

31 b_tag = soup.select( 

32 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')" 

33 ) 

34 if not b_tag: 

35 raise ValueError("Couldn't parse page") 

36 for a_tag in b_tag: 

37 href = a_tag.get("href") 

38 if not isinstance(href, str): 

39 raise ValueError("Couldn't parse ") 

40 xissue = create_issuedata() 

41 xissue.pid = href 

42 xissue.url = urljoin(self.collection_url, href) 

43 xissues.append(xissue) 

44 return xissues 

45 

46 def parse_issue_content(self, content, xissue): 

47 soup = BeautifulSoup(content, "html5lib") 

48 if not xissue.url: 

49 raise ValueError("Issue URL must be set") 

50 

51 # Parse issue year, volume and number 

52 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2") 

53 if not title_tag: 

54 raise ValueError("Couldn't find issue title") 

55 title_str = cleanup_str(title_tag.text) 

56 issue_search = self.issue_re.search(title_str) 

57 if not issue_search: 

58 raise ValueError("Couldn't parse issue title") 

59 issue_data = issue_search.groupdict() 

60 xissue.volume = issue_data["volume"] 

61 xissue.number = issue_data["number"] 

62 xissue.year = issue_data["year"] 

63 xissue.pid = get_issue_pid( 

64 self.collection_id, 

65 issue_data["year"], 

66 issue_data["volume"], 

67 issue_data["number"], 

68 ) 

69 

70 # Parse issue article list 

71 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]") 

72 for index, a_tag in enumerate(article_tags): 

73 href = a_tag.get("href") 

74 if not isinstance(href, str): 

75 raise ValueError("Couldn't parse article link") 

76 xarticle = create_articledata() 

77 xarticle.pid = "a" + str(index) 

78 xarticle.url = urljoin(xissue.url, href) 

79 xissue.articles.append(xarticle) 

80 

81 def parse_article_content(self, content, xissue, xarticle, url): 

82 if not xarticle.url: 

83 raise ValueError("Article must have an url") 

84 

85 soup = BeautifulSoup(content, "html5lib") 

86 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)") 

87 if not article_content: 

88 raise ValueError("Couldn't parse article") 

89 title_tag = article_content.select_one("h2") 

90 if not title_tag: 

91 raise ValueError("Couldn't find title") 

92 xarticle.title_tex = cleanup_str(title_tag.text) 

93 

94 authors_tag = article_content.select_one("h3") 

95 if not authors_tag: 

96 raise ValueError("Couldn't find title") 

97 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ") 

98 for author in authors_str: 

99 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

100 

101 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')") 

102 if keyword_tag: 

103 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ") 

104 for kwd in keywords_str.split("; "): 

105 xarticle.kwds.append(create_subj(value=kwd)) 

106 

107 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')") 

108 if msc_tag: 

109 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ") 

110 for kwd in msc_tag.split(" "): 

111 xarticle.kwds.append(create_subj(value=kwd, type="msc")) 

112 

113 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')") 

114 if pdf_tag: 

115 pdf_href = pdf_tag.get("href") 

116 if not isinstance(pdf_href, str): 

117 raise ValueError("Couldn't parse pdf href") 

118 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

119 return xarticle