Coverage for src/crawler/by_source/emis_am_crawler.py: 13%

89 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

9 

10 

11class Emis_amCrawler(BaseCollectionCrawler): 

12 source_name = "European Mathematical Information Service" 

13 source_domain = "EMIS_AM" 

14 source_website = "https://www.emis.de" 

15 

16 issue_re = regex.compile( 

17 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})" 

18 ) 

19 

20 requests_verify = False 

21 

22 @classmethod 

23 def get_view_id(cls): 

24 return "EMIS" 

25 

26 def parse_collection_content(self, content): 

27 xissues = [] 

28 

29 soup = BeautifulSoup(content, "html5lib") 

30 b_tag = soup.select( 

31 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')" 

32 ) 

33 if not b_tag: 

34 raise ValueError("Couldn't parse page") 

35 for a_tag in b_tag: 

36 href = a_tag.get("href") 

37 if not isinstance(href, str): 

38 raise ValueError("Couldn't parse ") 

39 xissue = create_issuedata() 

40 xissue.pid = href 

41 xissue.url = urljoin(self.collection_url, href) 

42 xissues.append(xissue) 

43 return xissues 

44 

45 def parse_issue_content(self, content, xissue): 

46 soup = BeautifulSoup(content, "html5lib") 

47 if not xissue.url: 

48 raise ValueError("Issue URL must be set") 

49 

50 # Parse issue year, volume and number 

51 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2") 

52 if not title_tag: 

53 raise ValueError("Couldn't find issue title") 

54 title_str = cleanup_str(title_tag.text) 

55 issue_search = self.issue_re.search(title_str) 

56 if not issue_search: 

57 raise ValueError("Couldn't parse issue title") 

58 issue_data = issue_search.groupdict() 

59 xissue.volume = issue_data["volume"] 

60 xissue.number = issue_data["number"] 

61 xissue.year = issue_data["year"] 

62 xissue.pid = self.get_issue_pid( 

63 self.collection_id, 

64 issue_data["year"], 

65 issue_data["volume"], 

66 issue_data["number"], 

67 ) 

68 

69 # Parse issue article list 

70 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]") 

71 for index, a_tag in enumerate(article_tags): 

72 href = a_tag.get("href") 

73 if not isinstance(href, str): 

74 raise ValueError("Couldn't parse article link") 

75 xarticle = create_articledata() 

76 xarticle.pid = "a" + str(index) 

77 xarticle.url = urljoin(xissue.url, href) 

78 xissue.articles.append(xarticle) 

79 

80 def parse_article_content(self, content, xissue, xarticle, url): 

81 if not xarticle.url: 

82 raise ValueError("Article must have an url") 

83 

84 soup = BeautifulSoup(content, "html5lib") 

85 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)") 

86 if not article_content: 

87 raise ValueError("Couldn't parse article") 

88 title_tag = article_content.select_one("h2") 

89 if not title_tag: 

90 raise ValueError("Couldn't find title") 

91 xarticle.title_tex = cleanup_str(title_tag.text) 

92 

93 authors_tag = article_content.select_one("h3") 

94 if not authors_tag: 

95 raise ValueError("Couldn't find title") 

96 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ") 

97 for author in authors_str: 

98 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

99 

100 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')") 

101 if keyword_tag: 

102 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ") 

103 for kwd in keywords_str.split("; "): 

104 xarticle.kwds.append(create_subj(value=kwd)) 

105 

106 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')") 

107 if msc_tag: 

108 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ") 

109 for kwd in msc_tag.split(" "): 

110 xarticle.kwds.append(create_subj(value=kwd, type="msc")) 

111 

112 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')") 

113 if pdf_tag: 

114 pdf_href = pdf_tag.get("href") 

115 if not isinstance(pdf_href, str): 

116 raise ValueError("Couldn't parse pdf href") 

117 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

118 return xarticle