Coverage for src/crawler/by_source/emis_am_crawler.py: 12%

86 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_articledata, create_contributor, create_issuedata, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

9 

10 

11class Emis_amCrawler(BaseCollectionCrawler): 

12 source_name = "European Mathematical Information Service" 

13 source_domain = "EMIS_AM" 

14 source_website = "https://www.emis.de" 

15 

16 issue_re = regex.compile( 

17 r"Annals of MathematicsVol\. (?P<volume>\d+), No. (?P<number>\d+), (?P<year>\d{4})" 

18 ) 

19 

20 requests_verify = False 

21 

22 def parse_collection_content(self, content): 

23 xissues = [] 

24 

25 soup = BeautifulSoup(content, "html5lib") 

26 b_tag = soup.select( 

27 "body table:nth-of-type(2) td:nth-of-type(1) b a[href]:-soup-contains-own('No. ')" 

28 ) 

29 if not b_tag: 

30 raise ValueError("Couldn't parse page") 

31 for a_tag in b_tag: 

32 href = a_tag.get("href") 

33 if not isinstance(href, str): 

34 raise ValueError("Couldn't parse ") 

35 xissue = create_issuedata() 

36 xissue.pid = href 

37 xissue.url = urljoin(self.collection_url, href) 

38 xissues.append(xissue) 

39 return xissues 

40 

41 def parse_issue_content(self, content, xissue): 

42 soup = BeautifulSoup(content, "html5lib") 

43 if not xissue.url: 

44 raise ValueError("Issue URL must be set") 

45 

46 # Parse issue year, volume and number 

47 title_tag = soup.select_one("table:nth-of-type(1) tr > td:nth-of-type(3) h2") 

48 if not title_tag: 

49 raise ValueError("Couldn't find issue title") 

50 title_str = cleanup_str(title_tag.text) 

51 issue_search = self.issue_re.search(title_str) 

52 if not issue_search: 

53 raise ValueError("Couldn't parse issue title") 

54 issue_data = issue_search.groupdict() 

55 xissue.volume = issue_data["volume"] 

56 xissue.number = issue_data["number"] 

57 xissue.year = issue_data["year"] 

58 xissue.pid = self.get_issue_pid( 

59 self.collection_id, 

60 issue_data["year"], 

61 issue_data["volume"], 

62 issue_data["number"], 

63 ) 

64 

65 # Parse issue article list 

66 article_tags = soup.select("table:nth-of-type(2) tr > td:nth-of-type(3) > p > a[href]") 

67 for index, a_tag in enumerate(article_tags): 

68 href = a_tag.get("href") 

69 if not isinstance(href, str): 

70 raise ValueError("Couldn't parse article link") 

71 xarticle = create_articledata() 

72 xarticle.pid = "a" + str(index) 

73 xarticle.url = urljoin(xissue.url, href) 

74 xissue.articles.append(xarticle) 

75 

76 def parse_article_content(self, content, xissue, xarticle, url): 

77 if not xarticle.url: 

78 raise ValueError("Article must have an url") 

79 

80 soup = BeautifulSoup(content, "html5lib") 

81 article_content = soup.select_one("table:nth-of-type(2) tr > td:nth-of-type(3)") 

82 if not article_content: 

83 raise ValueError("Couldn't parse article") 

84 title_tag = article_content.select_one("h2") 

85 if not title_tag: 

86 raise ValueError("Couldn't find title") 

87 xarticle.title_tex = cleanup_str(title_tag.text) 

88 

89 authors_tag = article_content.select_one("h3") 

90 if not authors_tag: 

91 raise ValueError("Couldn't find title") 

92 authors_str = cleanup_str(authors_tag.text).replace(" and ", ", ").split(", ") 

93 for author in authors_str: 

94 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

95 

96 keyword_tag = article_content.select_one("p:-soup-contains('Keywords:')") 

97 if keyword_tag: 

98 keywords_str = cleanup_str(keyword_tag.text).removeprefix("Keywords: ") 

99 for kwd in keywords_str.split("; "): 

100 xarticle.kwds.append(create_subj(value=kwd)) 

101 

102 msc_tag = article_content.select_one("p:-soup-contains('Classification (MSC2000):')") 

103 if msc_tag: 

104 msc_tag = cleanup_str(msc_tag.text).removeprefix("Classification (MSC2000): ") 

105 for kwd in msc_tag.split(" "): 

106 xarticle.kwds.append(create_subj(value=kwd, type="msc")) 

107 

108 pdf_tag = article_content.select_one("a:-soup-contains('PDF file')") 

109 if pdf_tag: 

110 pdf_href = pdf_tag.get("href") 

111 if not isinstance(pdf_href, str): 

112 raise ValueError("Couldn't parse pdf href") 

113 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

114 return xarticle