Coverage for src/crawler/by_source/ami_crawler.py: 84%

77 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1from urllib.parse import urljoin 

2 

3import regex 

4from bs4 import BeautifulSoup, Tag 

5from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle 

9 

10 

11class AmiCrawler(BaseCollectionCrawler): 

12 source_name = "Annales Mathematica et Informaticae website" 

13 source_domain = "AMI" 

14 source_website = "https://ami.uni-eszterhazy.hu/" 

15 

16 issue_re = r"Vol. \d+ \((?P<year>\d+)\)" 

17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html.parser") 

22 issues = soup.select("#realtart select[name='vol'] option") 

23 for issue in issues: 

24 vol_number = issue.get("value") 

25 if not isinstance(vol_number, str) or not vol_number.isdigit(): 

26 continue 

27 year_re = regex.search(self.issue_re, issue.text) 

28 if not year_re: 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 raise ValueError("Couldn't parse volume year") 

30 issue_dict = year_re.groupdict() 

31 xissues.append( 

32 self.create_xissue( 

33 self.collection_url + "?vol=" + vol_number, 

34 issue_dict["year"], 

35 vol_number, 

36 None, 

37 ) 

38 ) 

39 return xissues 

40 

41 def parse_issue_content(self, content, xissue): 

42 soup = BeautifulSoup(content, "html.parser") 

43 articles = soup.select("#realtart p.cikk") 

44 for index, article_tag in enumerate(articles): 

45 xissue.articles.append(self.parse_ami_article(article_tag, xissue, index)) 

46 

47 def parse_ami_article(self, article_tag: Tag, xissue: IssueData, index: int): 

48 if not xissue.pid: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("You must set xissue.pid before parsing an article") 

50 if not xissue.url: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError("You must set xissue.url before parsing an article") 

52 

53 xarticle = create_articledata() 

54 xarticle.lang = "en" 

55 xarticle.pid = xissue.pid + "_a" + str(index) 

56 

57 ext_link = create_extlink(rel="source", location=xissue.url, metadata=self.source_domain) 

58 xarticle.ext_links.append(ext_link) 

59 

60 # Title 

61 title_tag = article_tag.select_one("a[href^='./uploads']") 

62 if not title_tag: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError("Couldn't parse article title") 

64 xarticle.title_tex = title_tag.text 

65 

66 # PDF 

67 pdf_url = title_tag.get("href") 

68 if not isinstance(pdf_url, str): 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("Couldn't parse article href") 

70 pdf_url = urljoin(self.source_website, pdf_url) 

71 add_pdf_link_to_xarticle(xarticle, pdf_url) 

72 

73 title_tag.decompose() 

74 # DOI 

75 doi_tag = article_tag.select_one("a[href^='https://doi.org']") 

76 if doi_tag: 

77 xarticle.doi = doi_tag.text 

78 doi_tag.decompose() 

79 

80 # Pages 

81 pages_tag = article_tag.select_one("font.oldal") 

82 if not pages_tag: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't find pages") 

84 pages_search = regex.search(self.pages_re, pages_tag.text) 

85 if not pages_search: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise ValueError("Couldn't parse pages") 

87 pages_group = pages_search.groupdict() 

88 xarticle.fpage = pages_group["fpage"] 

89 xarticle.lpage = pages_group["lpage"] 

90 

91 # Authors 

92 authors = None 

93 for child in article_tag.children: 93 ↛ 99line 93 didn't jump to line 99 because the loop on line 93 didn't complete

94 if not isinstance(child, str): 

95 continue 

96 if child.startswith("by"): 

97 authors = child.removeprefix("by ") 

98 break 

99 if not authors: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 raise ValueError("Couldn't find authors") 

101 

102 authors = authors.split(", ") 

103 for a in authors: 

104 xarticle.contributors.append(create_contributor(string_name=a, role="author")) 

105 

106 return xarticle