Coverage for src/crawler/by_source/ami_crawler.py: 87%

70 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

8 

9 

10class AmiCrawler(BaseCollectionCrawler): 

11 source_name = "Annales Mathematica et Informaticae website" 

12 source_domain = "AMI" 

13 source_website = "https://ami.uni-eszterhazy.hu/" 

14 

15 issue_re = r"Vol. \d+ \((?P<year>\d+)\)" 

16 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)" 

17 

18 def parse_collection_content(self, content): 

19 xissues = [] 

20 soup = BeautifulSoup(content, "html.parser") 

21 issues = soup.select("#realtart select[name='vol'] option") 

22 for issue in issues: 

23 vol_number = issue.get("value") 

24 if not isinstance(vol_number, str) or not vol_number.isdigit(): 

25 continue 

26 issue_dict = regex_to_dict( 

27 self.issue_re, issue.text, error_msg="Couldn't parse volume year" 

28 ) 

29 xissues.append( 

30 self.create_xissue( 

31 self.collection_url + "?vol=" + vol_number, 

32 issue_dict["year"], 

33 vol_number, 

34 None, 

35 ) 

36 ) 

37 return xissues 

38 

39 def parse_issue_content(self, content, xissue): 

40 soup = BeautifulSoup(content, "html.parser") 

41 articles = soup.select("#realtart p.cikk") 

42 for index, article_tag in enumerate(articles): 

43 xissue.articles.append(self.parse_ami_article(article_tag, xissue, index)) 

44 

45 def parse_ami_article(self, article_tag: Tag, xissue: IssueData, index: int): 

46 if not xissue.pid: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 raise ValueError("You must set xissue.pid before parsing an article") 

48 if not xissue.url: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("You must set xissue.url before parsing an article") 

50 

51 xarticle = create_articledata() 

52 xarticle.lang = "en" 

53 xarticle.pid = xissue.pid + "_a" + str(index) 

54 

55 ext_link = create_extlink(rel="source", location=xissue.url, metadata=self.source_domain) 

56 xarticle.ext_links.append(ext_link) 

57 

58 # Title 

59 title_tag = article_tag.select_one("a[href^='./uploads']") 

60 if not title_tag: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 raise ValueError("Couldn't parse article title") 

62 xarticle.title_tex = title_tag.text 

63 

64 # PDF 

65 pdf_url = title_tag.get("href") 

66 if not isinstance(pdf_url, str): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 raise ValueError("Couldn't parse article href") 

68 pdf_url = urljoin(self.source_website, pdf_url) 

69 add_pdf_link_to_xarticle(xarticle, pdf_url) 

70 

71 title_tag.decompose() 

72 # DOI 

73 doi_tag = article_tag.select_one("a[href^='https://doi.org']") 

74 if doi_tag: 

75 xarticle.doi = doi_tag.text 

76 doi_tag.decompose() 

77 

78 # Pages 

79 pages_tag = article_tag.select_one("font.oldal") 

80 if not pages_tag: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Couldn't find pages") 

82 pages_group = regex_to_dict( 

83 self.pages_re, pages_tag.text, error_msg="Couldn't parse pages" 

84 ) 

85 xarticle.fpage = pages_group["fpage"] 

86 xarticle.lpage = pages_group["lpage"] 

87 

88 # Authors 

89 authors = None 

90 for child in article_tag.children: 90 ↛ 96line 90 didn't jump to line 96 because the loop on line 90 didn't complete

91 if not isinstance(child, str): 

92 continue 

93 if child.startswith("by"): 

94 authors = child.removeprefix("by ") 

95 break 

96 if not authors: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 raise ValueError("Couldn't find authors") 

98 

99 authors = authors.split(", ") 

100 for a in authors: 

101 xarticle.contributors.append(create_contributor(string_name=a, role="author")) 

102 

103 return xarticle