Coverage for src/crawler/by_source/ami

1from urllib.parse import urljoin

3import regex

4from bs4 import BeautifulSoup, Tag

5from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink

7from crawler.base_crawler import BaseCollectionCrawler

8from crawler.utils import add_pdf_link_to_xarticle

11class AmiCrawler(BaseCollectionCrawler):

12 source_name = "Annales Mathematica et Informaticae website"

13 source_domain = "AMI"

14 source_website = "https://ami.uni-eszterhazy.hu/"

16 issue_re = r"Vol. \d+ \((?P<year>\d+)\)"

17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"

19 def parse_collection_content(self, content):

20 xissues = []

21 soup = BeautifulSoup(content, "html.parser")

22 issues = soup.select("#realtart select[name='vol'] option")

23 for issue in issues:

24 vol_number = issue.get("value")

25 if not isinstance(vol_number, str) or not vol_number.isdigit():

26 continue

27 year_re = regex.search(self.issue_re, issue.text)

28 if not year_re: 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 raise ValueError("Couldn't parse volume year")

30 issue_dict = year_re.groupdict()

31 xissues.append(

32 self.create_xissue(

33 self.collection_url + "?vol=" + vol_number,

34 issue_dict["year"],

35 vol_number,

36 None,

37 )

38 )

39 return xissues

41 def parse_issue_content(self, content, xissue):

42 soup = BeautifulSoup(content, "html.parser")

43 articles = soup.select("#realtart p.cikk")

44 for index, article_tag in enumerate(articles):

45 xissue.articles.append(self.parse_ami_article(article_tag, xissue, index))

47 def parse_ami_article(self, article_tag: Tag, xissue: IssueData, index: int):

48 if not xissue.pid: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("You must set xissue.pid before parsing an article")

50 if not xissue.url: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError("You must set xissue.url before parsing an article")

53 xarticle = create_articledata()

54 xarticle.lang = "en"

55 xarticle.pid = xissue.pid + "_a" + str(index)

57 ext_link = create_extlink(rel="source", location=xissue.url, metadata=self.source_domain)

58 xarticle.ext_links.append(ext_link)

60 # Title

61 title_tag = article_tag.select_one("a[href^='./uploads']")

62 if not title_tag: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError("Couldn't parse article title")

64 xarticle.title_tex = title_tag.text

66 # PDF

67 pdf_url = title_tag.get("href")

68 if not isinstance(pdf_url, str): 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("Couldn't parse article href")

70 pdf_url = urljoin(self.source_website, pdf_url)

71 add_pdf_link_to_xarticle(xarticle, pdf_url)

73 title_tag.decompose()

74 # DOI

75 doi_tag = article_tag.select_one("a[href^='https://doi.org']")

76 if doi_tag:

77 xarticle.doi = doi_tag.text

78 doi_tag.decompose()

80 # Pages

81 pages_tag = article_tag.select_one("font.oldal")

82 if not pages_tag: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't find pages")

84 pages_search = regex.search(self.pages_re, pages_tag.text)

85 if not pages_search: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 raise ValueError("Couldn't parse pages")

87 pages_group = pages_search.groupdict()

88 xarticle.fpage = pages_group["fpage"]

89 xarticle.lpage = pages_group["lpage"]

91 # Authors

92 authors = None

93 for child in article_tag.children: 93 ↛ 99line 93 didn't jump to line 99 because the loop on line 93 didn't complete

94 if not isinstance(child, str):

95 continue

96 if child.startswith("by"):

97 authors = child.removeprefix("by ")

98 break

99 if not authors: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 raise ValueError("Couldn't find authors")

101

102 authors = authors.split(", ")

103 for a in authors:

104 xarticle.contributors.append(create_contributor(string_name=a, role="author"))

105

106 return xarticle

Coverage for src/crawler/by_source/ami_crawler.py: 84%

77 statements