Coverage for src/crawler/by_source/ami_crawler.py: 87%
70 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
10class AmiCrawler(BaseCollectionCrawler):
11 source_name = "Annales Mathematica et Informaticae website"
12 source_domain = "AMI"
13 source_website = "https://ami.uni-eszterhazy.hu/"
15 issue_re = r"Vol. \d+ \((?P<year>\d+)\)"
16 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"
18 def parse_collection_content(self, content):
19 xissues = []
20 soup = BeautifulSoup(content, "html.parser")
21 issues = soup.select("#realtart select[name='vol'] option")
22 for issue in issues:
23 vol_number = issue.get("value")
24 if not isinstance(vol_number, str) or not vol_number.isdigit():
25 continue
26 issue_dict = regex_to_dict(
27 self.issue_re, issue.text, error_msg="Couldn't parse volume year"
28 )
29 xissues.append(
30 self.create_xissue(
31 self.collection_url + "?vol=" + vol_number,
32 issue_dict["year"],
33 vol_number,
34 None,
35 )
36 )
37 return xissues
39 def parse_issue_content(self, content, xissue):
40 soup = BeautifulSoup(content, "html.parser")
41 articles = soup.select("#realtart p.cikk")
42 for index, article_tag in enumerate(articles):
43 xissue.articles.append(self.parse_ami_article(article_tag, xissue, index))
45 def parse_ami_article(self, article_tag: Tag, xissue: IssueData, index: int):
46 if not xissue.pid: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 raise ValueError("You must set xissue.pid before parsing an article")
48 if not xissue.url: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 raise ValueError("You must set xissue.url before parsing an article")
51 xarticle = create_articledata()
52 xarticle.lang = "en"
53 xarticle.pid = xissue.pid + "_a" + str(index)
55 ext_link = create_extlink(rel="source", location=xissue.url, metadata=self.source_domain)
56 xarticle.ext_links.append(ext_link)
58 # Title
59 title_tag = article_tag.select_one("a[href^='./uploads']")
60 if not title_tag: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 raise ValueError("Couldn't parse article title")
62 xarticle.title_tex = title_tag.text
64 # PDF
65 pdf_url = title_tag.get("href")
66 if not isinstance(pdf_url, str): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 raise ValueError("Couldn't parse article href")
68 pdf_url = urljoin(self.source_website, pdf_url)
69 add_pdf_link_to_xarticle(xarticle, pdf_url)
71 title_tag.decompose()
72 # DOI
73 doi_tag = article_tag.select_one("a[href^='https://doi.org']")
74 if doi_tag:
75 xarticle.doi = doi_tag.text
76 doi_tag.decompose()
78 # Pages
79 pages_tag = article_tag.select_one("font.oldal")
80 if not pages_tag: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true
81 raise ValueError("Couldn't find pages")
82 pages_group = regex_to_dict(
83 self.pages_re, pages_tag.text, error_msg="Couldn't parse pages"
84 )
85 xarticle.fpage = pages_group["fpage"]
86 xarticle.lpage = pages_group["lpage"]
88 # Authors
89 authors = None
90 for child in article_tag.children: 90 ↛ 96line 90 didn't jump to line 96 because the loop on line 90 didn't complete
91 if not isinstance(child, str):
92 continue
93 if child.startswith("by"):
94 authors = child.removeprefix("by ")
95 break
96 if not authors: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 raise ValueError("Couldn't find authors")
99 authors = authors.split(", ")
100 for a in authors:
101 xarticle.contributors.append(create_contributor(string_name=a, role="author"))
103 return xarticle