Coverage for src/crawler/by_source/ami_crawler.py: 84%
77 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-02-14 14:36 +0000
1from urllib.parse import urljoin
3import regex
4from bs4 import BeautifulSoup, Tag
5from ptf.model_data import IssueData, create_articledata, create_contributor, create_extlink
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle
11class AmiCrawler(BaseCollectionCrawler):
12 source_name = "Annales Mathematica et Informaticae website"
13 source_domain = "AMI"
14 source_website = "https://ami.uni-eszterhazy.hu/"
16 issue_re = r"Vol. \d+ \((?P<year>\d+)\)"
17 pages_re = r"Pages: (?P<fpage>\d+)–(?P<lpage>\d+)"
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html.parser")
22 issues = soup.select("#realtart select[name='vol'] option")
23 for issue in issues:
24 vol_number = issue.get("value")
25 if not isinstance(vol_number, str) or not vol_number.isdigit():
26 continue
27 year_re = regex.search(self.issue_re, issue.text)
28 if not year_re: 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 raise ValueError("Couldn't parse volume year")
30 issue_dict = year_re.groupdict()
31 xissues.append(
32 self.create_xissue(
33 self.collection_url + "?vol=" + vol_number,
34 issue_dict["year"],
35 vol_number,
36 None,
37 )
38 )
39 return xissues
41 def parse_issue_content(self, content, xissue):
42 soup = BeautifulSoup(content, "html.parser")
43 articles = soup.select("#realtart p.cikk")
44 for index, article_tag in enumerate(articles):
45 xissue.articles.append(self.parse_ami_article(article_tag, xissue, index))
47 def parse_ami_article(self, article_tag: Tag, xissue: IssueData, index: int):
48 if not xissue.pid: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 raise ValueError("You must set xissue.pid before parsing an article")
50 if not xissue.url: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 raise ValueError("You must set xissue.url before parsing an article")
53 xarticle = create_articledata()
54 xarticle.lang = "en"
55 xarticle.pid = xissue.pid + "_a" + str(index)
57 ext_link = create_extlink(rel="source", location=xissue.url, metadata=self.source_domain)
58 xarticle.ext_links.append(ext_link)
60 # Title
61 title_tag = article_tag.select_one("a[href^='./uploads']")
62 if not title_tag: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 raise ValueError("Couldn't parse article title")
64 xarticle.title_tex = title_tag.text
66 # PDF
67 pdf_url = title_tag.get("href")
68 if not isinstance(pdf_url, str): 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 raise ValueError("Couldn't parse article href")
70 pdf_url = urljoin(self.source_website, pdf_url)
71 add_pdf_link_to_xarticle(xarticle, pdf_url)
73 title_tag.decompose()
74 # DOI
75 doi_tag = article_tag.select_one("a[href^='https://doi.org']")
76 if doi_tag:
77 xarticle.doi = doi_tag.text
78 doi_tag.decompose()
80 # Pages
81 pages_tag = article_tag.select_one("font.oldal")
82 if not pages_tag: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 raise ValueError("Couldn't find pages")
84 pages_search = regex.search(self.pages_re, pages_tag.text)
85 if not pages_search: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 raise ValueError("Couldn't parse pages")
87 pages_group = pages_search.groupdict()
88 xarticle.fpage = pages_group["fpage"]
89 xarticle.lpage = pages_group["lpage"]
91 # Authors
92 authors = None
93 for child in article_tag.children: 93 ↛ 99line 93 didn't jump to line 99 because the loop on line 93 didn't complete
94 if not isinstance(child, str):
95 continue
96 if child.startswith("by"):
97 authors = child.removeprefix("by ")
98 break
99 if not authors: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 raise ValueError("Couldn't find authors")
102 authors = authors.split(", ")
103 for a in authors:
104 xarticle.contributors.append(create_contributor(string_name=a, role="author"))
106 return xarticle