Coverage for src/crawler/by_source/mathbas_crawler.py: 84%
92 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata
4from requests import Response
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
10class MathbasCrawler(BaseCollectionCrawler):
11 source_name = "Mathematicac Balkanica website"
12 source_domain = "MATHBAS"
13 source_website = "http://www.math.bas.bg/infres/MathBalk/"
15 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)"
17 def parse_collection_content(self, content):
18 # We are forced to fetch all volume pages first, because some volumes declare multiple issues.
19 soup = BeautifulSoup(content, "html.parser")
20 xissues = []
22 issues_tags = soup.select("#table4 td a")
23 for tag in issues_tags:
24 href = tag.get("href")
25 if not isinstance(href, str): 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true
26 raise ValueError(
27 f"[{self.source_domain}] {self.collection_id} : Invalid volume href"
28 )
29 text = cleanup_str(tag.text)
30 if not text.startswith("Volume"):
31 continue
33 fake_issue = create_issuedata()
34 fake_issue.url = self.source_website + href
35 volume_content = self.download_file(self.source_website + href)
36 xissues.extend(
37 self.parse_mathbas_volume(
38 volume_content, fake_issue, skip_articles=True, only_pid=False
39 )
40 )
42 return xissues
44 def parse_issue_content(self, content, xissue):
45 target_issue = self.parse_mathbas_volume(
46 content, xissue, skip_articles=False, only_pid=True
47 )
48 if not isinstance(target_issue, IssueData): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 raise ValueError("Couldn't filter issue by PID")
51 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False):
52 """Must handle parsing the issues titles/number, not issue contents/articles
54 only_pid tries to fill the input issue based on pid instead of creating a new one.
55 """
56 soup = BeautifulSoup(content, "html.parser")
57 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']")
58 xissues: list[IssueData] = []
59 if not table: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 raise ValueError(
61 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed"
62 )
63 current_issue = None
64 for child in table.findChildren(recursive=False):
65 if "heading" in (child.get("class", [])):
66 text = cleanup_str(child.text)
67 if not text.startswith("Vol"): 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 continue
69 volume_re = regex.search(self.volume_regex, text)
70 if not volume_re: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 raise ValueError(
72 f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume"
73 )
74 volume_dict = volume_re.groupdict()
76 if only_pid and current_issue and current_issue.pid == xissue.pid: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true
77 return current_issue
79 if (
80 self.get_issue_pid(
81 self.collection_id,
82 volume_dict["year"],
83 volume_dict["volume"],
84 volume_dict["number"],
85 )
86 == xissue.pid
87 ):
88 current_issue = xissue
89 else:
90 current_issue = self.create_xissue(
91 xissue.url,
92 volume_dict["year"],
93 volume_dict["volume"],
94 volume_dict["number"],
95 )
96 xissues.append(current_issue)
98 elif child.get("id") == "table5" and not skip_articles:
99 if not current_issue: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 raise ValueError(
101 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue"
102 )
103 self.parse_mathbas_issue(child, current_issue)
104 else:
105 continue
107 if only_pid:
108 return xissues[-1]
109 return xissues
111 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData):
112 lines = tag.select("tr")
113 # Parse article
114 for index, line in enumerate(lines):
115 names = line.select_one(".names")
116 if not names: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors")
118 title = line.select_one(".title")
119 if not title: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 raise ValueError(
121 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title"
122 )
123 pages = line.select_one(".pages")
124 if not pages: 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 raise ValueError(
126 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages"
127 )
128 pdf_url = title.get("href")
129 if not isinstance(pdf_url, str): 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url")
132 xarticle = create_articledata()
133 authors = cleanup_str(names.text)
134 if authors.endswith("."):
135 authors = authors[:-1]
137 if authors != "":
138 authors = authors.split(", ")
139 for a in authors:
140 xarticle.contributors.append(create_contributor(string_name=a, role="author"))
142 xarticle.title_tex = cleanup_str(title.text)
143 xarticle.fpage = cleanup_str(pages.text)
144 xarticle.url = xissue.url
145 add_pdf_link_to_xarticle(pdf_url=self.source_website + pdf_url, xarticle=xarticle)
146 xarticle.pid = f"a{index}"
147 xissue.articles.append(xarticle)
149 def decode_response(self, response: Response, encoding: str = "utf-8"):
150 return super().decode_response(response, "windows-1252")