Coverage for src/crawler/by_source/mathbas_crawler.py: 85%
88 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1from bs4 import BeautifulSoup, Tag
2from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata
3from requests import Response
5from crawler.base_crawler import BaseCollectionCrawler
6from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
9class MathbasCrawler(BaseCollectionCrawler):
10 source_name = "Mathematicac Balkanica website"
11 source_domain = "MATHBAS"
12 source_website = "http://www.math.bas.bg/infres/MathBalk/"
14 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)"
16 def parse_collection_content(self, content):
17 # We are forced to fetch all volume pages first, because some volumes declare multiple issues.
18 soup = BeautifulSoup(content, "html.parser")
19 xissues = []
21 issues_tags = soup.select("#table4 td a")
22 for tag in issues_tags:
23 href = tag.get("href")
24 if not isinstance(href, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true
25 raise ValueError(
26 f"[{self.source_domain}] {self.collection_id} : Invalid volume href"
27 )
28 text = cleanup_str(tag.text)
29 if not text.startswith("Volume"):
30 continue
32 fake_issue = create_issuedata()
33 fake_issue.url = self.source_website + href
34 volume_content = self.download_file(self.source_website + href)
35 xissues.extend(
36 self.parse_mathbas_volume(
37 volume_content, fake_issue, skip_articles=True, only_pid=False
38 )
39 )
41 return xissues
43 def parse_issue_content(self, content, xissue):
44 target_issue = self.parse_mathbas_volume(
45 content, xissue, skip_articles=False, only_pid=True
46 )
47 if not isinstance(target_issue, IssueData): 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 raise ValueError("Couldn't filter issue by PID")
50 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False):
51 """Must handle parsing the issues titles/number, not issue contents/articles
53 only_pid tries to fill the input issue based on pid instead of creating a new one.
54 """
55 soup = BeautifulSoup(content, "html.parser")
56 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']")
57 xissues: list[IssueData] = []
58 if not table: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 raise ValueError(
60 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed"
61 )
62 current_issue = None
63 for child in table.findChildren(recursive=False):
64 if "heading" in (child.get("class", [])):
65 text = cleanup_str(child.text)
66 if not text.startswith("Vol"): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 continue
69 volume_dict = regex_to_dict(
70 self.volume_regex,
71 text,
72 error_msg=f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume",
73 )
74 if only_pid and current_issue and current_issue.pid == xissue.pid: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 return current_issue
77 if (
78 self.get_issue_pid(
79 self.collection_id,
80 volume_dict["year"],
81 volume_dict["volume"],
82 volume_dict["number"],
83 )
84 == xissue.pid
85 ):
86 current_issue = xissue
87 else:
88 current_issue = self.create_xissue(
89 xissue.url,
90 volume_dict["year"],
91 volume_dict["volume"],
92 volume_dict["number"],
93 )
94 xissues.append(current_issue)
96 elif child.get("id") == "table5" and not skip_articles:
97 if not current_issue: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 raise ValueError(
99 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue"
100 )
101 self.parse_mathbas_issue(child, current_issue)
102 else:
103 continue
105 if only_pid:
106 return xissues[-1]
107 return xissues
109 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData):
110 lines = tag.select("tr")
111 # Parse article
112 for index, line in enumerate(lines):
113 names = line.select_one(".names")
114 if not names: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors")
116 title = line.select_one(".title")
117 if not title: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 raise ValueError(
119 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title"
120 )
121 pages = line.select_one(".pages")
122 if not pages: 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 raise ValueError(
124 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages"
125 )
126 pdf_url = title.get("href")
127 if not isinstance(pdf_url, str): 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url")
130 xarticle = create_articledata()
131 authors = cleanup_str(names.text)
132 if authors.endswith("."):
133 authors = authors[:-1]
135 if authors != "":
136 authors = authors.split(", ")
137 for a in authors:
138 xarticle.contributors.append(create_contributor(string_name=a, role="author"))
140 xarticle.title_tex = cleanup_str(title.text)
141 xarticle.fpage = cleanup_str(pages.text)
142 xarticle.url = xissue.url
143 add_pdf_link_to_xarticle(pdf_url=self.source_website + pdf_url, xarticle=xarticle)
144 xarticle.pid = f"a{index}"
145 xissue.articles.append(xarticle)
147 def decode_response(self, response: Response, encoding: str = "utf-8"):
148 return super().decode_response(response, "windows-1252")