Coverage for src/crawler/by_source/mathbas_crawler.py: 84%
94 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import regex
2from bs4 import BeautifulSoup, Tag
3from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata
4from requests import Response
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
10class MathbasCrawler(BaseCollectionCrawler):
11 source_name = "Mathematicac Balkanica website"
12 source_domain = "MATHBAS"
13 source_website = "http://www.math.bas.bg/infres/MathBalk/"
14 periode_begin = 0
15 periode_end = 0
17 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)"
19 def parse_collection_content(self, content):
20 # We are forced to fetch all volume pages first, because some volumes declare multiple issues.
21 soup = BeautifulSoup(content, "html.parser")
22 xissues = []
24 issues_tags = soup.select("#table4 td a")
25 for tag in issues_tags:
26 href = tag.get("href")
27 if not isinstance(href, str): 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true
28 raise ValueError(
29 f"[{self.source_domain}] {self.collection_id} : Invalid volume href"
30 )
31 text = cleanup_str(tag.text)
32 if not text.startswith("Volume"):
33 continue
35 fake_issue = create_issuedata()
36 fake_issue.url = self.source_website + href
37 volume_content = self.download_file(self.source_website + href)
38 xissues.extend(
39 self.parse_mathbas_volume(
40 volume_content, fake_issue, skip_articles=True, only_pid=False
41 )
42 )
44 return xissues
46 def parse_issue_content(self, content, xissue):
47 target_issue = self.parse_mathbas_volume(
48 content, xissue, skip_articles=False, only_pid=True
49 )
50 if not isinstance(target_issue, IssueData): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 raise ValueError("Couldn't filter issue by PID")
53 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False):
54 """Must handle parsing the issues titles/number, not issue contents/articles
56 only_pid tries to fill the input issue based on pid instead of creating a new one.
57 """
58 soup = BeautifulSoup(content, "html.parser")
59 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']")
60 xissues: list[IssueData] = []
61 if not table: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError(
63 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed"
64 )
65 current_issue = None
66 for child in table.findChildren(recursive=False):
67 if "heading" in (child.get("class", [])):
68 text = cleanup_str(child.text)
69 if not text.startswith("Vol"): 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 continue
71 volume_re = regex.search(self.volume_regex, text)
72 if not volume_re: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 raise ValueError(
74 f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume"
75 )
76 volume_dict = volume_re.groupdict()
78 if only_pid and current_issue and current_issue.pid == xissue.pid: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 return current_issue
81 if (
82 self.get_issue_pid(
83 self.collection_id,
84 volume_dict["year"],
85 volume_dict["volume"],
86 volume_dict["number"],
87 )
88 == xissue.pid
89 ):
90 current_issue = xissue
91 else:
92 current_issue = self.create_xissue(
93 xissue.url,
94 volume_dict["year"],
95 volume_dict["volume"],
96 volume_dict["number"],
97 )
98 xissues.append(current_issue)
100 elif child.get("id") == "table5" and not skip_articles:
101 if not current_issue: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 raise ValueError(
103 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue"
104 )
105 self.parse_mathbas_issue(child, current_issue)
106 else:
107 continue
109 if only_pid:
110 return xissues[-1]
111 return xissues
113 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData):
114 lines = tag.select("tr")
115 # Parse article
116 for index, line in enumerate(lines):
117 names = line.select_one(".names")
118 if not names: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors")
120 title = line.select_one(".title")
121 if not title: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 raise ValueError(
123 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title"
124 )
125 pages = line.select_one(".pages")
126 if not pages: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true
127 raise ValueError(
128 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages"
129 )
130 pdf_url = title.get("href")
131 if not isinstance(pdf_url, str): 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url")
134 xarticle = create_articledata()
135 authors = cleanup_str(names.text)
136 if authors.endswith("."):
137 authors = authors[:-1]
139 if authors != "":
140 authors = authors.split(", ")
141 for a in authors:
142 xarticle.contributors.append(create_contributor(string_name=a, role="author"))
144 xarticle.title_tex = cleanup_str(title.text)
145 xarticle.fpage = cleanup_str(pages.text)
146 xarticle.url = xissue.url
147 add_pdf_link_to_xarticle(pdf_url=pdf_url, xarticle=xarticle)
148 xarticle.pid = f"a{index}"
149 xissue.articles.append(xarticle)
151 def decode_response(self, response: Response, encoding: str = "utf-8"):
152 return super().decode_response(response, "windows-1252")