Coverage for src/crawler/by_source/mathbas_crawler.py: 85%

88 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1from bs4 import BeautifulSoup, Tag 

2from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata 

3from requests import Response 

4 

5from crawler.base_crawler import BaseCollectionCrawler 

6from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

7 

8 

9class MathbasCrawler(BaseCollectionCrawler): 

10 source_name = "Mathematicac Balkanica website" 

11 source_domain = "MATHBAS" 

12 source_website = "http://www.math.bas.bg/infres/MathBalk/" 

13 

14 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)" 

15 

16 def parse_collection_content(self, content): 

17 # We are forced to fetch all volume pages first, because some volumes declare multiple issues. 

18 soup = BeautifulSoup(content, "html.parser") 

19 xissues = [] 

20 

21 issues_tags = soup.select("#table4 td a") 

22 for tag in issues_tags: 

23 href = tag.get("href") 

24 if not isinstance(href, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError( 

26 f"[{self.source_domain}] {self.collection_id} : Invalid volume href" 

27 ) 

28 text = cleanup_str(tag.text) 

29 if not text.startswith("Volume"): 

30 continue 

31 

32 fake_issue = create_issuedata() 

33 fake_issue.url = self.source_website + href 

34 volume_content = self.download_file(self.source_website + href) 

35 xissues.extend( 

36 self.parse_mathbas_volume( 

37 volume_content, fake_issue, skip_articles=True, only_pid=False 

38 ) 

39 ) 

40 

41 return xissues 

42 

43 def parse_issue_content(self, content, xissue): 

44 target_issue = self.parse_mathbas_volume( 

45 content, xissue, skip_articles=False, only_pid=True 

46 ) 

47 if not isinstance(target_issue, IssueData): 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 raise ValueError("Couldn't filter issue by PID") 

49 

50 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False): 

51 """Must handle parsing the issues titles/number, not issue contents/articles 

52 

53 only_pid tries to fill the input issue based on pid instead of creating a new one. 

54 """ 

55 soup = BeautifulSoup(content, "html.parser") 

56 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']") 

57 xissues: list[IssueData] = [] 

58 if not table: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError( 

60 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed" 

61 ) 

62 current_issue = None 

63 for child in table.findChildren(recursive=False): 

64 if "heading" in (child.get("class", [])): 

65 text = cleanup_str(child.text) 

66 if not text.startswith("Vol"): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 continue 

68 

69 volume_dict = regex_to_dict( 

70 self.volume_regex, 

71 text, 

72 error_msg=f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume", 

73 ) 

74 if only_pid and current_issue and current_issue.pid == xissue.pid: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 return current_issue 

76 

77 if ( 

78 self.get_issue_pid( 

79 self.collection_id, 

80 volume_dict["year"], 

81 volume_dict["volume"], 

82 volume_dict["number"], 

83 ) 

84 == xissue.pid 

85 ): 

86 current_issue = xissue 

87 else: 

88 current_issue = self.create_xissue( 

89 xissue.url, 

90 volume_dict["year"], 

91 volume_dict["volume"], 

92 volume_dict["number"], 

93 ) 

94 xissues.append(current_issue) 

95 

96 elif child.get("id") == "table5" and not skip_articles: 

97 if not current_issue: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 raise ValueError( 

99 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue" 

100 ) 

101 self.parse_mathbas_issue(child, current_issue) 

102 else: 

103 continue 

104 

105 if only_pid: 

106 return xissues[-1] 

107 return xissues 

108 

109 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData): 

110 lines = tag.select("tr") 

111 # Parse article 

112 for index, line in enumerate(lines): 

113 names = line.select_one(".names") 

114 if not names: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors") 

116 title = line.select_one(".title") 

117 if not title: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 raise ValueError( 

119 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title" 

120 ) 

121 pages = line.select_one(".pages") 

122 if not pages: 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 raise ValueError( 

124 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages" 

125 ) 

126 pdf_url = title.get("href") 

127 if not isinstance(pdf_url, str): 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url") 

129 

130 xarticle = create_articledata() 

131 authors = cleanup_str(names.text) 

132 if authors.endswith("."): 

133 authors = authors[:-1] 

134 

135 if authors != "": 

136 authors = authors.split(", ") 

137 for a in authors: 

138 xarticle.contributors.append(create_contributor(string_name=a, role="author")) 

139 

140 xarticle.title_tex = cleanup_str(title.text) 

141 xarticle.fpage = cleanup_str(pages.text) 

142 xarticle.url = xissue.url 

143 add_pdf_link_to_xarticle(pdf_url=self.source_website + pdf_url, xarticle=xarticle) 

144 xarticle.pid = f"a{index}" 

145 xissue.articles.append(xarticle) 

146 

147 def decode_response(self, response: Response, encoding: str = "utf-8"): 

148 return super().decode_response(response, "windows-1252")