Coverage for src/crawler/by_source/mathbas_crawler.py: 84%

92 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata 

4from requests import Response 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

8 

9 

10class MathbasCrawler(BaseCollectionCrawler): 

11 source_name = "Mathematicac Balkanica website" 

12 source_domain = "MATHBAS" 

13 source_website = "http://www.math.bas.bg/infres/MathBalk/" 

14 

15 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)" 

16 

17 def parse_collection_content(self, content): 

18 # We are forced to fetch all volume pages first, because some volumes declare multiple issues. 

19 soup = BeautifulSoup(content, "html.parser") 

20 xissues = [] 

21 

22 issues_tags = soup.select("#table4 td a") 

23 for tag in issues_tags: 

24 href = tag.get("href") 

25 if not isinstance(href, str): 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 raise ValueError( 

27 f"[{self.source_domain}] {self.collection_id} : Invalid volume href" 

28 ) 

29 text = cleanup_str(tag.text) 

30 if not text.startswith("Volume"): 

31 continue 

32 

33 fake_issue = create_issuedata() 

34 fake_issue.url = self.source_website + href 

35 volume_content = self.download_file(self.source_website + href) 

36 xissues.extend( 

37 self.parse_mathbas_volume( 

38 volume_content, fake_issue, skip_articles=True, only_pid=False 

39 ) 

40 ) 

41 

42 return xissues 

43 

44 def parse_issue_content(self, content, xissue): 

45 target_issue = self.parse_mathbas_volume( 

46 content, xissue, skip_articles=False, only_pid=True 

47 ) 

48 if not isinstance(target_issue, IssueData): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("Couldn't filter issue by PID") 

50 

51 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False): 

52 """Must handle parsing the issues titles/number, not issue contents/articles 

53 

54 only_pid tries to fill the input issue based on pid instead of creating a new one. 

55 """ 

56 soup = BeautifulSoup(content, "html.parser") 

57 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']") 

58 xissues: list[IssueData] = [] 

59 if not table: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError( 

61 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed" 

62 ) 

63 current_issue = None 

64 for child in table.findChildren(recursive=False): 

65 if "heading" in (child.get("class", [])): 

66 text = cleanup_str(child.text) 

67 if not text.startswith("Vol"): 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 continue 

69 volume_re = regex.search(self.volume_regex, text) 

70 if not volume_re: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 raise ValueError( 

72 f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume" 

73 ) 

74 volume_dict = volume_re.groupdict() 

75 

76 if only_pid and current_issue and current_issue.pid == xissue.pid: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 return current_issue 

78 

79 if ( 

80 self.get_issue_pid( 

81 self.collection_id, 

82 volume_dict["year"], 

83 volume_dict["volume"], 

84 volume_dict["number"], 

85 ) 

86 == xissue.pid 

87 ): 

88 current_issue = xissue 

89 else: 

90 current_issue = self.create_xissue( 

91 xissue.url, 

92 volume_dict["year"], 

93 volume_dict["volume"], 

94 volume_dict["number"], 

95 ) 

96 xissues.append(current_issue) 

97 

98 elif child.get("id") == "table5" and not skip_articles: 

99 if not current_issue: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 raise ValueError( 

101 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue" 

102 ) 

103 self.parse_mathbas_issue(child, current_issue) 

104 else: 

105 continue 

106 

107 if only_pid: 

108 return xissues[-1] 

109 return xissues 

110 

111 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData): 

112 lines = tag.select("tr") 

113 # Parse article 

114 for index, line in enumerate(lines): 

115 names = line.select_one(".names") 

116 if not names: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors") 

118 title = line.select_one(".title") 

119 if not title: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 raise ValueError( 

121 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title" 

122 ) 

123 pages = line.select_one(".pages") 

124 if not pages: 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 raise ValueError( 

126 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages" 

127 ) 

128 pdf_url = title.get("href") 

129 if not isinstance(pdf_url, str): 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url") 

131 

132 xarticle = create_articledata() 

133 authors = cleanup_str(names.text) 

134 if authors.endswith("."): 

135 authors = authors[:-1] 

136 

137 if authors != "": 

138 authors = authors.split(", ") 

139 for a in authors: 

140 xarticle.contributors.append(create_contributor(string_name=a, role="author")) 

141 

142 xarticle.title_tex = cleanup_str(title.text) 

143 xarticle.fpage = cleanup_str(pages.text) 

144 xarticle.url = xissue.url 

145 add_pdf_link_to_xarticle(pdf_url=self.source_website + pdf_url, xarticle=xarticle) 

146 xarticle.pid = f"a{index}" 

147 xissue.articles.append(xarticle) 

148 

149 def decode_response(self, response: Response, encoding: str = "utf-8"): 

150 return super().decode_response(response, "windows-1252")