Coverage for src/crawler/by_source/mathbas_crawler.py: 84%

94 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import regex 

2from bs4 import BeautifulSoup, Tag 

3from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata 

4from requests import Response 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

8 

9 

10class MathbasCrawler(BaseCollectionCrawler): 

11 source_name = "Mathematicac Balkanica website" 

12 source_domain = "MATHBAS" 

13 source_website = "http://www.math.bas.bg/infres/MathBalk/" 

14 periode_begin = 0 

15 periode_end = 0 

16 

17 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)" 

18 

19 def parse_collection_content(self, content): 

20 # We are forced to fetch all volume pages first, because some volumes declare multiple issues. 

21 soup = BeautifulSoup(content, "html.parser") 

22 xissues = [] 

23 

24 issues_tags = soup.select("#table4 td a") 

25 for tag in issues_tags: 

26 href = tag.get("href") 

27 if not isinstance(href, str): 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 raise ValueError( 

29 f"[{self.source_domain}] {self.collection_id} : Invalid volume href" 

30 ) 

31 text = cleanup_str(tag.text) 

32 if not text.startswith("Volume"): 

33 continue 

34 

35 fake_issue = create_issuedata() 

36 fake_issue.url = self.source_website + href 

37 volume_content = self.download_file(self.source_website + href) 

38 xissues.extend( 

39 self.parse_mathbas_volume( 

40 volume_content, fake_issue, skip_articles=True, only_pid=False 

41 ) 

42 ) 

43 

44 return xissues 

45 

46 def parse_issue_content(self, content, xissue): 

47 target_issue = self.parse_mathbas_volume( 

48 content, xissue, skip_articles=False, only_pid=True 

49 ) 

50 if not isinstance(target_issue, IssueData): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError("Couldn't filter issue by PID") 

52 

53 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False): 

54 """Must handle parsing the issues titles/number, not issue contents/articles 

55 

56 only_pid tries to fill the input issue based on pid instead of creating a new one. 

57 """ 

58 soup = BeautifulSoup(content, "html.parser") 

59 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']") 

60 xissues: list[IssueData] = [] 

61 if not table: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError( 

63 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed" 

64 ) 

65 current_issue = None 

66 for child in table.findChildren(recursive=False): 

67 if "heading" in (child.get("class", [])): 

68 text = cleanup_str(child.text) 

69 if not text.startswith("Vol"): 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 continue 

71 volume_re = regex.search(self.volume_regex, text) 

72 if not volume_re: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 raise ValueError( 

74 f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume" 

75 ) 

76 volume_dict = volume_re.groupdict() 

77 

78 if only_pid and current_issue and current_issue.pid == xissue.pid: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 return current_issue 

80 

81 if ( 

82 self.get_issue_pid( 

83 self.collection_id, 

84 volume_dict["year"], 

85 volume_dict["volume"], 

86 volume_dict["number"], 

87 ) 

88 == xissue.pid 

89 ): 

90 current_issue = xissue 

91 else: 

92 current_issue = self.create_xissue( 

93 xissue.url, 

94 volume_dict["year"], 

95 volume_dict["volume"], 

96 volume_dict["number"], 

97 ) 

98 xissues.append(current_issue) 

99 

100 elif child.get("id") == "table5" and not skip_articles: 

101 if not current_issue: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 raise ValueError( 

103 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue" 

104 ) 

105 self.parse_mathbas_issue(child, current_issue) 

106 else: 

107 continue 

108 

109 if only_pid: 

110 return xissues[-1] 

111 return xissues 

112 

113 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData): 

114 lines = tag.select("tr") 

115 # Parse article 

116 for index, line in enumerate(lines): 

117 names = line.select_one(".names") 

118 if not names: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors") 

120 title = line.select_one(".title") 

121 if not title: 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 raise ValueError( 

123 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title" 

124 ) 

125 pages = line.select_one(".pages") 

126 if not pages: 126 ↛ 127line 126 didn't jump to line 127 because the condition on line 126 was never true

127 raise ValueError( 

128 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages" 

129 ) 

130 pdf_url = title.get("href") 

131 if not isinstance(pdf_url, str): 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url") 

133 

134 xarticle = create_articledata() 

135 authors = cleanup_str(names.text) 

136 if authors.endswith("."): 

137 authors = authors[:-1] 

138 

139 if authors != "": 

140 authors = authors.split(", ") 

141 for a in authors: 

142 xarticle.contributors.append(create_contributor(string_name=a, role="author")) 

143 

144 xarticle.title_tex = cleanup_str(title.text) 

145 xarticle.fpage = cleanup_str(pages.text) 

146 xarticle.url = xissue.url 

147 add_pdf_link_to_xarticle(pdf_url=pdf_url, xarticle=xarticle) 

148 xarticle.pid = f"a{index}" 

149 xissue.articles.append(xarticle) 

150 

151 def decode_response(self, response: Response, encoding: str = "utf-8"): 

152 return super().decode_response(response, "windows-1252")