Coverage for src/crawler/by_source/mathbas

1from bs4 import BeautifulSoup, Tag

2from ptf.model_data import IssueData, create_articledata, create_contributor, create_issuedata

3from requests import Response

5from crawler.base_crawler import BaseCollectionCrawler

6from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

9class MathbasCrawler(BaseCollectionCrawler):

10 source_name = "Mathematicac Balkanica website"

11 source_domain = "MATHBAS"

12 source_website = "http://www.math.bas.bg/infres/MathBalk/"

14 volume_regex = r"Vol\. (?P<volume>\d+) \((?P<year>\d+)\), Fasc\. (?P<number>[\d\-]+)"

16 def parse_collection_content(self, content):

17 # We are forced to fetch all volume pages first, because some volumes declare multiple issues.

18 soup = BeautifulSoup(content, "html.parser")

19 xissues = []

21 issues_tags = soup.select("#table4 td a")

22 for tag in issues_tags:

23 href = tag.get("href")

24 if not isinstance(href, str): 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 raise ValueError(

26 f"[{self.source_domain}] {self.collection_id} : Invalid volume href"

27 )

28 text = cleanup_str(tag.text)

29 if not text.startswith("Volume"):

30 continue

32 fake_issue = create_issuedata()

33 fake_issue.url = self.source_website + href

34 volume_content = self.download_file(self.source_website + href)

35 xissues.extend(

36 self.parse_mathbas_volume(

37 volume_content, fake_issue, skip_articles=True, only_pid=False

38 )

39 )

41 return xissues

43 def parse_issue_content(self, content, xissue):

44 target_issue = self.parse_mathbas_volume(

45 content, xissue, skip_articles=False, only_pid=True

46 )

47 if not isinstance(target_issue, IssueData): 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 raise ValueError("Couldn't filter issue by PID")

50 def parse_mathbas_volume(self, content, xissue, skip_articles=False, only_pid=False):

51 """Must handle parsing the issues titles/number, not issue contents/articles

53 only_pid tries to fill the input issue based on pid instead of creating a new one.

54 """

55 soup = BeautifulSoup(content, "html.parser")

56 table = soup.select_one("#table3 td[bgcolor='#F9FCC5']")

57 xissues: list[IssueData] = []

58 if not table: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError(

60 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Volume cannot be parsed"

61 )

62 current_issue = None

63 for child in table.findChildren(recursive=False):

64 if "heading" in (child.get("class", [])):

65 text = cleanup_str(child.text)

66 if not text.startswith("Vol"): 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 continue

69 volume_dict = regex_to_dict(

70 self.volume_regex,

71 text,

72 error_msg=f"[{self.source_domain}] {self.collection_id} : Couldn't parse volume",

73 )

74 if only_pid and current_issue and current_issue.pid == xissue.pid: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 return current_issue

77 if (

78 self.get_issue_pid(

79 self.collection_id,

80 volume_dict["year"],

81 volume_dict["volume"],

82 volume_dict["number"],

83 )

84 == xissue.pid

85 ):

86 current_issue = xissue

87 else:

88 current_issue = self.create_xissue(

89 xissue.url,

90 volume_dict["year"],

91 volume_dict["volume"],

92 volume_dict["number"],

93 )

94 xissues.append(current_issue)

96 elif child.get("id") == "table5" and not skip_articles:

97 if not current_issue: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 raise ValueError(

99 f"[{self.source_domain}] {self.collection_id} {xissue.url} : Couldn't parse volume page : article declared before issue"

100 )

101 self.parse_mathbas_issue(child, current_issue)

102 else:

103 continue

104

105 if only_pid:

106 return xissues[-1]

107 return xissues

108

109 def parse_mathbas_issue(self, tag: Tag, xissue: IssueData):

110 lines = tag.select("tr")

111 # Parse article

112 for index, line in enumerate(lines):

113 names = line.select_one(".names")

114 if not names: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't parse authors")

116 title = line.select_one(".title")

117 if not title: 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 raise ValueError(

119 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article title"

120 )

121 pages = line.select_one(".pages")

122 if not pages: 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 raise ValueError(

124 f"[{self.source_domain}] {xissue.pid} : Couldn't parse article pages"

125 )

126 pdf_url = title.get("href")

127 if not isinstance(pdf_url, str): 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 raise ValueError(f"[{self.source_domain}] {xissue.pid} : Couldn't extract pdf url")

129

130 xarticle = create_articledata()

131 authors = cleanup_str(names.text)

132 if authors.endswith("."):

133 authors = authors[:-1]

134

135 if authors != "":

136 authors = authors.split(", ")

137 for a in authors:

138 xarticle.contributors.append(create_contributor(string_name=a, role="author"))

139

140 xarticle.title_tex = cleanup_str(title.text)

141 xarticle.fpage = cleanup_str(pages.text)

142 xarticle.url = xissue.url

143 add_pdf_link_to_xarticle(pdf_url=self.source_website + pdf_url, xarticle=xarticle)

144 xarticle.pid = f"a{index}"

145 xissue.articles.append(xarticle)

146

147 def decode_response(self, response: Response, encoding: str = "utf-8"):

148 return super().decode_response(response, "windows-1252")

Coverage for src/crawler/by_source/mathbas_crawler.py: 85%

88 statements