Coverage for src/crawler/by_source/bmms

1import re

2from urllib.parse import urljoin

4from bs4 import BeautifulSoup

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj

7from crawler.base_crawler import BaseCollectionCrawler

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str

11class BmmsCrawler(BaseCollectionCrawler):

12 source_name = "BMMS"

13 source_domain = "BMMS"

14 source_website = "https://math.usm.my/bulletin/html/research.htm"

16 issue_re = r"Volume (?P<volume>\d+)\s*•\s*Number (?P<number>\d+)\s*•\s*(?P<year>\d{4})"

17 issue_re_bis = r"V(?P<volume>\d+)\s*•\s*N(?P<number>\d+[A-Z]?)\s*•\s*(?P<year>\d{4})"

18 abstract_re = r"(?<=Abstract\.)(.*?)(?=[\d]{4}.*Mathematics Subject Classification)"

19 abstract_no_msc_re = r"(?<=Abstract\.)(.*?)(?=Full.*text.*PDF)"

20 msc_re = r"(?<=Mathematics Subject Classification:)(.*?)(?=Full)"

21 delimiter_disp_formula = {"\\begin{align*}", "$$"}

22 delimiter_inline_formula = {"$", "\\(", "\\["}

24 def get_authors(self, parent_tag, title_tag, title_text):

25 """parent_tag is a soup that contains the authors

26 returns the string list of the authors

27 """

28 if "Linear Transformations of N-connections in OSC" in cleanup_str(title_text): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 return "Irena Čomić and Monica Purcaru"

30 if ( 30 ↛ 34line 30 didn't jump to line 34 because the condition on line 30 was never true

31 "Mapping fromTopological Space to Endomorphisms Algebra of Banach Space and its Applications"

32 in cleanup_str(title_text)

33 ):

34 return "Misir B. Ragimov"

35 authors_tag = parent_tag.select_one("span.justify2")

36 if authors_tag: 36 ↛ 64line 36 didn't jump to line 64 because the condition on line 36 was always true

37 authors_string = cleanup_str(authors_tag.get_text())

38 if cleanup_str(title_text) == authors_string: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 authors_tags = parent_tag.select("span.justify2")

40 for span in authors_tags:

41 if not span.find("i"):

42 authors_string = span.string

43 break

45 if "Abstract" in authors_string: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 return authors_string.split("Abstract")[0]

47 elif title_text in authors_string: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 res = authors_string.split(title_text)[1]

49 return res

50 else:

51 if authors_string != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true

52 return authors_string

53 authors_tag = parent_tag.select_one("span.justify2 span.justify2 span.justify2")

54 if authors_tag:

55 authors_string = authors_tag.get_text()

56 if authors_string != "":

57 return authors_string

58 authors_tag = parent_tag.select_one("span.justify2 span.justify2")

59 if authors_tag:

60 authors_string = authors_tag.get_text()

61 if authors_string != "":

62 return authors_string

64 authors_tag = parent_tag.select_one("span.text2")

65 if authors_tag:

66 return authors_tag.get_text()

67 else:

68 br_tag = title_tag.find_next("br")

69 if br_tag and br_tag.next_sibling:

70 return br_tag.next_sibling.strip()

72 def parse_collection_content(self, content):

73 xissues = []

74 soup = BeautifulSoup(content, "html.parser")

75 issues = soup.select("a.title")

76 for issue in issues:

77 issue_search = re.search(self.issue_re, issue.text)

78 if not issue_search:

79 issue_search = re.search(self.issue_re_bis, issue.text)

80 if not issue_search: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Couldn't parse issue data")

82 issue_dict = issue_search.groupdict()

83 issue_href = issue.get("href")

84 if not isinstance(issue_href, str): 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 raise ValueError("Couldn't parse issue url")

87 xissues.append(

88 self.create_xissue(

89 urljoin(self.source_website, issue_href),

90 issue_dict["year"],

91 issue_dict["volume"],

92 issue_dict.get("number"),

93 )

94 )

96 return xissues

98 def parse_issue_content(self, content, xissue):

99 soup = BeautifulSoup(content, "html.parser")

100 articles = soup.select("a.title")

101 for index, article_tag in enumerate(articles):

102 xarticle = create_articledata()

103 xarticle.pid = "a" + str(index)

104 article_href = article_tag.get("href")

105 if not isinstance(article_href, str): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 raise ValueError("Couldn't parse article href")

107 xarticle.url = urljoin(self.source_website, article_href)

108 xissue.articles.append(xarticle)

109

110 def parse_article_content(self, content, xissue, xarticle, url):

111 f = open("/tmp/" + xissue.pid, "a")

112 # print("Article Id: ", xarticle.pid)

113 f.write(xarticle.pid + "\n")

114 soup = BeautifulSoup(content, "html5lib")

115 # Title

116 title_tags = soup.select("i.title, span.title")

117 title_tag = title_tags[0]

118 title_text = title_tag.get_text(strip=True).lstrip("• ").strip()

119 if title_text == "":

120 title_tag = title_tags[1]

121 title_text = title_tag.get_text(strip=True).lstrip("• ").strip()

122 # print("title: ", cleanup_str(title_text))

123 if isinstance(title_text, str): 123 ↛ 126line 123 didn't jump to line 126 because the condition on line 123 was always true

124 xarticle.title_tex = title_text

125 # PDF

126 pdf_link_tag = soup.select("a[href$='.pdf']:-soup-contains-own('PDF')")

127 if len(pdf_link_tag) != 1: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 raise ValueError("Error while trying to parse pdf url : found multiple candidates")

129 pdf_link = pdf_link_tag[0].get("href")

130 if not isinstance(pdf_link, str): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 raise ValueError("Couldn't parse article pdf")

132 pdf_url = urljoin(self.source_website, pdf_link)

133 f.write(pdf_url + "\n")

134 add_pdf_link_to_xarticle(xarticle, pdf_url)

135

136 # abstract with authors and msc

137 # fetch the span with "Abstract."

138 abstract_span = soup.find("span", string=lambda s: s and "Abstract." in s)

139 # fetch the parent with the all text

140 if abstract_span: 140 ↛ 169line 140 didn't jump to line 169 because the condition on line 140 was always true

141 parent = abstract_span.find_parent("table")

142 authors_string = self.get_authors(parent, title_tag, cleanup_str(title_text)).replace(

143 " and ", ", "

144 )

145 authors = authors_string.split(", ")

146 # print("AUTHORS: ", authors)

147 for author in authors:

148 xarticle.contributors.append(

149 create_contributor(string_name=cleanup_str(author), role="author")

150 )

151 # ABSTRACT

152 full_text = parent.get_text(separator=" ", strip=True).replace("\n", " ")

153 abstract_search = re.search(self.abstract_re, full_text)

154 if abstract_search:

155 abstract = abstract_search.group()

156 # MSC

157 mcs_search = re.search(self.msc_re, full_text)

158 mcs_string = mcs_search.group()

159 msc_list = mcs_string.split(", ")

160 for msc in msc_list:

161 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc)))

162 else:

163 # No msc, we have to define the abstract tag without the msc string

164 abstract_search = re.search(self.abstract_no_msc_re, full_text)

165 abstract = abstract_search.group()

166 # print("ABSTRACT:", abstract)

167 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract))

168 else:

169 print("Abstract not found.")

170 return None

171

172 return xarticle

173

174 def decode_response(self, response, encoding="ISO-8859-1"):

175 """Force encoding"""

176 try:

177 return super().decode_response(response, "ISO-8859-1")

178 except UnicodeDecodeError:

179 self.logger.debug(

180 f"Cannot parse resource using {encoding}. Attempting ISO-8859-1",

181 extra={"url": response.url},

182 )

Coverage for src/crawler/by_source/bmms_crawler.py: 65%

129 statements