Coverage for src/crawler/by_source/bmms_crawler.py: 65%

129 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-11-21 14:41 +0000

1import re 

2from urllib.parse import urljoin 

3 

4from bs4 import BeautifulSoup 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

9 

10 

11class BmmsCrawler(BaseCollectionCrawler): 

12 source_name = "BMMS" 

13 source_domain = "BMMS" 

14 source_website = "https://math.usm.my/bulletin/html/research.htm" 

15 

16 issue_re = r"Volume (?P<volume>\d+)\s*•\s*Number (?P<number>\d+)\s*•\s*(?P<year>\d{4})" 

17 issue_re_bis = r"V(?P<volume>\d+)\s*•\s*N(?P<number>\d+[A-Z]?)\s*•\s*(?P<year>\d{4})" 

18 abstract_re = r"(?<=Abstract\.)(.*?)(?=[\d]{4}.*Mathematics Subject Classification)" 

19 abstract_no_msc_re = r"(?<=Abstract\.)(.*?)(?=Full.*text.*PDF)" 

20 msc_re = r"(?<=Mathematics Subject Classification:)(.*?)(?=Full)" 

21 delimiter_disp_formula = {"\\begin{align*}", "$$"} 

22 delimiter_inline_formula = {"$", "\\(", "\\["} 

23 

24 def get_authors(self, parent_tag, title_tag, title_text): 

25 """parent_tag is a soup that contains the authors 

26 returns the string list of the authors 

27 """ 

28 if "Linear Transformations of N-connections in OSC" in cleanup_str(title_text): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 return "Irena Čomić and Monica Purcaru" 

30 if ( 30 ↛ 34line 30 didn't jump to line 34 because the condition on line 30 was never true

31 "Mapping fromTopological Space to Endomorphisms Algebra of Banach Space and its Applications" 

32 in cleanup_str(title_text) 

33 ): 

34 return "Misir B. Ragimov" 

35 authors_tag = parent_tag.select_one("span.justify2") 

36 if authors_tag: 36 ↛ 64line 36 didn't jump to line 64 because the condition on line 36 was always true

37 authors_string = cleanup_str(authors_tag.get_text()) 

38 if cleanup_str(title_text) == authors_string: 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 authors_tags = parent_tag.select("span.justify2") 

40 for span in authors_tags: 

41 if not span.find("i"): 

42 authors_string = span.string 

43 break 

44 

45 if "Abstract" in authors_string: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 return authors_string.split("Abstract")[0] 

47 elif title_text in authors_string: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 res = authors_string.split(title_text)[1] 

49 return res 

50 else: 

51 if authors_string != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true

52 return authors_string 

53 authors_tag = parent_tag.select_one("span.justify2 span.justify2 span.justify2") 

54 if authors_tag: 

55 authors_string = authors_tag.get_text() 

56 if authors_string != "": 

57 return authors_string 

58 authors_tag = parent_tag.select_one("span.justify2 span.justify2") 

59 if authors_tag: 

60 authors_string = authors_tag.get_text() 

61 if authors_string != "": 

62 return authors_string 

63 

64 authors_tag = parent_tag.select_one("span.text2") 

65 if authors_tag: 

66 return authors_tag.get_text() 

67 else: 

68 br_tag = title_tag.find_next("br") 

69 if br_tag and br_tag.next_sibling: 

70 return br_tag.next_sibling.strip() 

71 

72 def parse_collection_content(self, content): 

73 xissues = [] 

74 soup = BeautifulSoup(content, "html.parser") 

75 issues = soup.select("a.title") 

76 for issue in issues: 

77 issue_search = re.search(self.issue_re, issue.text) 

78 if not issue_search: 

79 issue_search = re.search(self.issue_re_bis, issue.text) 

80 if not issue_search: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 raise ValueError("Couldn't parse issue data") 

82 issue_dict = issue_search.groupdict() 

83 issue_href = issue.get("href") 

84 if not isinstance(issue_href, str): 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 raise ValueError("Couldn't parse issue url") 

86 

87 xissues.append( 

88 self.create_xissue( 

89 urljoin(self.source_website, issue_href), 

90 issue_dict["year"], 

91 issue_dict["volume"], 

92 issue_dict.get("number"), 

93 ) 

94 ) 

95 

96 return xissues 

97 

98 def parse_issue_content(self, content, xissue): 

99 soup = BeautifulSoup(content, "html.parser") 

100 articles = soup.select("a.title") 

101 for index, article_tag in enumerate(articles): 

102 xarticle = create_articledata() 

103 xarticle.pid = "a" + str(index) 

104 article_href = article_tag.get("href") 

105 if not isinstance(article_href, str): 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 raise ValueError("Couldn't parse article href") 

107 xarticle.url = urljoin(self.source_website, article_href) 

108 xissue.articles.append(xarticle) 

109 

110 def parse_article_content(self, content, xissue, xarticle, url): 

111 f = open("/tmp/" + xissue.pid, "a") 

112 # print("Article Id: ", xarticle.pid) 

113 f.write(xarticle.pid + "\n") 

114 soup = BeautifulSoup(content, "html5lib") 

115 # Title 

116 title_tags = soup.select("i.title, span.title") 

117 title_tag = title_tags[0] 

118 title_text = title_tag.get_text(strip=True).lstrip("• ").strip() 

119 if title_text == "": 

120 title_tag = title_tags[1] 

121 title_text = title_tag.get_text(strip=True).lstrip("• ").strip() 

122 # print("title: ", cleanup_str(title_text)) 

123 if isinstance(title_text, str): 123 ↛ 126line 123 didn't jump to line 126 because the condition on line 123 was always true

124 xarticle.title_tex = title_text 

125 # PDF 

126 pdf_link_tag = soup.select("a[href$='.pdf']:-soup-contains-own('PDF')") 

127 if len(pdf_link_tag) != 1: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 raise ValueError("Error while trying to parse pdf url : found multiple candidates") 

129 pdf_link = pdf_link_tag[0].get("href") 

130 if not isinstance(pdf_link, str): 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 raise ValueError("Couldn't parse article pdf") 

132 pdf_url = urljoin(self.source_website, pdf_link) 

133 f.write(pdf_url + "\n") 

134 add_pdf_link_to_xarticle(xarticle, pdf_url) 

135 

136 # abstract with authors and msc 

137 # fetch the span with "Abstract." 

138 abstract_span = soup.find("span", string=lambda s: s and "Abstract." in s) 

139 # fetch the parent with the all text 

140 if abstract_span: 140 ↛ 169line 140 didn't jump to line 169 because the condition on line 140 was always true

141 parent = abstract_span.find_parent("table") 

142 authors_string = self.get_authors(parent, title_tag, cleanup_str(title_text)).replace( 

143 " and ", ", " 

144 ) 

145 authors = authors_string.split(", ") 

146 # print("AUTHORS: ", authors) 

147 for author in authors: 

148 xarticle.contributors.append( 

149 create_contributor(string_name=cleanup_str(author), role="author") 

150 ) 

151 # ABSTRACT 

152 full_text = parent.get_text(separator=" ", strip=True).replace("\n", " ") 

153 abstract_search = re.search(self.abstract_re, full_text) 

154 if abstract_search: 

155 abstract = abstract_search.group() 

156 # MSC 

157 mcs_search = re.search(self.msc_re, full_text) 

158 mcs_string = mcs_search.group() 

159 msc_list = mcs_string.split(", ") 

160 for msc in msc_list: 

161 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc))) 

162 else: 

163 # No msc, we have to define the abstract tag without the msc string 

164 abstract_search = re.search(self.abstract_no_msc_re, full_text) 

165 abstract = abstract_search.group() 

166 # print("ABSTRACT:", abstract) 

167 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract)) 

168 else: 

169 print("Abstract not found.") 

170 return None 

171 

172 return xarticle 

173 

174 def decode_response(self, response, encoding="ISO-8859-1"): 

175 """Force encoding""" 

176 try: 

177 return super().decode_response(response, "ISO-8859-1") 

178 except UnicodeDecodeError: 

179 self.logger.debug( 

180 f"Cannot parse resource using {encoding}. Attempting ISO-8859-1", 

181 extra={"url": response.url}, 

182 )