Coverage for src / crawler / by_source / bmms_crawler.py: 63%

139 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import re 

2from urllib.parse import urljoin 

3 

4import requests 

5from bs4 import BeautifulSoup 

6from ptf.model_data import ( 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16 

17class BmmsCrawler(BaseCollectionCrawler): 

18 source_name = "BMMS" 

19 source_domain = "BMMS" 

20 source_website = "https://math.usm.my/bulletin/html/research.htm" 

21 

22 issue_re = r"Volume (?P<volume>\d+)\s*•\s*Number (?P<number>\d+)\s*•\s*(?P<year>\d{4})" 

23 issue_re_bis = r"V(?P<volume>\d+)\s*•\s*N(?P<number>\d+[A-Z]?)\s*•\s*(?P<year>\d{4})" 

24 abstract_re = r"(?<=Abstract\.)(.*?)(?=[\d]{4}.*Mathematics Subject Classification)" 

25 abstract_no_msc_re = r"(?<=Abstract\.)(.*?)(?=Full.*text.*PDF)" 

26 msc_re = r"(?<=Mathematics Subject Classification:)(.*?)(?=Full)" 

27 delimiter_disp_formula = {"\\begin{align*}", "$$"} 

28 delimiter_inline_formula = {"$", "\\(", "\\["} 

29 

30 def get_authors(self, parent_tag, title_tag, title_text): 

31 """parent_tag is a soup that contains the authors 

32 returns the string list of the authors 

33 """ 

34 if "Linear Transformations of N-connections in OSC" in cleanup_str(title_text): 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 return "Irena Čomić and Monica Purcaru" 

36 if ( 36 ↛ 40line 36 didn't jump to line 40 because the condition on line 36 was never true

37 "Mapping fromTopological Space to Endomorphisms Algebra of Banach Space and its Applications" 

38 in cleanup_str(title_text) 

39 ): 

40 return "Misir B. Ragimov" 

41 authors_tag = parent_tag.select_one("span.justify2") 

42 if authors_tag: 42 ↛ 70line 42 didn't jump to line 70 because the condition on line 42 was always true

43 authors_string = cleanup_str(authors_tag.get_text()) 

44 if cleanup_str(title_text) == authors_string: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 authors_tags = parent_tag.select("span.justify2") 

46 for span in authors_tags: 

47 if not span.find("i"): 

48 authors_string = span.string 

49 break 

50 

51 if "Abstract" in authors_string: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 return authors_string.split("Abstract")[0] 

53 elif title_text in authors_string: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 res = authors_string.split(title_text)[1] 

55 return res 

56 else: 

57 if authors_string != "": 57 ↛ 59line 57 didn't jump to line 59 because the condition on line 57 was always true

58 return authors_string 

59 authors_tag = parent_tag.select_one("span.justify2 span.justify2 span.justify2") 

60 if authors_tag: 

61 authors_string = authors_tag.get_text() 

62 if authors_string != "": 

63 return authors_string 

64 authors_tag = parent_tag.select_one("span.justify2 span.justify2") 

65 if authors_tag: 

66 authors_string = authors_tag.get_text() 

67 if authors_string != "": 

68 return authors_string 

69 

70 authors_tag = parent_tag.select_one("span.text2") 

71 if authors_tag: 

72 return authors_tag.get_text() 

73 else: 

74 br_tag = title_tag.find_next("br") 

75 if br_tag and br_tag.next_sibling: 

76 return br_tag.next_sibling.strip() 

77 

78 def parse_collection_content(self, content): 

79 xissues = [] 

80 soup = BeautifulSoup(content, "html.parser") 

81 issues = soup.select("a.title") 

82 for issue in issues: 

83 issue_search = re.search(self.issue_re, issue.text) 

84 if not issue_search: 

85 issue_search = re.search(self.issue_re_bis, issue.text) 

86 if not issue_search: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 raise ValueError("Couldn't parse issue data") 

88 issue_dict = issue_search.groupdict() 

89 issue_href = issue.get("href") 

90 if not isinstance(issue_href, str): 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 raise ValueError("Couldn't parse issue url") 

92 

93 xissues.append( 

94 self.create_xissue( 

95 urljoin(self.source_website, issue_href), 

96 issue_dict["year"], 

97 issue_dict["volume"], 

98 issue_dict.get("number"), 

99 ) 

100 ) 

101 

102 return xissues 

103 

104 def parse_issue_content(self, content, xissue): 

105 soup = BeautifulSoup(content, "html.parser") 

106 articles = soup.select("a.title") 

107 for index, article_tag in enumerate(articles): 

108 xarticle = create_articledata() 

109 xarticle.pid = "a" + str(index) 

110 article_href = article_tag.get("href") 

111 if not isinstance(article_href, str): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Couldn't parse article href") 

113 xarticle.url = urljoin(self.source_website, article_href) 

114 xissue.articles.append(xarticle) 

115 

116 def parse_article_content(self, content, xissue, xarticle, url): 

117 f = open("/tmp/" + xissue.pid, "a") 

118 # print("Article Id: ", xarticle.pid) 

119 f.write(xarticle.pid + "\n") 

120 soup = BeautifulSoup(content, "html5lib") 

121 # Title 

122 title_tags = soup.select("i.title, span.title") 

123 title_tag = title_tags[0] 

124 title_text = title_tag.get_text(strip=True).lstrip("• ").strip() 

125 if title_text == "": 

126 title_tag = title_tags[1] 

127 title_text = title_tag.get_text(strip=True).lstrip("• ").strip() 

128 # print("title: ", cleanup_str(title_text)) 

129 if isinstance(title_text, str): 129 ↛ 132line 129 didn't jump to line 132 because the condition on line 129 was always true

130 xarticle.title_tex = title_text 

131 # PDF 

132 pdf_link_tag = soup.select("a[href$='.pdf']:-soup-contains-own('PDF')") 

133 if len(pdf_link_tag) != 1: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 raise ValueError("Error while trying to parse pdf url : found multiple candidates") 

135 pdf_link = pdf_link_tag[0].get("href") 

136 if not isinstance(pdf_link, str): 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 raise ValueError("Couldn't parse article pdf") 

138 pdf_url = urljoin(self.source_website, pdf_link) 

139 f.write(pdf_url + "\n") 

140 add_pdf_link_to_xarticle(xarticle, pdf_url) 

141 

142 # abstract with authors and msc 

143 # fetch the span with "Abstract." 

144 abstract_span = soup.find("span", string=lambda s: s and "Abstract." in s) 

145 # fetch the parent with the all text 

146 if abstract_span: 146 ↛ 175line 146 didn't jump to line 175 because the condition on line 146 was always true

147 parent = abstract_span.find_parent("table") 

148 authors_string = self.get_authors(parent, title_tag, cleanup_str(title_text)).replace( 

149 " and ", ", " 

150 ) 

151 authors = authors_string.split(", ") 

152 # print("AUTHORS: ", authors) 

153 for author in authors: 

154 xarticle.contributors.append( 

155 create_contributor(string_name=cleanup_str(author), role="author") 

156 ) 

157 # ABSTRACT 

158 full_text = parent.get_text(separator=" ", strip=True).replace("\n", " ") 

159 abstract_search = re.search(self.abstract_re, full_text) 

160 if abstract_search: 

161 abstract = abstract_search.group() 

162 # MSC 

163 mcs_search = re.search(self.msc_re, full_text) 

164 mcs_string = mcs_search.group() 

165 msc_list = mcs_string.split(", ") 

166 for msc in msc_list: 

167 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc))) 

168 else: 

169 # No msc, we have to define the abstract tag without the msc string 

170 abstract_search = re.search(self.abstract_no_msc_re, full_text) 

171 abstract = abstract_search.group() 

172 # print("ABSTRACT:", abstract) 

173 xarticle.abstracts.append(create_abstract(lang="en", value_tex=abstract)) 

174 else: 

175 print("Abstract not found.") 

176 return None 

177 

178 return xarticle 

179 

180 def crawl_article(self, xarticle, xissue): 

181 try: 

182 return super().crawl_article(xarticle, xissue) 

183 except requests.exceptions.HTTPError as e: 

184 status_code = e.response.status_code 

185 if status_code == 404: 

186 self.logger.warning(e) 

187 return None 

188 raise e 

189 

190 def decode_response(self, response, encoding="ISO-8859-1"): 

191 """Force encoding""" 

192 try: 

193 return super().decode_response(response, "ISO-8859-1") 

194 except UnicodeDecodeError: 

195 self.logger.debug( 

196 f"Cannot parse resource using {encoding}. Attempting ISO-8859-1", 

197 extra={"url": response.url}, 

198 )