Coverage for src/crawler/by_source/tac_crawler.py: 95%

122 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import re 

2 

3from bs4 import BeautifulSoup 

4from ptf.model_data import ( 

5 create_abstract, 

6 create_articledata, 

7 create_contributor, 

8 create_subj, 

9) 

10 

11from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

12from crawler.utils import regex_to_dict 

13 

14 

15class TacCrawler(BaseCollectionCrawler): 

16 source_name = "Theory and Applications of Categories website" 

17 source_domain = "TAC" 

18 source_website = "http://www.tac.mta.ca/tac" 

19 

20 issue_re = r"Volume (?P<volume>\d+) \- (?P<year>[\w :]+)" 

21 

22 def parse_collection_content(self, content): 

23 """ 

24 Parse the HTML page of Annals of Math and returns a list of xissue. 

25 Each xissue has its volume/number/year metadata + its url 

26 

27 """ 

28 soup = BeautifulSoup(content, "html5lib") 

29 xissues = [] 

30 

31 issue_nodes = soup.find_all("h3") 

32 

33 for issue_node in issue_nodes: 

34 xissue = self.create_tac_xissue(issue_node) 

35 xissues.append(xissue) 

36 

37 # TAC has multiple links towards the same page (ie: title + abstract) 

38 # We want to add only 1 article so we keep track of the urls handled. 

39 urls = [] 

40 

41 # The TAC web page is badly formatted. <dt> have no closing </dt> tags. 

42 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume 

43 link_nodes = soup.find_all("a") 

44 for link_node in link_nodes: 

45 url = link_node.get("href") 

46 if ( 

47 url is not None 

48 and url.startswith("volumes/") 

49 and url.endswith(".html") 

50 and url not in urls 

51 ): 

52 urls.append(url) 

53 

54 article_url = self.source_website + "/" + url 

55 url = url[8:] 

56 parts = url.split("/") 

57 volume = parts[0] 

58 

59 if len(volume) == 4: 

60 # The first volumes do have a url in /volumes/@vid/ 

61 # The url is /volumes/@year/@article_number/@volume-*.html 

62 parts = parts[2].split("-") 

63 if len(parts) > 1: 

64 volume = parts[0] 

65 else: 

66 volume = "" 

67 elif len(parts) != 3: 

68 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html 

69 volume = "" 

70 

71 if volume: 

72 xissue = [xissue for xissue in xissues if xissue.volume == volume][0] 

73 article_index = len(xissue.articles) 

74 

75 xarticle = create_articledata() 

76 xarticle.pid = "a" + str(article_index) 

77 xarticle.url = article_url 

78 xissue.articles.append(xarticle) 

79 

80 return xissues 

81 

82 # Years are available inside the "abstract" page of an article. 

83 # TODO : implement fetching the year from an article instead of an hardcoded dict 

84 issue_years = { 

85 "Lawvere Festschrift": "2025", 

86 "Hofstra Festschrift": "2024", 

87 "Bunge Festschrift": "2024", 

88 "The Rosebrugh Festschrift": "2021", 

89 "CT2011": "2012", 

90 "The Bourn Festschrift": "2010", 

91 "The Tholen Festschrift": "2008", 

92 "CT2006": "2007", 

93 "Chu spaces: theory and applications": "2006", 

94 "CT2004": "2005", 

95 "The Carboni Festschrift": "2004", 

96 "CT2000": "2001", 

97 "The Lambek Festschrift": "1999", 

98 } 

99 

100 def create_tac_xissue(self, issue_node): 

101 text = issue_node.get_text().strip() 

102 

103 issue_dict = regex_to_dict(self.issue_re, text, error_msg="Couldn't parse issue") 

104 

105 volume = issue_dict["volume"] 

106 year = issue_dict["year"] 

107 title = "" 

108 

109 if re.search("[a-zA-Z]", year): 

110 title = year 

111 if title not in self.issue_years: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError( 

113 "Couldn't parse issue (year not found). Have we encountered a new issue ?" 

114 ) 

115 year = self.issue_years[title] 

116 

117 xissue = self.create_xissue(self.source_website + f"#vol{volume}", year, volume, None) 

118 

119 if title: 

120 xissue.title_tex = title 

121 

122 return xissue 

123 

124 def parse_article_content(self, content, xissue, xarticle, url): 

125 """ 

126 Parse the content with Beautifulsoup and returns an ArticleData 

127 """ 

128 xarticle.lang = "en" 

129 

130 soup = BeautifulSoup(content, "html5lib") 

131 

132 # TITLE 

133 title_node = soup.find("h1") 

134 if title_node is not None: 134 ↛ 138line 134 didn't jump to line 138 because the condition on line 134 was always true

135 xarticle.title_tex = title_node.get_text().strip() 

136 

137 # AUTHORS 

138 author_node = soup.find("h2") 

139 if author_node is not None: 139 ↛ 212line 139 didn't jump to line 212 because the condition on line 139 was always true

140 text = author_node.get_text().strip() 

141 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text) 

142 

143 if xissue.pid == "TAC_2018_33" and xarticle.pid == "a30": 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"] 

145 

146 for text_author in parts: 

147 author = create_contributor( 

148 role="author", string_name=text_author.replace("\n", " ") 

149 ) 

150 

151 xarticle.contributors.append(author) 

152 

153 # The first paragraphs (there can be many) before other metadata are part of the abstracts 

154 parsed_p_besides_abstract = False 

155 

156 for node in author_node.find_next_siblings(): 

157 if node.name == "p": 

158 text = node.get_text().strip() 

159 

160 # KEYWORDS 

161 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:") 

162 parsed_text = parsed_text or self.insert_kwd( 

163 xarticle, "msc", text, "2020 MSC:" 

164 ) 

165 parsed_text = parsed_text or self.insert_kwd( 

166 xarticle, "msc", text, "2010 MSC:" 

167 ) 

168 parsed_text = parsed_text or self.insert_kwd( 

169 xarticle, "msc", text, "2000 MSC:" 

170 ) 

171 parsed_text = parsed_text or self.insert_kwd( 

172 xarticle, "msc", text, "1991 MSC:" 

173 ) 

174 parsed_text = parsed_text or self.insert_kwd( 

175 xarticle, "msc", text, "AMS Classification (1991):" 

176 ) 

177 

178 # PAGES 

179 title = "Theory and Applications of Categories" 

180 if not parsed_text and text.startswith(title) and not xarticle.fpage: 

181 parsed_text = True 

182 pages = text[len(title) :].split("pp")[1][:-1].strip() 

183 pages = pages.replace("--", "-") 

184 parts = pages.split("-") 

185 xarticle.fpage = parts[0] 

186 xarticle.lpage = parts[1].split(".")[0] 

187 

188 # PUBLICATION DATE (note: revised dates are ignored) 

189 if not parsed_text and text.startswith("Published"): 

190 parsed_text = True 

191 date_str = text[10:].split(".")[0] 

192 xarticle.date_published_iso_8601_date_str = date_str 

193 

194 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract 

195 

196 # ABSTRACT 

197 if not parsed_p_besides_abstract: 

198 abstract = str(node) 

199 if len(xarticle.abstracts) > 0: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 xarticle.abstracts[0]["value_tex"] += abstract 

201 else: 

202 xabstract = create_abstract( 

203 tag="abstract", 

204 value_tex=abstract, 

205 lang=xarticle.lang, 

206 ) 

207 xarticle.abstracts.append(xabstract) 

208 

209 # PDF 

210 # We need to find the last PDF link because TAC can have revised version of an article. 

211 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html 

212 pdf_url = "" 

213 link_nodes = soup.find_all("a") 

214 for link_node in link_nodes: 

215 url = link_node.get("href") 

216 if url is not None and url.endswith(".pdf"): 

217 pdf_url = url 

218 if pdf_url: 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true

219 add_pdf_link_to_xarticle(xarticle, pdf_url) 

220 

221 return xarticle 

222 

223 def insert_kwd(self, xarticle, content_type, text, prefix): 

224 if text.startswith(prefix): 

225 text = text[len(prefix) + 1 :] 

226 for kwd in re.split(",|;", text): 

227 subject = create_subj() 

228 subject["value"] = kwd.strip().replace("\n", " ") 

229 subject["type"] = content_type 

230 subject["lang"] = "en" 

231 xarticle.kwds.append(subject) 

232 return True 

233 return False