Coverage for src/crawler/by_source/tac_crawler.py: 94%

126 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-09-16 12:41 +0000

1import re 

2from urllib.parse import urljoin 

3 

4import regex 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 create_abstract, 

8 create_articledata, 

9 create_contributor, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler 

14from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict 

15 

16 

17class TacCrawler(BaseCollectionCrawler): 

18 source_name = "Theory and Applications of Categories website" 

19 source_domain = "TAC" 

20 source_website = "http://www.tac.mta.ca/tac" 

21 

22 issue_re = r"Volume (?P<volume>\d+) \- (?P<year>[\w :]+)" 

23 

24 def parse_collection_content(self, content): 

25 """ 

26 Parse the HTML page of Annals of Math and returns a list of xissue. 

27 Each xissue has its volume/number/year metadata + its url 

28 

29 """ 

30 soup = BeautifulSoup(content, "html5lib") 

31 xissues = [] 

32 

33 issue_nodes = soup.find_all("h3") 

34 

35 for issue_node in issue_nodes: 

36 xissue = self.create_tac_xissue(issue_node) 

37 xissues.append(xissue) 

38 

39 # TAC has multiple links towards the same page (ie: title + abstract) 

40 # We want to add only 1 article so we keep track of the urls handled. 

41 urls = [] 

42 

43 # The TAC web page is badly formatted. <dt> have no closing </dt> tags. 

44 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume 

45 link_nodes = soup.find_all("a") 

46 for link_node in link_nodes: 

47 url = link_node.get("href") 

48 if not isinstance(url, str): 

49 continue 

50 if not (url.startswith("volumes/") and url.endswith(".html") and url not in urls): 

51 continue 

52 

53 urls.append(url) 

54 

55 article_url = self.source_website + "/" + url 

56 url = url[8:] 

57 parts = url.split("/") 

58 volume = parts[0] 

59 

60 if len(volume) == 4: 

61 # The first volumes do have a url in /volumes/@vid/ 

62 # The url is /volumes/@year/@article_number/@volume-*.html 

63 parts = parts[2].split("-") 

64 if len(parts) > 1: 

65 volume = parts[0] 

66 else: 

67 volume = "" 

68 elif len(parts) != 3: 

69 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html 

70 volume = "" 

71 

72 if volume: 

73 xissue = [xissue for xissue in xissues if xissue.volume == volume][0] 

74 article_index = len(xissue.articles) 

75 

76 xarticle = create_articledata() 

77 xarticle.pid = "a" + str(article_index) 

78 xarticle.url = article_url 

79 xissue.articles.append(xarticle) 

80 

81 attachment_node = link_node.parent.parent.find_next("dd") 

82 if isinstance(attachment_node, Tag): 82 ↛ 46line 82 didn't jump to line 46 because the condition on line 82 was always true

83 a_node = attachment_node.select_one("a[href$='.pdf']") 

84 if a_node: 84 ↛ 46line 84 didn't jump to line 46 because the condition on line 84 was always true

85 href = a_node.get("href") 

86 if isinstance(href, str): 86 ↛ 46line 86 didn't jump to line 46 because the condition on line 86 was always true

87 add_pdf_link_to_xarticle( 

88 xarticle, urljoin(self.collection_url + "/", href) 

89 ) 

90 

91 return xissues 

92 

93 # Years are available inside the "abstract" page of an article. 

94 # TODO : implement fetching the year from an article instead of an hardcoded dict 

95 issue_years = { 

96 "Lawvere Festschrift": "2025", 

97 "Hofstra Festschrift": "2024", 

98 "Bunge Festschrift": "2024", 

99 "The Rosebrugh Festschrift": "2021", 

100 "CT2011": "2012", 

101 "The Bourn Festschrift": "2010", 

102 "The Tholen Festschrift": "2008", 

103 "CT2006": "2007", 

104 "Chu spaces: theory and applications": "2006", 

105 "CT2004": "2005", 

106 "The Carboni Festschrift": "2004", 

107 "CT2000": "2001", 

108 "The Lambek Festschrift": "1999", 

109 } 

110 

111 def create_tac_xissue(self, issue_node): 

112 text = issue_node.get_text().strip() 

113 

114 issue_dict = regex_to_dict(self.issue_re, text, error_msg="Couldn't parse issue") 

115 

116 volume = issue_dict["volume"] 

117 year = issue_dict["year"] 

118 title = "" 

119 

120 if re.search("[a-zA-Z]", year): 

121 title = year 

122 if title not in self.issue_years: 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 raise ValueError( 

124 "Couldn't parse issue (year not found). Have we encountered a new issue ?" 

125 ) 

126 year = self.issue_years[title] 

127 

128 xissue = self.create_xissue(self.source_website + f"#vol{volume}", year, volume, None) 

129 

130 if title: 

131 xissue.title_tex = title 

132 

133 return xissue 

134 

135 def parse_article_content(self, content, xissue, xarticle, url): 

136 """ 

137 Parse the content with Beautifulsoup and returns an ArticleData 

138 """ 

139 xarticle.lang = "en" 

140 

141 soup = BeautifulSoup(content, "html5lib") 

142 

143 # TITLE 

144 title_node = soup.find("h1") 

145 if title_node is not None: 145 ↛ 149line 145 didn't jump to line 149 because the condition on line 145 was always true

146 xarticle.title_tex = title_node.get_text().strip() 

147 

148 # AUTHORS 

149 author_node = soup.find("h2") 

150 if author_node is not None: 150 ↛ 231line 150 didn't jump to line 231 because the condition on line 150 was always true

151 text = author_node.get_text().strip() 

152 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text) 

153 

154 if xarticle.pid == "TAC_2018_33_a30": 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"] 

156 

157 for text_author in parts: 

158 author = create_contributor( 

159 role="author", string_name=text_author.replace("\n", " ") 

160 ) 

161 

162 xarticle.contributors.append(author) 

163 

164 # The first paragraphs (there can be many) before other metadata are part of the abstracts 

165 parsed_p_besides_abstract = False 

166 

167 for node in author_node.find_next_siblings(): 

168 if node.name == "p": 

169 text = node.get_text().strip() 

170 

171 # KEYWORDS 

172 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:") 

173 parsed_text = parsed_text or self.insert_kwd( 

174 xarticle, "msc", text, "2020 MSC:" 

175 ) 

176 parsed_text = parsed_text or self.insert_kwd( 

177 xarticle, "msc", text, "2010 MSC:" 

178 ) 

179 parsed_text = parsed_text or self.insert_kwd( 

180 xarticle, "msc", text, "2000 MSC:" 

181 ) 

182 parsed_text = parsed_text or self.insert_kwd( 

183 xarticle, "msc", text, "1991 MSC:" 

184 ) 

185 parsed_text = parsed_text or self.insert_kwd( 

186 xarticle, "msc", text, "AMS Classification (1991):" 

187 ) 

188 

189 # PAGES 

190 title = "Theory and Applications of Categories" 

191 if not parsed_text and text.startswith(title) and not xarticle.fpage: 

192 parsed_text = True 

193 pages = text[len(title) :].split("pp")[1][:-1].strip() 

194 pages = pages.replace("--", "-") 

195 parts = pages.split("-") 

196 xarticle.fpage = parts[0] 

197 xarticle.lpage = regex.split(r"(\.|\n)", parts[1], maxsplit=1)[0] 

198 

199 # PUBLICATION DATE (note: revised dates are ignored) 

200 if not parsed_text and text.startswith("Published"): 

201 parsed_text = True 

202 date_str = text[10:].split(".")[0] 

203 xarticle.date_published_iso_8601_date_str = date_str 

204 

205 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract 

206 

207 # ABSTRACT 

208 if not parsed_p_besides_abstract: 

209 abstract = str(node) 

210 if len(xarticle.abstracts) > 0: 210 ↛ 211line 210 didn't jump to line 211 because the condition on line 210 was never true

211 xarticle.abstracts[0]["value_tex"] += abstract 

212 else: 

213 xabstract = create_abstract( 

214 value_tex=abstract, 

215 lang=xarticle.lang, 

216 ) 

217 xarticle.abstracts.append(xabstract) 

218 

219 # PDF 

220 # We need to find the last PDF link because TAC can have revised version of an article. 

221 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html 

222 # pdf_url = "" 

223 # link_nodes = soup.find_all("a") 

224 # for link_node in link_nodes: 

225 # url = link_node.get("href") 

226 # if url is not None and url.endswith(".pdf"): 

227 # pdf_url = url 

228 # if pdf_url: 

229 # add_pdf_link_to_xarticle(xarticle, pdf_url) 

230 

231 return xarticle 

232 

233 def insert_kwd(self, xarticle, content_type, text, prefix): 

234 if text.startswith(prefix): 

235 text = text[len(prefix) + 1 :] 

236 for kwd in re.split(",|;", text): 

237 subject = create_subj() 

238 subject["value"] = kwd.strip().replace("\n", " ") 

239 subject["type"] = content_type 

240 subject["lang"] = "en" 

241 xarticle.kwds.append(subject) 

242 return True 

243 return False