Coverage for src/crawler/by_source/tac_crawler.py: 96%

138 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2 

3from bs4 import BeautifulSoup 

4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

14 

15 

16class TacCrawler(BaseCollectionCrawler): 

17 source_name = "Theory and Applications of Categories website" 

18 source_domain = "TAC" 

19 source_website = "http://www.tac.mta.ca/tac" 

20 

21 def parse_collection_content(self, content): 

22 """ 

23 Parse the HTML page of Annals of Math and returns a list of xissue. 

24 Each xissue has its volume/number/year metadata + its url 

25 

26 """ 

27 soup = BeautifulSoup(content, "html5lib") 

28 xissues = [] 

29 

30 issue_nodes = soup.find_all("h3") 

31 previous_year = 0 

32 

33 for issue_node in issue_nodes: 

34 xissue, previous_year = self.create_tac_xissue(issue_node, previous_year) 

35 xissues.append(xissue) 

36 

37 # TAC has multiple links towards the same page (ie: title + abstract) 

38 # We want to add only 1 article so we keep track of the urls handled. 

39 urls = [] 

40 

41 # The TAC web page is badly formatted. <dt> have no closing </dt> tags. 

42 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume 

43 link_nodes = soup.find_all("a") 

44 for link_node in link_nodes: 

45 url = link_node.get("href") 

46 if ( 

47 url is not None 

48 and url.startswith("volumes/") 

49 and url.endswith(".html") 

50 and url not in urls 

51 ): 

52 urls.append(url) 

53 

54 article_url = self.source_website + "/" + url 

55 url = url[8:] 

56 parts = url.split("/") 

57 volume = parts[0] 

58 

59 if len(volume) == 4: 

60 # The first volumes do have a url in /volumes/@vid/ 

61 # The url is /volumes/@year/@article_number/@volume-*.html 

62 parts = parts[2].split("-") 

63 if len(parts) > 1: 

64 volume = parts[0] 

65 else: 

66 volume = "" 

67 elif len(parts) != 3: 

68 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html 

69 volume = "" 

70 

71 if volume: 

72 xissue = [xissue for xissue in xissues if xissue.volume == volume][0] 

73 article_index = len(xissue.articles) 

74 

75 xarticle = create_articledata() 

76 xarticle.pid = "a" + str(article_index) 

77 xarticle.url = article_url 

78 xissue.articles.append(xarticle) 

79 

80 return xissues 

81 

82 def create_tac_xissue(self, issue_node, previous_year): 

83 text = issue_node.get_text().strip() 

84 text = text[7:] # Remove "Volume " 

85 parts = text.split(" - ") 

86 volume = parts[0] 

87 year = parts[1] 

88 title = "" 

89 

90 # TAC has some special issues: the title is specified instead of the year 

91 try: 

92 year_int = int(year) 

93 previous_year = year_int 

94 except Exception: 

95 if year[-1] == "*": 

96 year = year[:-1] 

97 

98 title = year 

99 if "Festschrift" in title: 

100 if title == "Bunge Festschrift": 

101 year = "2024" 

102 else: 

103 year = str(previous_year - 1) 

104 elif volume == "17": 

105 title = "Chu spaces" 

106 year = "2006" 

107 else: 

108 year = title[2:] 

109 

110 xissue = create_issuedata() 

111 xissue.pid = self.collection_id + "_" + year + "__" + volume 

112 xissue.year = year 

113 xissue.volume = volume 

114 

115 xissue.title_tex = title 

116 xissue.title_html = title 

117 xissue.title_xml = get_issue_title_xml(title, "en") 

118 

119 return xissue, previous_year 

120 

121 def parse_article_content(self, content, xissue, xarticle, url): 

122 """ 

123 Parse the content with Beautifulsoup and returns an ArticleData 

124 """ 

125 xarticle.lang = "en" 

126 

127 soup = BeautifulSoup(content, "html5lib") 

128 

129 # TITLE 

130 title_node = soup.find("h1") 

131 if title_node is not None: 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was always true

132 xarticle.title_tex = title_node.get_text().strip() 

133 

134 # AUTHORS 

135 author_node = soup.find("h2") 

136 if author_node is not None: 136 ↛ 209line 136 didn't jump to line 209 because the condition on line 136 was always true

137 text = author_node.get_text().strip() 

138 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text) 

139 

140 if xissue.pid == "TAC_2018__33" and xarticle.pid == "a30": 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"] 

142 

143 for text_author in parts: 

144 author = create_contributor() 

145 author["role"] = "author" 

146 author["string_name"] = text_author.replace("\n", " ") 

147 

148 xarticle.contributors.append(author) 

149 

150 # The first paragraphs (there can be many) before other metadata are part of the abstracts 

151 parsed_p_besides_abstract = False 

152 

153 for node in author_node.find_next_siblings(): 

154 if node.name == "p": 

155 text = node.get_text().strip() 

156 

157 # KEYWORDS 

158 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:") 

159 parsed_text = parsed_text or self.insert_kwd( 

160 xarticle, "msc", text, "2020 MSC:" 

161 ) 

162 parsed_text = parsed_text or self.insert_kwd( 

163 xarticle, "msc", text, "2010 MSC:" 

164 ) 

165 parsed_text = parsed_text or self.insert_kwd( 

166 xarticle, "msc", text, "2000 MSC:" 

167 ) 

168 parsed_text = parsed_text or self.insert_kwd( 

169 xarticle, "msc", text, "1991 MSC:" 

170 ) 

171 parsed_text = parsed_text or self.insert_kwd( 

172 xarticle, "msc", text, "AMS Classification (1991):" 

173 ) 

174 

175 # PAGES 

176 title = "Theory and Applications of Categories" 

177 if not parsed_text and text.startswith(title) and not xarticle.fpage: 

178 parsed_text = True 

179 pages = text[len(title) :].split("pp")[1][:-1].strip() 

180 pages = pages.replace("--", "-") 

181 parts = pages.split("-") 

182 xarticle.fpage = parts[0] 

183 xarticle.lpage = parts[1].split(".")[0] 

184 

185 # PUBLICATION DATE (note: revised dates are ignored) 

186 if not parsed_text and text.startswith("Published"): 

187 parsed_text = True 

188 date_str = text[10:].split(".")[0] 

189 xarticle.date_published_iso_8601_date_str = date_str 

190 

191 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract 

192 

193 # ABSTRACT 

194 if not parsed_p_besides_abstract: 

195 abstract = str(node) 

196 if len(xarticle.abstracts) > 0: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true

197 xarticle.abstracts[0]["value_tex"] += abstract 

198 else: 

199 xabstract = create_abstract( 

200 tag="abstract", 

201 value_tex=abstract, 

202 lang=xarticle.lang, 

203 ) 

204 xarticle.abstracts.append(xabstract) 

205 

206 # PDF 

207 # We need to find the last PDF link because TAC can have revised version of an article. 

208 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html 

209 pdf_url = "" 

210 link_nodes = soup.find_all("a") 

211 for link_node in link_nodes: 

212 url = link_node.get("href") 

213 if url is not None and url.endswith(".pdf"): 

214 pdf_url = url 

215 if pdf_url: 215 ↛ 218line 215 didn't jump to line 218 because the condition on line 215 was always true

216 add_pdf_link_to_xarticle(xarticle, pdf_url) 

217 

218 return xarticle 

219 

220 def insert_kwd(self, xarticle, content_type, text, prefix): 

221 if text.startswith(prefix): 

222 text = text[len(prefix) + 1 :] 

223 for kwd in re.split(",|;", text): 

224 subject = create_subj() 

225 subject["value"] = kwd.strip().replace("\n", " ") 

226 subject["type"] = content_type 

227 subject["lang"] = "en" 

228 xarticle.kwds.append(subject) 

229 return True 

230 return False