Coverage for src/crawler/by_source/tac_crawler.py: 96%

142 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2 

3from bs4 import BeautifulSoup 

4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml 

5from ptf.model_data import ( 

6 create_abstract, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

14 

15 

16class TacCrawler(BaseCollectionCrawler): 

17 source_name = "Theory and Applications of Categories website" 

18 source_domain = "TAC" 

19 source_website = "http://www.tac.mta.ca/tac" 

20 periode_begin = 1995 

21 periode_end = 2024 

22 

23 def parse_collection_content(self, content): 

24 """ 

25 Parse the HTML page of Annals of Math and returns a list of xissue. 

26 Each xissue has its volume/number/year metadata + its url 

27 

28 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page 

29 """ 

30 soup = BeautifulSoup(content, "html5lib") 

31 xissues = [] 

32 

33 issue_nodes = soup.find_all("h3") 

34 previous_year = 0 

35 

36 for issue_node in issue_nodes: 

37 xissue, previous_year = self.create_tac_xissue(issue_node, previous_year) 

38 xissues.append(xissue) 

39 

40 # TAC has multiple links towards the same page (ie: title + abstract) 

41 # We want to add only 1 article so we keep track of the urls handled. 

42 urls = [] 

43 

44 # The TAC web page is badly formatted. <dt> have no closing </dt> tags. 

45 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume 

46 link_nodes = soup.find_all("a") 

47 for link_node in link_nodes: 

48 url = link_node.get("href") 

49 if ( 

50 url is not None 

51 and url.startswith("volumes/") 

52 and url.endswith(".html") 

53 and url not in urls 

54 ): 

55 urls.append(url) 

56 

57 article_url = self.source_website + "/" + url 

58 url = url[8:] 

59 parts = url.split("/") 

60 volume = parts[0] 

61 

62 if len(volume) == 4: 

63 # The first volumes do have a url in /volumes/@vid/ 

64 # The url is /volumes/@year/@article_number/@volume-*.html 

65 parts = parts[2].split("-") 

66 if len(parts) > 1: 

67 volume = parts[0] 

68 else: 

69 volume = "" 

70 elif len(parts) != 3: 

71 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html 

72 volume = "" 

73 

74 if volume: 

75 xissue = [xissue for xissue in xissues if xissue.volume == volume][0] 

76 article_index = len(xissue.articles) 

77 

78 xarticle = create_articledata() 

79 xarticle.pid = "a" + str(article_index) 

80 xarticle.url = article_url 

81 xissue.articles.append(xarticle) 

82 

83 return xissues 

84 

85 def create_tac_xissue(self, issue_node, previous_year): 

86 text = issue_node.get_text().strip() 

87 text = text[7:] # Remove "Volume " 

88 parts = text.split(" - ") 

89 volume = parts[0] 

90 year = parts[1] 

91 title = "" 

92 

93 # TAC has some special issues: the title is specified instead of the year 

94 try: 

95 year_int = int(year) 

96 previous_year = year_int 

97 except Exception: 

98 if year[-1] == "*": 

99 year = year[:-1] 

100 

101 title = year 

102 if "Festschrift" in title: 

103 if title == "Bunge Festschrift": 

104 year = "2024" 

105 else: 

106 year = str(previous_year - 1) 

107 elif volume == "17": 

108 title = "Chu spaces" 

109 year = "2006" 

110 else: 

111 year = title[2:] 

112 

113 xissue = create_issuedata() 

114 xissue.pid = self.collection_id + "_" + year + "__" + volume 

115 xissue.year = year 

116 xissue.volume = volume 

117 

118 xissue.title_tex = title 

119 xissue.title_html = title 

120 xissue.title_xml = get_issue_title_xml(title, "en") 

121 

122 return xissue, previous_year 

123 

124 def parse_article_content(self, content, xissue, xarticle, url, pid): 

125 """ 

126 Parse the content with Beautifulsoup and returns an ArticleData 

127 """ 

128 xarticle = create_articledata() 

129 xarticle.pid = pid 

130 xarticle.lang = "en" 

131 

132 soup = BeautifulSoup(content, "html5lib") 

133 

134 # TITLE 

135 title_node = soup.find("h1") 

136 if title_node is not None: 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was always true

137 xarticle.title_tex = title_node.get_text().strip() 

138 

139 # AUTHORS 

140 author_node = soup.find("h2") 

141 if author_node is not None: 141 ↛ 214line 141 didn't jump to line 214 because the condition on line 141 was always true

142 text = author_node.get_text().strip() 

143 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text) 

144 

145 if pid == "TAC_2018__33_a30": 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"] 

147 

148 for text_author in parts: 

149 author = create_contributor() 

150 author["role"] = "author" 

151 author["string_name"] = text_author.replace("\n", " ") 

152 

153 xarticle.contributors.append(author) 

154 

155 # The first paragraphs (there can be many) before other metadata are part of the abstracts 

156 parsed_p_besides_abstract = False 

157 

158 for node in author_node.find_next_siblings(): 

159 if node.name == "p": 

160 text = node.get_text().strip() 

161 

162 # KEYWORDS 

163 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:") 

164 parsed_text = parsed_text or self.insert_kwd( 

165 xarticle, "msc", text, "2020 MSC:" 

166 ) 

167 parsed_text = parsed_text or self.insert_kwd( 

168 xarticle, "msc", text, "2010 MSC:" 

169 ) 

170 parsed_text = parsed_text or self.insert_kwd( 

171 xarticle, "msc", text, "2000 MSC:" 

172 ) 

173 parsed_text = parsed_text or self.insert_kwd( 

174 xarticle, "msc", text, "1991 MSC:" 

175 ) 

176 parsed_text = parsed_text or self.insert_kwd( 

177 xarticle, "msc", text, "AMS Classification (1991):" 

178 ) 

179 

180 # PAGES 

181 title = "Theory and Applications of Categories" 

182 if not parsed_text and text.startswith(title) and not xarticle.fpage: 

183 parsed_text = True 

184 pages = text[len(title) :].split("pp")[1][:-1].strip() 

185 pages = pages.replace("--", "-") 

186 parts = pages.split("-") 

187 xarticle.fpage = parts[0] 

188 xarticle.lpage = parts[1].split(".")[0] 

189 

190 # PUBLICATION DATE (note: revised dates are ignored) 

191 if not parsed_text and text.startswith("Published"): 

192 parsed_text = True 

193 date_str = text[10:].split(".")[0] 

194 xarticle.date_published_iso_8601_date_str = date_str 

195 

196 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract 

197 

198 # ABSTRACT 

199 if not parsed_p_besides_abstract: 

200 abstract = str(node) 

201 if len(xarticle.abstracts) > 0: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 xarticle.abstracts[0]["value_tex"] += abstract 

203 else: 

204 xabstract = create_abstract( 

205 tag="abstract", 

206 value_tex=abstract, 

207 lang=xarticle.lang, 

208 ) 

209 xarticle.abstracts.append(xabstract) 

210 

211 # PDF 

212 # We need to find the last PDF link because TAC can have revised version of an article. 

213 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html 

214 pdf_url = "" 

215 link_nodes = soup.find_all("a") 

216 for link_node in link_nodes: 

217 url = link_node.get("href") 

218 if url is not None and url.endswith(".pdf"): 

219 pdf_url = url 

220 if pdf_url: 220 ↛ 223line 220 didn't jump to line 223 because the condition on line 220 was always true

221 add_pdf_link_to_xarticle(xarticle, pdf_url) 

222 

223 return xarticle 

224 

225 def insert_kwd(self, xarticle, content_type, text, prefix): 

226 if text.startswith(prefix): 

227 text = text[len(prefix) + 1 :] 

228 for kwd in re.split(",|;", text): 

229 subject = create_subj() 

230 subject["value"] = kwd.strip().replace("\n", " ") 

231 subject["type"] = content_type 

232 subject["lang"] = "en" 

233 xarticle.kwds.append(subject) 

234 return True 

235 return False