Coverage for src/crawler/by_source/tac_crawler.py: 95%

148 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2 

3from bs4 import BeautifulSoup 

4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml 

5from ptf.model_data import ( 

6 AbstractDict, 

7 create_articledata, 

8 create_contributor, 

9 create_issuedata, 

10 create_subj, 

11) 

12 

13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

14 

15 

16class TacCrawler(BaseCollectionCrawler): 

17 source_name = "Theory and Applications of Categories website" 

18 source_domain = "TAC" 

19 source_website = "http://www.tac.mta.ca/tac" 

20 periode_begin = 1995 

21 periode_end = 2024 

22 

23 def __init__(self, *args, **kwargs): 

24 super().__init__(*args, **kwargs) 

25 

26 self.source = self.get_or_create_source() 

27 self.periode = self.get_or_create_periode() 

28 

29 def parse_collection_content(self, content): 

30 """ 

31 Parse the HTML page of Annals of Math and returns a list of xissue. 

32 Each xissue has its volume/number/year metadata + its url 

33 

34 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page 

35 """ 

36 soup = BeautifulSoup(content, "html5lib") 

37 xissues = [] 

38 

39 issue_nodes = soup.find_all("h3") 

40 previous_year = 0 

41 

42 for issue_node in issue_nodes: 

43 xissue, previous_year = self.create_xissue(issue_node, previous_year) 

44 xissues.append(xissue) 

45 

46 # TAC has multiple links towards the same page (ie: title + abstract) 

47 # We want to add only 1 article so we keep track of the urls handled. 

48 urls = [] 

49 

50 # The TAC web page is badly formatted. <dt> have no closing </dt> tags. 

51 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume 

52 link_nodes = soup.find_all("a") 

53 for link_node in link_nodes: 

54 url = link_node.get("href") 

55 if ( 

56 url is not None 

57 and url.startswith("volumes/") 

58 and url.endswith(".html") 

59 and url not in urls 

60 ): 

61 urls.append(url) 

62 

63 article_url = self.source_website + "/" + url 

64 url = url[8:] 

65 parts = url.split("/") 

66 volume = parts[0] 

67 

68 if len(volume) == 4: 

69 # The first volumes do have a url in /volumes/@vid/ 

70 # The url is /volumes/@year/@article_number/@volume-*.html 

71 parts = parts[2].split("-") 

72 if len(parts) > 1: 

73 volume = parts[0] 

74 else: 

75 volume = "" 

76 elif len(parts) != 3: 

77 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html 

78 volume = "" 

79 

80 if volume: 

81 xissue = [xissue for xissue in xissues if xissue.volume == volume][0] 

82 article_index = len(xissue.articles) 

83 

84 xarticle = create_articledata() 

85 xarticle.pid = "a" + str(article_index) 

86 xarticle.url = article_url 

87 xissue.articles.append(xarticle) 

88 

89 return xissues 

90 

91 def create_xissue(self, issue_node, previous_year): 

92 text = issue_node.get_text().strip() 

93 text = text[7:] # Remove "Volume " 

94 parts = text.split(" - ") 

95 volume = parts[0] 

96 year = parts[1] 

97 title = "" 

98 

99 # TAC has some special issues: the title is specified instead of the year 

100 try: 

101 year_int = int(year) 

102 previous_year = year_int 

103 except Exception: 

104 if year[-1] == "*": 

105 year = year[:-1] 

106 

107 title = year 

108 if "Festschrift" in title: 

109 if title == "Bunge Festschrift": 

110 year = "2024" 

111 else: 

112 year = str(previous_year - 1) 

113 elif volume == "17": 

114 title = "Chu spaces" 

115 year = "2006" 

116 else: 

117 year = title[2:] 

118 

119 xissue = create_issuedata() 

120 xissue.pid = self.collection_id + "_" + year + "__" + volume 

121 xissue.year = year 

122 xissue.volume = volume 

123 

124 xissue.title_tex = title 

125 xissue.title_html = title 

126 xissue.title_xml = get_issue_title_xml(title, "en") 

127 

128 return xissue, previous_year 

129 

130 def parse_article_content(self, content, xissue, xarticle, url, pid): 

131 """ 

132 Parse the content with Beautifulsoup and returns an ArticleData 

133 """ 

134 xarticle = create_articledata() 

135 xarticle.pid = pid 

136 xarticle.lang = "en" 

137 if pid == "TAC_2014__29_a4": 

138 print 

139 soup = BeautifulSoup(content, "html5lib") 

140 

141 # TITLE 

142 title_node = soup.find("h1") 

143 if title_node is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true

144 xarticle.title_tex = title_node.get_text().strip() 

145 

146 # AUTHORS 

147 author_node = soup.find("h2") 

148 if author_node is not None: 148 ↛ 223line 148 didn't jump to line 223 because the condition on line 148 was always true

149 text = author_node.get_text().strip() 

150 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text) 

151 

152 if pid == "TAC_2018__33_a30": 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"] 

154 

155 for text_author in parts: 

156 author = create_contributor() 

157 author["role"] = "author" 

158 author["string_name"] = text_author.replace("\n", " ") 

159 

160 xarticle.contributors.append(author) 

161 

162 # The first paragraphs (there can be many) before other metadata are part of the abstracts 

163 parsed_p_besides_abstract = False 

164 

165 for node in author_node.find_next_siblings(): 

166 if node.name == "p": 

167 text = node.get_text().strip() 

168 

169 # KEYWORDS 

170 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:") 

171 parsed_text = parsed_text or self.insert_kwd( 

172 xarticle, "msc", text, "2020 MSC:" 

173 ) 

174 parsed_text = parsed_text or self.insert_kwd( 

175 xarticle, "msc", text, "2010 MSC:" 

176 ) 

177 parsed_text = parsed_text or self.insert_kwd( 

178 xarticle, "msc", text, "2000 MSC:" 

179 ) 

180 parsed_text = parsed_text or self.insert_kwd( 

181 xarticle, "msc", text, "1991 MSC:" 

182 ) 

183 parsed_text = parsed_text or self.insert_kwd( 

184 xarticle, "msc", text, "AMS Classification (1991):" 

185 ) 

186 

187 # PAGES 

188 title = "Theory and Applications of Categories" 

189 if not parsed_text and text.startswith(title) and not xarticle.fpage: 

190 parsed_text = True 

191 pages = text[len(title) :].split("pp")[1][:-1].strip() 

192 pages = pages.replace("--", "-") 

193 parts = pages.split("-") 

194 xarticle.fpage = parts[0] 

195 xarticle.lpage = parts[1].split(".")[0] 

196 

197 # PUBLICATION DATE (note: revised dates are ignored) 

198 if not parsed_text and text.startswith("Published"): 

199 parsed_text = True 

200 date_str = text[10:].split(".")[0] 

201 xarticle.date_published_iso_8601_date_str = date_str 

202 

203 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract 

204 

205 # ABSTRACT 

206 if not parsed_p_besides_abstract: 

207 abstract = str(node) 

208 if len(xarticle.abstracts) > 0: 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true

209 xarticle.abstracts[0]["value_tex"] += abstract 

210 else: 

211 xabstract: AbstractDict = { 

212 "tag": "abstract", 

213 "value_html": "", 

214 "value_tex": abstract, 

215 "value_xml": "", 

216 "lang": "en", 

217 } 

218 xarticle.abstracts.append(xabstract) 

219 

220 # PDF 

221 # We need to find the last PDF link because TAC can have revised version of an article. 

222 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html 

223 pdf_url = "" 

224 link_nodes = soup.find_all("a") 

225 for link_node in link_nodes: 

226 url = link_node.get("href") 

227 if url is not None and url.endswith(".pdf"): 

228 pdf_url = url 

229 if pdf_url: 229 ↛ 232line 229 didn't jump to line 232 because the condition on line 229 was always true

230 add_pdf_link_to_xarticle(xarticle, pdf_url) 

231 

232 return xarticle 

233 

234 def insert_kwd(self, xarticle, content_type, text, prefix): 

235 if text.startswith(prefix): 

236 text = text[len(prefix) + 1 :] 

237 for kwd in re.split(",|;", text): 

238 subject = create_subj() 

239 subject["value"] = kwd.strip().replace("\n", " ") 

240 subject["type"] = content_type 

241 subject["lang"] = "en" 

242 xarticle.kwds.append(subject) 

243 return True 

244 return False