Coverage for src/crawler/by_source/tac

1import re

3from bs4 import BeautifulSoup

4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml

5from ptf.model_data import (

6 create_abstract,

7 create_articledata,

8 create_contributor,

9 create_issuedata,

10 create_subj,

11)

13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle

16class TacCrawler(BaseCollectionCrawler):

17 source_name = "Theory and Applications of Categories website"

18 source_domain = "TAC"

19 source_website = "http://www.tac.mta.ca/tac"

20 periode_begin = 1995

21 periode_end = 2024

23 def parse_collection_content(self, content):

24 """

25 Parse the HTML page of Annals of Math and returns a list of xissue.

26 Each xissue has its volume/number/year metadata + its url

28 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page

29 """

30 soup = BeautifulSoup(content, "html5lib")

31 xissues = []

33 issue_nodes = soup.find_all("h3")

34 previous_year = 0

36 for issue_node in issue_nodes:

37 xissue, previous_year = self.create_tac_xissue(issue_node, previous_year)

38 xissues.append(xissue)

40 # TAC has multiple links towards the same page (ie: title + abstract)

41 # We want to add only 1 article so we keep track of the urls handled.

42 urls = []

44 # The TAC web page is badly formatted. <dt> have no closing </dt> tags.

45 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume

46 link_nodes = soup.find_all("a")

47 for link_node in link_nodes:

48 url = link_node.get("href")

49 if (

50 url is not None

51 and url.startswith("volumes/")

52 and url.endswith(".html")

53 and url not in urls

54 ):

55 urls.append(url)

57 article_url = self.source_website + "/" + url

58 url = url[8:]

59 parts = url.split("/")

60 volume = parts[0]

62 if len(volume) == 4:

63 # The first volumes do have a url in /volumes/@vid/

64 # The url is /volumes/@year/@article_number/@volume-*.html

65 parts = parts[2].split("-")

66 if len(parts) > 1:

67 volume = parts[0]

68 else:

69 volume = ""

70 elif len(parts) != 3:

71 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html

72 volume = ""

74 if volume:

75 xissue = [xissue for xissue in xissues if xissue.volume == volume][0]

76 article_index = len(xissue.articles)

78 xarticle = create_articledata()

79 xarticle.pid = "a" + str(article_index)

80 xarticle.url = article_url

81 xissue.articles.append(xarticle)

83 return xissues

85 def create_tac_xissue(self, issue_node, previous_year):

86 text = issue_node.get_text().strip()

87 text = text[7:] # Remove "Volume "

88 parts = text.split(" - ")

89 volume = parts[0]

90 year = parts[1]

91 title = ""

93 # TAC has some special issues: the title is specified instead of the year

94 try:

95 year_int = int(year)

96 previous_year = year_int

97 except Exception:

98 if year[-1] == "*":

99 year = year[:-1]

100

101 title = year

102 if "Festschrift" in title:

103 if title == "Bunge Festschrift":

104 year = "2024"

105 else:

106 year = str(previous_year - 1)

107 elif volume == "17":

108 title = "Chu spaces"

109 year = "2006"

110 else:

111 year = title[2:]

112

113 xissue = create_issuedata()

114 xissue.pid = self.collection_id + "_" + year + "__" + volume

115 xissue.year = year

116 xissue.volume = volume

117

118 xissue.title_tex = title

119 xissue.title_html = title

120 xissue.title_xml = get_issue_title_xml(title, "en")

121

122 return xissue, previous_year

123

124 def parse_article_content(self, content, xissue, xarticle, url, pid):

125 """

126 Parse the content with Beautifulsoup and returns an ArticleData

127 """

128 xarticle = create_articledata()

129 xarticle.pid = pid

130 xarticle.lang = "en"

131

132 soup = BeautifulSoup(content, "html5lib")

133

134 # TITLE

135 title_node = soup.find("h1")

136 if title_node is not None: 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was always true

137 xarticle.title_tex = title_node.get_text().strip()

138

139 # AUTHORS

140 author_node = soup.find("h2")

141 if author_node is not None: 141 ↛ 214line 141 didn't jump to line 214 because the condition on line 141 was always true

142 text = author_node.get_text().strip()

143 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text)

144

145 if pid == "TAC_2018__33_a30": 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"]

147

148 for text_author in parts:

149 author = create_contributor()

150 author["role"] = "author"

151 author["string_name"] = text_author.replace("\n", " ")

152

153 xarticle.contributors.append(author)

154

155 # The first paragraphs (there can be many) before other metadata are part of the abstracts

156 parsed_p_besides_abstract = False

157

158 for node in author_node.find_next_siblings():

159 if node.name == "p":

160 text = node.get_text().strip()

161

162 # KEYWORDS

163 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:")

164 parsed_text = parsed_text or self.insert_kwd(

165 xarticle, "msc", text, "2020 MSC:"

166 )

167 parsed_text = parsed_text or self.insert_kwd(

168 xarticle, "msc", text, "2010 MSC:"

169 )

170 parsed_text = parsed_text or self.insert_kwd(

171 xarticle, "msc", text, "2000 MSC:"

172 )

173 parsed_text = parsed_text or self.insert_kwd(

174 xarticle, "msc", text, "1991 MSC:"

175 )

176 parsed_text = parsed_text or self.insert_kwd(

177 xarticle, "msc", text, "AMS Classification (1991):"

178 )

179

180 # PAGES

181 title = "Theory and Applications of Categories"

182 if not parsed_text and text.startswith(title) and not xarticle.fpage:

183 parsed_text = True

184 pages = text[len(title) :].split("pp")[1][:-1].strip()

185 pages = pages.replace("--", "-")

186 parts = pages.split("-")

187 xarticle.fpage = parts[0]

188 xarticle.lpage = parts[1].split(".")[0]

189

190 # PUBLICATION DATE (note: revised dates are ignored)

191 if not parsed_text and text.startswith("Published"):

192 parsed_text = True

193 date_str = text[10:].split(".")[0]

194 xarticle.date_published_iso_8601_date_str = date_str

195

196 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract

197

198 # ABSTRACT

199 if not parsed_p_besides_abstract:

200 abstract = str(node)

201 if len(xarticle.abstracts) > 0: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 xarticle.abstracts[0]["value_tex"] += abstract

203 else:

204 xabstract = create_abstract(

205 tag="abstract",

206 value_tex=abstract,

207 lang=xarticle.lang,

208 )

209 xarticle.abstracts.append(xabstract)

210

211 # PDF

212 # We need to find the last PDF link because TAC can have revised version of an article.

213 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html

214 pdf_url = ""

215 link_nodes = soup.find_all("a")

216 for link_node in link_nodes:

217 url = link_node.get("href")

218 if url is not None and url.endswith(".pdf"):

219 pdf_url = url

220 if pdf_url: 220 ↛ 223line 220 didn't jump to line 223 because the condition on line 220 was always true

221 add_pdf_link_to_xarticle(xarticle, pdf_url)

222

223 return xarticle

224

225 def insert_kwd(self, xarticle, content_type, text, prefix):

226 if text.startswith(prefix):

227 text = text[len(prefix) + 1 :]

228 for kwd in re.split(",|;", text):

229 subject = create_subj()

230 subject["value"] = kwd.strip().replace("\n", " ")

231 subject["type"] = content_type

232 subject["lang"] = "en"

233 xarticle.kwds.append(subject)

234 return True

235 return False

Coverage for src/crawler/by_source/tac_crawler.py: 96%

142 statements