Coverage for src/crawler/by_source/tac_crawler.py: 96%
142 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1import re
3from bs4 import BeautifulSoup
4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
16class TacCrawler(BaseCollectionCrawler):
17 source_name = "Theory and Applications of Categories website"
18 source_domain = "TAC"
19 source_website = "http://www.tac.mta.ca/tac"
20 periode_begin = 1995
21 periode_end = 2024
23 def parse_collection_content(self, content):
24 """
25 Parse the HTML page of Annals of Math and returns a list of xissue.
26 Each xissue has its volume/number/year metadata + its url
28 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page
29 """
30 soup = BeautifulSoup(content, "html5lib")
31 xissues = []
33 issue_nodes = soup.find_all("h3")
34 previous_year = 0
36 for issue_node in issue_nodes:
37 xissue, previous_year = self.create_tac_xissue(issue_node, previous_year)
38 xissues.append(xissue)
40 # TAC has multiple links towards the same page (ie: title + abstract)
41 # We want to add only 1 article so we keep track of the urls handled.
42 urls = []
44 # The TAC web page is badly formatted. <dt> have no closing </dt> tags.
45 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume
46 link_nodes = soup.find_all("a")
47 for link_node in link_nodes:
48 url = link_node.get("href")
49 if (
50 url is not None
51 and url.startswith("volumes/")
52 and url.endswith(".html")
53 and url not in urls
54 ):
55 urls.append(url)
57 article_url = self.source_website + "/" + url
58 url = url[8:]
59 parts = url.split("/")
60 volume = parts[0]
62 if len(volume) == 4:
63 # The first volumes do have a url in /volumes/@vid/
64 # The url is /volumes/@year/@article_number/@volume-*.html
65 parts = parts[2].split("-")
66 if len(parts) > 1:
67 volume = parts[0]
68 else:
69 volume = ""
70 elif len(parts) != 3:
71 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html
72 volume = ""
74 if volume:
75 xissue = [xissue for xissue in xissues if xissue.volume == volume][0]
76 article_index = len(xissue.articles)
78 xarticle = create_articledata()
79 xarticle.pid = "a" + str(article_index)
80 xarticle.url = article_url
81 xissue.articles.append(xarticle)
83 return xissues
85 def create_tac_xissue(self, issue_node, previous_year):
86 text = issue_node.get_text().strip()
87 text = text[7:] # Remove "Volume "
88 parts = text.split(" - ")
89 volume = parts[0]
90 year = parts[1]
91 title = ""
93 # TAC has some special issues: the title is specified instead of the year
94 try:
95 year_int = int(year)
96 previous_year = year_int
97 except Exception:
98 if year[-1] == "*":
99 year = year[:-1]
101 title = year
102 if "Festschrift" in title:
103 if title == "Bunge Festschrift":
104 year = "2024"
105 else:
106 year = str(previous_year - 1)
107 elif volume == "17":
108 title = "Chu spaces"
109 year = "2006"
110 else:
111 year = title[2:]
113 xissue = create_issuedata()
114 xissue.pid = self.collection_id + "_" + year + "__" + volume
115 xissue.year = year
116 xissue.volume = volume
118 xissue.title_tex = title
119 xissue.title_html = title
120 xissue.title_xml = get_issue_title_xml(title, "en")
122 return xissue, previous_year
124 def parse_article_content(self, content, xissue, xarticle, url, pid):
125 """
126 Parse the content with Beautifulsoup and returns an ArticleData
127 """
128 xarticle = create_articledata()
129 xarticle.pid = pid
130 xarticle.lang = "en"
132 soup = BeautifulSoup(content, "html5lib")
134 # TITLE
135 title_node = soup.find("h1")
136 if title_node is not None: 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was always true
137 xarticle.title_tex = title_node.get_text().strip()
139 # AUTHORS
140 author_node = soup.find("h2")
141 if author_node is not None: 141 ↛ 214line 141 didn't jump to line 214 because the condition on line 141 was always true
142 text = author_node.get_text().strip()
143 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text)
145 if pid == "TAC_2018__33_a30": 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"]
148 for text_author in parts:
149 author = create_contributor()
150 author["role"] = "author"
151 author["string_name"] = text_author.replace("\n", " ")
153 xarticle.contributors.append(author)
155 # The first paragraphs (there can be many) before other metadata are part of the abstracts
156 parsed_p_besides_abstract = False
158 for node in author_node.find_next_siblings():
159 if node.name == "p":
160 text = node.get_text().strip()
162 # KEYWORDS
163 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:")
164 parsed_text = parsed_text or self.insert_kwd(
165 xarticle, "msc", text, "2020 MSC:"
166 )
167 parsed_text = parsed_text or self.insert_kwd(
168 xarticle, "msc", text, "2010 MSC:"
169 )
170 parsed_text = parsed_text or self.insert_kwd(
171 xarticle, "msc", text, "2000 MSC:"
172 )
173 parsed_text = parsed_text or self.insert_kwd(
174 xarticle, "msc", text, "1991 MSC:"
175 )
176 parsed_text = parsed_text or self.insert_kwd(
177 xarticle, "msc", text, "AMS Classification (1991):"
178 )
180 # PAGES
181 title = "Theory and Applications of Categories"
182 if not parsed_text and text.startswith(title) and not xarticle.fpage:
183 parsed_text = True
184 pages = text[len(title) :].split("pp")[1][:-1].strip()
185 pages = pages.replace("--", "-")
186 parts = pages.split("-")
187 xarticle.fpage = parts[0]
188 xarticle.lpage = parts[1].split(".")[0]
190 # PUBLICATION DATE (note: revised dates are ignored)
191 if not parsed_text and text.startswith("Published"):
192 parsed_text = True
193 date_str = text[10:].split(".")[0]
194 xarticle.date_published_iso_8601_date_str = date_str
196 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract
198 # ABSTRACT
199 if not parsed_p_besides_abstract:
200 abstract = str(node)
201 if len(xarticle.abstracts) > 0: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true
202 xarticle.abstracts[0]["value_tex"] += abstract
203 else:
204 xabstract = create_abstract(
205 tag="abstract",
206 value_tex=abstract,
207 lang=xarticle.lang,
208 )
209 xarticle.abstracts.append(xabstract)
211 # PDF
212 # We need to find the last PDF link because TAC can have revised version of an article.
213 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html
214 pdf_url = ""
215 link_nodes = soup.find_all("a")
216 for link_node in link_nodes:
217 url = link_node.get("href")
218 if url is not None and url.endswith(".pdf"):
219 pdf_url = url
220 if pdf_url: 220 ↛ 223line 220 didn't jump to line 223 because the condition on line 220 was always true
221 add_pdf_link_to_xarticle(xarticle, pdf_url)
223 return xarticle
225 def insert_kwd(self, xarticle, content_type, text, prefix):
226 if text.startswith(prefix):
227 text = text[len(prefix) + 1 :]
228 for kwd in re.split(",|;", text):
229 subject = create_subj()
230 subject["value"] = kwd.strip().replace("\n", " ")
231 subject["type"] = content_type
232 subject["lang"] = "en"
233 xarticle.kwds.append(subject)
234 return True
235 return False