Coverage for src/crawler/by_source/tac_crawler.py: 95%
122 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1import re
3from bs4 import BeautifulSoup
4from ptf.model_data import (
5 create_abstract,
6 create_articledata,
7 create_contributor,
8 create_subj,
9)
11from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
12from crawler.utils import regex_to_dict
15class TacCrawler(BaseCollectionCrawler):
16 source_name = "Theory and Applications of Categories website"
17 source_domain = "TAC"
18 source_website = "http://www.tac.mta.ca/tac"
20 issue_re = r"Volume (?P<volume>\d+) \- (?P<year>[\w :]+)"
22 def parse_collection_content(self, content):
23 """
24 Parse the HTML page of Annals of Math and returns a list of xissue.
25 Each xissue has its volume/number/year metadata + its url
27 """
28 soup = BeautifulSoup(content, "html5lib")
29 xissues = []
31 issue_nodes = soup.find_all("h3")
33 for issue_node in issue_nodes:
34 xissue = self.create_tac_xissue(issue_node)
35 xissues.append(xissue)
37 # TAC has multiple links towards the same page (ie: title + abstract)
38 # We want to add only 1 article so we keep track of the urls handled.
39 urls = []
41 # The TAC web page is badly formatted. <dt> have no closing </dt> tags.
42 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume
43 link_nodes = soup.find_all("a")
44 for link_node in link_nodes:
45 url = link_node.get("href")
46 if (
47 url is not None
48 and url.startswith("volumes/")
49 and url.endswith(".html")
50 and url not in urls
51 ):
52 urls.append(url)
54 article_url = self.source_website + "/" + url
55 url = url[8:]
56 parts = url.split("/")
57 volume = parts[0]
59 if len(volume) == 4:
60 # The first volumes do have a url in /volumes/@vid/
61 # The url is /volumes/@year/@article_number/@volume-*.html
62 parts = parts[2].split("-")
63 if len(parts) > 1:
64 volume = parts[0]
65 else:
66 volume = ""
67 elif len(parts) != 3:
68 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html
69 volume = ""
71 if volume:
72 xissue = [xissue for xissue in xissues if xissue.volume == volume][0]
73 article_index = len(xissue.articles)
75 xarticle = create_articledata()
76 xarticle.pid = "a" + str(article_index)
77 xarticle.url = article_url
78 xissue.articles.append(xarticle)
80 return xissues
82 # Years are available inside the "abstract" page of an article.
83 # TODO : implement fetching the year from an article instead of an hardcoded dict
84 issue_years = {
85 "Lawvere Festschrift": "2025",
86 "Hofstra Festschrift": "2024",
87 "Bunge Festschrift": "2024",
88 "The Rosebrugh Festschrift": "2021",
89 "CT2011": "2012",
90 "The Bourn Festschrift": "2010",
91 "The Tholen Festschrift": "2008",
92 "CT2006": "2007",
93 "Chu spaces: theory and applications": "2006",
94 "CT2004": "2005",
95 "The Carboni Festschrift": "2004",
96 "CT2000": "2001",
97 "The Lambek Festschrift": "1999",
98 }
100 def create_tac_xissue(self, issue_node):
101 text = issue_node.get_text().strip()
103 issue_dict = regex_to_dict(self.issue_re, text, error_msg="Couldn't parse issue")
105 volume = issue_dict["volume"]
106 year = issue_dict["year"]
107 title = ""
109 if re.search("[a-zA-Z]", year):
110 title = year
111 if title not in self.issue_years: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 raise ValueError(
113 "Couldn't parse issue (year not found). Have we encountered a new issue ?"
114 )
115 year = self.issue_years[title]
117 xissue = self.create_xissue(self.source_website + f"#vol{volume}", year, volume, None)
119 if title:
120 xissue.title_tex = title
122 return xissue
124 def parse_article_content(self, content, xissue, xarticle, url):
125 """
126 Parse the content with Beautifulsoup and returns an ArticleData
127 """
128 xarticle.lang = "en"
130 soup = BeautifulSoup(content, "html5lib")
132 # TITLE
133 title_node = soup.find("h1")
134 if title_node is not None: 134 ↛ 138line 134 didn't jump to line 138 because the condition on line 134 was always true
135 xarticle.title_tex = title_node.get_text().strip()
137 # AUTHORS
138 author_node = soup.find("h2")
139 if author_node is not None: 139 ↛ 212line 139 didn't jump to line 212 because the condition on line 139 was always true
140 text = author_node.get_text().strip()
141 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text)
143 if xissue.pid == "TAC_2018_33" and xarticle.pid == "a30": 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"]
146 for text_author in parts:
147 author = create_contributor(
148 role="author", string_name=text_author.replace("\n", " ")
149 )
151 xarticle.contributors.append(author)
153 # The first paragraphs (there can be many) before other metadata are part of the abstracts
154 parsed_p_besides_abstract = False
156 for node in author_node.find_next_siblings():
157 if node.name == "p":
158 text = node.get_text().strip()
160 # KEYWORDS
161 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:")
162 parsed_text = parsed_text or self.insert_kwd(
163 xarticle, "msc", text, "2020 MSC:"
164 )
165 parsed_text = parsed_text or self.insert_kwd(
166 xarticle, "msc", text, "2010 MSC:"
167 )
168 parsed_text = parsed_text or self.insert_kwd(
169 xarticle, "msc", text, "2000 MSC:"
170 )
171 parsed_text = parsed_text or self.insert_kwd(
172 xarticle, "msc", text, "1991 MSC:"
173 )
174 parsed_text = parsed_text or self.insert_kwd(
175 xarticle, "msc", text, "AMS Classification (1991):"
176 )
178 # PAGES
179 title = "Theory and Applications of Categories"
180 if not parsed_text and text.startswith(title) and not xarticle.fpage:
181 parsed_text = True
182 pages = text[len(title) :].split("pp")[1][:-1].strip()
183 pages = pages.replace("--", "-")
184 parts = pages.split("-")
185 xarticle.fpage = parts[0]
186 xarticle.lpage = parts[1].split(".")[0]
188 # PUBLICATION DATE (note: revised dates are ignored)
189 if not parsed_text and text.startswith("Published"):
190 parsed_text = True
191 date_str = text[10:].split(".")[0]
192 xarticle.date_published_iso_8601_date_str = date_str
194 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract
196 # ABSTRACT
197 if not parsed_p_besides_abstract:
198 abstract = str(node)
199 if len(xarticle.abstracts) > 0: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 xarticle.abstracts[0]["value_tex"] += abstract
201 else:
202 xabstract = create_abstract(
203 tag="abstract",
204 value_tex=abstract,
205 lang=xarticle.lang,
206 )
207 xarticle.abstracts.append(xabstract)
209 # PDF
210 # We need to find the last PDF link because TAC can have revised version of an article.
211 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html
212 pdf_url = ""
213 link_nodes = soup.find_all("a")
214 for link_node in link_nodes:
215 url = link_node.get("href")
216 if url is not None and url.endswith(".pdf"):
217 pdf_url = url
218 if pdf_url: 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true
219 add_pdf_link_to_xarticle(xarticle, pdf_url)
221 return xarticle
223 def insert_kwd(self, xarticle, content_type, text, prefix):
224 if text.startswith(prefix):
225 text = text[len(prefix) + 1 :]
226 for kwd in re.split(",|;", text):
227 subject = create_subj()
228 subject["value"] = kwd.strip().replace("\n", " ")
229 subject["type"] = content_type
230 subject["lang"] = "en"
231 xarticle.kwds.append(subject)
232 return True
233 return False