Coverage for src/crawler/by_source/tac_crawler.py: 94%
126 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-09-16 12:41 +0000
1import re
2from urllib.parse import urljoin
4import regex
5from bs4 import BeautifulSoup, Tag
6from ptf.model_data import (
7 create_abstract,
8 create_articledata,
9 create_contributor,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler
14from crawler.utils import add_pdf_link_to_xarticle, regex_to_dict
17class TacCrawler(BaseCollectionCrawler):
18 source_name = "Theory and Applications of Categories website"
19 source_domain = "TAC"
20 source_website = "http://www.tac.mta.ca/tac"
22 issue_re = r"Volume (?P<volume>\d+) \- (?P<year>[\w :]+)"
24 def parse_collection_content(self, content):
25 """
26 Parse the HTML page of Annals of Math and returns a list of xissue.
27 Each xissue has its volume/number/year metadata + its url
29 """
30 soup = BeautifulSoup(content, "html5lib")
31 xissues = []
33 issue_nodes = soup.find_all("h3")
35 for issue_node in issue_nodes:
36 xissue = self.create_tac_xissue(issue_node)
37 xissues.append(xissue)
39 # TAC has multiple links towards the same page (ie: title + abstract)
40 # We want to add only 1 article so we keep track of the urls handled.
41 urls = []
43 # The TAC web page is badly formatted. <dt> have no closing </dt> tags.
44 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume
45 link_nodes = soup.find_all("a")
46 for link_node in link_nodes:
47 url = link_node.get("href")
48 if not isinstance(url, str):
49 continue
50 if not (url.startswith("volumes/") and url.endswith(".html") and url not in urls):
51 continue
53 urls.append(url)
55 article_url = self.source_website + "/" + url
56 url = url[8:]
57 parts = url.split("/")
58 volume = parts[0]
60 if len(volume) == 4:
61 # The first volumes do have a url in /volumes/@vid/
62 # The url is /volumes/@year/@article_number/@volume-*.html
63 parts = parts[2].split("-")
64 if len(parts) > 1:
65 volume = parts[0]
66 else:
67 volume = ""
68 elif len(parts) != 3:
69 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html
70 volume = ""
72 if volume:
73 xissue = [xissue for xissue in xissues if xissue.volume == volume][0]
74 article_index = len(xissue.articles)
76 xarticle = create_articledata()
77 xarticle.pid = "a" + str(article_index)
78 xarticle.url = article_url
79 xissue.articles.append(xarticle)
81 attachment_node = link_node.parent.parent.find_next("dd")
82 if isinstance(attachment_node, Tag): 82 ↛ 46line 82 didn't jump to line 46 because the condition on line 82 was always true
83 a_node = attachment_node.select_one("a[href$='.pdf']")
84 if a_node: 84 ↛ 46line 84 didn't jump to line 46 because the condition on line 84 was always true
85 href = a_node.get("href")
86 if isinstance(href, str): 86 ↛ 46line 86 didn't jump to line 46 because the condition on line 86 was always true
87 add_pdf_link_to_xarticle(
88 xarticle, urljoin(self.collection_url + "/", href)
89 )
91 return xissues
93 # Years are available inside the "abstract" page of an article.
94 # TODO : implement fetching the year from an article instead of an hardcoded dict
95 issue_years = {
96 "Lawvere Festschrift": "2025",
97 "Hofstra Festschrift": "2024",
98 "Bunge Festschrift": "2024",
99 "The Rosebrugh Festschrift": "2021",
100 "CT2011": "2012",
101 "The Bourn Festschrift": "2010",
102 "The Tholen Festschrift": "2008",
103 "CT2006": "2007",
104 "Chu spaces: theory and applications": "2006",
105 "CT2004": "2005",
106 "The Carboni Festschrift": "2004",
107 "CT2000": "2001",
108 "The Lambek Festschrift": "1999",
109 }
111 def create_tac_xissue(self, issue_node):
112 text = issue_node.get_text().strip()
114 issue_dict = regex_to_dict(self.issue_re, text, error_msg="Couldn't parse issue")
116 volume = issue_dict["volume"]
117 year = issue_dict["year"]
118 title = ""
120 if re.search("[a-zA-Z]", year):
121 title = year
122 if title not in self.issue_years: 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 raise ValueError(
124 "Couldn't parse issue (year not found). Have we encountered a new issue ?"
125 )
126 year = self.issue_years[title]
128 xissue = self.create_xissue(self.source_website + f"#vol{volume}", year, volume, None)
130 if title:
131 xissue.title_tex = title
133 return xissue
135 def parse_article_content(self, content, xissue, xarticle, url):
136 """
137 Parse the content with Beautifulsoup and returns an ArticleData
138 """
139 xarticle.lang = "en"
141 soup = BeautifulSoup(content, "html5lib")
143 # TITLE
144 title_node = soup.find("h1")
145 if title_node is not None: 145 ↛ 149line 145 didn't jump to line 149 because the condition on line 145 was always true
146 xarticle.title_tex = title_node.get_text().strip()
148 # AUTHORS
149 author_node = soup.find("h2")
150 if author_node is not None: 150 ↛ 231line 150 didn't jump to line 231 because the condition on line 150 was always true
151 text = author_node.get_text().strip()
152 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text)
154 if xarticle.pid == "TAC_2018_33_a30": 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"]
157 for text_author in parts:
158 author = create_contributor(
159 role="author", string_name=text_author.replace("\n", " ")
160 )
162 xarticle.contributors.append(author)
164 # The first paragraphs (there can be many) before other metadata are part of the abstracts
165 parsed_p_besides_abstract = False
167 for node in author_node.find_next_siblings():
168 if node.name == "p":
169 text = node.get_text().strip()
171 # KEYWORDS
172 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:")
173 parsed_text = parsed_text or self.insert_kwd(
174 xarticle, "msc", text, "2020 MSC:"
175 )
176 parsed_text = parsed_text or self.insert_kwd(
177 xarticle, "msc", text, "2010 MSC:"
178 )
179 parsed_text = parsed_text or self.insert_kwd(
180 xarticle, "msc", text, "2000 MSC:"
181 )
182 parsed_text = parsed_text or self.insert_kwd(
183 xarticle, "msc", text, "1991 MSC:"
184 )
185 parsed_text = parsed_text or self.insert_kwd(
186 xarticle, "msc", text, "AMS Classification (1991):"
187 )
189 # PAGES
190 title = "Theory and Applications of Categories"
191 if not parsed_text and text.startswith(title) and not xarticle.fpage:
192 parsed_text = True
193 pages = text[len(title) :].split("pp")[1][:-1].strip()
194 pages = pages.replace("--", "-")
195 parts = pages.split("-")
196 xarticle.fpage = parts[0]
197 xarticle.lpage = regex.split(r"(\.|\n)", parts[1], maxsplit=1)[0]
199 # PUBLICATION DATE (note: revised dates are ignored)
200 if not parsed_text and text.startswith("Published"):
201 parsed_text = True
202 date_str = text[10:].split(".")[0]
203 xarticle.date_published_iso_8601_date_str = date_str
205 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract
207 # ABSTRACT
208 if not parsed_p_besides_abstract:
209 abstract = str(node)
210 if len(xarticle.abstracts) > 0: 210 ↛ 211line 210 didn't jump to line 211 because the condition on line 210 was never true
211 xarticle.abstracts[0]["value_tex"] += abstract
212 else:
213 xabstract = create_abstract(
214 value_tex=abstract,
215 lang=xarticle.lang,
216 )
217 xarticle.abstracts.append(xabstract)
219 # PDF
220 # We need to find the last PDF link because TAC can have revised version of an article.
221 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html
222 # pdf_url = ""
223 # link_nodes = soup.find_all("a")
224 # for link_node in link_nodes:
225 # url = link_node.get("href")
226 # if url is not None and url.endswith(".pdf"):
227 # pdf_url = url
228 # if pdf_url:
229 # add_pdf_link_to_xarticle(xarticle, pdf_url)
231 return xarticle
233 def insert_kwd(self, xarticle, content_type, text, prefix):
234 if text.startswith(prefix):
235 text = text[len(prefix) + 1 :]
236 for kwd in re.split(",|;", text):
237 subject = create_subj()
238 subject["value"] = kwd.strip().replace("\n", " ")
239 subject["type"] = content_type
240 subject["lang"] = "en"
241 xarticle.kwds.append(subject)
242 return True
243 return False