Coverage for src/crawler/by_source/tac_crawler.py: 96%
138 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import re
3from bs4 import BeautifulSoup
4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml
5from ptf.model_data import (
6 create_abstract,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
16class TacCrawler(BaseCollectionCrawler):
17 source_name = "Theory and Applications of Categories website"
18 source_domain = "TAC"
19 source_website = "http://www.tac.mta.ca/tac"
21 def parse_collection_content(self, content):
22 """
23 Parse the HTML page of Annals of Math and returns a list of xissue.
24 Each xissue has its volume/number/year metadata + its url
26 """
27 soup = BeautifulSoup(content, "html5lib")
28 xissues = []
30 issue_nodes = soup.find_all("h3")
31 previous_year = 0
33 for issue_node in issue_nodes:
34 xissue, previous_year = self.create_tac_xissue(issue_node, previous_year)
35 xissues.append(xissue)
37 # TAC has multiple links towards the same page (ie: title + abstract)
38 # We want to add only 1 article so we keep track of the urls handled.
39 urls = []
41 # The TAC web page is badly formatted. <dt> have no closing </dt> tags.
42 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume
43 link_nodes = soup.find_all("a")
44 for link_node in link_nodes:
45 url = link_node.get("href")
46 if (
47 url is not None
48 and url.startswith("volumes/")
49 and url.endswith(".html")
50 and url not in urls
51 ):
52 urls.append(url)
54 article_url = self.source_website + "/" + url
55 url = url[8:]
56 parts = url.split("/")
57 volume = parts[0]
59 if len(volume) == 4:
60 # The first volumes do have a url in /volumes/@vid/
61 # The url is /volumes/@year/@article_number/@volume-*.html
62 parts = parts[2].split("-")
63 if len(parts) > 1:
64 volume = parts[0]
65 else:
66 volume = ""
67 elif len(parts) != 3:
68 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html
69 volume = ""
71 if volume:
72 xissue = [xissue for xissue in xissues if xissue.volume == volume][0]
73 article_index = len(xissue.articles)
75 xarticle = create_articledata()
76 xarticle.pid = "a" + str(article_index)
77 xarticle.url = article_url
78 xissue.articles.append(xarticle)
80 return xissues
82 def create_tac_xissue(self, issue_node, previous_year):
83 text = issue_node.get_text().strip()
84 text = text[7:] # Remove "Volume "
85 parts = text.split(" - ")
86 volume = parts[0]
87 year = parts[1]
88 title = ""
90 # TAC has some special issues: the title is specified instead of the year
91 try:
92 year_int = int(year)
93 previous_year = year_int
94 except Exception:
95 if year[-1] == "*":
96 year = year[:-1]
98 title = year
99 if "Festschrift" in title:
100 if title == "Bunge Festschrift":
101 year = "2024"
102 else:
103 year = str(previous_year - 1)
104 elif volume == "17":
105 title = "Chu spaces"
106 year = "2006"
107 else:
108 year = title[2:]
110 xissue = create_issuedata()
111 xissue.pid = self.collection_id + "_" + year + "__" + volume
112 xissue.year = year
113 xissue.volume = volume
115 xissue.title_tex = title
116 xissue.title_html = title
117 xissue.title_xml = get_issue_title_xml(title, "en")
119 return xissue, previous_year
121 def parse_article_content(self, content, xissue, xarticle, url):
122 """
123 Parse the content with Beautifulsoup and returns an ArticleData
124 """
125 xarticle.lang = "en"
127 soup = BeautifulSoup(content, "html5lib")
129 # TITLE
130 title_node = soup.find("h1")
131 if title_node is not None: 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was always true
132 xarticle.title_tex = title_node.get_text().strip()
134 # AUTHORS
135 author_node = soup.find("h2")
136 if author_node is not None: 136 ↛ 209line 136 didn't jump to line 209 because the condition on line 136 was always true
137 text = author_node.get_text().strip()
138 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text)
140 if xissue.pid == "TAC_2018__33" and xarticle.pid == "a30": 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"]
143 for text_author in parts:
144 author = create_contributor()
145 author["role"] = "author"
146 author["string_name"] = text_author.replace("\n", " ")
148 xarticle.contributors.append(author)
150 # The first paragraphs (there can be many) before other metadata are part of the abstracts
151 parsed_p_besides_abstract = False
153 for node in author_node.find_next_siblings():
154 if node.name == "p":
155 text = node.get_text().strip()
157 # KEYWORDS
158 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:")
159 parsed_text = parsed_text or self.insert_kwd(
160 xarticle, "msc", text, "2020 MSC:"
161 )
162 parsed_text = parsed_text or self.insert_kwd(
163 xarticle, "msc", text, "2010 MSC:"
164 )
165 parsed_text = parsed_text or self.insert_kwd(
166 xarticle, "msc", text, "2000 MSC:"
167 )
168 parsed_text = parsed_text or self.insert_kwd(
169 xarticle, "msc", text, "1991 MSC:"
170 )
171 parsed_text = parsed_text or self.insert_kwd(
172 xarticle, "msc", text, "AMS Classification (1991):"
173 )
175 # PAGES
176 title = "Theory and Applications of Categories"
177 if not parsed_text and text.startswith(title) and not xarticle.fpage:
178 parsed_text = True
179 pages = text[len(title) :].split("pp")[1][:-1].strip()
180 pages = pages.replace("--", "-")
181 parts = pages.split("-")
182 xarticle.fpage = parts[0]
183 xarticle.lpage = parts[1].split(".")[0]
185 # PUBLICATION DATE (note: revised dates are ignored)
186 if not parsed_text and text.startswith("Published"):
187 parsed_text = True
188 date_str = text[10:].split(".")[0]
189 xarticle.date_published_iso_8601_date_str = date_str
191 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract
193 # ABSTRACT
194 if not parsed_p_besides_abstract:
195 abstract = str(node)
196 if len(xarticle.abstracts) > 0: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true
197 xarticle.abstracts[0]["value_tex"] += abstract
198 else:
199 xabstract = create_abstract(
200 tag="abstract",
201 value_tex=abstract,
202 lang=xarticle.lang,
203 )
204 xarticle.abstracts.append(xabstract)
206 # PDF
207 # We need to find the last PDF link because TAC can have revised version of an article.
208 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html
209 pdf_url = ""
210 link_nodes = soup.find_all("a")
211 for link_node in link_nodes:
212 url = link_node.get("href")
213 if url is not None and url.endswith(".pdf"):
214 pdf_url = url
215 if pdf_url: 215 ↛ 218line 215 didn't jump to line 218 because the condition on line 215 was always true
216 add_pdf_link_to_xarticle(xarticle, pdf_url)
218 return xarticle
220 def insert_kwd(self, xarticle, content_type, text, prefix):
221 if text.startswith(prefix):
222 text = text[len(prefix) + 1 :]
223 for kwd in re.split(",|;", text):
224 subject = create_subj()
225 subject["value"] = kwd.strip().replace("\n", " ")
226 subject["type"] = content_type
227 subject["lang"] = "en"
228 xarticle.kwds.append(subject)
229 return True
230 return False