Coverage for src/crawler/by_source/tac_crawler.py: 95%
148 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1import re
3from bs4 import BeautifulSoup
4from ptf.cmds.xml.jats.builder.issue import get_issue_title_xml
5from ptf.model_data import (
6 AbstractDict,
7 create_articledata,
8 create_contributor,
9 create_issuedata,
10 create_subj,
11)
13from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
16class TacCrawler(BaseCollectionCrawler):
17 source_name = "Theory and Applications of Categories website"
18 source_domain = "TAC"
19 source_website = "http://www.tac.mta.ca/tac"
20 periode_begin = 1995
21 periode_end = 2024
23 def __init__(self, *args, **kwargs):
24 super().__init__(*args, **kwargs)
26 self.source = self.get_or_create_source()
27 self.periode = self.get_or_create_periode()
29 def parse_collection_content(self, content):
30 """
31 Parse the HTML page of Annals of Math and returns a list of xissue.
32 Each xissue has its volume/number/year metadata + its url
34 self.periode is set during the parsing with the <meta name="citation_year"> of the HTML page
35 """
36 soup = BeautifulSoup(content, "html5lib")
37 xissues = []
39 issue_nodes = soup.find_all("h3")
40 previous_year = 0
42 for issue_node in issue_nodes:
43 xissue, previous_year = self.create_xissue(issue_node, previous_year)
44 xissues.append(xissue)
46 # TAC has multiple links towards the same page (ie: title + abstract)
47 # We want to add only 1 article so we keep track of the urls handled.
48 urls = []
50 # The TAC web page is badly formatted. <dt> have no closing </dt> tags.
51 # To get the list of articles, we rely on the links and parse the URLs to find the corresponding volume
52 link_nodes = soup.find_all("a")
53 for link_node in link_nodes:
54 url = link_node.get("href")
55 if (
56 url is not None
57 and url.startswith("volumes/")
58 and url.endswith(".html")
59 and url not in urls
60 ):
61 urls.append(url)
63 article_url = self.source_website + "/" + url
64 url = url[8:]
65 parts = url.split("/")
66 volume = parts[0]
68 if len(volume) == 4:
69 # The first volumes do have a url in /volumes/@vid/
70 # The url is /volumes/@year/@article_number/@volume-*.html
71 parts = parts[2].split("-")
72 if len(parts) > 1:
73 volume = parts[0]
74 else:
75 volume = ""
76 elif len(parts) != 3:
77 # Ignore URLs that do not respect /volumes/@year/@article_number/@volume-*.html
78 volume = ""
80 if volume:
81 xissue = [xissue for xissue in xissues if xissue.volume == volume][0]
82 article_index = len(xissue.articles)
84 xarticle = create_articledata()
85 xarticle.pid = "a" + str(article_index)
86 xarticle.url = article_url
87 xissue.articles.append(xarticle)
89 return xissues
91 def create_xissue(self, issue_node, previous_year):
92 text = issue_node.get_text().strip()
93 text = text[7:] # Remove "Volume "
94 parts = text.split(" - ")
95 volume = parts[0]
96 year = parts[1]
97 title = ""
99 # TAC has some special issues: the title is specified instead of the year
100 try:
101 year_int = int(year)
102 previous_year = year_int
103 except Exception:
104 if year[-1] == "*":
105 year = year[:-1]
107 title = year
108 if "Festschrift" in title:
109 if title == "Bunge Festschrift":
110 year = "2024"
111 else:
112 year = str(previous_year - 1)
113 elif volume == "17":
114 title = "Chu spaces"
115 year = "2006"
116 else:
117 year = title[2:]
119 xissue = create_issuedata()
120 xissue.pid = self.collection_id + "_" + year + "__" + volume
121 xissue.year = year
122 xissue.volume = volume
124 xissue.title_tex = title
125 xissue.title_html = title
126 xissue.title_xml = get_issue_title_xml(title, "en")
128 return xissue, previous_year
130 def parse_article_content(self, content, xissue, xarticle, url, pid):
131 """
132 Parse the content with Beautifulsoup and returns an ArticleData
133 """
134 xarticle = create_articledata()
135 xarticle.pid = pid
136 xarticle.lang = "en"
137 if pid == "TAC_2014__29_a4":
138 print
139 soup = BeautifulSoup(content, "html5lib")
141 # TITLE
142 title_node = soup.find("h1")
143 if title_node is not None: 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true
144 xarticle.title_tex = title_node.get_text().strip()
146 # AUTHORS
147 author_node = soup.find("h2")
148 if author_node is not None: 148 ↛ 223line 148 didn't jump to line 223 because the condition on line 148 was always true
149 text = author_node.get_text().strip()
150 parts = re.split(r",\s+and\s+|\s+and\s+|,\s+", text)
152 if pid == "TAC_2018__33_a30": 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 parts = ["J. Bhowmick", "S. Ghosh", "N. Rakshit", "M. Yamashita"]
155 for text_author in parts:
156 author = create_contributor()
157 author["role"] = "author"
158 author["string_name"] = text_author.replace("\n", " ")
160 xarticle.contributors.append(author)
162 # The first paragraphs (there can be many) before other metadata are part of the abstracts
163 parsed_p_besides_abstract = False
165 for node in author_node.find_next_siblings():
166 if node.name == "p":
167 text = node.get_text().strip()
169 # KEYWORDS
170 parsed_text = self.insert_kwd(xarticle, "", text, "Keywords:")
171 parsed_text = parsed_text or self.insert_kwd(
172 xarticle, "msc", text, "2020 MSC:"
173 )
174 parsed_text = parsed_text or self.insert_kwd(
175 xarticle, "msc", text, "2010 MSC:"
176 )
177 parsed_text = parsed_text or self.insert_kwd(
178 xarticle, "msc", text, "2000 MSC:"
179 )
180 parsed_text = parsed_text or self.insert_kwd(
181 xarticle, "msc", text, "1991 MSC:"
182 )
183 parsed_text = parsed_text or self.insert_kwd(
184 xarticle, "msc", text, "AMS Classification (1991):"
185 )
187 # PAGES
188 title = "Theory and Applications of Categories"
189 if not parsed_text and text.startswith(title) and not xarticle.fpage:
190 parsed_text = True
191 pages = text[len(title) :].split("pp")[1][:-1].strip()
192 pages = pages.replace("--", "-")
193 parts = pages.split("-")
194 xarticle.fpage = parts[0]
195 xarticle.lpage = parts[1].split(".")[0]
197 # PUBLICATION DATE (note: revised dates are ignored)
198 if not parsed_text and text.startswith("Published"):
199 parsed_text = True
200 date_str = text[10:].split(".")[0]
201 xarticle.date_published_iso_8601_date_str = date_str
203 parsed_p_besides_abstract = parsed_text or parsed_p_besides_abstract
205 # ABSTRACT
206 if not parsed_p_besides_abstract:
207 abstract = str(node)
208 if len(xarticle.abstracts) > 0: 208 ↛ 209line 208 didn't jump to line 209 because the condition on line 208 was never true
209 xarticle.abstracts[0]["value_tex"] += abstract
210 else:
211 xabstract: AbstractDict = {
212 "tag": "abstract",
213 "value_html": "",
214 "value_tex": abstract,
215 "value_xml": "",
216 "lang": "en",
217 }
218 xarticle.abstracts.append(xabstract)
220 # PDF
221 # We need to find the last PDF link because TAC can have revised version of an article.
222 # Ex: http://www.tac.mta.ca/tac/volumes/38/31/38-31abs.html
223 pdf_url = ""
224 link_nodes = soup.find_all("a")
225 for link_node in link_nodes:
226 url = link_node.get("href")
227 if url is not None and url.endswith(".pdf"):
228 pdf_url = url
229 if pdf_url: 229 ↛ 232line 229 didn't jump to line 232 because the condition on line 229 was always true
230 add_pdf_link_to_xarticle(xarticle, pdf_url)
232 return xarticle
234 def insert_kwd(self, xarticle, content_type, text, prefix):
235 if text.startswith(prefix):
236 text = text[len(prefix) + 1 :]
237 for kwd in re.split(",|;", text):
238 subject = create_subj()
239 subject["value"] = kwd.strip().replace("\n", " ")
240 subject["type"] = content_type
241 subject["lang"] = "en"
242 xarticle.kwds.append(subject)
243 return True
244 return False