Coverage for src / crawler / by_source / slc_crawler.py: 8%
165 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import re
2from time import strftime, strptime
3from urllib.parse import urljoin
5import requests
6from bs4 import BeautifulSoup, Comment, PageElement, Tag
7from ptf.model_data import (
8 ArticleData,
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
19def is_relevant_tag(tag: PageElement):
20 if isinstance(tag, str):
21 if cleanup_str(tag) == "":
22 return False
23 return True
25 if cleanup_str(tag.text) == "":
26 return False
27 return True
30class Slc_Crawler(BaseCollectionCrawler):
31 source_name = "Séminaire Lotharingien de Combinatoire website"
32 source_domain = "SLC"
33 source_website = "https://www.mat.univie.ac.at/~slc/"
35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)"
36 abstract_regex = (
37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?"
38 )
39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?"
41 title_corrections = {
42 "SLC_1989_22_a15": "On the Growth Rate of Certain Combinatorial Functions",
43 }
45 def parse_collection_content(self, content):
46 xissues: list[IssueData] = []
48 soup = BeautifulSoup(content, "html5lib")
49 issue_tags = soup.select("table[border='1'] > tbody > tr > td")
50 for i_tag in issue_tags:
51 a_tag = i_tag.select_one("a")
52 if not a_tag:
53 continue
54 href = a_tag.get("href")
55 if not isinstance(href, str):
56 raise (ValueError("Couldn't parse issue href"))
57 href = urljoin(self.collection_url, href)
58 issue_dict = regex_to_dict(
59 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year"
60 )
62 year = issue_dict["year"]
63 if issue_dict.get("year_end") is not None:
64 year += "-"
65 if len(issue_dict["year_end"]) < 3:
66 year += year[0 : len(issue_dict["year_end"])]
67 year += issue_dict["year_end"]
69 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None)
70 xissues.append(issue_data)
72 return xissues
74 def parse_issue_content(self, content: str, xissue: IssueData):
75 if not xissue.url:
76 raise ValueError("xissue must have an url")
77 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"):
78 xissue.url += "/"
80 soup = BeautifulSoup(content, "html5lib")
82 # Preface
83 preface_tag = soup.select_one("a:-soup-contains-own('Preface')")
84 if preface_tag:
85 preface_href = preface_tag.get("href")
86 if isinstance(preface_href, str):
87 preface_href = urljoin(xissue.url, preface_href)
88 try:
89 preface_content = self.download_file(preface_href)
90 self.parse_slc_preface(preface_content)
91 except requests.exceptions.HTTPError:
92 self.logger.debug(
93 "Couldn't download file", extra={"url": preface_href, "pid": xissue.pid}
94 )
96 # Articles
97 articles_tags = soup.select("dl a")
98 for index, a_tag in enumerate(articles_tags):
99 href = a_tag.get("href", None)
100 if not isinstance(href, str):
101 continue
102 href = urljoin(xissue.url, href)
103 if a_tag.text == "Scanned copy":
104 add_pdf_link_to_xarticle(xissue, href)
105 continue
106 a_href = a_tag.get("href")
107 if not isinstance(a_href, str):
108 continue
109 xarticle = create_articledata()
110 xarticle.pid = f"a{index}"
111 xarticle.url = urljoin(xissue.url, a_href)
112 xissue.articles.append(xarticle)
114 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')")
115 if pdf_tag:
116 pdf_href = pdf_tag.get("href")
117 if isinstance(pdf_href, str):
118 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href))
120 pass
122 def parse_article_content(
123 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
124 ) -> ArticleData | None:
125 if not xarticle.url:
126 raise ValueError("Article must have an url")
128 soup = BeautifulSoup(content, "html5lib")
129 body = soup.select_one("body")
130 if not body:
131 raise ValueError("Couldn't parse article body")
133 # PDF
134 pdf_tag = body.select_one(
135 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')"
136 )
137 if not pdf_tag:
138 self.logger.debug(
139 "Couldn't find article pdf", extra={"pid": xarticle.pid, "url": xarticle.url}
140 )
141 return
142 pdf_href = pdf_tag.get("href")
143 if not isinstance(pdf_href, str):
144 raise ValueError("Couldn't parse pdf href")
145 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
147 # Author
148 author_tag = body.select_one("h1:nth-of-type(1)")
149 if not isinstance(author_tag, Tag):
150 raise ValueError("Couldn't find article authors")
151 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ")
152 if authors_str != "":
153 for author in authors_str.split(", "):
154 xarticle.contributors.append(create_contributor(role="author", string_name=author))
156 # Title
157 title_tag = body.select_one("h1:nth-of-type(2)")
158 if not isinstance(title_tag, Tag):
159 raise ValueError("Couldn't find article title")
160 xarticle.title_tex = cleanup_str(title_tag.text)
162 if xarticle.pid in self.title_corrections:
163 xarticle.title_tex = self.title_corrections[xarticle.pid]
165 # TEX
166 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')")
167 if tex_tag:
168 tex_href = tex_tag.get("href")
169 if isinstance(tex_href, str):
170 add_pdf_link_to_xarticle(
171 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex"
172 )
174 # Here we decompose/extract every element to keep only the abstracts
175 author_tag.decompose()
176 title_tag.decompose()
177 body.select_one("h5").decompose()
178 form = body.select_one("form")
179 if form:
180 form.decompose()
181 links = body.select_one(
182 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')"
183 )
184 if links is not None:
185 link_header = links.find_previous_sibling("p")
186 if isinstance(link_header, Tag):
187 link_header.decompose()
188 links.decompose()
189 body.select_one("body")
190 for child in body.children:
191 if isinstance(child, Comment):
192 child.extract()
194 # Dates
195 dates = next(
196 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None
197 )
198 if dates:
199 dates_str = cleanup_str(dates.text)
200 dates_str = re.sub(r"[\.;]", ",", dates_str)
201 dates_str = re.sub(r"Oct,? ", "October ", dates_str)
202 dates_str = re.sub(r"Sept,? ", "September ", dates_str)
204 dates_dict = regex_to_dict(
205 self.dates_regex, dates_str, error_msg="Couldn't parse dates"
206 )
207 xarticle.date_accepted = strftime(
208 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y")
209 )
210 if dates_dict["received"] is not None:
211 xarticle.date_received = strftime(
212 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y")
213 )
214 if dates_dict["revised"] is not None:
215 xarticle.date_revised = strftime(
216 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y")
217 )
218 if dates_dict["final"] is not None:
219 xarticle.date_published = strftime(
220 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y")
221 )
222 dates.extract()
224 # Abstract
225 for img in body.select("img[alt]"):
226 alt = img.get("alt")
227 if not isinstance(alt, str):
228 raise ValueError("Couldn't parse abstract : invalid img alt")
229 img.replace_with(alt)
231 abstract_text = cleanup_str("".join([str(c) for c in body.contents]))
232 try:
233 abstract_dict = regex_to_dict(
234 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract"
235 )
236 except ValueError:
237 abstract_dict = {"abstract": abstract_text}
239 if abstract_dict["abstract"] is not None:
240 xarticle.abstracts.append(
241 create_abstract(value_tex=cleanup_str(abstract_dict["abstract"]), lang="en")
242 )
244 if abstract_dict.get("resume", None) is not None:
245 xarticle.abstracts.append(
246 create_abstract(value_tex=cleanup_str(abstract_dict["resume"]), lang="fr")
247 )
249 return xarticle
251 def parse_slc_preface(self, content: str):
252 pass