Coverage for src/crawler/by_source/slc_crawler.py: 8%
165 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-16 07:44 +0000
1import re
2from time import strftime, strptime
3from urllib.parse import urljoin
5import requests
6from bs4 import BeautifulSoup, Comment, PageElement, Tag
7from ptf.model_data import (
8 ArticleData,
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
19def is_relevant_tag(tag: PageElement):
20 if isinstance(tag, str):
21 if cleanup_str(tag) == "":
22 return False
23 return True
25 if cleanup_str(tag.text) == "":
26 return False
27 return True
30class Slc_Crawler(BaseCollectionCrawler):
31 source_name = "Séminaire Lotharingien de Combinatoire website"
32 source_domain = "SLC"
33 source_website = "https://www.mat.univie.ac.at/~slc/"
35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)"
36 abstract_regex = (
37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?"
38 )
39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?"
41 def parse_collection_content(self, content):
42 xissues: list[IssueData] = []
44 soup = BeautifulSoup(content, "html5lib")
45 issue_tags = soup.select("table[border='1'] > tbody > tr > td")
46 for i_tag in issue_tags:
47 a_tag = i_tag.select_one("a")
48 if not a_tag:
49 continue
50 href = a_tag.get("href")
51 if not isinstance(href, str):
52 raise (ValueError("Couldn't parse issue href"))
53 href = urljoin(self.collection_url, href)
54 issue_dict = regex_to_dict(
55 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year"
56 )
58 year = issue_dict["year"]
59 if issue_dict.get("year_end") is not None:
60 year += "-"
61 if len(issue_dict["year_end"]) < 3:
62 year += year[0 : len(issue_dict["year_end"])]
63 year += issue_dict["year_end"]
65 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None)
66 xissues.append(issue_data)
68 return xissues
70 def parse_issue_content(self, content: str, xissue: IssueData):
71 if not xissue.url:
72 raise ValueError("xissue must have an url")
73 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"):
74 xissue.url += "/"
76 soup = BeautifulSoup(content, "html5lib")
78 # Preface
79 preface_tag = soup.select_one("a:-soup-contains-own('Preface')")
80 if preface_tag:
81 preface_href = preface_tag.get("href")
82 if isinstance(preface_href, str):
83 preface_href = urljoin(xissue.url, preface_href)
84 try:
85 preface_content = self.download_file(preface_href)
86 self.parse_slc_preface(preface_content)
87 except requests.exceptions.HTTPError:
88 print("Couldn't download file : " + preface_href)
90 # Articles
91 articles_tags = soup.select("dl a")
92 for index, a_tag in enumerate(articles_tags):
93 href = a_tag.get("href", None)
94 if not isinstance(href, str):
95 continue
96 href = urljoin(xissue.url, href)
97 if a_tag.text == "Scanned copy":
98 add_pdf_link_to_xarticle(xissue, href)
99 continue
100 a_href = a_tag.get("href")
101 if not isinstance(a_href, str):
102 continue
103 xarticle = create_articledata()
104 xarticle.pid = f"a{index}"
105 xarticle.url = urljoin(xissue.url, a_href)
106 xissue.articles.append(xarticle)
108 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')")
109 if pdf_tag:
110 pdf_href = pdf_tag.get("href")
111 if isinstance(pdf_href, str):
112 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href))
114 pass
116 def parse_article_content(
117 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
118 ) -> ArticleData | None:
119 if not xarticle.url:
120 raise ValueError("Article must have an url")
122 soup = BeautifulSoup(content, "html5lib")
123 body = soup.select_one("body")
124 if not body:
125 raise ValueError("Couldn't parse article body")
127 # PDF
128 pdf_tag = body.select_one(
129 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')"
130 )
131 if not pdf_tag:
132 print("Couldn't find article pdf : " + xarticle.url)
133 return
134 pdf_href = pdf_tag.get("href")
135 if not isinstance(pdf_href, str):
136 raise ValueError("Couldn't parse pdf href")
137 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
139 # Author
140 author_tag = body.select_one("h1:nth-of-type(1)")
141 if not isinstance(author_tag, Tag):
142 raise ValueError("Couldn't find article authors")
143 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ")
144 if authors_str != "":
145 for author in authors_str.split(", "):
146 xarticle.contributors.append(create_contributor(role="author", string_name=author))
148 # Title
149 title_tag = body.select_one("h1:nth-of-type(2)")
150 if not isinstance(title_tag, Tag):
151 raise ValueError("Couldn't find article title")
152 xarticle.title_tex = cleanup_str(title_tag.text)
154 if xarticle.title_tex == "":
155 if xarticle.url == "https://www.mat.univie.ac.at/~slc/opapers/s22thumser.html":
156 xarticle.title_tex = " "
158 # TEX
159 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')")
160 if tex_tag:
161 tex_href = tex_tag.get("href")
162 if isinstance(tex_href, str):
163 add_pdf_link_to_xarticle(
164 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex"
165 )
167 # Here we decompose/extract every element to keep only the abstracts
168 author_tag.decompose()
169 title_tag.decompose()
170 body.select_one("h5").decompose()
171 form = body.select_one("form")
172 if form:
173 form.decompose()
174 links = body.select_one(
175 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')"
176 )
177 if links is not None:
178 link_header = links.find_previous_sibling("p")
179 if isinstance(link_header, Tag):
180 link_header.decompose()
181 links.decompose()
182 body.select_one("body")
183 for child in body.children:
184 if isinstance(child, Comment):
185 child.extract()
187 # Dates
188 dates = next(
189 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None
190 )
191 if dates:
192 dates_str = cleanup_str(dates.text)
193 dates_str = re.sub(r"[\.;]", ",", dates_str)
194 dates_str = re.sub(r"Oct,? ", "October ", dates_str)
195 dates_str = re.sub(r"Sept,? ", "September ", dates_str)
197 dates_dict = regex_to_dict(
198 self.dates_regex, dates_str, error_msg="Couldn't parse dates"
199 )
200 xarticle.date_accepted = strftime(
201 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y")
202 )
203 if dates_dict["received"] is not None:
204 xarticle.date_received = strftime(
205 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y")
206 )
207 if dates_dict["revised"] is not None:
208 xarticle.date_revised = strftime(
209 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y")
210 )
211 if dates_dict["final"] is not None:
212 xarticle.date_published = strftime(
213 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y")
214 )
215 dates.extract()
217 # Abstract
218 for img in body.select("img[alt]"):
219 alt = img.get("alt")
220 if not isinstance(alt, str):
221 raise ValueError("Couldn't parse abstract : invalid img alt")
222 img.replace_with(alt)
224 abstract_text = cleanup_str("".join([str(c) for c in body.contents]))
225 try:
226 abstract_dict = regex_to_dict(
227 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract"
228 )
229 except ValueError:
230 abstract_dict = {"abstract": abstract_text}
232 if abstract_dict["abstract"] is not None:
233 xarticle.abstracts.append(
234 create_abstract(
235 tag="abstract", value_tex=cleanup_str(abstract_dict["abstract"]), lang="en"
236 )
237 )
239 if abstract_dict.get("resume", None) is not None:
240 xarticle.abstracts.append(
241 create_abstract(
242 tag="abstract", value_tex=cleanup_str(abstract_dict["resume"]), lang="fr"
243 )
244 )
246 return xarticle
248 def parse_slc_preface(self, content: str):
249 pass