Coverage for src/crawler/by_source/slc_crawler.py: 8%
165 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import re
2from time import strftime, strptime
3from urllib.parse import urljoin
5import requests
6from bs4 import BeautifulSoup, Comment, PageElement, Tag
7from ptf.model_data import (
8 ArticleData,
9 IssueData,
10 create_abstract,
11 create_articledata,
12 create_contributor,
13)
15from crawler.base_crawler import BaseCollectionCrawler
16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
19def is_relevant_tag(tag: PageElement):
20 if isinstance(tag, str):
21 if cleanup_str(tag) == "":
22 return False
23 return True
25 if cleanup_str(tag.text) == "":
26 return False
27 return True
30class Slc_Crawler(BaseCollectionCrawler):
31 source_name = "Séminaire Lotharingien de Combinatoire website"
32 source_domain = "SLC"
33 source_website = "https://www.mat.univie.ac.at/~slc/"
35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)"
36 abstract_regex = (
37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?"
38 )
39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?"
41 def parse_collection_content(self, content):
42 xissues: list[IssueData] = []
44 soup = BeautifulSoup(content, "html5lib")
45 issue_tags = soup.select("table[border='1'] > tbody > tr > td")
46 for i_tag in issue_tags:
47 a_tag = i_tag.select_one("a")
48 if not a_tag:
49 continue
50 href = a_tag.get("href")
51 if not isinstance(href, str):
52 raise (ValueError("Couldn't parse issue href"))
53 href = urljoin(self.collection_url, href)
54 issue_dict = regex_to_dict(
55 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year"
56 )
58 year = issue_dict["year"]
59 if issue_dict.get("year_end") is not None:
60 year += "-"
61 if len(issue_dict["year_end"]) < 3:
62 year += year[0 : len(issue_dict["year_end"])]
63 year += issue_dict["year_end"]
65 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None)
66 xissues.append(issue_data)
68 return xissues
70 def parse_issue_content(self, content: str, xissue: IssueData):
71 if not xissue.url:
72 raise ValueError("xissue must have an url")
73 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"):
74 xissue.url += "/"
76 soup = BeautifulSoup(content, "html5lib")
78 # Preface
79 preface_tag = soup.select_one("a:-soup-contains-own('Preface')")
80 if preface_tag:
81 preface_href = preface_tag.get("href")
82 if isinstance(preface_href, str):
83 preface_href = urljoin(xissue.url, preface_href)
84 try:
85 preface_content = self.download_file(preface_href)
86 self.parse_slc_preface(preface_content)
87 except requests.exceptions.HTTPError:
88 self.logger.debug(
89 "Couldn't download file", extra={"url": preface_href, "pid": xissue.pid}
90 )
92 # Articles
93 articles_tags = soup.select("dl a")
94 for index, a_tag in enumerate(articles_tags):
95 href = a_tag.get("href", None)
96 if not isinstance(href, str):
97 continue
98 href = urljoin(xissue.url, href)
99 if a_tag.text == "Scanned copy":
100 add_pdf_link_to_xarticle(xissue, href)
101 continue
102 a_href = a_tag.get("href")
103 if not isinstance(a_href, str):
104 continue
105 xarticle = create_articledata()
106 xarticle.pid = f"a{index}"
107 xarticle.url = urljoin(xissue.url, a_href)
108 xissue.articles.append(xarticle)
110 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')")
111 if pdf_tag:
112 pdf_href = pdf_tag.get("href")
113 if isinstance(pdf_href, str):
114 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href))
116 pass
118 def parse_article_content(
119 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str
120 ) -> ArticleData | None:
121 if not xarticle.url:
122 raise ValueError("Article must have an url")
124 soup = BeautifulSoup(content, "html5lib")
125 body = soup.select_one("body")
126 if not body:
127 raise ValueError("Couldn't parse article body")
129 # PDF
130 pdf_tag = body.select_one(
131 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')"
132 )
133 if not pdf_tag:
134 self.logger.debug(
135 "Couldn't find article pdf", extra={"pid": xarticle.pid, "url": xarticle.url}
136 )
137 return
138 pdf_href = pdf_tag.get("href")
139 if not isinstance(pdf_href, str):
140 raise ValueError("Couldn't parse pdf href")
141 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))
143 # Author
144 author_tag = body.select_one("h1:nth-of-type(1)")
145 if not isinstance(author_tag, Tag):
146 raise ValueError("Couldn't find article authors")
147 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ")
148 if authors_str != "":
149 for author in authors_str.split(", "):
150 xarticle.contributors.append(create_contributor(role="author", string_name=author))
152 # Title
153 title_tag = body.select_one("h1:nth-of-type(2)")
154 if not isinstance(title_tag, Tag):
155 raise ValueError("Couldn't find article title")
156 xarticle.title_tex = cleanup_str(title_tag.text)
158 if xarticle.title_tex == "":
159 if xarticle.url == "https://www.mat.univie.ac.at/~slc/opapers/s22thumser.html":
160 xarticle.title_tex = " "
162 # TEX
163 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')")
164 if tex_tag:
165 tex_href = tex_tag.get("href")
166 if isinstance(tex_href, str):
167 add_pdf_link_to_xarticle(
168 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex"
169 )
171 # Here we decompose/extract every element to keep only the abstracts
172 author_tag.decompose()
173 title_tag.decompose()
174 body.select_one("h5").decompose()
175 form = body.select_one("form")
176 if form:
177 form.decompose()
178 links = body.select_one(
179 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')"
180 )
181 if links is not None:
182 link_header = links.find_previous_sibling("p")
183 if isinstance(link_header, Tag):
184 link_header.decompose()
185 links.decompose()
186 body.select_one("body")
187 for child in body.children:
188 if isinstance(child, Comment):
189 child.extract()
191 # Dates
192 dates = next(
193 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None
194 )
195 if dates:
196 dates_str = cleanup_str(dates.text)
197 dates_str = re.sub(r"[\.;]", ",", dates_str)
198 dates_str = re.sub(r"Oct,? ", "October ", dates_str)
199 dates_str = re.sub(r"Sept,? ", "September ", dates_str)
201 dates_dict = regex_to_dict(
202 self.dates_regex, dates_str, error_msg="Couldn't parse dates"
203 )
204 xarticle.date_accepted = strftime(
205 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y")
206 )
207 if dates_dict["received"] is not None:
208 xarticle.date_received = strftime(
209 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y")
210 )
211 if dates_dict["revised"] is not None:
212 xarticle.date_revised = strftime(
213 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y")
214 )
215 if dates_dict["final"] is not None:
216 xarticle.date_published = strftime(
217 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y")
218 )
219 dates.extract()
221 # Abstract
222 for img in body.select("img[alt]"):
223 alt = img.get("alt")
224 if not isinstance(alt, str):
225 raise ValueError("Couldn't parse abstract : invalid img alt")
226 img.replace_with(alt)
228 abstract_text = cleanup_str("".join([str(c) for c in body.contents]))
229 try:
230 abstract_dict = regex_to_dict(
231 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract"
232 )
233 except ValueError:
234 abstract_dict = {"abstract": abstract_text}
236 if abstract_dict["abstract"] is not None:
237 xarticle.abstracts.append(
238 create_abstract(
239 tag="abstract", value_tex=cleanup_str(abstract_dict["abstract"]), lang="en"
240 )
241 )
243 if abstract_dict.get("resume", None) is not None:
244 xarticle.abstracts.append(
245 create_abstract(
246 tag="abstract", value_tex=cleanup_str(abstract_dict["resume"]), lang="fr"
247 )
248 )
250 return xarticle
252 def parse_slc_preface(self, content: str):
253 pass