Coverage for src/crawler/by_source/slc

1import re

2from time import strftime, strptime

3from urllib.parse import urljoin

5import requests

6from bs4 import BeautifulSoup, Comment, PageElement, Tag

7from ptf.model_data import (

8 ArticleData,

9 IssueData,

10 create_abstract,

11 create_articledata,

12 create_contributor,

13)

15from crawler.base_crawler import BaseCollectionCrawler

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

19def is_relevant_tag(tag: PageElement):

20 if isinstance(tag, str):

21 if cleanup_str(tag) == "":

22 return False

23 return True

25 if cleanup_str(tag.text) == "":

26 return False

27 return True

30class Slc_Crawler(BaseCollectionCrawler):

31 source_name = "Séminaire Lotharingien de Combinatoire website"

32 source_domain = "SLC"

33 source_website = "https://www.mat.univie.ac.at/~slc/"

35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)"

36 abstract_regex = (

37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?"

38 )

39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?"

41 def parse_collection_content(self, content):

42 xissues: list[IssueData] = []

44 soup = BeautifulSoup(content, "html5lib")

45 issue_tags = soup.select("table[border='1'] > tbody > tr > td")

46 for i_tag in issue_tags:

47 a_tag = i_tag.select_one("a")

48 if not a_tag:

49 continue

50 href = a_tag.get("href")

51 if not isinstance(href, str):

52 raise (ValueError("Couldn't parse issue href"))

53 href = urljoin(self.collection_url, href)

54 issue_dict = regex_to_dict(

55 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year"

56 )

58 year = issue_dict["year"]

59 if issue_dict.get("year_end") is not None:

60 year += "-"

61 if len(issue_dict["year_end"]) < 3:

62 year += year[0 : len(issue_dict["year_end"])]

63 year += issue_dict["year_end"]

65 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None)

66 xissues.append(issue_data)

68 return xissues

70 def parse_issue_content(self, content: str, xissue: IssueData):

71 if not xissue.url:

72 raise ValueError("xissue must have an url")

73 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"):

74 xissue.url += "/"

76 soup = BeautifulSoup(content, "html5lib")

78 # Preface

79 preface_tag = soup.select_one("a:-soup-contains-own('Preface')")

80 if preface_tag:

81 preface_href = preface_tag.get("href")

82 if isinstance(preface_href, str):

83 preface_href = urljoin(xissue.url, preface_href)

84 try:

85 preface_content = self.download_file(preface_href)

86 self.parse_slc_preface(preface_content)

87 except requests.exceptions.HTTPError:

88 self.logger.debug(

89 "Couldn't download file", extra={"url": preface_href, "pid": xissue.pid}

90 )

92 # Articles

93 articles_tags = soup.select("dl a")

94 for index, a_tag in enumerate(articles_tags):

95 href = a_tag.get("href", None)

96 if not isinstance(href, str):

97 continue

98 href = urljoin(xissue.url, href)

99 if a_tag.text == "Scanned copy":

100 add_pdf_link_to_xarticle(xissue, href)

101 continue

102 a_href = a_tag.get("href")

103 if not isinstance(a_href, str):

104 continue

105 xarticle = create_articledata()

106 xarticle.pid = f"a{index}"

107 xarticle.url = urljoin(xissue.url, a_href)

108 xissue.articles.append(xarticle)

109

110 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')")

111 if pdf_tag:

112 pdf_href = pdf_tag.get("href")

113 if isinstance(pdf_href, str):

114 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href))

115

116 pass

117

118 def parse_article_content(

119 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str

120 ) -> ArticleData | None:

121 if not xarticle.url:

122 raise ValueError("Article must have an url")

123

124 soup = BeautifulSoup(content, "html5lib")

125 body = soup.select_one("body")

126 if not body:

127 raise ValueError("Couldn't parse article body")

128

129 # PDF

130 pdf_tag = body.select_one(

131 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')"

132 )

133 if not pdf_tag:

134 self.logger.debug(

135 "Couldn't find article pdf", extra={"pid": xarticle.pid, "url": xarticle.url}

136 )

137 return

138 pdf_href = pdf_tag.get("href")

139 if not isinstance(pdf_href, str):

140 raise ValueError("Couldn't parse pdf href")

141 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href))

142

143 # Author

144 author_tag = body.select_one("h1:nth-of-type(1)")

145 if not isinstance(author_tag, Tag):

146 raise ValueError("Couldn't find article authors")

147 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ")

148 if authors_str != "":

149 for author in authors_str.split(", "):

150 xarticle.contributors.append(create_contributor(role="author", string_name=author))

151

152 # Title

153 title_tag = body.select_one("h1:nth-of-type(2)")

154 if not isinstance(title_tag, Tag):

155 raise ValueError("Couldn't find article title")

156 xarticle.title_tex = cleanup_str(title_tag.text)

157

158 if xarticle.title_tex == "":

159 if xarticle.url == "https://www.mat.univie.ac.at/~slc/opapers/s22thumser.html":

160 xarticle.title_tex = " "

161

162 # TEX

163 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')")

164 if tex_tag:

165 tex_href = tex_tag.get("href")

166 if isinstance(tex_href, str):

167 add_pdf_link_to_xarticle(

168 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex"

169 )

170

171 # Here we decompose/extract every element to keep only the abstracts

172 author_tag.decompose()

173 title_tag.decompose()

174 body.select_one("h5").decompose()

175 form = body.select_one("form")

176 if form:

177 form.decompose()

178 links = body.select_one(

179 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')"

180 )

181 if links is not None:

182 link_header = links.find_previous_sibling("p")

183 if isinstance(link_header, Tag):

184 link_header.decompose()

185 links.decompose()

186 body.select_one("body")

187 for child in body.children:

188 if isinstance(child, Comment):

189 child.extract()

190

191 # Dates

192 dates = next(

193 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None

194 )

195 if dates:

196 dates_str = cleanup_str(dates.text)

197 dates_str = re.sub(r"[\.;]", ",", dates_str)

198 dates_str = re.sub(r"Oct,? ", "October ", dates_str)

199 dates_str = re.sub(r"Sept,? ", "September ", dates_str)

200

201 dates_dict = regex_to_dict(

202 self.dates_regex, dates_str, error_msg="Couldn't parse dates"

203 )

204 xarticle.date_accepted = strftime(

205 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y")

206 )

207 if dates_dict["received"] is not None:

208 xarticle.date_received = strftime(

209 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y")

210 )

211 if dates_dict["revised"] is not None:

212 xarticle.date_revised = strftime(

213 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y")

214 )

215 if dates_dict["final"] is not None:

216 xarticle.date_published = strftime(

217 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y")

218 )

219 dates.extract()

220

221 # Abstract

222 for img in body.select("img[alt]"):

223 alt = img.get("alt")

224 if not isinstance(alt, str):

225 raise ValueError("Couldn't parse abstract : invalid img alt")

226 img.replace_with(alt)

227

228 abstract_text = cleanup_str("".join([str(c) for c in body.contents]))

229 try:

230 abstract_dict = regex_to_dict(

231 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract"

232 )

233 except ValueError:

234 abstract_dict = {"abstract": abstract_text}

235

236 if abstract_dict["abstract"] is not None:

237 xarticle.abstracts.append(

238 create_abstract(

239 tag="abstract", value_tex=cleanup_str(abstract_dict["abstract"]), lang="en"

240 )

241 )

242

243 if abstract_dict.get("resume", None) is not None:

244 xarticle.abstracts.append(

245 create_abstract(

246 tag="abstract", value_tex=cleanup_str(abstract_dict["resume"]), lang="fr"

247 )

248 )

249

250 return xarticle

251

252 def parse_slc_preface(self, content: str):

253 pass

Coverage for src/crawler/by_source/slc_crawler.py: 8%

165 statements