Coverage for src/crawler/by_source/slc_crawler.py: 8%

165 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import re 

2from time import strftime, strptime 

3from urllib.parse import urljoin 

4 

5import requests 

6from bs4 import BeautifulSoup, Comment, PageElement, Tag 

7from ptf.model_data import ( 

8 ArticleData, 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

17 

18 

19def is_relevant_tag(tag: PageElement): 

20 if isinstance(tag, str): 

21 if cleanup_str(tag) == "": 

22 return False 

23 return True 

24 

25 if cleanup_str(tag.text) == "": 

26 return False 

27 return True 

28 

29 

30class Slc_Crawler(BaseCollectionCrawler): 

31 source_name = "Séminaire Lotharingien de Combinatoire website" 

32 source_domain = "SLC" 

33 source_website = "https://www.mat.univie.ac.at/~slc/" 

34 

35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)" 

36 abstract_regex = ( 

37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?" 

38 ) 

39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?" 

40 

41 def parse_collection_content(self, content): 

42 xissues: list[IssueData] = [] 

43 

44 soup = BeautifulSoup(content, "html5lib") 

45 issue_tags = soup.select("table[border='1'] > tbody > tr > td") 

46 for i_tag in issue_tags: 

47 a_tag = i_tag.select_one("a") 

48 if not a_tag: 

49 continue 

50 href = a_tag.get("href") 

51 if not isinstance(href, str): 

52 raise (ValueError("Couldn't parse issue href")) 

53 href = urljoin(self.collection_url, href) 

54 issue_dict = regex_to_dict( 

55 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year" 

56 ) 

57 

58 year = issue_dict["year"] 

59 if issue_dict.get("year_end") is not None: 

60 year += "-" 

61 if len(issue_dict["year_end"]) < 3: 

62 year += year[0 : len(issue_dict["year_end"])] 

63 year += issue_dict["year_end"] 

64 

65 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None) 

66 xissues.append(issue_data) 

67 

68 return xissues 

69 

70 def parse_issue_content(self, content: str, xissue: IssueData): 

71 if not xissue.url: 

72 raise ValueError("xissue must have an url") 

73 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"): 

74 xissue.url += "/" 

75 

76 soup = BeautifulSoup(content, "html5lib") 

77 

78 # Preface 

79 preface_tag = soup.select_one("a:-soup-contains-own('Preface')") 

80 if preface_tag: 

81 preface_href = preface_tag.get("href") 

82 if isinstance(preface_href, str): 

83 preface_href = urljoin(xissue.url, preface_href) 

84 try: 

85 preface_content = self.download_file(preface_href) 

86 self.parse_slc_preface(preface_content) 

87 except requests.exceptions.HTTPError: 

88 self.logger.debug( 

89 "Couldn't download file", extra={"url": preface_href, "pid": xissue.pid} 

90 ) 

91 

92 # Articles 

93 articles_tags = soup.select("dl a") 

94 for index, a_tag in enumerate(articles_tags): 

95 href = a_tag.get("href", None) 

96 if not isinstance(href, str): 

97 continue 

98 href = urljoin(xissue.url, href) 

99 if a_tag.text == "Scanned copy": 

100 add_pdf_link_to_xarticle(xissue, href) 

101 continue 

102 a_href = a_tag.get("href") 

103 if not isinstance(a_href, str): 

104 continue 

105 xarticle = create_articledata() 

106 xarticle.pid = f"a{index}" 

107 xarticle.url = urljoin(xissue.url, a_href) 

108 xissue.articles.append(xarticle) 

109 

110 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')") 

111 if pdf_tag: 

112 pdf_href = pdf_tag.get("href") 

113 if isinstance(pdf_href, str): 

114 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href)) 

115 

116 pass 

117 

118 def parse_article_content( 

119 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

120 ) -> ArticleData | None: 

121 if not xarticle.url: 

122 raise ValueError("Article must have an url") 

123 

124 soup = BeautifulSoup(content, "html5lib") 

125 body = soup.select_one("body") 

126 if not body: 

127 raise ValueError("Couldn't parse article body") 

128 

129 # PDF 

130 pdf_tag = body.select_one( 

131 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')" 

132 ) 

133 if not pdf_tag: 

134 self.logger.debug( 

135 "Couldn't find article pdf", extra={"pid": xarticle.pid, "url": xarticle.url} 

136 ) 

137 return 

138 pdf_href = pdf_tag.get("href") 

139 if not isinstance(pdf_href, str): 

140 raise ValueError("Couldn't parse pdf href") 

141 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

142 

143 # Author 

144 author_tag = body.select_one("h1:nth-of-type(1)") 

145 if not isinstance(author_tag, Tag): 

146 raise ValueError("Couldn't find article authors") 

147 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ") 

148 if authors_str != "": 

149 for author in authors_str.split(", "): 

150 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

151 

152 # Title 

153 title_tag = body.select_one("h1:nth-of-type(2)") 

154 if not isinstance(title_tag, Tag): 

155 raise ValueError("Couldn't find article title") 

156 xarticle.title_tex = cleanup_str(title_tag.text) 

157 

158 if xarticle.title_tex == "": 

159 if xarticle.url == "https://www.mat.univie.ac.at/~slc/opapers/s22thumser.html": 

160 xarticle.title_tex = " " 

161 

162 # TEX 

163 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')") 

164 if tex_tag: 

165 tex_href = tex_tag.get("href") 

166 if isinstance(tex_href, str): 

167 add_pdf_link_to_xarticle( 

168 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex" 

169 ) 

170 

171 # Here we decompose/extract every element to keep only the abstracts 

172 author_tag.decompose() 

173 title_tag.decompose() 

174 body.select_one("h5").decompose() 

175 form = body.select_one("form") 

176 if form: 

177 form.decompose() 

178 links = body.select_one( 

179 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')" 

180 ) 

181 if links is not None: 

182 link_header = links.find_previous_sibling("p") 

183 if isinstance(link_header, Tag): 

184 link_header.decompose() 

185 links.decompose() 

186 body.select_one("body") 

187 for child in body.children: 

188 if isinstance(child, Comment): 

189 child.extract() 

190 

191 # Dates 

192 dates = next( 

193 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None 

194 ) 

195 if dates: 

196 dates_str = cleanup_str(dates.text) 

197 dates_str = re.sub(r"[\.;]", ",", dates_str) 

198 dates_str = re.sub(r"Oct,? ", "October ", dates_str) 

199 dates_str = re.sub(r"Sept,? ", "September ", dates_str) 

200 

201 dates_dict = regex_to_dict( 

202 self.dates_regex, dates_str, error_msg="Couldn't parse dates" 

203 ) 

204 xarticle.date_accepted = strftime( 

205 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y") 

206 ) 

207 if dates_dict["received"] is not None: 

208 xarticle.date_received = strftime( 

209 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y") 

210 ) 

211 if dates_dict["revised"] is not None: 

212 xarticle.date_revised = strftime( 

213 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y") 

214 ) 

215 if dates_dict["final"] is not None: 

216 xarticle.date_published = strftime( 

217 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y") 

218 ) 

219 dates.extract() 

220 

221 # Abstract 

222 for img in body.select("img[alt]"): 

223 alt = img.get("alt") 

224 if not isinstance(alt, str): 

225 raise ValueError("Couldn't parse abstract : invalid img alt") 

226 img.replace_with(alt) 

227 

228 abstract_text = cleanup_str("".join([str(c) for c in body.contents])) 

229 try: 

230 abstract_dict = regex_to_dict( 

231 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract" 

232 ) 

233 except ValueError: 

234 abstract_dict = {"abstract": abstract_text} 

235 

236 if abstract_dict["abstract"] is not None: 

237 xarticle.abstracts.append( 

238 create_abstract( 

239 tag="abstract", value_tex=cleanup_str(abstract_dict["abstract"]), lang="en" 

240 ) 

241 ) 

242 

243 if abstract_dict.get("resume", None) is not None: 

244 xarticle.abstracts.append( 

245 create_abstract( 

246 tag="abstract", value_tex=cleanup_str(abstract_dict["resume"]), lang="fr" 

247 ) 

248 ) 

249 

250 return xarticle 

251 

252 def parse_slc_preface(self, content: str): 

253 pass