Coverage for src / crawler / by_source / slc_crawler.py: 8%

165 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import re 

2from time import strftime, strptime 

3from urllib.parse import urljoin 

4 

5import requests 

6from bs4 import BeautifulSoup, Comment, PageElement, Tag 

7from ptf.model_data import ( 

8 ArticleData, 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

17 

18 

19def is_relevant_tag(tag: PageElement): 

20 if isinstance(tag, str): 

21 if cleanup_str(tag) == "": 

22 return False 

23 return True 

24 

25 if cleanup_str(tag.text) == "": 

26 return False 

27 return True 

28 

29 

30class Slc_Crawler(BaseCollectionCrawler): 

31 source_name = "Séminaire Lotharingien de Combinatoire website" 

32 source_domain = "SLC" 

33 source_website = "https://www.mat.univie.ac.at/~slc/" 

34 

35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)" 

36 abstract_regex = ( 

37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?" 

38 ) 

39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?" 

40 

41 title_corrections = { 

42 "SLC_1989_22_a15": "On the Growth Rate of Certain Combinatorial Functions", 

43 } 

44 

45 def parse_collection_content(self, content): 

46 xissues: list[IssueData] = [] 

47 

48 soup = BeautifulSoup(content, "html5lib") 

49 issue_tags = soup.select("table[border='1'] > tbody > tr > td") 

50 for i_tag in issue_tags: 

51 a_tag = i_tag.select_one("a") 

52 if not a_tag: 

53 continue 

54 href = a_tag.get("href") 

55 if not isinstance(href, str): 

56 raise (ValueError("Couldn't parse issue href")) 

57 href = urljoin(self.collection_url, href) 

58 issue_dict = regex_to_dict( 

59 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year" 

60 ) 

61 

62 year = issue_dict["year"] 

63 if issue_dict.get("year_end") is not None: 

64 year += "-" 

65 if len(issue_dict["year_end"]) < 3: 

66 year += year[0 : len(issue_dict["year_end"])] 

67 year += issue_dict["year_end"] 

68 

69 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None) 

70 xissues.append(issue_data) 

71 

72 return xissues 

73 

74 def parse_issue_content(self, content: str, xissue: IssueData): 

75 if not xissue.url: 

76 raise ValueError("xissue must have an url") 

77 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"): 

78 xissue.url += "/" 

79 

80 soup = BeautifulSoup(content, "html5lib") 

81 

82 # Preface 

83 preface_tag = soup.select_one("a:-soup-contains-own('Preface')") 

84 if preface_tag: 

85 preface_href = preface_tag.get("href") 

86 if isinstance(preface_href, str): 

87 preface_href = urljoin(xissue.url, preface_href) 

88 try: 

89 preface_content = self.download_file(preface_href) 

90 self.parse_slc_preface(preface_content) 

91 except requests.exceptions.HTTPError: 

92 self.logger.debug( 

93 "Couldn't download file", extra={"url": preface_href, "pid": xissue.pid} 

94 ) 

95 

96 # Articles 

97 articles_tags = soup.select("dl a") 

98 for index, a_tag in enumerate(articles_tags): 

99 href = a_tag.get("href", None) 

100 if not isinstance(href, str): 

101 continue 

102 href = urljoin(xissue.url, href) 

103 if a_tag.text == "Scanned copy": 

104 add_pdf_link_to_xarticle(xissue, href) 

105 continue 

106 a_href = a_tag.get("href") 

107 if not isinstance(a_href, str): 

108 continue 

109 xarticle = create_articledata() 

110 xarticle.pid = f"a{index}" 

111 xarticle.url = urljoin(xissue.url, a_href) 

112 xissue.articles.append(xarticle) 

113 

114 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')") 

115 if pdf_tag: 

116 pdf_href = pdf_tag.get("href") 

117 if isinstance(pdf_href, str): 

118 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href)) 

119 

120 pass 

121 

122 def parse_article_content( 

123 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

124 ) -> ArticleData | None: 

125 if not xarticle.url: 

126 raise ValueError("Article must have an url") 

127 

128 soup = BeautifulSoup(content, "html5lib") 

129 body = soup.select_one("body") 

130 if not body: 

131 raise ValueError("Couldn't parse article body") 

132 

133 # PDF 

134 pdf_tag = body.select_one( 

135 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')" 

136 ) 

137 if not pdf_tag: 

138 self.logger.debug( 

139 "Couldn't find article pdf", extra={"pid": xarticle.pid, "url": xarticle.url} 

140 ) 

141 return 

142 pdf_href = pdf_tag.get("href") 

143 if not isinstance(pdf_href, str): 

144 raise ValueError("Couldn't parse pdf href") 

145 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

146 

147 # Author 

148 author_tag = body.select_one("h1:nth-of-type(1)") 

149 if not isinstance(author_tag, Tag): 

150 raise ValueError("Couldn't find article authors") 

151 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ") 

152 if authors_str != "": 

153 for author in authors_str.split(", "): 

154 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

155 

156 # Title 

157 title_tag = body.select_one("h1:nth-of-type(2)") 

158 if not isinstance(title_tag, Tag): 

159 raise ValueError("Couldn't find article title") 

160 xarticle.title_tex = cleanup_str(title_tag.text) 

161 

162 if xarticle.pid in self.title_corrections: 

163 xarticle.title_tex = self.title_corrections[xarticle.pid] 

164 

165 # TEX 

166 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')") 

167 if tex_tag: 

168 tex_href = tex_tag.get("href") 

169 if isinstance(tex_href, str): 

170 add_pdf_link_to_xarticle( 

171 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex" 

172 ) 

173 

174 # Here we decompose/extract every element to keep only the abstracts 

175 author_tag.decompose() 

176 title_tag.decompose() 

177 body.select_one("h5").decompose() 

178 form = body.select_one("form") 

179 if form: 

180 form.decompose() 

181 links = body.select_one( 

182 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')" 

183 ) 

184 if links is not None: 

185 link_header = links.find_previous_sibling("p") 

186 if isinstance(link_header, Tag): 

187 link_header.decompose() 

188 links.decompose() 

189 body.select_one("body") 

190 for child in body.children: 

191 if isinstance(child, Comment): 

192 child.extract() 

193 

194 # Dates 

195 dates = next( 

196 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None 

197 ) 

198 if dates: 

199 dates_str = cleanup_str(dates.text) 

200 dates_str = re.sub(r"[\.;]", ",", dates_str) 

201 dates_str = re.sub(r"Oct,? ", "October ", dates_str) 

202 dates_str = re.sub(r"Sept,? ", "September ", dates_str) 

203 

204 dates_dict = regex_to_dict( 

205 self.dates_regex, dates_str, error_msg="Couldn't parse dates" 

206 ) 

207 xarticle.date_accepted = strftime( 

208 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y") 

209 ) 

210 if dates_dict["received"] is not None: 

211 xarticle.date_received = strftime( 

212 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y") 

213 ) 

214 if dates_dict["revised"] is not None: 

215 xarticle.date_revised = strftime( 

216 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y") 

217 ) 

218 if dates_dict["final"] is not None: 

219 xarticle.date_published = strftime( 

220 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y") 

221 ) 

222 dates.extract() 

223 

224 # Abstract 

225 for img in body.select("img[alt]"): 

226 alt = img.get("alt") 

227 if not isinstance(alt, str): 

228 raise ValueError("Couldn't parse abstract : invalid img alt") 

229 img.replace_with(alt) 

230 

231 abstract_text = cleanup_str("".join([str(c) for c in body.contents])) 

232 try: 

233 abstract_dict = regex_to_dict( 

234 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract" 

235 ) 

236 except ValueError: 

237 abstract_dict = {"abstract": abstract_text} 

238 

239 if abstract_dict["abstract"] is not None: 

240 xarticle.abstracts.append( 

241 create_abstract(value_tex=cleanup_str(abstract_dict["abstract"]), lang="en") 

242 ) 

243 

244 if abstract_dict.get("resume", None) is not None: 

245 xarticle.abstracts.append( 

246 create_abstract(value_tex=cleanup_str(abstract_dict["resume"]), lang="fr") 

247 ) 

248 

249 return xarticle 

250 

251 def parse_slc_preface(self, content: str): 

252 pass