Coverage for src/crawler/by_source/slc_crawler.py: 8%

165 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-16 07:44 +0000

1import re 

2from time import strftime, strptime 

3from urllib.parse import urljoin 

4 

5import requests 

6from bs4 import BeautifulSoup, Comment, PageElement, Tag 

7from ptf.model_data import ( 

8 ArticleData, 

9 IssueData, 

10 create_abstract, 

11 create_articledata, 

12 create_contributor, 

13) 

14 

15from crawler.base_crawler import BaseCollectionCrawler 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

17 

18 

19def is_relevant_tag(tag: PageElement): 

20 if isinstance(tag, str): 

21 if cleanup_str(tag) == "": 

22 return False 

23 return True 

24 

25 if cleanup_str(tag.text) == "": 

26 return False 

27 return True 

28 

29 

30class Slc_Crawler(BaseCollectionCrawler): 

31 source_name = "Séminaire Lotharingien de Combinatoire website" 

32 source_domain = "SLC" 

33 source_website = "https://www.mat.univie.ac.at/~slc/" 

34 

35 year_regex = r"Vol\. (?P<volume>\w+).+\((?P<year>\d+)(?:[\-\/](?P<year_end>\d+))?\)" 

36 abstract_regex = ( 

37 r"<b>(?:Résumé.<\/b>(?P<resume>.+))?(?:(?:English )?Abstract.<\/b>(?P<abstract>.+))?" 

38 ) 

39 dates_regex = r"(?:Received: (?P<received>[\w ]+,? \d{1,2},? \d+), )?(?:Revised(?: Version)?:? (?P<revised>[\w ]+,? \d{1,2},? \d+), )?Accepted:? (?P<accepted>[\w ]+,? \d{1,2},? \d+),(?: Final Version: (?P<final>[\w ]+,? \d{1,2},? \d+))?" 

40 

41 def parse_collection_content(self, content): 

42 xissues: list[IssueData] = [] 

43 

44 soup = BeautifulSoup(content, "html5lib") 

45 issue_tags = soup.select("table[border='1'] > tbody > tr > td") 

46 for i_tag in issue_tags: 

47 a_tag = i_tag.select_one("a") 

48 if not a_tag: 

49 continue 

50 href = a_tag.get("href") 

51 if not isinstance(href, str): 

52 raise (ValueError("Couldn't parse issue href")) 

53 href = urljoin(self.collection_url, href) 

54 issue_dict = regex_to_dict( 

55 self.year_regex, cleanup_str(i_tag.text), error_msg="Couldn't parse issue year" 

56 ) 

57 

58 year = issue_dict["year"] 

59 if issue_dict.get("year_end") is not None: 

60 year += "-" 

61 if len(issue_dict["year_end"]) < 3: 

62 year += year[0 : len(issue_dict["year_end"])] 

63 year += issue_dict["year_end"] 

64 

65 issue_data = self.create_xissue(href, year, issue_dict["volume"], issue_number=None) 

66 xissues.append(issue_data) 

67 

68 return xissues 

69 

70 def parse_issue_content(self, content: str, xissue: IssueData): 

71 if not xissue.url: 

72 raise ValueError("xissue must have an url") 

73 if not xissue.url.endswith(".html") and not xissue.url.endswith("/"): 

74 xissue.url += "/" 

75 

76 soup = BeautifulSoup(content, "html5lib") 

77 

78 # Preface 

79 preface_tag = soup.select_one("a:-soup-contains-own('Preface')") 

80 if preface_tag: 

81 preface_href = preface_tag.get("href") 

82 if isinstance(preface_href, str): 

83 preface_href = urljoin(xissue.url, preface_href) 

84 try: 

85 preface_content = self.download_file(preface_href) 

86 self.parse_slc_preface(preface_content) 

87 except requests.exceptions.HTTPError: 

88 print("Couldn't download file : " + preface_href) 

89 

90 # Articles 

91 articles_tags = soup.select("dl a") 

92 for index, a_tag in enumerate(articles_tags): 

93 href = a_tag.get("href", None) 

94 if not isinstance(href, str): 

95 continue 

96 href = urljoin(xissue.url, href) 

97 if a_tag.text == "Scanned copy": 

98 add_pdf_link_to_xarticle(xissue, href) 

99 continue 

100 a_href = a_tag.get("href") 

101 if not isinstance(a_href, str): 

102 continue 

103 xarticle = create_articledata() 

104 xarticle.pid = f"a{index}" 

105 xarticle.url = urljoin(xissue.url, a_href) 

106 xissue.articles.append(xarticle) 

107 

108 pdf_tag = soup.select_one("a[href]:-soup-contains-own('Scanned copy')") 

109 if pdf_tag: 

110 pdf_href = pdf_tag.get("href") 

111 if isinstance(pdf_href, str): 

112 add_pdf_link_to_xarticle(xissue, urljoin(xissue.url, pdf_href)) 

113 

114 pass 

115 

116 def parse_article_content( 

117 self, content: str, xissue: IssueData, xarticle: ArticleData, url: str 

118 ) -> ArticleData | None: 

119 if not xarticle.url: 

120 raise ValueError("Article must have an url") 

121 

122 soup = BeautifulSoup(content, "html5lib") 

123 body = soup.select_one("body") 

124 if not body: 

125 raise ValueError("Couldn't parse article body") 

126 

127 # PDF 

128 pdf_tag = body.select_one( 

129 "a[href]:-soup-contains-own('PDF'), a[href]:-soup-contains-own('Scan of original article')" 

130 ) 

131 if not pdf_tag: 

132 print("Couldn't find article pdf : " + xarticle.url) 

133 return 

134 pdf_href = pdf_tag.get("href") 

135 if not isinstance(pdf_href, str): 

136 raise ValueError("Couldn't parse pdf href") 

137 add_pdf_link_to_xarticle(xarticle, urljoin(xarticle.url, pdf_href)) 

138 

139 # Author 

140 author_tag = body.select_one("h1:nth-of-type(1)") 

141 if not isinstance(author_tag, Tag): 

142 raise ValueError("Couldn't find article authors") 

143 authors_str = cleanup_str(author_tag.text).replace(" and ", ", ") 

144 if authors_str != "": 

145 for author in authors_str.split(", "): 

146 xarticle.contributors.append(create_contributor(role="author", string_name=author)) 

147 

148 # Title 

149 title_tag = body.select_one("h1:nth-of-type(2)") 

150 if not isinstance(title_tag, Tag): 

151 raise ValueError("Couldn't find article title") 

152 xarticle.title_tex = cleanup_str(title_tag.text) 

153 

154 if xarticle.title_tex == "": 

155 if xarticle.url == "https://www.mat.univie.ac.at/~slc/opapers/s22thumser.html": 

156 xarticle.title_tex = " " 

157 

158 # TEX 

159 tex_tag = body.select_one("a[href]:-soup-contains-own('Tex version')") 

160 if tex_tag: 

161 tex_href = tex_tag.get("href") 

162 if isinstance(tex_href, str): 

163 add_pdf_link_to_xarticle( 

164 xarticle, urljoin(xarticle.url, tex_href), mimetype="application/x-tex" 

165 ) 

166 

167 # Here we decompose/extract every element to keep only the abstracts 

168 author_tag.decompose() 

169 title_tag.decompose() 

170 body.select_one("h5").decompose() 

171 form = body.select_one("form") 

172 if form: 

173 form.decompose() 

174 links = body.select_one( 

175 "ul:-soup-contains('Scan of original article'), ul:-soup-contains('PDF')" 

176 ) 

177 if links is not None: 

178 link_header = links.find_previous_sibling("p") 

179 if isinstance(link_header, Tag): 

180 link_header.decompose() 

181 links.decompose() 

182 body.select_one("body") 

183 for child in body.children: 

184 if isinstance(child, Comment): 

185 child.extract() 

186 

187 # Dates 

188 dates = next( 

189 (c for c in body.children if cleanup_str(str(c)).startswith("Received")), None 

190 ) 

191 if dates: 

192 dates_str = cleanup_str(dates.text) 

193 dates_str = re.sub(r"[\.;]", ",", dates_str) 

194 dates_str = re.sub(r"Oct,? ", "October ", dates_str) 

195 dates_str = re.sub(r"Sept,? ", "September ", dates_str) 

196 

197 dates_dict = regex_to_dict( 

198 self.dates_regex, dates_str, error_msg="Couldn't parse dates" 

199 ) 

200 xarticle.date_accepted = strftime( 

201 "%Y-%m-%d", strptime(dates_dict["accepted"].replace(",", ""), "%B %d %Y") 

202 ) 

203 if dates_dict["received"] is not None: 

204 xarticle.date_received = strftime( 

205 "%Y-%m-%d", strptime(dates_dict["received"].replace(",", ""), "%B %d %Y") 

206 ) 

207 if dates_dict["revised"] is not None: 

208 xarticle.date_revised = strftime( 

209 "%Y-%m-%d", strptime(dates_dict["revised"].replace(",", ""), "%B %d %Y") 

210 ) 

211 if dates_dict["final"] is not None: 

212 xarticle.date_published = strftime( 

213 "%Y-%m-%d", strptime(dates_dict["final"].replace(",", ""), "%B %d %Y") 

214 ) 

215 dates.extract() 

216 

217 # Abstract 

218 for img in body.select("img[alt]"): 

219 alt = img.get("alt") 

220 if not isinstance(alt, str): 

221 raise ValueError("Couldn't parse abstract : invalid img alt") 

222 img.replace_with(alt) 

223 

224 abstract_text = cleanup_str("".join([str(c) for c in body.contents])) 

225 try: 

226 abstract_dict = regex_to_dict( 

227 self.abstract_regex, abstract_text, error_msg="Couldn't parse article abstract" 

228 ) 

229 except ValueError: 

230 abstract_dict = {"abstract": abstract_text} 

231 

232 if abstract_dict["abstract"] is not None: 

233 xarticle.abstracts.append( 

234 create_abstract( 

235 tag="abstract", value_tex=cleanup_str(abstract_dict["abstract"]), lang="en" 

236 ) 

237 ) 

238 

239 if abstract_dict.get("resume", None) is not None: 

240 xarticle.abstracts.append( 

241 create_abstract( 

242 tag="abstract", value_tex=cleanup_str(abstract_dict["resume"]), lang="fr" 

243 ) 

244 ) 

245 

246 return xarticle 

247 

248 def parse_slc_preface(self, content: str): 

249 pass