Coverage for src/crawler/by_source/ptm_crawler.py: 23%

124 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-03 13:39 +0000

1import lingua 

2import regex 

3from bs4 import BeautifulSoup, Tag 

4from lingua import LanguageDetectorBuilder 

5from ptf.cmds.xml.jats.jats_parser import JatsBase 

6from ptf.model_data import ( 

7 ArticleData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_subj, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17 

18class PtmCrawler(BaseCollectionCrawler): 

19 source_name = "Annales Societatis Mathematicae Polonae Series " 

20 source_domain = "PTM" 

21 source_website = "https://wydawnictwa.ptm.org.pl/" 

22 

23 issue_re = ( 

24 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)" 

25 ) 

26 

27 language_detector = LanguageDetectorBuilder.from_languages( 

28 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN 

29 ).build() 

30 

31 def __init__(self, *args, **kwargs): 

32 super().__init__(*args, **kwargs) 

33 self.update_cookies() 

34 

35 def download_file(self, url, force_refresh=False): 

36 content = super().download_file(url, force_refresh) 

37 if ( 

38 "Access to this website is possible only using browser with JavaScript and Cookies enabled." 

39 in content 

40 ): 

41 self.update_cookies() 

42 return self.download_file(url, force_refresh=True) 

43 return content 

44 

45 def update_cookies(self): 

46 script_content = super().download_file(self.source_website) 

47 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content) 

48 if not cookie_search: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("Couldn't set cookie for ptm") 

50 self.headers.update( 

51 { 

52 "Cookie": f"vjs={cookie_search.group(1)}", 

53 # "Accept-Language": "en-US,en;q=0.5", 

54 "Accept-Encoding": "gzip, deflate, br, zstd", 

55 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 

56 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", 

57 } 

58 ) 

59 

60 def parse_collection_content(self, content): 

61 soup = BeautifulSoup(content, "html.parser") 

62 xissues = [] 

63 for issue_tag in soup.select("#issue h4 a"): 

64 issue_search = regex.search(self.issue_re, issue_tag.text) 

65 if not issue_search: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 raise ValueError("Couldn't parse issue data") 

67 issue_url = issue_tag.get("href") 

68 if not isinstance(issue_url, str): 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("Couldn't parse issue url") 

70 issue_data = issue_search.groupdict() 

71 xissues.append( 

72 self.create_xissue( 

73 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"] 

74 ) 

75 ) 

76 return xissues 

77 

78 def parse_issue_content(self, content, xissue): 

79 soup = BeautifulSoup(content, "html.parser") 

80 for index, article in enumerate(soup.select(".tocTitle a")): 

81 xarticle = create_articledata() 

82 xarticle.pid = f"a{index}" 

83 article_url = article.get("href") 

84 if not isinstance(article_url, str): 

85 raise ValueError("Couldn't parse article url") 

86 xarticle.url = article_url 

87 xissue.articles.append(xarticle) 

88 

89 def parse_article_content(self, content, xissue, xarticle, url): 

90 soup = BeautifulSoup(content, "html.parser") 

91 frame = soup.select_one("frameset frame:first-child") 

92 if not frame: 

93 raise ValueError("Couldn't parse article") 

94 real_url = frame.get("src") 

95 if not isinstance(real_url, str): 

96 raise ValueError("Couldn't find article url") 

97 

98 content = self.download_file(real_url) 

99 soup = BeautifulSoup(content, "html.parser") 

100 

101 # Title 

102 title_tag = soup.select_one("#articleTitle") 

103 if not title_tag: 

104 print(f"Couldn't parse article : {xissue.pid}_{xarticle.pid} {real_url}. Skipping") 

105 return None 

106 raise ValueError("Couldn't parse title") 

107 xarticle.title_tex = cleanup_str(title_tag.text) 

108 

109 # DOI 

110 doi_header = soup.select_one( 

111 "strong:-soup-contains-own('Digital Object Identifier (DOI):')" 

112 ) 

113 if doi_header: 

114 doi_tag = doi_header.parent 

115 doi_header.decompose() 

116 xarticle.doi = cleanup_str(doi_tag.text) 

117 

118 # Abstract 

119 abstract_tag = soup.select_one("#articleAbstract div") 

120 if abstract_tag: 

121 abstract_text = cleanup_str(abstract_tag.text) 

122 if len(abstract_text) > 0 and abstract_text not in ( 

123 "Artykuł nie zawiera streszczenia", 

124 "-", 

125 ): 

126 xarticle.abstracts.append( 

127 create_abstract( 

128 tag="abstract", 

129 value_tex=abstract_text, 

130 lang=self.detect_language(abstract_text), 

131 ) 

132 ) 

133 

134 # Pages 

135 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") or soup.select_one( 

136 "strong:-soup-contains-own('Strony:')" 

137 ) 

138 if pages_header: 

139 pages_tag = pages_header.parent 

140 pages_header.decompose() 

141 pages_splitted = pages_tag.text.split("-") 

142 if len(pages_splitted) > 0: 

143 xarticle.fpage = pages_splitted[0] 

144 if len(pages_splitted) > 1: 

145 xarticle.lpage = pages_splitted[1] 

146 

147 # pdf 

148 pdf_tag = soup.select_one("a.file") 

149 if pdf_tag: 

150 pdf_url = pdf_tag.get("href") 

151 if not isinstance(pdf_url, str): 

152 raise ValueError("Couldn't parse pdf url") 

153 

154 pdf_url = pdf_url.replace("/view/", "/download/") 

155 add_pdf_link_to_xarticle(xarticle, pdf_url) 

156 

157 if "(ENGLISH)" in pdf_tag.text: 

158 xarticle.lang = "en" 

159 elif "(POLSKI)" in pdf_tag.text: 

160 xarticle.lang = "pl" 

161 else: 

162 xarticle.lang = "pl" 

163 else: 

164 print(f"Couldn't find article pdf for article {xissue.pid}_{xarticle.pid}") 

165 return None 

166 

167 # Authors 

168 authors_tag = soup.select_one("#authorString") 

169 if authors_tag: 

170 for author in cleanup_str(authors_tag.text).split(", "): 

171 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

172 

173 # msc 

174 msc_header = soup.select_one( 

175 "strong:-soup-contains-own('Subject classification:')" 

176 ) or soup.select_one("strong:-soup-contains-own('Kklasyfikacja tematyczna:')") 

177 if msc_header: 

178 msc_tag = msc_header.parent 

179 msc_header.decompose() 

180 for msc in cleanup_str(msc_tag.text).split("; "): 

181 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

182 

183 # Keywords 

184 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") or soup.select_one( 

185 "strong:-soup-contains-own('Słowa kluczowe:')" 

186 ) 

187 if kwd_header: 

188 kwd_tag = kwd_header.parent 

189 kwd_header.decompose() 

190 for kwd in cleanup_str(kwd_tag.text).split("; "): 

191 xarticle.kwds.append(create_subj(value=kwd)) 

192 

193 # References 

194 # Disabling references for now : PTM doesn't have a "clean" way to display references (eg : https://wydawnictwa.ptm.org.pl/index.php/antiquitates-mathematicae/article/view/7321) 

195 

196 # refs_header = soup.select_one("h4:-soup-contains-own('References')") or soup.select_one( 

197 # "h4:-soup-contains-own('Cytowania')" 

198 # ) 

199 # if refs_header: 

200 # refs_tag = refs_header.next_sibling.next_sibling 

201 # if refs_tag and isinstance(refs_tag, Tag): 

202 # self.parse_references(xarticle, refs_tag) 

203 

204 return xarticle 

205 

206 def parse_references(self, xarticle: ArticleData, references: Tag): 

207 bibitems = [] 

208 # TODO : extensive parsing (authors, title etc...) 

209 # Currently, only the text is inserted 

210 for ref in references.get_text(strip=True, separator="\n").splitlines(): 

211 bibitem = JatsBase.bake_ref(cleanup_str(ref)) 

212 bibitems.append(bibitem) 

213 if len(bibitems) > 0: 

214 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))