Coverage for src/crawler/by_source/ptm_crawler.py: 22%

121 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import lingua 

2import regex 

3from bs4 import BeautifulSoup, Tag 

4from lingua import LanguageDetectorBuilder 

5from ptf.cmds.xml.jats.jats_parser import JatsBase 

6from ptf.model_data import ( 

7 ArticleData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_subj, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

16 

17 

18class PtmCrawler(BaseCollectionCrawler): 

19 source_name = "Annales Societatis Mathematicae Polonae Series " 

20 source_domain = "PTM" 

21 source_website = "https://wydawnictwa.ptm.org.pl/" 

22 

23 issue_re = ( 

24 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)" 

25 ) 

26 

27 language_detector = LanguageDetectorBuilder.from_languages( 

28 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN 

29 ).build() 

30 

31 def __init__(self, *args, **kwargs): 

32 super().__init__(*args, **kwargs) 

33 self.update_cookies() 

34 

35 def download_file(self, url, force_refresh=False): 

36 content = super().download_file(url, force_refresh) 

37 if ( 

38 "Access to this website is possible only using browser with JavaScript and Cookies enabled." 

39 in content 

40 ): 

41 self.update_cookies() 

42 return self.download_file(url, force_refresh=True) 

43 return content 

44 

45 def update_cookies(self): 

46 script_content = super().download_file(self.source_website) 

47 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content) 

48 if not cookie_search: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 raise ValueError("Couldn't set cookie for ptm") 

50 self.headers.update( 

51 { 

52 "Cookie": f"vjs={cookie_search.group(1)}", 

53 # "Accept-Language": "en-US,en;q=0.5", 

54 "Accept-Encoding": "gzip, deflate, br, zstd", 

55 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 

56 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", 

57 } 

58 ) 

59 

60 def parse_collection_content(self, content): 

61 soup = BeautifulSoup(content, "html.parser") 

62 xissues = [] 

63 for issue_tag in soup.select("#issue h4 a"): 

64 issue_url = issue_tag.get("href") 

65 if not isinstance(issue_url, str): 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true

66 raise ValueError("Couldn't parse issue url") 

67 issue_data = regex_to_dict( 

68 self.issue_re, 

69 issue_tag.text, 

70 error_msg="Couldn't parse issue data", 

71 ) 

72 xissues.append( 

73 self.create_xissue( 

74 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"] 

75 ) 

76 ) 

77 return xissues 

78 

79 def parse_issue_content(self, content, xissue): 

80 soup = BeautifulSoup(content, "html.parser") 

81 for index, article in enumerate(soup.select(".tocTitle a")): 

82 xarticle = create_articledata() 

83 xarticle.pid = f"a{index}" 

84 article_url = article.get("href") 

85 if not isinstance(article_url, str): 

86 raise ValueError("Couldn't parse article url") 

87 xarticle.url = article_url 

88 xissue.articles.append(xarticle) 

89 

90 def parse_article_content(self, content, xissue, xarticle, url): 

91 soup = BeautifulSoup(content, "html.parser") 

92 frame = soup.select_one("frameset frame:first-child") 

93 if not frame: 

94 raise ValueError("Couldn't parse article") 

95 real_url = frame.get("src") 

96 if not isinstance(real_url, str): 

97 raise ValueError("Couldn't find article url") 

98 

99 content = self.download_file(real_url) 

100 soup = BeautifulSoup(content, "html.parser") 

101 

102 # Title 

103 title_tag = soup.select_one("#articleTitle") 

104 if not title_tag: 

105 self.logger.debug( 

106 "Couldn't parse article. Skipping", extra={"pid": xarticle.pid, "url": real_url} 

107 ) 

108 return None 

109 raise ValueError("Couldn't parse title") 

110 xarticle.title_tex = cleanup_str(title_tag.text) 

111 

112 # DOI 

113 doi_header = soup.select_one( 

114 "strong:-soup-contains-own('Digital Object Identifier (DOI):')" 

115 ) 

116 if doi_header: 

117 doi_tag = doi_header.parent 

118 doi_header.decompose() 

119 xarticle.doi = cleanup_str(doi_tag.text) 

120 

121 # Abstract 

122 abstract_tag = soup.select_one("#articleAbstract div") 

123 if abstract_tag: 

124 abstract_text = cleanup_str(abstract_tag.text) 

125 if len(abstract_text) > 0 and abstract_text not in ( 

126 "Artykuł nie zawiera streszczenia", 

127 "-", 

128 ): 

129 xarticle.abstracts.append( 

130 create_abstract( 

131 tag="abstract", 

132 value_tex=abstract_text, 

133 lang=self.detect_language(abstract_text), 

134 ) 

135 ) 

136 

137 # Pages 

138 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") or soup.select_one( 

139 "strong:-soup-contains-own('Strony:')" 

140 ) 

141 if pages_header: 

142 pages_tag = pages_header.parent 

143 pages_header.decompose() 

144 pages_splitted = pages_tag.text.split("-") 

145 if len(pages_splitted) > 0: 

146 xarticle.fpage = pages_splitted[0] 

147 if len(pages_splitted) > 1: 

148 xarticle.lpage = pages_splitted[1] 

149 

150 # pdf 

151 pdf_tag = soup.select_one("a.file") 

152 if pdf_tag: 

153 pdf_url = pdf_tag.get("href") 

154 if not isinstance(pdf_url, str): 

155 raise ValueError("Couldn't parse pdf url") 

156 

157 pdf_url = pdf_url.replace("/view/", "/download/") 

158 add_pdf_link_to_xarticle(xarticle, pdf_url) 

159 

160 if "(ENGLISH)" in pdf_tag.text: 

161 xarticle.lang = "en" 

162 elif "(POLSKI)" in pdf_tag.text: 

163 xarticle.lang = "pl" 

164 else: 

165 xarticle.lang = "pl" 

166 else: 

167 self.logger.debug("Couldn't find article pdf", extra={"pid": xarticle.pid}) 

168 return None 

169 

170 # Authors 

171 authors_tag = soup.select_one("#authorString") 

172 if authors_tag: 

173 for author in cleanup_str(authors_tag.text).split(", "): 

174 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

175 

176 # msc 

177 msc_header = soup.select_one( 

178 "strong:-soup-contains-own('Subject classification:')" 

179 ) or soup.select_one("strong:-soup-contains-own('Kklasyfikacja tematyczna:')") 

180 if msc_header: 

181 msc_tag = msc_header.parent 

182 msc_header.decompose() 

183 for msc in cleanup_str(msc_tag.text).split("; "): 

184 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

185 

186 # Keywords 

187 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") or soup.select_one( 

188 "strong:-soup-contains-own('Słowa kluczowe:')" 

189 ) 

190 if kwd_header: 

191 kwd_tag = kwd_header.parent 

192 kwd_header.decompose() 

193 for kwd in cleanup_str(kwd_tag.text).split("; "): 

194 xarticle.kwds.append(create_subj(value=kwd)) 

195 

196 # References 

197 # Disabling references for now : PTM doesn't have a "clean" way to display references (eg : https://wydawnictwa.ptm.org.pl/index.php/antiquitates-mathematicae/article/view/7321) 

198 

199 # refs_header = soup.select_one("h4:-soup-contains-own('References')") or soup.select_one( 

200 # "h4:-soup-contains-own('Cytowania')" 

201 # ) 

202 # if refs_header: 

203 # refs_tag = refs_header.next_sibling.next_sibling 

204 # if refs_tag and isinstance(refs_tag, Tag): 

205 # self.parse_references(xarticle, refs_tag) 

206 

207 return xarticle 

208 

209 def parse_references(self, xarticle: ArticleData, references: Tag): 

210 bibitems = [] 

211 # TODO : extensive parsing (authors, title etc...) 

212 # Currently, only the text is inserted 

213 for ref in references.get_text(strip=True, separator="\n").splitlines(): 

214 bibitem = JatsBase.bake_ref(cleanup_str(ref)) 

215 bibitems.append(bibitem) 

216 if len(bibitems) > 0: 

217 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))