Coverage for src / crawler / by_source / ptm_crawler.py: 13%

118 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import lingua 

2import regex 

3from bs4 import BeautifulSoup, Tag 

4from lingua import LanguageDetectorBuilder 

5from ptf.cmds.xml.jats.jats_parser import JatsBase 

6from ptf.model_data import ( 

7 ArticleData, 

8 create_abstract, 

9 create_articledata, 

10 create_contributor, 

11 create_subj, 

12) 

13 

14from crawler.base_crawler import BaseCollectionCrawler 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

16 

17 

18class PtmCrawler(BaseCollectionCrawler): 

19 source_name = "Annales Societatis Mathematicae Polonae Series " 

20 source_domain = "PTM" 

21 source_website = "https://wydawnictwa.ptm.org.pl/" 

22 is_checkable = False 

23 

24 issue_re = ( 

25 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)" 

26 ) 

27 

28 _language_detector_builder = LanguageDetectorBuilder.from_languages( 

29 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN 

30 ) 

31 

32 def __init__(self, *args, **kwargs): 

33 super().__init__(*args, **kwargs) 

34 self.update_cookies() 

35 

36 def download_file(self, url, force_refresh=False): 

37 content = super().download_file(url, force_refresh) 

38 if ( 

39 "Access to this website is possible only using browser with JavaScript and Cookies enabled." 

40 in content 

41 ): 

42 self.update_cookies() 

43 return self.download_file(url, force_refresh=True) 

44 return content 

45 

46 def update_cookies(self): 

47 script_content = super().download_file(self.source_website) 

48 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content) 

49 if not cookie_search: 

50 raise ValueError("Couldn't set cookie for ptm") 

51 self.headers.update( 

52 { 

53 "Cookie": f"vjs={cookie_search.group(1)}", 

54 # "Accept-Language": "en-US,en;q=0.5", 

55 "Accept-Encoding": "gzip, deflate, br, zstd", 

56 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 

57 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", 

58 } 

59 ) 

60 

61 def parse_collection_content(self, content): 

62 soup = BeautifulSoup(content, "html.parser") 

63 xissues = [] 

64 for issue_tag in soup.select("#issue h4 a"): 

65 issue_url = issue_tag.get("href") 

66 if not isinstance(issue_url, str): 

67 raise ValueError("Couldn't parse issue url") 

68 issue_data = regex_to_dict( 

69 self.issue_re, 

70 issue_tag.text, 

71 error_msg="Couldn't parse issue data", 

72 ) 

73 xissues.append( 

74 self.create_xissue( 

75 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"] 

76 ) 

77 ) 

78 return xissues 

79 

80 def parse_issue_content(self, content, xissue): 

81 soup = BeautifulSoup(content, "html.parser") 

82 for index, article in enumerate(soup.select(".tocTitle a")): 

83 xarticle = create_articledata() 

84 xarticle.pid = f"a{index}" 

85 article_url = article.get("href") 

86 if not isinstance(article_url, str): 

87 raise ValueError("Couldn't parse article url") 

88 xarticle.url = article_url 

89 xissue.articles.append(xarticle) 

90 

91 def parse_article_content(self, content, xissue, xarticle, url): 

92 soup = BeautifulSoup(content, "html.parser") 

93 frame = soup.select_one("frameset frame:first-child") 

94 if not frame: 

95 raise ValueError("Couldn't parse article") 

96 real_url = frame.get("src") 

97 if not isinstance(real_url, str): 

98 raise ValueError("Couldn't find article url") 

99 

100 content = self.download_file(real_url) 

101 soup = BeautifulSoup(content, "html.parser") 

102 

103 # Title 

104 title_tag = soup.select_one("#articleTitle") 

105 if not title_tag: 

106 self.logger.debug( 

107 "Couldn't parse article. Skipping", extra={"pid": xarticle.pid, "url": real_url} 

108 ) 

109 return None 

110 raise ValueError("Couldn't parse title") 

111 xarticle.title_tex = cleanup_str(title_tag.text) 

112 

113 # DOI 

114 doi_header = soup.select_one( 

115 "strong:-soup-contains-own('Digital Object Identifier (DOI):')" 

116 ) 

117 if doi_header: 

118 doi_tag = doi_header.parent 

119 doi_header.decompose() 

120 xarticle.doi = cleanup_str(doi_tag.text) 

121 

122 # Abstract 

123 abstract_tag = soup.select_one("#articleAbstract div") 

124 if abstract_tag: 

125 abstract_text = cleanup_str(abstract_tag.text) 

126 if len(abstract_text) > 0 and abstract_text not in ( 

127 "Artykuł nie zawiera streszczenia", 

128 "-", 

129 ): 

130 xarticle.abstracts.append( 

131 create_abstract( 

132 value_tex=abstract_text, 

133 lang=self.detect_language(abstract_text), 

134 ) 

135 ) 

136 

137 # Pages 

138 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") or soup.select_one( 

139 "strong:-soup-contains-own('Strony:')" 

140 ) 

141 if pages_header: 

142 pages_tag = pages_header.parent 

143 pages_header.decompose() 

144 pages_splitted = pages_tag.text.split("-") 

145 if len(pages_splitted) > 0: 

146 xarticle.fpage = pages_splitted[0] 

147 if len(pages_splitted) > 1: 

148 xarticle.lpage = pages_splitted[1] 

149 

150 # pdf 

151 pdf_tag = soup.select_one("a.file") 

152 if pdf_tag: 

153 pdf_url = pdf_tag.get("href") 

154 if not isinstance(pdf_url, str): 

155 raise ValueError("Couldn't parse pdf url") 

156 

157 pdf_url = pdf_url.replace("/view/", "/download/") 

158 add_pdf_link_to_xarticle(xarticle, pdf_url) 

159 

160 if "(ENGLISH)" in pdf_tag.text: 

161 xarticle.lang = "en" 

162 elif "(POLSKI)" in pdf_tag.text: 

163 xarticle.lang = "pl" 

164 else: 

165 xarticle.lang = "pl" 

166 else: 

167 self.logger.debug("Couldn't find article pdf", extra={"pid": xarticle.pid}) 

168 return None 

169 

170 # Authors 

171 authors_tag = soup.select_one("#authorString") 

172 if authors_tag: 

173 for author in cleanup_str(authors_tag.text).split(", "): 

174 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

175 

176 # msc 

177 msc_header = soup.select_one( 

178 "strong:-soup-contains-own('Subject classification:')" 

179 ) or soup.select_one("strong:-soup-contains-own('Kklasyfikacja tematyczna:')") 

180 if msc_header: 

181 msc_tag = msc_header.parent 

182 msc_header.decompose() 

183 for msc in cleanup_str(msc_tag.text).split("; "): 

184 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

185 

186 # Keywords 

187 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") or soup.select_one( 

188 "strong:-soup-contains-own('Słowa kluczowe:')" 

189 ) 

190 if kwd_header: 

191 kwd_tag = kwd_header.parent 

192 kwd_header.decompose() 

193 for kwd in cleanup_str(kwd_tag.text).split("; "): 

194 xarticle.kwds.append(create_subj(value=kwd)) 

195 

196 # References 

197 # Disabling references for now : PTM doesn't have a "clean" way to display references (eg : https://wydawnictwa.ptm.org.pl/index.php/antiquitates-mathematicae/article/view/7321) 

198 

199 # refs_header = soup.select_one("h4:-soup-contains-own('References')") or soup.select_one( 

200 # "h4:-soup-contains-own('Cytowania')" 

201 # ) 

202 # if refs_header: 

203 # refs_tag = refs_header.next_sibling.next_sibling 

204 # if refs_tag and isinstance(refs_tag, Tag): 

205 # self.parse_references(xarticle, refs_tag) 

206 

207 return xarticle 

208 

209 def parse_references(self, xarticle: ArticleData, references: Tag): 

210 # TODO : extensive parsing (authors, title etc...) 

211 # Currently, only the text is inserted 

212 for ref in references.get_text(strip=True, separator="\n").splitlines(): 

213 xarticle.bibitems.append(JatsBase.bake_ref(cleanup_str(ref)))