Coverage for src/crawler/by_source/ptm_crawler.py: 25%

110 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import lingua 

2import regex 

3from bs4 import BeautifulSoup 

4from lingua import LanguageDetectorBuilder 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

9 

10 

11class PtmCrawler(BaseCollectionCrawler): 

12 source_name = "Annales Societatis Mathematicae Polonae Series " 

13 source_domain = "PTM" 

14 source_website = "https://wydawnictwa.ptm.org.pl/" 

15 

16 issue_re = ( 

17 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)" 

18 ) 

19 

20 language_detector = LanguageDetectorBuilder.from_languages( 

21 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN 

22 ).build() 

23 

24 def __init__(self, *args, **kwargs): 

25 super().__init__(*args, **kwargs) 

26 self.update_cookies() 

27 

28 def download_file(self, url, force_refresh=False): 

29 content = super().download_file(url, force_refresh) 

30 if ( 

31 "Access to this website is possible only using browser with JavaScript and Cookies enabled." 

32 in content 

33 ): 

34 self.update_cookies() 

35 return self.download_file(url, force_refresh=True) 

36 return content 

37 

38 def update_cookies(self): 

39 script_content = super().download_file(self.source_website) 

40 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content) 

41 if not cookie_search: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 raise ValueError("Couldn't set cookie for ptm") 

43 self.headers.update( 

44 { 

45 "Cookie": f"vjs={cookie_search.group(1)}", 

46 "Accept-Language": "en-US,en;q=0.5", 

47 "Accept-Encoding": "gzip, deflate, br, zstd", 

48 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 

49 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", 

50 } 

51 ) 

52 

53 def parse_collection_content(self, content): 

54 soup = BeautifulSoup(content, "html.parser") 

55 xissues = [] 

56 for issue_tag in soup.select("#issue h4 a"): 

57 issue_search = regex.search(self.issue_re, issue_tag.text) 

58 if not issue_search: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError("Couldn't parse issue data") 

60 issue_url = issue_tag.get("href") 

61 if not isinstance(issue_url, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 raise ValueError("Couldn't parse issue url") 

63 issue_data = issue_search.groupdict() 

64 xissues.append( 

65 self.create_xissue( 

66 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"] 

67 ) 

68 ) 

69 return xissues 

70 

71 def parse_issue_content(self, content, xissue): 

72 soup = BeautifulSoup(content, "html.parser") 

73 for index, article in enumerate(soup.select(".tocTitle a")): 

74 xarticle = create_articledata() 

75 xarticle.pid = f"a{index}" 

76 article_url = article.get("href") 

77 if not isinstance(article_url, str): 

78 raise ValueError("Couldn't parse article url") 

79 xarticle.url = article_url 

80 xissue.articles.append(xarticle) 

81 

82 def parse_article_content(self, content, xissue, xarticle, url): 

83 soup = BeautifulSoup(content, "html.parser") 

84 frame = soup.select_one("frameset frame:first-child") 

85 if not frame: 

86 raise ValueError("Couldn't parse article") 

87 real_url = frame.get("src") 

88 if not isinstance(real_url, str): 

89 raise ValueError("Couldn't find article url") 

90 

91 content = self.download_file(real_url) 

92 soup = BeautifulSoup(content, "html.parser") 

93 

94 # Title 

95 title_tag = soup.select_one("#articleTitle") 

96 if not title_tag: 

97 print(f"Couldn't parse article : {xissue.pid}_{xarticle.pid} {real_url}. Skipping") 

98 return None 

99 raise ValueError("Couldn't parse title") 

100 xarticle.title_tex = cleanup_str(title_tag.text) 

101 

102 # DOI 

103 doi_header = soup.select_one( 

104 "strong:-soup-contains-own('Digital Object Identifier (DOI):')" 

105 ) 

106 if doi_header: 

107 doi_tag = doi_header.parent 

108 doi_header.decompose() 

109 xarticle.doi = cleanup_str(doi_tag.text) 

110 

111 # Abstract 

112 abstract_tag = soup.select_one("#articleAbstract div") 

113 if abstract_tag: 

114 abstract_text = cleanup_str(abstract_tag.text) 

115 if abstract_text != "Artykuł nie zawiera streszczenia": 

116 xarticle.abstracts.append( 

117 create_abstract( 

118 tag="abstract", 

119 value_tex=abstract_text, 

120 lang=self.detect_language(abstract_text), 

121 ) 

122 ) 

123 

124 # Pages 

125 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") 

126 if pages_header: 

127 pages_tag = pages_header.parent 

128 pages_header.decompose() 

129 pages_splitted = pages_tag.text.split("-") 

130 if len(pages_splitted) > 0: 

131 xarticle.fpage = pages_splitted[0] 

132 if len(pages_splitted) > 1: 

133 xarticle.lpage = pages_splitted[1] 

134 

135 # pdf 

136 pdf_tag = soup.select_one("a.file") 

137 if pdf_tag: 

138 pdf_url = pdf_tag.get("href") 

139 if not isinstance(pdf_url, str): 

140 raise ValueError("Couldn't parse pdf url") 

141 

142 pdf_url = pdf_url.replace("/view/", "/download/") 

143 add_pdf_link_to_xarticle(xarticle, pdf_url) 

144 else: 

145 print(f"Couldn't find article pdf for article {xissue.pid}_{xarticle.pid}") 

146 

147 # Authors 

148 authors_tag = soup.select_one("#authorString") 

149 if authors_tag: 

150 for author in cleanup_str(authors_tag.text).split(", "): 

151 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

152 

153 # msc 

154 msc_header = soup.select_one("strong:-soup-contains-own('Subject classification:')") 

155 if msc_header: 

156 msc_tag = msc_header.parent 

157 msc_header.decompose() 

158 for msc in cleanup_str(msc_tag.text).split("; "): 

159 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

160 

161 # Keywords 

162 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") 

163 if kwd_header: 

164 kwd_tag = kwd_header.parent 

165 kwd_header.decompose() 

166 for kwd in cleanup_str(kwd_tag.text).split("; "): 

167 xarticle.kwds.append(create_subj(value=kwd)) 

168 

169 return xarticle