Coverage for src/crawler/by_source/ptm_crawler.py: 74%

105 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-02-14 14:36 +0000

1import regex 

2from bs4 import BeautifulSoup 

3from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

4 

5from crawler.base_crawler import BaseCollectionCrawler 

6from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

7 

8 

9class PtmCrawler(BaseCollectionCrawler): 

10 source_name = "Annales Societatis Mathematicae Polonae Series " 

11 source_domain = "PTM" 

12 source_website = "https://wydawnictwa.ptm.org.pl/" 

13 

14 issue_re = ( 

15 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)" 

16 ) 

17 

18 def __init__(self, *args, **kwargs): 

19 super().__init__(*args, **kwargs) 

20 self.update_cookies() 

21 

22 def download_file(self, url, force_refresh=False): 

23 content = super().download_file(url, force_refresh) 

24 if ( 

25 "Access to this website is possible only using browser with JavaScript and Cookies enabled." 

26 in content 

27 ): 

28 self.update_cookies() 

29 return self.download_file(url, force_refresh=True) 

30 return content 

31 

32 def update_cookies(self): 

33 script_content = super().download_file(self.source_website) 

34 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content) 

35 if not cookie_search: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 raise ValueError("Couldn't set cookie for ptm") 

37 self.headers.update( 

38 { 

39 "Cookie": f"vjs={cookie_search.group(1)}", 

40 "Accept-Language": "en-US,en;q=0.5", 

41 "Accept-Encoding": "gzip, deflate, br, zstd", 

42 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 

43 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", 

44 } 

45 ) 

46 

47 def parse_collection_content(self, content): 

48 soup = BeautifulSoup(content, "html.parser") 

49 xissues = [] 

50 for issue_tag in soup.select("#issue h4 a"): 

51 issue_search = regex.search(self.issue_re, issue_tag.text) 

52 if not issue_search: 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 raise ValueError("Couldn't parse issue data") 

54 issue_url = issue_tag.get("href") 

55 if not isinstance(issue_url, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("Couldn't parse issue url") 

57 issue_data = issue_search.groupdict() 

58 xissues.append( 

59 self.create_xissue( 

60 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"] 

61 ) 

62 ) 

63 return xissues 

64 

65 def parse_issue_content(self, content, xissue): 

66 soup = BeautifulSoup(content, "html.parser") 

67 for index, article in enumerate(soup.select(".tocTitle a")): 

68 xarticle = create_articledata() 

69 xarticle.pid = f"a{index}" 

70 article_url = article.get("href") 

71 if not isinstance(article_url, str): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raise ValueError("Couldn't parse article url") 

73 xarticle.url = article_url 

74 xissue.articles.append(xarticle) 

75 

76 def parse_article_content(self, content, xissue, xarticle, url, pid): 

77 soup = BeautifulSoup(content, "html.parser") 

78 frame = soup.select_one("frameset frame:first-child") 

79 if not frame: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise ValueError("Couldn't parse article") 

81 real_url = frame.get("src") 

82 if not isinstance(real_url, str): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 raise ValueError("Couldn't find article url") 

84 

85 content = self.download_file(real_url) 

86 soup = BeautifulSoup(content, "html.parser") 

87 

88 # Title 

89 title_tag = soup.select_one("#articleTitle") 

90 if not title_tag: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 print(f"Couldn't parse article : {pid} {real_url}. Skipping") 

92 return None 

93 raise ValueError("Couldn't parse title") 

94 xarticle.title_tex = cleanup_str(title_tag.text) 

95 

96 # DOI 

97 doi_header = soup.select_one( 

98 "strong:-soup-contains-own('Digital Object Identifier (DOI):')" 

99 ) 

100 if doi_header: 100 ↛ 106line 100 didn't jump to line 106 because the condition on line 100 was always true

101 doi_tag = doi_header.parent 

102 doi_header.decompose() 

103 xarticle.doi = cleanup_str(doi_tag.text) 

104 

105 # Abstract 

106 abstract_tag = soup.select_one("#articleAbstract div") 

107 if abstract_tag: 107 ↛ 111line 107 didn't jump to line 111 because the condition on line 107 was always true

108 xarticle.abstracts.append(create_abstract(tag="abstract", value_tex=abstract_tag.text)) 

109 

110 # Pages 

111 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") 

112 if pages_header: 112 ↛ 122line 112 didn't jump to line 122 because the condition on line 112 was always true

113 pages_tag = pages_header.parent 

114 pages_header.decompose() 

115 pages_splitted = pages_tag.text.split("-") 

116 if len(pages_splitted) > 0: 116 ↛ 118line 116 didn't jump to line 118 because the condition on line 116 was always true

117 xarticle.fpage = pages_splitted[0] 

118 if len(pages_splitted) > 1: 118 ↛ 122line 118 didn't jump to line 122 because the condition on line 118 was always true

119 xarticle.lpage = pages_splitted[1] 

120 

121 # pdf 

122 pdf_tag = soup.select_one("a.file") 

123 if pdf_tag: 123 ↛ 131line 123 didn't jump to line 131 because the condition on line 123 was always true

124 pdf_url = pdf_tag.get("href") 

125 if not isinstance(pdf_url, str): 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 raise ValueError("Couldn't parse pdf url") 

127 

128 pdf_url = pdf_url.replace("/view/", "/download/") 

129 add_pdf_link_to_xarticle(xarticle, pdf_url) 

130 else: 

131 print(f"Couldn't find article pdf for article {pid}") 

132 

133 # Authors 

134 authors_tag = soup.select_one("#authorString") 

135 if authors_tag: 135 ↛ 140line 135 didn't jump to line 140 because the condition on line 135 was always true

136 for author in cleanup_str(authors_tag.text).split(", "): 

137 xarticle.contributors.append(create_contributor(string_name=author, role="author")) 

138 

139 # msc 

140 msc_header = soup.select_one("strong:-soup-contains-own('Subject classification:')") 

141 if msc_header: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 msc_tag = msc_header.parent 

143 msc_header.decompose() 

144 for msc in cleanup_str(msc_tag.text).split("; "): 

145 xarticle.kwds.append(create_subj(type="msc", value=msc)) 

146 

147 # Keywords 

148 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") 

149 if kwd_header: 149 ↛ 155line 149 didn't jump to line 155 because the condition on line 149 was always true

150 kwd_tag = kwd_header.parent 

151 kwd_header.decompose() 

152 for kwd in cleanup_str(kwd_tag.text).split("; "): 

153 xarticle.kwds.append(create_subj(value=kwd)) 

154 

155 return super().parse_article_content(content, xissue, xarticle, url, pid)