Coverage for src/crawler/by_source/ptm

1import lingua

2import regex

3from bs4 import BeautifulSoup, Tag

4from lingua import LanguageDetectorBuilder

5from ptf.cmds.xml.jats.jats_parser import JatsBase

6from ptf.model_data import (

7 ArticleData,

8 create_abstract,

9 create_articledata,

10 create_contributor,

11 create_subj,

12)

14from crawler.base_crawler import BaseCollectionCrawler

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict

18class PtmCrawler(BaseCollectionCrawler):

19 source_name = "Annales Societatis Mathematicae Polonae Series "

20 source_domain = "PTM"

21 source_website = "https://wydawnictwa.ptm.org.pl/"

23 issue_re = (

24 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)"

25 )

27 _language_detector_builder = LanguageDetectorBuilder.from_languages(

28 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN

29 )

31 def __init__(self, *args, **kwargs):

32 super().__init__(*args, **kwargs)

33 self.update_cookies()

35 def download_file(self, url, force_refresh=False):

36 content = super().download_file(url, force_refresh)

37 if (

38 "Access to this website is possible only using browser with JavaScript and Cookies enabled."

39 in content

40 ):

41 self.update_cookies()

42 return self.download_file(url, force_refresh=True)

43 return content

45 def update_cookies(self):

46 script_content = super().download_file(self.source_website)

47 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content)

48 if not cookie_search:

49 raise ValueError("Couldn't set cookie for ptm")

50 self.headers.update(

51 {

52 "Cookie": f"vjs={cookie_search.group(1)}",

53 # "Accept-Language": "en-US,en;q=0.5",

54 "Accept-Encoding": "gzip, deflate, br, zstd",

55 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",

56 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",

57 }

58 )

60 def parse_collection_content(self, content):

61 soup = BeautifulSoup(content, "html.parser")

62 xissues = []

63 for issue_tag in soup.select("#issue h4 a"):

64 issue_url = issue_tag.get("href")

65 if not isinstance(issue_url, str):

66 raise ValueError("Couldn't parse issue url")

67 issue_data = regex_to_dict(

68 self.issue_re,

69 issue_tag.text,

70 error_msg="Couldn't parse issue data",

71 )

72 xissues.append(

73 self.create_xissue(

74 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"]

75 )

76 )

77 return xissues

79 def parse_issue_content(self, content, xissue):

80 soup = BeautifulSoup(content, "html.parser")

81 for index, article in enumerate(soup.select(".tocTitle a")):

82 xarticle = create_articledata()

83 xarticle.pid = f"a{index}"

84 article_url = article.get("href")

85 if not isinstance(article_url, str):

86 raise ValueError("Couldn't parse article url")

87 xarticle.url = article_url

88 xissue.articles.append(xarticle)

90 def parse_article_content(self, content, xissue, xarticle, url):

91 soup = BeautifulSoup(content, "html.parser")

92 frame = soup.select_one("frameset frame:first-child")

93 if not frame:

94 raise ValueError("Couldn't parse article")

95 real_url = frame.get("src")

96 if not isinstance(real_url, str):

97 raise ValueError("Couldn't find article url")

99 content = self.download_file(real_url)

100 soup = BeautifulSoup(content, "html.parser")

101

102 # Title

103 title_tag = soup.select_one("#articleTitle")

104 if not title_tag:

105 self.logger.debug(

106 "Couldn't parse article. Skipping", extra={"pid": xarticle.pid, "url": real_url}

107 )

108 return None

109 raise ValueError("Couldn't parse title")

110 xarticle.title_tex = cleanup_str(title_tag.text)

111

112 # DOI

113 doi_header = soup.select_one(

114 "strong:-soup-contains-own('Digital Object Identifier (DOI):')"

115 )

116 if doi_header:

117 doi_tag = doi_header.parent

118 doi_header.decompose()

119 xarticle.doi = cleanup_str(doi_tag.text)

120

121 # Abstract

122 abstract_tag = soup.select_one("#articleAbstract div")

123 if abstract_tag:

124 abstract_text = cleanup_str(abstract_tag.text)

125 if len(abstract_text) > 0 and abstract_text not in (

126 "Artykuł nie zawiera streszczenia",

127 "-",

128 ):

129 xarticle.abstracts.append(

130 create_abstract(

131 value_tex=abstract_text,

132 lang=self.detect_language(abstract_text),

133 )

134 )

135

136 # Pages

137 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") or soup.select_one(

138 "strong:-soup-contains-own('Strony:')"

139 )

140 if pages_header:

141 pages_tag = pages_header.parent

142 pages_header.decompose()

143 pages_splitted = pages_tag.text.split("-")

144 if len(pages_splitted) > 0:

145 xarticle.fpage = pages_splitted[0]

146 if len(pages_splitted) > 1:

147 xarticle.lpage = pages_splitted[1]

148

149 # pdf

150 pdf_tag = soup.select_one("a.file")

151 if pdf_tag:

152 pdf_url = pdf_tag.get("href")

153 if not isinstance(pdf_url, str):

154 raise ValueError("Couldn't parse pdf url")

155

156 pdf_url = pdf_url.replace("/view/", "/download/")

157 add_pdf_link_to_xarticle(xarticle, pdf_url)

158

159 if "(ENGLISH)" in pdf_tag.text:

160 xarticle.lang = "en"

161 elif "(POLSKI)" in pdf_tag.text:

162 xarticle.lang = "pl"

163 else:

164 xarticle.lang = "pl"

165 else:

166 self.logger.debug("Couldn't find article pdf", extra={"pid": xarticle.pid})

167 return None

168

169 # Authors

170 authors_tag = soup.select_one("#authorString")

171 if authors_tag:

172 for author in cleanup_str(authors_tag.text).split(", "):

173 xarticle.contributors.append(create_contributor(string_name=author, role="author"))

174

175 # msc

176 msc_header = soup.select_one(

177 "strong:-soup-contains-own('Subject classification:')"

178 ) or soup.select_one("strong:-soup-contains-own('Kklasyfikacja tematyczna:')")

179 if msc_header:

180 msc_tag = msc_header.parent

181 msc_header.decompose()

182 for msc in cleanup_str(msc_tag.text).split("; "):

183 xarticle.kwds.append(create_subj(type="msc", value=msc))

184

185 # Keywords

186 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") or soup.select_one(

187 "strong:-soup-contains-own('Słowa kluczowe:')"

188 )

189 if kwd_header:

190 kwd_tag = kwd_header.parent

191 kwd_header.decompose()

192 for kwd in cleanup_str(kwd_tag.text).split("; "):

193 xarticle.kwds.append(create_subj(value=kwd))

194

195 # References

196 # Disabling references for now : PTM doesn't have a "clean" way to display references (eg : https://wydawnictwa.ptm.org.pl/index.php/antiquitates-mathematicae/article/view/7321)

197

198 # refs_header = soup.select_one("h4:-soup-contains-own('References')") or soup.select_one(

199 # "h4:-soup-contains-own('Cytowania')"

200 # )

201 # if refs_header:

202 # refs_tag = refs_header.next_sibling.next_sibling

203 # if refs_tag and isinstance(refs_tag, Tag):

204 # self.parse_references(xarticle, refs_tag)

205

206 return xarticle

207

208 def parse_references(self, xarticle: ArticleData, references: Tag):

209 # TODO : extensive parsing (authors, title etc...)

210 # Currently, only the text is inserted

211 for ref in references.get_text(strip=True, separator="\n").splitlines():

212 xarticle.bibitems.append(JatsBase.bake_ref(cleanup_str(ref)))

Coverage for src/crawler/by_source/ptm_crawler.py: 12%

117 statements