Coverage for src/crawler/by_source/ptm_crawler.py: 25%
110 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1import lingua
2import regex
3from bs4 import BeautifulSoup
4from lingua import LanguageDetectorBuilder
5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
11class PtmCrawler(BaseCollectionCrawler):
12 source_name = "Annales Societatis Mathematicae Polonae Series "
13 source_domain = "PTM"
14 source_website = "https://wydawnictwa.ptm.org.pl/"
16 issue_re = (
17 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)"
18 )
20 language_detector = LanguageDetectorBuilder.from_languages(
21 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN
22 ).build()
24 def __init__(self, *args, **kwargs):
25 super().__init__(*args, **kwargs)
26 self.update_cookies()
28 def download_file(self, url, force_refresh=False):
29 content = super().download_file(url, force_refresh)
30 if (
31 "Access to this website is possible only using browser with JavaScript and Cookies enabled."
32 in content
33 ):
34 self.update_cookies()
35 return self.download_file(url, force_refresh=True)
36 return content
38 def update_cookies(self):
39 script_content = super().download_file(self.source_website)
40 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content)
41 if not cookie_search: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 raise ValueError("Couldn't set cookie for ptm")
43 self.headers.update(
44 {
45 "Cookie": f"vjs={cookie_search.group(1)}",
46 "Accept-Language": "en-US,en;q=0.5",
47 "Accept-Encoding": "gzip, deflate, br, zstd",
48 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
49 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
50 }
51 )
53 def parse_collection_content(self, content):
54 soup = BeautifulSoup(content, "html.parser")
55 xissues = []
56 for issue_tag in soup.select("#issue h4 a"):
57 issue_search = regex.search(self.issue_re, issue_tag.text)
58 if not issue_search: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 raise ValueError("Couldn't parse issue data")
60 issue_url = issue_tag.get("href")
61 if not isinstance(issue_url, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError("Couldn't parse issue url")
63 issue_data = issue_search.groupdict()
64 xissues.append(
65 self.create_xissue(
66 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"]
67 )
68 )
69 return xissues
71 def parse_issue_content(self, content, xissue):
72 soup = BeautifulSoup(content, "html.parser")
73 for index, article in enumerate(soup.select(".tocTitle a")):
74 xarticle = create_articledata()
75 xarticle.pid = f"a{index}"
76 article_url = article.get("href")
77 if not isinstance(article_url, str):
78 raise ValueError("Couldn't parse article url")
79 xarticle.url = article_url
80 xissue.articles.append(xarticle)
82 def parse_article_content(self, content, xissue, xarticle, url):
83 soup = BeautifulSoup(content, "html.parser")
84 frame = soup.select_one("frameset frame:first-child")
85 if not frame:
86 raise ValueError("Couldn't parse article")
87 real_url = frame.get("src")
88 if not isinstance(real_url, str):
89 raise ValueError("Couldn't find article url")
91 content = self.download_file(real_url)
92 soup = BeautifulSoup(content, "html.parser")
94 # Title
95 title_tag = soup.select_one("#articleTitle")
96 if not title_tag:
97 print(f"Couldn't parse article : {xissue.pid}_{xarticle.pid} {real_url}. Skipping")
98 return None
99 raise ValueError("Couldn't parse title")
100 xarticle.title_tex = cleanup_str(title_tag.text)
102 # DOI
103 doi_header = soup.select_one(
104 "strong:-soup-contains-own('Digital Object Identifier (DOI):')"
105 )
106 if doi_header:
107 doi_tag = doi_header.parent
108 doi_header.decompose()
109 xarticle.doi = cleanup_str(doi_tag.text)
111 # Abstract
112 abstract_tag = soup.select_one("#articleAbstract div")
113 if abstract_tag:
114 abstract_text = cleanup_str(abstract_tag.text)
115 if abstract_text != "Artykuł nie zawiera streszczenia":
116 xarticle.abstracts.append(
117 create_abstract(
118 tag="abstract",
119 value_tex=abstract_text,
120 lang=self.detect_language(abstract_text),
121 )
122 )
124 # Pages
125 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')")
126 if pages_header:
127 pages_tag = pages_header.parent
128 pages_header.decompose()
129 pages_splitted = pages_tag.text.split("-")
130 if len(pages_splitted) > 0:
131 xarticle.fpage = pages_splitted[0]
132 if len(pages_splitted) > 1:
133 xarticle.lpage = pages_splitted[1]
135 # pdf
136 pdf_tag = soup.select_one("a.file")
137 if pdf_tag:
138 pdf_url = pdf_tag.get("href")
139 if not isinstance(pdf_url, str):
140 raise ValueError("Couldn't parse pdf url")
142 pdf_url = pdf_url.replace("/view/", "/download/")
143 add_pdf_link_to_xarticle(xarticle, pdf_url)
144 else:
145 print(f"Couldn't find article pdf for article {xissue.pid}_{xarticle.pid}")
147 # Authors
148 authors_tag = soup.select_one("#authorString")
149 if authors_tag:
150 for author in cleanup_str(authors_tag.text).split(", "):
151 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
153 # msc
154 msc_header = soup.select_one("strong:-soup-contains-own('Subject classification:')")
155 if msc_header:
156 msc_tag = msc_header.parent
157 msc_header.decompose()
158 for msc in cleanup_str(msc_tag.text).split("; "):
159 xarticle.kwds.append(create_subj(type="msc", value=msc))
161 # Keywords
162 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')")
163 if kwd_header:
164 kwd_tag = kwd_header.parent
165 kwd_header.decompose()
166 for kwd in cleanup_str(kwd_tag.text).split("; "):
167 xarticle.kwds.append(create_subj(value=kwd))
169 return xarticle