Coverage for src / crawler / by_source / ptm_crawler.py: 13%
118 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import lingua
2import regex
3from bs4 import BeautifulSoup, Tag
4from lingua import LanguageDetectorBuilder
5from ptf.cmds.xml.jats.jats_parser import JatsBase
6from ptf.model_data import (
7 ArticleData,
8 create_abstract,
9 create_articledata,
10 create_contributor,
11 create_subj,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
18class PtmCrawler(BaseCollectionCrawler):
19 source_name = "Annales Societatis Mathematicae Polonae Series "
20 source_domain = "PTM"
21 source_website = "https://wydawnictwa.ptm.org.pl/"
22 is_checkable = False
24 issue_re = (
25 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)"
26 )
28 _language_detector_builder = LanguageDetectorBuilder.from_languages(
29 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN
30 )
32 def __init__(self, *args, **kwargs):
33 super().__init__(*args, **kwargs)
34 self.update_cookies()
36 def download_file(self, url, force_refresh=False):
37 content = super().download_file(url, force_refresh)
38 if (
39 "Access to this website is possible only using browser with JavaScript and Cookies enabled."
40 in content
41 ):
42 self.update_cookies()
43 return self.download_file(url, force_refresh=True)
44 return content
46 def update_cookies(self):
47 script_content = super().download_file(self.source_website)
48 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content)
49 if not cookie_search:
50 raise ValueError("Couldn't set cookie for ptm")
51 self.headers.update(
52 {
53 "Cookie": f"vjs={cookie_search.group(1)}",
54 # "Accept-Language": "en-US,en;q=0.5",
55 "Accept-Encoding": "gzip, deflate, br, zstd",
56 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
57 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
58 }
59 )
61 def parse_collection_content(self, content):
62 soup = BeautifulSoup(content, "html.parser")
63 xissues = []
64 for issue_tag in soup.select("#issue h4 a"):
65 issue_url = issue_tag.get("href")
66 if not isinstance(issue_url, str):
67 raise ValueError("Couldn't parse issue url")
68 issue_data = regex_to_dict(
69 self.issue_re,
70 issue_tag.text,
71 error_msg="Couldn't parse issue data",
72 )
73 xissues.append(
74 self.create_xissue(
75 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"]
76 )
77 )
78 return xissues
80 def parse_issue_content(self, content, xissue):
81 soup = BeautifulSoup(content, "html.parser")
82 for index, article in enumerate(soup.select(".tocTitle a")):
83 xarticle = create_articledata()
84 xarticle.pid = f"a{index}"
85 article_url = article.get("href")
86 if not isinstance(article_url, str):
87 raise ValueError("Couldn't parse article url")
88 xarticle.url = article_url
89 xissue.articles.append(xarticle)
91 def parse_article_content(self, content, xissue, xarticle, url):
92 soup = BeautifulSoup(content, "html.parser")
93 frame = soup.select_one("frameset frame:first-child")
94 if not frame:
95 raise ValueError("Couldn't parse article")
96 real_url = frame.get("src")
97 if not isinstance(real_url, str):
98 raise ValueError("Couldn't find article url")
100 content = self.download_file(real_url)
101 soup = BeautifulSoup(content, "html.parser")
103 # Title
104 title_tag = soup.select_one("#articleTitle")
105 if not title_tag:
106 self.logger.debug(
107 "Couldn't parse article. Skipping", extra={"pid": xarticle.pid, "url": real_url}
108 )
109 return None
110 raise ValueError("Couldn't parse title")
111 xarticle.title_tex = cleanup_str(title_tag.text)
113 # DOI
114 doi_header = soup.select_one(
115 "strong:-soup-contains-own('Digital Object Identifier (DOI):')"
116 )
117 if doi_header:
118 doi_tag = doi_header.parent
119 doi_header.decompose()
120 xarticle.doi = cleanup_str(doi_tag.text)
122 # Abstract
123 abstract_tag = soup.select_one("#articleAbstract div")
124 if abstract_tag:
125 abstract_text = cleanup_str(abstract_tag.text)
126 if len(abstract_text) > 0 and abstract_text not in (
127 "Artykuł nie zawiera streszczenia",
128 "-",
129 ):
130 xarticle.abstracts.append(
131 create_abstract(
132 value_tex=abstract_text,
133 lang=self.detect_language(abstract_text),
134 )
135 )
137 # Pages
138 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") or soup.select_one(
139 "strong:-soup-contains-own('Strony:')"
140 )
141 if pages_header:
142 pages_tag = pages_header.parent
143 pages_header.decompose()
144 pages_splitted = pages_tag.text.split("-")
145 if len(pages_splitted) > 0:
146 xarticle.fpage = pages_splitted[0]
147 if len(pages_splitted) > 1:
148 xarticle.lpage = pages_splitted[1]
150 # pdf
151 pdf_tag = soup.select_one("a.file")
152 if pdf_tag:
153 pdf_url = pdf_tag.get("href")
154 if not isinstance(pdf_url, str):
155 raise ValueError("Couldn't parse pdf url")
157 pdf_url = pdf_url.replace("/view/", "/download/")
158 add_pdf_link_to_xarticle(xarticle, pdf_url)
160 if "(ENGLISH)" in pdf_tag.text:
161 xarticle.lang = "en"
162 elif "(POLSKI)" in pdf_tag.text:
163 xarticle.lang = "pl"
164 else:
165 xarticle.lang = "pl"
166 else:
167 self.logger.debug("Couldn't find article pdf", extra={"pid": xarticle.pid})
168 return None
170 # Authors
171 authors_tag = soup.select_one("#authorString")
172 if authors_tag:
173 for author in cleanup_str(authors_tag.text).split(", "):
174 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
176 # msc
177 msc_header = soup.select_one(
178 "strong:-soup-contains-own('Subject classification:')"
179 ) or soup.select_one("strong:-soup-contains-own('Kklasyfikacja tematyczna:')")
180 if msc_header:
181 msc_tag = msc_header.parent
182 msc_header.decompose()
183 for msc in cleanup_str(msc_tag.text).split("; "):
184 xarticle.kwds.append(create_subj(type="msc", value=msc))
186 # Keywords
187 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") or soup.select_one(
188 "strong:-soup-contains-own('Słowa kluczowe:')"
189 )
190 if kwd_header:
191 kwd_tag = kwd_header.parent
192 kwd_header.decompose()
193 for kwd in cleanup_str(kwd_tag.text).split("; "):
194 xarticle.kwds.append(create_subj(value=kwd))
196 # References
197 # Disabling references for now : PTM doesn't have a "clean" way to display references (eg : https://wydawnictwa.ptm.org.pl/index.php/antiquitates-mathematicae/article/view/7321)
199 # refs_header = soup.select_one("h4:-soup-contains-own('References')") or soup.select_one(
200 # "h4:-soup-contains-own('Cytowania')"
201 # )
202 # if refs_header:
203 # refs_tag = refs_header.next_sibling.next_sibling
204 # if refs_tag and isinstance(refs_tag, Tag):
205 # self.parse_references(xarticle, refs_tag)
207 return xarticle
209 def parse_references(self, xarticle: ArticleData, references: Tag):
210 # TODO : extensive parsing (authors, title etc...)
211 # Currently, only the text is inserted
212 for ref in references.get_text(strip=True, separator="\n").splitlines():
213 xarticle.bibitems.append(JatsBase.bake_ref(cleanup_str(ref)))