Coverage for src/crawler/by_source/ptm_crawler.py: 23%
124 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-03 13:39 +0000
1import lingua
2import regex
3from bs4 import BeautifulSoup, Tag
4from lingua import LanguageDetectorBuilder
5from ptf.cmds.xml.jats.jats_parser import JatsBase
6from ptf.model_data import (
7 ArticleData,
8 create_abstract,
9 create_articledata,
10 create_contributor,
11 create_subj,
12)
14from crawler.base_crawler import BaseCollectionCrawler
15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str
18class PtmCrawler(BaseCollectionCrawler):
19 source_name = "Annales Societatis Mathematicae Polonae Series "
20 source_domain = "PTM"
21 source_website = "https://wydawnictwa.ptm.org.pl/"
23 issue_re = (
24 r"(?:Vol|Tom) (?P<volume>\d+)(?:, (?:No|Nr) (?P<number>[\d\-\/]+))? \((?P<year>\d{4})\)"
25 )
27 language_detector = LanguageDetectorBuilder.from_languages(
28 lingua.Language.ENGLISH, lingua.Language.POLISH, lingua.Language.RUSSIAN
29 ).build()
31 def __init__(self, *args, **kwargs):
32 super().__init__(*args, **kwargs)
33 self.update_cookies()
35 def download_file(self, url, force_refresh=False):
36 content = super().download_file(url, force_refresh)
37 if (
38 "Access to this website is possible only using browser with JavaScript and Cookies enabled."
39 in content
40 ):
41 self.update_cookies()
42 return self.download_file(url, force_refresh=True)
43 return content
45 def update_cookies(self):
46 script_content = super().download_file(self.source_website)
47 cookie_search = regex.search(r"createCookie\('vjs','(?P<cookie>\d+)',60\)", script_content)
48 if not cookie_search: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 raise ValueError("Couldn't set cookie for ptm")
50 self.headers.update(
51 {
52 "Cookie": f"vjs={cookie_search.group(1)}",
53 # "Accept-Language": "en-US,en;q=0.5",
54 "Accept-Encoding": "gzip, deflate, br, zstd",
55 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
56 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
57 }
58 )
60 def parse_collection_content(self, content):
61 soup = BeautifulSoup(content, "html.parser")
62 xissues = []
63 for issue_tag in soup.select("#issue h4 a"):
64 issue_search = regex.search(self.issue_re, issue_tag.text)
65 if not issue_search: 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true
66 raise ValueError("Couldn't parse issue data")
67 issue_url = issue_tag.get("href")
68 if not isinstance(issue_url, str): 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 raise ValueError("Couldn't parse issue url")
70 issue_data = issue_search.groupdict()
71 xissues.append(
72 self.create_xissue(
73 issue_url, issue_data["year"], issue_data["volume"], issue_data["number"]
74 )
75 )
76 return xissues
78 def parse_issue_content(self, content, xissue):
79 soup = BeautifulSoup(content, "html.parser")
80 for index, article in enumerate(soup.select(".tocTitle a")):
81 xarticle = create_articledata()
82 xarticle.pid = f"a{index}"
83 article_url = article.get("href")
84 if not isinstance(article_url, str):
85 raise ValueError("Couldn't parse article url")
86 xarticle.url = article_url
87 xissue.articles.append(xarticle)
89 def parse_article_content(self, content, xissue, xarticle, url):
90 soup = BeautifulSoup(content, "html.parser")
91 frame = soup.select_one("frameset frame:first-child")
92 if not frame:
93 raise ValueError("Couldn't parse article")
94 real_url = frame.get("src")
95 if not isinstance(real_url, str):
96 raise ValueError("Couldn't find article url")
98 content = self.download_file(real_url)
99 soup = BeautifulSoup(content, "html.parser")
101 # Title
102 title_tag = soup.select_one("#articleTitle")
103 if not title_tag:
104 print(f"Couldn't parse article : {xissue.pid}_{xarticle.pid} {real_url}. Skipping")
105 return None
106 raise ValueError("Couldn't parse title")
107 xarticle.title_tex = cleanup_str(title_tag.text)
109 # DOI
110 doi_header = soup.select_one(
111 "strong:-soup-contains-own('Digital Object Identifier (DOI):')"
112 )
113 if doi_header:
114 doi_tag = doi_header.parent
115 doi_header.decompose()
116 xarticle.doi = cleanup_str(doi_tag.text)
118 # Abstract
119 abstract_tag = soup.select_one("#articleAbstract div")
120 if abstract_tag:
121 abstract_text = cleanup_str(abstract_tag.text)
122 if len(abstract_text) > 0 and abstract_text not in (
123 "Artykuł nie zawiera streszczenia",
124 "-",
125 ):
126 xarticle.abstracts.append(
127 create_abstract(
128 tag="abstract",
129 value_tex=abstract_text,
130 lang=self.detect_language(abstract_text),
131 )
132 )
134 # Pages
135 pages_header = soup.select_one("strong:-soup-contains-own('Pages:')") or soup.select_one(
136 "strong:-soup-contains-own('Strony:')"
137 )
138 if pages_header:
139 pages_tag = pages_header.parent
140 pages_header.decompose()
141 pages_splitted = pages_tag.text.split("-")
142 if len(pages_splitted) > 0:
143 xarticle.fpage = pages_splitted[0]
144 if len(pages_splitted) > 1:
145 xarticle.lpage = pages_splitted[1]
147 # pdf
148 pdf_tag = soup.select_one("a.file")
149 if pdf_tag:
150 pdf_url = pdf_tag.get("href")
151 if not isinstance(pdf_url, str):
152 raise ValueError("Couldn't parse pdf url")
154 pdf_url = pdf_url.replace("/view/", "/download/")
155 add_pdf_link_to_xarticle(xarticle, pdf_url)
157 if "(ENGLISH)" in pdf_tag.text:
158 xarticle.lang = "en"
159 elif "(POLSKI)" in pdf_tag.text:
160 xarticle.lang = "pl"
161 else:
162 xarticle.lang = "pl"
163 else:
164 print(f"Couldn't find article pdf for article {xissue.pid}_{xarticle.pid}")
165 return None
167 # Authors
168 authors_tag = soup.select_one("#authorString")
169 if authors_tag:
170 for author in cleanup_str(authors_tag.text).split(", "):
171 xarticle.contributors.append(create_contributor(string_name=author, role="author"))
173 # msc
174 msc_header = soup.select_one(
175 "strong:-soup-contains-own('Subject classification:')"
176 ) or soup.select_one("strong:-soup-contains-own('Kklasyfikacja tematyczna:')")
177 if msc_header:
178 msc_tag = msc_header.parent
179 msc_header.decompose()
180 for msc in cleanup_str(msc_tag.text).split("; "):
181 xarticle.kwds.append(create_subj(type="msc", value=msc))
183 # Keywords
184 kwd_header = soup.select_one("strong:-soup-contains-own('Keywords:')") or soup.select_one(
185 "strong:-soup-contains-own('Słowa kluczowe:')"
186 )
187 if kwd_header:
188 kwd_tag = kwd_header.parent
189 kwd_header.decompose()
190 for kwd in cleanup_str(kwd_tag.text).split("; "):
191 xarticle.kwds.append(create_subj(value=kwd))
193 # References
194 # Disabling references for now : PTM doesn't have a "clean" way to display references (eg : https://wydawnictwa.ptm.org.pl/index.php/antiquitates-mathematicae/article/view/7321)
196 # refs_header = soup.select_one("h4:-soup-contains-own('References')") or soup.select_one(
197 # "h4:-soup-contains-own('Cytowania')"
198 # )
199 # if refs_header:
200 # refs_tag = refs_header.next_sibling.next_sibling
201 # if refs_tag and isinstance(refs_tag, Tag):
202 # self.parse_references(xarticle, refs_tag)
204 return xarticle
206 def parse_references(self, xarticle: ArticleData, references: Tag):
207 bibitems = []
208 # TODO : extensive parsing (authors, title etc...)
209 # Currently, only the text is inserted
210 for ref in references.get_text(strip=True, separator="\n").splitlines():
211 bibitem = JatsBase.bake_ref(cleanup_str(ref))
212 bibitems.append(bibitem)
213 if len(bibitems) > 0:
214 xarticle.abstracts.append(JatsBase.compile_refs(bibitems))