Coverage for src/crawler/by_source/nsjom_crawler.py: 71%
40 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1from ptf.model_data import IssueData
2from requests import Response
4from crawler.base_crawler import BaseCollectionCrawler
6from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971
7from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971
8from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010
9from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010
10from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml
11from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml
13collection_crawlers = (
14 parse_1971,
15 parse_2010,
16 parse_xml,
17)
20class NsjomCrawler(BaseCollectionCrawler):
21 """NSJOM has multiple source layouts for articles depending of the publication year.
22 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
23 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
24 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
26 We have to implement different crawlers for each layout.
27 """
29 source_name = "Novi sad journal of mathematics website"
30 source_domain = "NSJOM"
31 source_website = "https://sites.dmi.uns.ac.rs/nsjom/"
33 def parse_collection_content(self, content):
34 """
35 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
36 """
37 xissues: list[IssueData] = []
38 for crwlr in collection_crawlers:
39 xissues = xissues + crwlr(self, content, self.source_domain)
40 return xissues
42 def parse_issue_content(self, content: str, xissue: IssueData):
43 if not xissue.year: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 raise ValueError("Issue year is not set")
45 if int(xissue.year) >= 2015:
46 parse_issue_xml(self, content, xissue)
47 elif int(xissue.year) >= 2010: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 parse_issue_2010(self, content, xissue)
49 else:
50 parse_issue_1971(self, content, xissue)
52 def decode_response(self, response: Response, encoding: str = "utf-8"):
53 """Force windows-1250 encoding if we cannot cannot read the abstract"""
54 try:
55 return super().decode_response(response, encoding)
56 except UnicodeDecodeError:
57 print(
58 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252"
59 )
60 try:
61 return super().decode_response(response, "windows-1252")
62 except UnicodeDecodeError:
63 print(
64 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250"
65 )
66 try:
67 return super().decode_response(response, "windows-1250")
68 except UnicodeDecodeError:
69 raise BufferError(
70 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
71 )