Coverage for src/crawler/by_source/nsjom_crawler.py: 71%
40 statements
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2025-01-15 14:09 +0000
1from ptf.model_data import IssueData
2from requests import Response
4from crawler.base_crawler import BaseCollectionCrawler
6from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971
7from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971
8from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010
9from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010
10from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml
11from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml
13collection_crawlers = (
14 parse_1971,
15 parse_2010,
16 parse_xml,
17)
20class NsjomCrawler(BaseCollectionCrawler):
21 """NSJOM has multiple source layouts for articles depending of the publication year.
22 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
23 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
24 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
26 We have to implement different crawlers for each layout.
27 """
29 source_name = "Novi sad journal of mathematics website"
30 source_domain = "NSJOM"
31 source_website = "https://sites.dmi.uns.ac.rs/nsjom/"
33 def parse_collection_content(self, content):
34 """
35 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
36 """
37 xissues: list[IssueData] = []
38 for crwlr in collection_crawlers:
39 xissues = xissues + crwlr(
40 self, content, self.periode_begin, self.periode_end, self.source_domain
41 )
42 return xissues
44 def parse_issue_content(self, content: str, xissue: IssueData):
45 if not xissue.year: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 raise ValueError("Issue year is not set")
47 if int(xissue.year) >= 2015:
48 parse_issue_xml(self, content, xissue)
49 elif int(xissue.year) >= 2010: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 parse_issue_2010(self, content, xissue)
51 else:
52 parse_issue_1971(self, content, xissue)
54 def decode_response(self, response: Response, encoding: str = "utf-8"):
55 """Force windows-1250 encoding if we cannot cannot read the abstract"""
56 try:
57 return super().decode_response(response, encoding)
58 except UnicodeDecodeError:
59 print(
60 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252"
61 )
62 try:
63 return super().decode_response(response, "windows-1252")
64 except UnicodeDecodeError:
65 print(
66 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250"
67 )
68 try:
69 return super().decode_response(response, "windows-1250")
70 except UnicodeDecodeError:
71 raise BufferError(
72 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
73 )