Coverage for src/crawler/by_source/nsjom/nsjom_crawler.py: 71%
40 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from ptf.model_data import IssueData
2from requests import Response
4from crawler.base_crawler import BaseCollectionCrawler
6from .nsjom_1971_crawler import parse_collection_content as parse_1971
7from .nsjom_1971_crawler import parse_issue_content as parse_issue_1971
8from .nsjom_2010_crawler import parse_collection_content as parse_2010
9from .nsjom_2010_crawler import parse_issue_content as parse_issue_2010
10from .nsjom_xml_crawler import parse_collection_content as parse_xml
11from .nsjom_xml_crawler import parse_issue_content as parse_issue_xml
13collection_crawlers = (
14 parse_1971,
15 parse_2010,
16 parse_xml,
17)
20class NsjomCrawler(BaseCollectionCrawler):
21 """NSJOM has multiple source layouts for articles depending of the publication year.
22 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
23 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
24 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
26 We have to implement different crawlers for each layout.
27 """
29 source_name = "Novi sad journal of mathematics website"
30 source_domain = "NSJOM"
31 source_website = "https://sites.dmi.uns.ac.rs/nsjom/"
33 def parse_collection_content(self, content):
34 """
35 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
36 """
37 xissues: list[IssueData] = []
38 for crwlr in collection_crawlers:
39 xissues = xissues + crwlr(self, content, self.source_domain)
40 return xissues
42 def parse_issue_content(self, content: str, xissue: IssueData):
43 if not xissue.year: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 raise ValueError("Issue year is not set")
45 if int(xissue.year) >= 2015:
46 parse_issue_xml(self, content, xissue)
47 elif int(xissue.year) >= 2010: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 parse_issue_2010(self, content, xissue)
49 else:
50 parse_issue_1971(self, content, xissue)
52 def decode_response(self, response: Response, encoding: str = "utf-8"):
53 """Force windows-1250 encoding if we cannot cannot read the abstract"""
54 try:
55 return super().decode_response(response, encoding)
56 except UnicodeDecodeError:
57 self.logger.debug(
58 f"Cannot parse resource using {encoding}. Attempting windows-1252",
59 extra={"url": response.url},
60 )
61 try:
62 return super().decode_response(response, "windows-1252")
63 except UnicodeDecodeError:
64 self.logger.debug(
65 "Cannot parse resource using windows-1252. Attempting windows-1250",
66 extra={"url": response.url},
67 )
68 try:
69 return super().decode_response(response, "windows-1250")
70 except UnicodeDecodeError:
71 raise BufferError(
72 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
73 )