Coverage for src/crawler/by_source/nsjom_crawler.py: 71%
45 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-11-20 09:03 +0000
1from datetime import datetime
3from crawler.base_crawler import BaseCollectionCrawler
4from requests import Response
6from ptf.model_data import IssueData
8from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971
9from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971
10from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010
11from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010
12from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml
13from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml
15collection_crawlers = (
16 parse_1971,
17 parse_2010,
18 parse_xml,
19)
22class NsjomCrawler(BaseCollectionCrawler):
23 """NSJOM has multiple source layouts for articles depending of the publication year.
24 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html
25 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html
26 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
28 We have to implement different crawlers for each layout.
29 """
31 source_name = "Novi sad journal of mathematics (NSJOM)"
32 source_domain = "NSJOM"
33 source_website = "https://sites.dmi.uns.ac.rs/nsjom/"
35 periode_begin = 0
36 periode_end = datetime.now().year
38 def __init__(self, *args, **kwargs):
39 super().__init__(*args, **kwargs)
41 self.source = self.get_or_create_source()
43 self.periode = self.get_or_create_periode()
45 def parse_collection_content(self, content):
46 """
47 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml
48 """
49 xissues: list[IssueData] = []
50 for crwlr in collection_crawlers:
51 xissues = xissues + crwlr(
52 self, content, self.periode_begin, self.periode_end, self.source_domain
53 )
54 return xissues
56 def parse_issue_content(self, content: str, xissue: IssueData):
57 if int(xissue.year) >= 2015:
58 parse_issue_xml(self, content, xissue)
59 elif int(xissue.year) >= 2010: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 parse_issue_2010(self, content, xissue)
61 else:
62 parse_issue_1971(self, content, xissue)
64 def decode_response(self, response: Response, encoding: str = "utf-8"):
65 """Force windows-1250 encoding if we cannot cannot read the abstract"""
66 try:
67 return super().decode_response(response, encoding)
68 except UnicodeDecodeError:
69 print(
70 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252"
71 )
72 try:
73 return super().decode_response(response, "windows-1252")
74 except UnicodeDecodeError:
75 print(
76 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250"
77 )
78 try:
79 return super().decode_response(response, "windows-1250")
80 except UnicodeDecodeError:
81 raise BufferError(
82 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"
83 )