Coverage for src/crawler/by_source/nsjom_crawler.py: 71%

45 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from datetime import datetime 

2 

3from crawler.base_crawler import BaseCollectionCrawler 

4from requests import Response 

5 

6from ptf.model_data import IssueData 

7 

8from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971 

9from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971 

10from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010 

11from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010 

12from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml 

13from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml 

14 

15collection_crawlers = ( 

16 parse_1971, 

17 parse_2010, 

18 parse_xml, 

19) 

20 

21 

22class NsjomCrawler(BaseCollectionCrawler): 

23 """NSJOM has multiple source layouts for articles depending of the publication year. 

24 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

25 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html 

26 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

27 

28 We have to implement different crawlers for each layout. 

29 """ 

30 

31 source_name = "Novi sad journal of mathematics (NSJOM)" 

32 source_domain = "NSJOM" 

33 source_website = "https://sites.dmi.uns.ac.rs/nsjom/" 

34 

35 periode_begin = 0 

36 periode_end = datetime.now().year 

37 

38 def __init__(self, *args, **kwargs): 

39 super().__init__(*args, **kwargs) 

40 

41 self.source = self.get_or_create_source() 

42 

43 self.periode = self.get_or_create_periode() 

44 

45 def parse_collection_content(self, content): 

46 """ 

47 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

48 """ 

49 xissues: list[IssueData] = [] 

50 for crwlr in collection_crawlers: 

51 xissues = xissues + crwlr( 

52 self, content, self.periode_begin, self.periode_end, self.source_domain 

53 ) 

54 return xissues 

55 

56 def parse_issue_content(self, content: str, xissue: IssueData): 

57 if int(xissue.year) >= 2015: 

58 parse_issue_xml(self, content, xissue) 

59 elif int(xissue.year) >= 2010: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 parse_issue_2010(self, content, xissue) 

61 else: 

62 parse_issue_1971(self, content, xissue) 

63 

64 def decode_response(self, response: Response, encoding: str = "utf-8"): 

65 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

66 try: 

67 return super().decode_response(response, encoding) 

68 except UnicodeDecodeError: 

69 print( 

70 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252" 

71 ) 

72 try: 

73 return super().decode_response(response, "windows-1252") 

74 except UnicodeDecodeError: 

75 print( 

76 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250" 

77 ) 

78 try: 

79 return super().decode_response(response, "windows-1250") 

80 except UnicodeDecodeError: 

81 raise BufferError( 

82 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

83 )