Coverage for src/crawler/by_source/nsjom_crawler.py: 71%

40 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1from ptf.model_data import IssueData 

2from requests import Response 

3 

4from crawler.base_crawler import BaseCollectionCrawler 

5 

6from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971 

7from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971 

8from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010 

9from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010 

10from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml 

11from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml 

12 

13collection_crawlers = ( 

14 parse_1971, 

15 parse_2010, 

16 parse_xml, 

17) 

18 

19 

20class NsjomCrawler(BaseCollectionCrawler): 

21 """NSJOM has multiple source layouts for articles depending of the publication year. 

22 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

23 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html 

24 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

25 

26 We have to implement different crawlers for each layout. 

27 """ 

28 

29 source_name = "Novi sad journal of mathematics website" 

30 source_domain = "NSJOM" 

31 source_website = "https://sites.dmi.uns.ac.rs/nsjom/" 

32 

33 def parse_collection_content(self, content): 

34 """ 

35 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

36 """ 

37 xissues: list[IssueData] = [] 

38 for crwlr in collection_crawlers: 

39 xissues = xissues + crwlr( 

40 self, content, self.periode_begin, self.periode_end, self.source_domain 

41 ) 

42 return xissues 

43 

44 def parse_issue_content(self, content: str, xissue: IssueData): 

45 if not xissue.year: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 raise ValueError("Issue year is not set") 

47 if int(xissue.year) >= 2015: 

48 parse_issue_xml(self, content, xissue) 

49 elif int(xissue.year) >= 2010: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 parse_issue_2010(self, content, xissue) 

51 else: 

52 parse_issue_1971(self, content, xissue) 

53 

54 def decode_response(self, response: Response, encoding: str = "utf-8"): 

55 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

56 try: 

57 return super().decode_response(response, encoding) 

58 except UnicodeDecodeError: 

59 print( 

60 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252" 

61 ) 

62 try: 

63 return super().decode_response(response, "windows-1252") 

64 except UnicodeDecodeError: 

65 print( 

66 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250" 

67 ) 

68 try: 

69 return super().decode_response(response, "windows-1250") 

70 except UnicodeDecodeError: 

71 raise BufferError( 

72 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

73 )