Coverage for src/crawler/by_source/nsjom_crawler.py: 71%

40 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1from ptf.model_data import IssueData 

2from requests import Response 

3 

4from crawler.base_crawler import BaseCollectionCrawler 

5 

6from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971 

7from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971 

8from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010 

9from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010 

10from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml 

11from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml 

12 

13collection_crawlers = ( 

14 parse_1971, 

15 parse_2010, 

16 parse_xml, 

17) 

18 

19 

20class NsjomCrawler(BaseCollectionCrawler): 

21 """NSJOM has multiple source layouts for articles depending of the publication year. 

22 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

23 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html 

24 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

25 

26 We have to implement different crawlers for each layout. 

27 """ 

28 

29 source_name = "Novi sad journal of mathematics website" 

30 source_domain = "NSJOM" 

31 source_website = "https://sites.dmi.uns.ac.rs/nsjom/" 

32 

33 def parse_collection_content(self, content): 

34 """ 

35 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml 

36 """ 

37 xissues: list[IssueData] = [] 

38 for crwlr in collection_crawlers: 

39 xissues = xissues + crwlr(self, content, self.source_domain) 

40 return xissues 

41 

42 def parse_issue_content(self, content: str, xissue: IssueData): 

43 if not xissue.year: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 raise ValueError("Issue year is not set") 

45 if int(xissue.year) >= 2015: 

46 parse_issue_xml(self, content, xissue) 

47 elif int(xissue.year) >= 2010: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 parse_issue_2010(self, content, xissue) 

49 else: 

50 parse_issue_1971(self, content, xissue) 

51 

52 def decode_response(self, response: Response, encoding: str = "utf-8"): 

53 """Force windows-1250 encoding if we cannot cannot read the abstract""" 

54 try: 

55 return super().decode_response(response, encoding) 

56 except UnicodeDecodeError: 

57 print( 

58 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252" 

59 ) 

60 try: 

61 return super().decode_response(response, "windows-1252") 

62 except UnicodeDecodeError: 

63 print( 

64 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250" 

65 ) 

66 try: 

67 return super().decode_response(response, "windows-1250") 

68 except UnicodeDecodeError: 

69 raise BufferError( 

70 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read" 

71 )