Coverage for src/crawler/by_source/nsjom

1from ptf.model_data import IssueData

2from requests import Response

4from crawler.base_crawler import BaseCollectionCrawler

6from .nsjom.nsjom_1971_crawler import parse_collection_content as parse_1971

7from .nsjom.nsjom_1971_crawler import parse_issue_content as parse_issue_1971

8from .nsjom.nsjom_2010_crawler import parse_collection_content as parse_2010

9from .nsjom.nsjom_2010_crawler import parse_issue_content as parse_issue_2010

10from .nsjom.nsjom_xml_crawler import parse_collection_content as parse_xml

11from .nsjom.nsjom_xml_crawler import parse_issue_content as parse_issue_xml

13collection_crawlers = (

14 parse_1971,

15 parse_2010,

16 parse_xml,

17)

20class NsjomCrawler(BaseCollectionCrawler):

21 """NSJOM has multiple source layouts for articles depending of the publication year.

22 - 1971 to 2009: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns1971.html

23 - 2010 to 2014: one HTML page per year https://sites.dmi.uns.ac.rs/nsjom/ns2010.html

24 - 2015 to today : an XML file: https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml

26 We have to implement different crawlers for each layout.

27 """

29 source_name = "Novi sad journal of mathematics website"

30 source_domain = "NSJOM"

31 source_website = "https://sites.dmi.uns.ac.rs/nsjom/"

33 def parse_collection_content(self, content):

34 """

35 Parses all articles from one xml file : https://sites.dmi.uns.ac.rs/nsjom/NSJOM.xml

36 """

37 xissues: list[IssueData] = []

38 for crwlr in collection_crawlers:

39 xissues = xissues + crwlr(self, content, self.source_domain)

40 return xissues

42 def parse_issue_content(self, content: str, xissue: IssueData):

43 if not xissue.year: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 raise ValueError("Issue year is not set")

45 if int(xissue.year) >= 2015:

46 parse_issue_xml(self, content, xissue)

47 elif int(xissue.year) >= 2010: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 parse_issue_2010(self, content, xissue)

49 else:

50 parse_issue_1971(self, content, xissue)

52 def decode_response(self, response: Response, encoding: str = "utf-8"):

53 """Force windows-1250 encoding if we cannot cannot read the abstract"""

54 try:

55 return super().decode_response(response, encoding)

56 except UnicodeDecodeError:

57 print(

58 f"[{self.source_domain}] cannot parse resource using {encoding} : {response.url}. Attempting windows-1252"

59 )

60 try:

61 return super().decode_response(response, "windows-1252")

62 except UnicodeDecodeError:

63 print(

64 f"[{self.source_domain}] cannot parse resource using windows-1252 : {response.url}. Attempting windows-1250"

65 )

66 try:

67 return super().decode_response(response, "windows-1250")

68 except UnicodeDecodeError:

69 raise BufferError(

70 f"[{self.source_domain}] cannot parse resource using windows-1250 : {response.url}. Cannot read"

71 )

Coverage for src/crawler/by_source/nsjom_crawler.py: 71%

40 statements