Coverage for src/crawler/by_source/arsia_crawler.py: 91%

29 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from crawler.by_source.da_crawler import DaCrawler 

2 

3from ptf.external.datacite import get_datacite_articles_in_journal 

4from ptf.model_data import create_issuedata 

5 

6 

7class ArsiaCrawler(DaCrawler): 

8 source_name = " Ars Inveniendi Analytica website" 

9 source_domain = "ARSIA" 

10 source_website = "https://ars-inveniendi-analytica.com/" 

11 periode_begin = 2021 

12 periode_end = 2024 

13 

14 def __init__(self, *args, **kwargs): 

15 super().__init__(*args, **kwargs) 

16 

17 self.source = self.get_or_create_source() 

18 

19 self.periode = self.get_or_create_periode() 

20 

21 # self.has_dynamic_article_pages = True 

22 

23 def parse_collection_content(self, content): 

24 """ 

25 Discrete Analysis. 

26 We ignore the journal web page and query Crossref to get the list of articles. 

27 We query crossref for each article to get the list of xissues based on the publication date. 

28 Each xissue has its year + list of articles with their URLs 

29 """ 

30 

31 what = ["published", "year", "primary_url"] 

32 xarticles = get_datacite_articles_in_journal("Ars Inveniendi Analytica", what) 

33 

34 xissues = [] 

35 years = {} 

36 

37 for xarticle in xarticles: 

38 year = str(xarticle.year) 

39 if year not in years: 

40 xissue = create_issuedata() 

41 xissue.pid = self.collection_id + "_" + year + "__" 

42 xissue.year = year 

43 

44 years[year] = xissue 

45 xissues.append(xissue) 

46 else: 

47 xissue = years[year] 

48 

49 xissue.articles.append(xarticle) 

50 

51 return xissues