Coverage for src/crawler/by_source/arsia_crawler.py: 100%

24 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1from ptf.external.datacite import get_datacite_articles_in_journal 

2from ptf.model_data import create_issuedata 

3 

4from crawler.by_source.da_crawler import DaCrawler 

5 

6 

7class ArsiaCrawler(DaCrawler): 

8 source_name = "Ars Inveniendi Analytica website" 

9 source_domain = "ARSIA" 

10 source_website = "https://ars-inveniendi-analytica.com/" 

11 

12 # def __init__(self, *args, **kwargs): 

13 # # We want to skip the init of DaCrawler and go straight to BaseCollectionCrawler 

14 # super(DaCrawler, self).__init__(*args, **kwargs) 

15 

16 def parse_collection_content(self, content): 

17 """ 

18 Discrete Analysis. 

19 We ignore the journal web page and query Crossref to get the list of articles. 

20 We query crossref for each article to get the list of xissues based on the publication date. 

21 Each xissue has its year + list of articles with their URLs 

22 """ 

23 

24 what = ["published", "year", "primary_url"] 

25 xarticles = get_datacite_articles_in_journal("Ars Inveniendi Analytica", what) 

26 

27 xarticles = [article for article in xarticles if article.url != self.source_website] 

28 xissues = [] 

29 years = {} 

30 

31 for xarticle in xarticles: 

32 year = str(xarticle.year) 

33 if year not in years: 

34 xissue = create_issuedata() 

35 xissue.pid = self.collection_id + "_" + year + "__" 

36 xissue.year = year 

37 

38 years[year] = xissue 

39 xissues.append(xissue) 

40 else: 

41 xissue = years[year] 

42 

43 xissue.articles.append(xarticle) 

44 

45 return xissues