Coverage for src/crawler/by_source/advc_crawler.py: 29%

31 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1import json 

2from urllib.parse import parse_qs, urlencode, urlparse, urlunparse 

3 

4from dateutil import parser 

5from ptf.model_data import IssueData, create_articledata 

6 

7from crawler.by_source.da_crawler import DaCrawler 

8 

9 

10class AdvcCrawler(DaCrawler): 

11 source_name = "Advances in Combinatronics website" 

12 source_domain = "ADVC" 

13 source_website: str = "https://www.advancesincombinatorics.com/" 

14 

15 def parse_collection_content(self, content): 

16 xissues_years: dict[int, IssueData] = {} 

17 articles_dicts = [] 

18 parsed_url = urlparse(self.collection_url) 

19 query = parse_qs(parsed_url.query) 

20 

21 while True: 

22 data = json.loads(content) 

23 if len(data["articles"]) == 0: 

24 break 

25 articles_dicts.extend(data["articles"]) 

26 query["offset"] = [str(int(query["offset"][0]) + int(query["per_page"][0]))] 

27 parsed_url = parsed_url._replace(query=urlencode(query, True)) 

28 content = self.download_file(urlunparse(parsed_url)) 

29 

30 for a in articles_dicts: 

31 article = create_articledata() 

32 article.url = a["url"] 

33 article.date_published_iso_8601_date_str = a["published_at"] 

34 

35 year = parser.parse(a["published_at"]).year 

36 xissues_years.setdefault(year, self.create_xissue(None, str(year), None, None)) 

37 

38 article.pid = f"a{len(xissues_years[year].articles)}" 

39 

40 xissues_years[year].articles.append(article) 

41 

42 return list(xissues_years.values())