Coverage for src/crawler/by_source/advc_crawler.py: 29%
31 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1import json
2from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
4from dateutil import parser
5from ptf.model_data import IssueData, create_articledata
7from crawler.by_source.da_crawler import DaCrawler
10class AdvcCrawler(DaCrawler):
11 source_name = "Advances in Combinatronics website"
12 source_domain = "ADVC"
13 source_website: str = "https://www.advancesincombinatorics.com/"
15 def parse_collection_content(self, content):
16 xissues_years: dict[int, IssueData] = {}
17 articles_dicts = []
18 parsed_url = urlparse(self.collection_url)
19 query = parse_qs(parsed_url.query)
21 while True:
22 data = json.loads(content)
23 if len(data["articles"]) == 0:
24 break
25 articles_dicts.extend(data["articles"])
26 query["offset"] = [str(int(query["offset"][0]) + int(query["per_page"][0]))]
27 parsed_url = parsed_url._replace(query=urlencode(query, True))
28 content = self.download_file(urlunparse(parsed_url))
30 for a in articles_dicts:
31 article = create_articledata()
32 article.url = a["url"]
33 article.date_published_iso_8601_date_str = a["published_at"]
35 year = parser.parse(a["published_at"]).year
36 xissues_years.setdefault(year, self.create_xissue(None, str(year), None, None))
38 article.pid = f"a{len(xissues_years[year].articles)}"
40 xissues_years[year].articles.append(article)
42 return list(xissues_years.values())