Coverage for src / crawler / by_source / arsia_crawler.py: 21%
46 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
1from bs4 import BeautifulSoup
2from ptf.external.arxiv import get_arxiv_article
3from ptf.external.datacite import get_datacite_articles_in_journal
4from ptf.model_data import create_issuedata
6from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle
10class ArsiaCrawler(BaseCollectionCrawler):
11 source_name = "Ars Inveniendi Analytica website"
12 source_domain = "ARSIA"
13 source_website = "https://ars-inveniendi-analytica.com/"
15 # def __init__(self, *args, **kwargs):
16 # # We want to skip the init of DaCrawler and go straight to BaseCollectionCrawler
17 # super(DaCrawler, self).__init__(*args, **kwargs)
19 def parse_collection_content(self, content):
20 """
21 Discrete Analysis.
22 We ignore the journal web page and query Crossref to get the list of articles.
23 We query crossref for each article to get the list of xissues based on the publication date.
24 Each xissue has its year + list of articles with their URLs
25 """
27 what = ["published", "year", "primary_url"]
28 xarticles = get_datacite_articles_in_journal("Ars Inveniendi Analytica", what)
30 xarticles = [article for article in xarticles if article.url != self.source_website]
31 xissues = []
32 years = {}
34 for xarticle in xarticles:
35 year = str(xarticle.year)
36 if year not in years:
37 xissue = create_issuedata()
38 xissue.pid = self.collection_id + "_" + year + "__"
39 xissue.year = year
41 years[year] = xissue
42 xissues.append(xissue)
43 else:
44 xissue = years[year]
46 xissue.articles.append(xarticle)
48 return xissues
50 def parse_article_content(self, content, xissue, xarticle, url):
51 """
52 Parse the content with Beautifulsoup and returns an ArticleData
53 """
55 # We only parse the arXiv id in the Discrete Analysis article page
56 soup = BeautifulSoup(content, "html.parser")
58 a_node = soup.select_one("div.main_entry a:-soup-contains-own('Read article')")
59 if a_node is None:
60 raise ValueError("a_node is None")
62 href = a_node.get("href")
63 if not isinstance(href, str):
64 raise ValueError("href is not a string")
65 id = href.split("/")[-1]
67 new_xarticle = get_arxiv_article(id)
68 if new_xarticle is None:
69 raise ValueError("new_xarticle is None")
70 new_xarticle.doi = xarticle.doi
71 new_xarticle.ext_links = xarticle.ext_links
72 new_xarticle.url = url
73 new_xarticle.lang = "en"
74 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str
76 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url)
78 return new_xarticle