Coverage for src/crawler/by_source/da_crawler.py: 54%
47 statements
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
« prev ^ index » next coverage.py v7.7.0, created at 2025-04-03 12:36 +0000
1from bs4 import BeautifulSoup
2from ptf.external.arxiv import get_arxiv_article
3from ptf.external.crossref import get_crossref_articles_in_journal
4from ptf.model_data import create_issuedata
6from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
9class DaCrawler(BaseCollectionCrawler):
10 source_name = "Discrete Analysis website"
11 source_domain = "DA"
12 source_website = "https://discreteanalysisjournal.com"
14 def parse_collection_content(self, content):
15 """
16 Discrete Analysis.
17 We ignore the journal web page and query Crossref to get the list of articles.
18 We query crossref for each article to get the list of xissues based on the publication date.
19 Each xissue has its year + list of articles with their URLs
20 """
22 what = ["published", "year", "primary_url"]
23 xarticles = get_crossref_articles_in_journal("2397-3129", what)
25 xissues = []
26 years = {}
28 for xarticle in xarticles:
29 year = str(xarticle.year)
30 if year not in years:
31 xissue = create_issuedata()
32 xissue.pid = self.collection_id + "_" + year + "__"
33 xissue.year = year
35 years[year] = xissue
36 xissues.append(xissue)
37 else:
38 xissue = years[year]
40 xissue.articles.append(xarticle)
42 return xissues
44 def parse_article_content(self, content, xissue, xarticle, url):
45 """
46 Parse the content with Beautifulsoup and returns an ArticleData
47 """
49 # We only parse the arXiv id in the Discrete Analysis article page
50 soup = BeautifulSoup(content, "html.parser")
52 metadata_node = soup.find("div", {"class": "article-metadata"})
53 if metadata_node is None: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 raise ValueError("metadata_node is None")
56 a_node = metadata_node.find("a", {"class": "outline-alt button"})
57 if a_node is None: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true
58 raise ValueError("a_node is None")
60 href = a_node.get("href")
61 if not isinstance(href, str): 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 raise ValueError("href is not a string")
63 id = href.split("/")[-1]
65 new_xarticle = get_arxiv_article(id)
66 if new_xarticle is None: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 raise ValueError("new_xarticle is None")
68 new_xarticle.doi = xarticle.doi
69 new_xarticle.ext_links = xarticle.ext_links
70 new_xarticle.url = url
71 new_xarticle.lang = "en"
72 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str
74 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url)
76 return new_xarticle