Coverage for src/crawler/by_source/da_crawler.py: 54%

44 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from bs4 import BeautifulSoup 

2from ptf.external.arxiv import get_arxiv_article 

3from ptf.external.crossref import get_crossref_articles_in_journal 

4from ptf.model_data import create_issuedata 

5 

6from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

7 

8 

9class DaCrawler(BaseCollectionCrawler): 

10 source_name = "Discrete Analysis website" 

11 source_domain = "DAS" 

12 source_website = "https://discreteanalysisjournal.com" 

13 

14 def parse_collection_content(self, content): 

15 """ 

16 Discrete Analysis. 

17 We ignore the journal web page and query Crossref to get the list of articles. 

18 We query crossref for each article to get the list of xissues based on the publication date. 

19 Each xissue has its year + list of articles with their URLs 

20 """ 

21 

22 what = ["published", "year", "primary_url"] 

23 xarticles = get_crossref_articles_in_journal("2397-3129", what) 

24 

25 xissues = [] 

26 years = {} 

27 

28 for xarticle in xarticles: 

29 year = str(xarticle.year) 

30 if year not in years: 

31 xissue = create_issuedata() 

32 xissue.pid = self.collection_id + "_" + year + "__" 

33 xissue.year = year 

34 

35 years[year] = xissue 

36 xissues.append(xissue) 

37 else: 

38 xissue = years[year] 

39 

40 xissue.articles.append(xarticle) 

41 

42 return xissues 

43 

44 def parse_article_content(self, content, xissue, xarticle, url): 

45 """ 

46 Parse the content with Beautifulsoup and returns an ArticleData 

47 """ 

48 

49 # We only parse the arXiv id in the Discrete Analysis article page 

50 soup = BeautifulSoup(content, "html.parser") 

51 

52 a_node = soup.select_one("div.main_entry a:-soup-contains-own('Read article')") 

53 if a_node is None: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 raise ValueError("a_node is None") 

55 

56 href = a_node.get("href") 

57 if not isinstance(href, str): 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 raise ValueError("href is not a string") 

59 id = href.split("/")[-1] 

60 

61 new_xarticle = get_arxiv_article(id) 

62 if new_xarticle is None: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 raise ValueError("new_xarticle is None") 

64 new_xarticle.doi = xarticle.doi 

65 new_xarticle.ext_links = xarticle.ext_links 

66 new_xarticle.url = url 

67 new_xarticle.lang = "en" 

68 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str 

69 

70 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url) 

71 

72 return new_xarticle