Coverage for src/crawler/by_source/da_crawler.py: 55%

48 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1from bs4 import BeautifulSoup 

2from ptf.external.arxiv import get_arxiv_article 

3from ptf.external.crossref import get_crossref_articles_in_journal 

4from ptf.model_data import create_issuedata 

5 

6from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

7 

8 

9class DaCrawler(BaseCollectionCrawler): 

10 source_name = "Discrete Analysis website" 

11 source_domain = "DA" 

12 source_website = "https://discreteanalysisjournal.com" 

13 periode_begin = 2016 

14 periode_end = 2024 

15 

16 def parse_collection_content(self, content): 

17 """ 

18 Discrete Analysis. 

19 We ignore the journal web page and query Crossref to get the list of articles. 

20 We query crossref for each article to get the list of xissues based on the publication date. 

21 Each xissue has its year + list of articles with their URLs 

22 """ 

23 

24 what = ["published", "year", "primary_url"] 

25 xarticles = get_crossref_articles_in_journal("2397-3129", what) 

26 

27 xissues = [] 

28 years = {} 

29 

30 for xarticle in xarticles: 

31 year = str(xarticle.year) 

32 if year not in years: 

33 xissue = create_issuedata() 

34 xissue.pid = self.collection_id + "_" + year + "__" 

35 xissue.year = year 

36 

37 years[year] = xissue 

38 xissues.append(xissue) 

39 else: 

40 xissue = years[year] 

41 

42 xissue.articles.append(xarticle) 

43 

44 return xissues 

45 

46 def parse_article_content(self, content, xissue, xarticle, url, pid): 

47 """ 

48 Parse the content with Beautifulsoup and returns an ArticleData 

49 """ 

50 

51 # We only parse the arXiv id in the Discrete Analysis article page 

52 soup = BeautifulSoup(content, "html.parser") 

53 

54 metadata_node = soup.find("div", {"class": "article-metadata"}) 

55 if metadata_node is None: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 raise ValueError("metadata_node is None") 

57 

58 a_node = metadata_node.find("a", {"class": "outline-alt button"}) 

59 if a_node is None: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 raise ValueError("a_node is None") 

61 

62 href = a_node.get("href") 

63 if not isinstance(href, str): 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 raise ValueError("href is not a string") 

65 id = href.split("/")[-1] 

66 

67 new_xarticle = get_arxiv_article(id) 

68 if new_xarticle is None: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 raise ValueError("new_xarticle is None") 

70 

71 new_xarticle.pid = xarticle.pid 

72 new_xarticle.doi = xarticle.doi 

73 new_xarticle.lang = "en" 

74 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str 

75 

76 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url) 

77 

78 return new_xarticle