Coverage for src/crawler/by_source/da_crawler.py: 51%

51 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1from bs4 import BeautifulSoup 

2from crawler.base_crawler import BaseCollectionCrawler 

3from crawler.base_crawler import add_pdf_link_to_xarticle 

4 

5from ptf.external.arxiv import get_arxiv_article 

6from ptf.external.crossref import get_crossref_articles_in_journal 

7from ptf.model_data import create_issuedata 

8 

9 

10class DaCrawler(BaseCollectionCrawler): 

11 source_name = "Discrete Analysis website" 

12 source_domain = "DA" 

13 source_website = "https://discreteanalysisjournal.com" 

14 

15 def __init__(self, *args, **kwargs): 

16 super().__init__(*args, **kwargs) 

17 

18 self.source = self.get_or_create_source() 

19 

20 self.periode_begin = 2016 

21 self.periode_end = 2024 

22 self.periode = self.get_or_create_periode() 

23 

24 def parse_collection_content(self, content): 

25 """ 

26 Discrete Analysis. 

27 We ignore the journal web page and query Crossref to get the list of articles. 

28 We query crossref for each article to get the list of xissues based on the publication date. 

29 Each xissue has its year + list of articles with their URLs 

30 """ 

31 

32 what = ["published", "year", "primary_url"] 

33 xarticles = get_crossref_articles_in_journal("2397-3129", what) 

34 

35 xissues = [] 

36 years = {} 

37 

38 for xarticle in xarticles: 

39 year = str(xarticle.year) 

40 if year not in years: 

41 xissue = create_issuedata() 

42 xissue.pid = self.collection_id + "_" + year + "__" 

43 xissue.year = year 

44 

45 years[year] = xissue 

46 xissues.append(xissue) 

47 else: 

48 xissue = years[year] 

49 

50 xissue.articles.append(xarticle) 

51 

52 return xissues 

53 

54 def parse_article_content(self, content, xissue, xarticle, url, pid): 

55 """ 

56 Parse the content with Beautifulsoup and returns an ArticleData 

57 """ 

58 

59 # We only parse the arXiv id in the Discrete Analysis article page 

60 soup = BeautifulSoup(content, "html.parser") 

61 

62 metadata_node = soup.find("div", {"class": "article-metadata"}) 

63 if metadata_node is None: 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 return None 

65 

66 a_node = metadata_node.find("a", {"class": "outline-alt button"}) 

67 if a_node is None: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 return None 

69 

70 href = a_node.get("href") 

71 id = href.split("/")[-1] 

72 

73 new_xarticle = get_arxiv_article(id) 

74 if new_xarticle is None: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 return None 

76 

77 new_xarticle.pid = xarticle.pid 

78 new_xarticle.doi = xarticle.doi 

79 new_xarticle.lang = "en" 

80 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str 

81 

82 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url) 

83 

84 return new_xarticle