Coverage for src / crawler / by_source / scholastica_crawler.py: 24%

57 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import json 

2from urllib.parse import parse_qs, urlparse, urlunparse 

3 

4from bs4 import BeautifulSoup 

5from dateutil import parser 

6from django.conf import settings 

7from ptf.external.arxiv import get_arxiv_url, parse_arxiv_response 

8from ptf.model_data import IssueData, create_articledata 

9from pysolr import urlencode 

10 

11from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle 

12 

13 

14class ScholasticaCrawler(BaseCollectionCrawler): 

15 source_name = "Scholastica" 

16 source_domain = "SCHOLASTICA" 

17 source_website = "https://scholasticahq.com" 

18 

19 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 5) 

20 

21 def parse_collection_content(self, content): 

22 xissues_years: "dict[int, IssueData]" = {} 

23 articles_dicts = [] 

24 parsed_url = urlparse(self.collection_url) 

25 query = parse_qs(parsed_url.query) 

26 

27 while True: 

28 data = json.loads(content) 

29 if len(data["articles"]) == 0: 

30 break 

31 articles_dicts.extend(data["articles"]) 

32 query["offset"] = [str(int(query["offset"][0]) + int(query["per_page"][0]))] 

33 parsed_url = parsed_url._replace(query=urlencode(query, True)) 

34 content = self.download_file(urlunparse(parsed_url)) 

35 

36 for a in articles_dicts: 

37 xarticle = create_articledata() 

38 xarticle.url = a["url"] 

39 xarticle.date_published_iso_8601_date_str = a["published_at"] 

40 

41 year = parser.parse(a["published_at"]).year 

42 xissues_years.setdefault(year, self.create_xissue(None, str(year), None, None)) 

43 

44 xarticle.pid = f"a{len(xissues_years[year].articles)}" 

45 

46 xissues_years[year].articles.append(xarticle) 

47 

48 return list(xissues_years.values()) 

49 

50 def parse_article_content(self, content, xissue, xarticle, url): 

51 """ 

52 Parse the content with Beautifulsoup and returns an ArticleData 

53 """ 

54 

55 # We only parse the arXiv id in the Discrete Analysis article page 

56 soup = BeautifulSoup(content, "html.parser") 

57 

58 a_node = soup.select_one("div#article-show-page a:-soup-contains-own('Read article')") 

59 if a_node is None: 

60 raise ValueError("a_node is None") 

61 

62 href = a_node.get("href") 

63 if not isinstance(href, str): 

64 raise ValueError("href is not a string") 

65 id = href.split("/")[-1] 

66 text = self.download_file(get_arxiv_url(id)) 

67 

68 new_xarticle = parse_arxiv_response(text, id) 

69 if new_xarticle is None: 

70 raise ValueError("new_xarticle is None") 

71 new_xarticle.pid = xarticle.pid 

72 new_xarticle.doi = xarticle.doi 

73 new_xarticle.ext_links = xarticle.ext_links 

74 new_xarticle.url = url 

75 new_xarticle.lang = "en" 

76 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str 

77 

78 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url) 

79 

80 return new_xarticle