Coverage for src / crawler / by_source / scholastica_crawler.py: 25%

58 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-03-19 14:59 +0000

1import json 

2from urllib.parse import parse_qs, urlparse, urlunparse 

3 

4from bs4 import BeautifulSoup 

5from dateutil import parser 

6from django.conf import settings 

7from ptf.external.arxiv import get_arxiv_url, parse_arxiv_response 

8from ptf.model_data import IssueData, create_articledata 

9from pysolr import urlencode 

10 

11from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler 

12from crawler.utils import add_pdf_link_to_xarticle 

13 

14 

15class ScholasticaCrawler(BaseCollectionCrawler): 

16 source_name = "Scholastica" 

17 source_domain = "SCHOLASTICA" 

18 source_website = "https://scholasticahq.com" 

19 

20 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 5) 

21 

22 def parse_collection_content(self, content): 

23 xissues_years: "dict[int, IssueData]" = {} 

24 articles_dicts = [] 

25 parsed_url = urlparse(self.collection_url) 

26 query = parse_qs(parsed_url.query) 

27 

28 while True: 

29 data = json.loads(content) 

30 if len(data["articles"]) == 0: 

31 break 

32 articles_dicts.extend(data["articles"]) 

33 query["offset"] = [str(int(query["offset"][0]) + int(query["per_page"][0]))] 

34 parsed_url = parsed_url._replace(query=urlencode(query, True)) 

35 content = self.download_file(urlunparse(parsed_url)) 

36 

37 for a in articles_dicts: 

38 xarticle = create_articledata() 

39 xarticle.url = a["url"] 

40 xarticle.date_published_iso_8601_date_str = a["published_at"] 

41 

42 year = parser.parse(a["published_at"]).year 

43 xissues_years.setdefault(year, self.create_xissue(None, str(year), None, None)) 

44 

45 xarticle.pid = f"a{len(xissues_years[year].articles)}" 

46 

47 xissues_years[year].articles.append(xarticle) 

48 

49 return list(xissues_years.values()) 

50 

51 def parse_article_content(self, content, xissue, xarticle, url): 

52 """ 

53 Parse the content with Beautifulsoup and returns an ArticleData 

54 """ 

55 

56 # We only parse the arXiv id in the Discrete Analysis article page 

57 soup = BeautifulSoup(content, "html.parser") 

58 

59 a_node = soup.select_one("div#article-show-page a:-soup-contains-own('Read article')") 

60 if a_node is None: 

61 raise ValueError("a_node is None") 

62 

63 href = a_node.get("href") 

64 if not isinstance(href, str): 

65 raise ValueError("href is not a string") 

66 id = href.split("/")[-1] 

67 text = self.download_file(get_arxiv_url(id)) 

68 

69 new_xarticle = parse_arxiv_response(text, id) 

70 if new_xarticle is None: 

71 raise ValueError("new_xarticle is None") 

72 new_xarticle.pid = xarticle.pid 

73 new_xarticle.doi = xarticle.doi 

74 new_xarticle.ext_links = xarticle.ext_links 

75 new_xarticle.url = url 

76 new_xarticle.lang = "en" 

77 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str 

78 

79 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url) 

80 

81 return new_xarticle