Coverage for src / crawler / by_source / scholastica_crawler.py: 25%
58 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-03-19 14:59 +0000
1import json
2from urllib.parse import parse_qs, urlparse, urlunparse
4from bs4 import BeautifulSoup
5from dateutil import parser
6from django.conf import settings
7from ptf.external.arxiv import get_arxiv_url, parse_arxiv_response
8from ptf.model_data import IssueData, create_articledata
9from pysolr import urlencode
11from crawler.abstract_crawlers.base_crawler import BaseCollectionCrawler
12from crawler.utils import add_pdf_link_to_xarticle
15class ScholasticaCrawler(BaseCollectionCrawler):
16 source_name = "Scholastica"
17 source_domain = "SCHOLASTICA"
18 source_website = "https://scholasticahq.com"
20 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 5)
22 def parse_collection_content(self, content):
23 xissues_years: "dict[int, IssueData]" = {}
24 articles_dicts = []
25 parsed_url = urlparse(self.collection_url)
26 query = parse_qs(parsed_url.query)
28 while True:
29 data = json.loads(content)
30 if len(data["articles"]) == 0:
31 break
32 articles_dicts.extend(data["articles"])
33 query["offset"] = [str(int(query["offset"][0]) + int(query["per_page"][0]))]
34 parsed_url = parsed_url._replace(query=urlencode(query, True))
35 content = self.download_file(urlunparse(parsed_url))
37 for a in articles_dicts:
38 xarticle = create_articledata()
39 xarticle.url = a["url"]
40 xarticle.date_published_iso_8601_date_str = a["published_at"]
42 year = parser.parse(a["published_at"]).year
43 xissues_years.setdefault(year, self.create_xissue(None, str(year), None, None))
45 xarticle.pid = f"a{len(xissues_years[year].articles)}"
47 xissues_years[year].articles.append(xarticle)
49 return list(xissues_years.values())
51 def parse_article_content(self, content, xissue, xarticle, url):
52 """
53 Parse the content with Beautifulsoup and returns an ArticleData
54 """
56 # We only parse the arXiv id in the Discrete Analysis article page
57 soup = BeautifulSoup(content, "html.parser")
59 a_node = soup.select_one("div#article-show-page a:-soup-contains-own('Read article')")
60 if a_node is None:
61 raise ValueError("a_node is None")
63 href = a_node.get("href")
64 if not isinstance(href, str):
65 raise ValueError("href is not a string")
66 id = href.split("/")[-1]
67 text = self.download_file(get_arxiv_url(id))
69 new_xarticle = parse_arxiv_response(text, id)
70 if new_xarticle is None:
71 raise ValueError("new_xarticle is None")
72 new_xarticle.pid = xarticle.pid
73 new_xarticle.doi = xarticle.doi
74 new_xarticle.ext_links = xarticle.ext_links
75 new_xarticle.url = url
76 new_xarticle.lang = "en"
77 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str
79 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url)
81 return new_xarticle