Coverage for src / crawler / by_source / scholastica_crawler.py: 24%
57 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1import json
2from urllib.parse import parse_qs, urlparse, urlunparse
4from bs4 import BeautifulSoup
5from dateutil import parser
6from django.conf import settings
7from ptf.external.arxiv import get_arxiv_url, parse_arxiv_response
8from ptf.model_data import IssueData, create_articledata
9from pysolr import urlencode
11from crawler.base_crawler import BaseCollectionCrawler, add_pdf_link_to_xarticle
14class ScholasticaCrawler(BaseCollectionCrawler):
15 source_name = "Scholastica"
16 source_domain = "SCHOLASTICA"
17 source_website = "https://scholasticahq.com"
19 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 5)
21 def parse_collection_content(self, content):
22 xissues_years: "dict[int, IssueData]" = {}
23 articles_dicts = []
24 parsed_url = urlparse(self.collection_url)
25 query = parse_qs(parsed_url.query)
27 while True:
28 data = json.loads(content)
29 if len(data["articles"]) == 0:
30 break
31 articles_dicts.extend(data["articles"])
32 query["offset"] = [str(int(query["offset"][0]) + int(query["per_page"][0]))]
33 parsed_url = parsed_url._replace(query=urlencode(query, True))
34 content = self.download_file(urlunparse(parsed_url))
36 for a in articles_dicts:
37 xarticle = create_articledata()
38 xarticle.url = a["url"]
39 xarticle.date_published_iso_8601_date_str = a["published_at"]
41 year = parser.parse(a["published_at"]).year
42 xissues_years.setdefault(year, self.create_xissue(None, str(year), None, None))
44 xarticle.pid = f"a{len(xissues_years[year].articles)}"
46 xissues_years[year].articles.append(xarticle)
48 return list(xissues_years.values())
50 def parse_article_content(self, content, xissue, xarticle, url):
51 """
52 Parse the content with Beautifulsoup and returns an ArticleData
53 """
55 # We only parse the arXiv id in the Discrete Analysis article page
56 soup = BeautifulSoup(content, "html.parser")
58 a_node = soup.select_one("div#article-show-page a:-soup-contains-own('Read article')")
59 if a_node is None:
60 raise ValueError("a_node is None")
62 href = a_node.get("href")
63 if not isinstance(href, str):
64 raise ValueError("href is not a string")
65 id = href.split("/")[-1]
66 text = self.download_file(get_arxiv_url(id))
68 new_xarticle = parse_arxiv_response(text, id)
69 if new_xarticle is None:
70 raise ValueError("new_xarticle is None")
71 new_xarticle.pid = xarticle.pid
72 new_xarticle.doi = xarticle.doi
73 new_xarticle.ext_links = xarticle.ext_links
74 new_xarticle.url = url
75 new_xarticle.lang = "en"
76 new_xarticle.date_published_iso_8601_date_str = xarticle.date_published_iso_8601_date_str
78 add_pdf_link_to_xarticle(new_xarticle, new_xarticle.pdf_url)
80 return new_xarticle