Coverage for src / crawler / by_source / compositio_crawler.py: 25%
42 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-06-19 13:33 +0000
1from urllib.parse import urljoin
3import requests
4from ptf.model_data import create_articledata
6from crawler.abstract_crawlers.matching_crawler import MatchingCrawler
7from crawler.utils import add_pdf_link_to_xarticle
10class CompositioCrawler(MatchingCrawler):
11 source_name = "Fondation Compositio Mathematica"
12 source_domain = "COMPOSITIO"
13 source_website = "https://algebraicgeometry.nl/"
14 api_url = "https://api.algebraicgeometry.nl/graphql"
16 def parse_collection_content(self, content):
17 query = """
18 query GetVolumes {
19 volumes {
20 id
21 year
22 issues {
23 id
24 issueNumber
25 date
26 }
27 }
28 }
29 """
31 headers = {
32 "Content-Type": "application/json",
33 "Origin": "https://algebraicgeometry.nl",
34 "Referer": "https://algebraicgeometry.nl/",
35 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0",
36 }
38 response = requests.post(self.api_url, json={"query": query}, headers=headers)
39 data = response.json()
41 volumes = data["data"]["volumes"]
42 xissues = []
43 for volume in volumes:
44 year = volume["year"]
45 id = volume["id"]
46 for issue in volume["issues"]:
47 issue_number = issue["issueNumber"]
48 xissues.append(
49 self.create_xissue(
50 urljoin(self.source_website, f"issues/{issue['id']}"),
51 str(year),
52 str(id),
53 str(issue_number),
54 )
55 )
56 return xissues
58 def parse_issue_content(self, content, xissue):
59 query = """
60 query getPublishedIssue($id: Int) {
61 issue(id: $id) {
62 volume
63 issueNumber
64 date
65 year
66 publishedArticles {
67 id
68 startPage
69 endPage
70 title
71 authors
72 url
73 }
74 }
75 }
76 """
77 headers = {
78 "Content-Type": "application/json",
79 "Origin": "https://algebraicgeometry.nl",
80 "Referer": "https://algebraicgeometry.nl/",
81 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0",
82 }
84 response = requests.post(
85 self.api_url,
86 json={"query": query, "variables": {"id": int(xissue.url.split("/")[-1])}},
87 headers=headers,
88 )
89 data = response.json()
90 issue = data["data"]["issue"]
91 i = 1
92 for article in issue["publishedArticles"]:
93 xarticle = create_articledata()
94 xarticle.title_tex = article["title"]
95 xarticle.authors = article["authors"]
96 xarticle.url = xissue.url
97 add_pdf_link_to_xarticle(xarticle, article["url"])
98 pid = f"compositio_{xissue.year}_{xissue.volume}_{xissue.number}_{i}"
99 xarticle.pid = pid
100 i += 1
101 xissue.articles.append(xarticle)
102 return