Coverage for src / crawler / by_source / compositio_crawler.py: 25%

42 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-06-19 13:33 +0000

1from urllib.parse import urljoin 

2 

3import requests 

4from ptf.model_data import create_articledata 

5 

6from crawler.abstract_crawlers.matching_crawler import MatchingCrawler 

7from crawler.utils import add_pdf_link_to_xarticle 

8 

9 

10class CompositioCrawler(MatchingCrawler): 

11 source_name = "Fondation Compositio Mathematica" 

12 source_domain = "COMPOSITIO" 

13 source_website = "https://algebraicgeometry.nl/" 

14 api_url = "https://api.algebraicgeometry.nl/graphql" 

15 

16 def parse_collection_content(self, content): 

17 query = """ 

18 query GetVolumes { 

19 volumes { 

20 id 

21 year 

22 issues { 

23 id 

24 issueNumber 

25 date 

26 } 

27 } 

28 } 

29 """ 

30 

31 headers = { 

32 "Content-Type": "application/json", 

33 "Origin": "https://algebraicgeometry.nl", 

34 "Referer": "https://algebraicgeometry.nl/", 

35 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", 

36 } 

37 

38 response = requests.post(self.api_url, json={"query": query}, headers=headers) 

39 data = response.json() 

40 

41 volumes = data["data"]["volumes"] 

42 xissues = [] 

43 for volume in volumes: 

44 year = volume["year"] 

45 id = volume["id"] 

46 for issue in volume["issues"]: 

47 issue_number = issue["issueNumber"] 

48 xissues.append( 

49 self.create_xissue( 

50 urljoin(self.source_website, f"issues/{issue['id']}"), 

51 str(year), 

52 str(id), 

53 str(issue_number), 

54 ) 

55 ) 

56 return xissues 

57 

58 def parse_issue_content(self, content, xissue): 

59 query = """ 

60 query getPublishedIssue($id: Int) { 

61 issue(id: $id) { 

62 volume 

63 issueNumber 

64 date 

65 year 

66 publishedArticles { 

67 id 

68 startPage 

69 endPage 

70 title 

71 authors 

72 url 

73 } 

74 } 

75 } 

76 """ 

77 headers = { 

78 "Content-Type": "application/json", 

79 "Origin": "https://algebraicgeometry.nl", 

80 "Referer": "https://algebraicgeometry.nl/", 

81 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", 

82 } 

83 

84 response = requests.post( 

85 self.api_url, 

86 json={"query": query, "variables": {"id": int(xissue.url.split("/")[-1])}}, 

87 headers=headers, 

88 ) 

89 data = response.json() 

90 issue = data["data"]["issue"] 

91 i = 1 

92 for article in issue["publishedArticles"]: 

93 xarticle = create_articledata() 

94 xarticle.title_tex = article["title"] 

95 xarticle.authors = article["authors"] 

96 xarticle.url = xissue.url 

97 add_pdf_link_to_xarticle(xarticle, article["url"]) 

98 pid = f"compositio_{xissue.year}_{xissue.volume}_{xissue.number}_{i}" 

99 xarticle.pid = pid 

100 i += 1 

101 xissue.articles.append(xarticle) 

102 return