Coverage for src / crawler / by_source / ejc_crawler.py: 7%

113 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1from bs4 import BeautifulSoup, Tag 

2from ptf.model_data import create_abstract, create_articledata 

3 

4from crawler.matching_crawler import MatchingCrawler 

5from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

6 

7 

8class EjcCrawler(MatchingCrawler): 

9 source_name = "The Electronic Journal of Combinatorics website" 

10 source_domain = "EJC" 

11 source_website = "https://www.combinatorics.org" 

12 

13 # Parse one page (pagination) 

14 def parse_ejc_collection_page(self, soup: "Tag"): 

15 xissues = [] 

16 xissues_tags = soup.select(".obj_issue_summary a.title") 

17 for tag in xissues_tags: 

18 try: 

19 volume_data = regex_to_dict( 

20 pattern=r"Volume (?P<volume>\d+)(?:, Issue (?P<issue>\d+))? \( ?(?P<year>\d{4})(?:[-\d]+)? ?\)(?: \((?P<title>[\w ]+)\))?", 

21 value=tag.text, 

22 ) 

23 except ValueError: 

24 self.logger.warning(f"Couldn't parse issue with name {cleanup_str(tag.text)}") 

25 continue 

26 xissue = self.create_xissue( 

27 url=self.get_str_attr(tag, "href"), 

28 year=volume_data["year"], 

29 volume_number=volume_data["volume"], 

30 issue_number=volume_data["issue"], 

31 ) 

32 if volume_data.get("title", ""): 

33 xissue.title_tex = volume_data["title"] 

34 xissues.append(xissue) 

35 return xissues 

36 

37 def parse_collection_content(self, content): 

38 xissues = [] 

39 soup = BeautifulSoup(content, "html5lib") 

40 next = soup.select_one("a.next") 

41 xissues.extend(self.parse_ejc_collection_page(soup)) 

42 # Handle pagination 

43 while next is not None: 

44 content = self.download_file(self.get_str_attr(next, "href")) 

45 soup = BeautifulSoup(content, "html5lib") 

46 next = soup.select_one("a.next") 

47 xissues.extend(self.parse_ejc_collection_page(soup)) 

48 return xissues 

49 

50 def parse_issue_content(self, content, xissue): 

51 soup = BeautifulSoup(content, "html5lib") 

52 sections = soup.select(".section") 

53 article_number = 0 

54 for section in sections: 

55 articles = section.select("ul.articles li .obj_article_summary .title a") 

56 atype = "" 

57 atype_tag = section.select_one("h2") 

58 if atype_tag: 

59 atype = atype_tag.text 

60 for article_tag in articles: 

61 xarticle = create_articledata() 

62 xarticle.url = self.get_str_attr(article_tag, "href") 

63 xarticle.pid = f"a{article_number}" 

64 xarticle.atype = atype 

65 article_number += 1 

66 xissue.articles.append(xarticle) 

67 

68 return super().parse_issue_content(content, xissue) 

69 

70 def parse_article_content(self, content, xissue, xarticle, url): 

71 soup = BeautifulSoup(content, "html5lib") 

72 self.get_metadata_using_citation_meta(xarticle, xissue, soup, ["author", "doi", "title"]) 

73 self.get_metadata_using_dcterms(xarticle, soup, ("date_published", "article_type")) 

74 

75 # abstract 

76 abstract_tag = soup.select_one(".abstract") 

77 if not abstract_tag: 

78 raise ValueError("Cannot find abstract") 

79 label = abstract_tag.select_one(".label") 

80 if label: 

81 label.decompose() 

82 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text))) 

83 

84 # article number 

85 article_number_label = soup.select_one( 

86 ".sub_item label:-soup-contains-own('Article Number')" 

87 ) 

88 if article_number_label: 

89 article_number_container = soup.parent 

90 if not article_number_container: 

91 raise ValueError("Couldn't find article number container") 

92 article_number_tag = article_number_container.select_one(".value .pages") 

93 if not article_number_tag: 

94 raise ValueError("Couldn't find article number") 

95 xarticle.article_number = cleanup_str(article_number_tag.text) 

96 

97 # PDF 

98 links_tags = soup.select(".galleys_links a") 

99 for tag in links_tags: 

100 tag_text = cleanup_str(tag.text) 

101 if tag_text.lower().startswith("comment"): 

102 self.logger.warning(f"COMMENT file ignored for article at {xarticle.url}") 

103 continue 

104 if "CODE" == tag_text or "Sage Package" == tag_text: 

105 self.logger.warning(f"CODE file ignored for article at {xarticle.url}") 

106 continue 

107 if "Source Code" == tag_text: 

108 self.logger.warning(f"Source Code file ignored for article at {xarticle.url}") 

109 continue 

110 if tag_text.lower().startswith("data"): 

111 self.logger.warning(f"DATA file ignored for article at {xarticle.url}") 

112 continue 

113 

114 if "pdf" in tag.get("class", []): 

115 mimetype = "application/pdf" 

116 elif "file" in tag.get("class", []): 

117 mimetype = "text/html" 

118 else: 

119 raise ValueError(f"Couldn't get mimetype for class {tag.get('class', [])}") 

120 

121 if tag_text.startswith("PDF"): 

122 pdf_url = self.get_str_attr(tag, "href") 

123 pdf_url = pdf_url.replace("article/view/", "article/download/") 

124 add_pdf_link_to_xarticle(xarticle, pdf_url) 

125 continue 

126 if "HTML" == tag_text: 

127 add_pdf_link_to_xarticle( 

128 xarticle, self.get_str_attr(tag, "href"), mimetype=mimetype 

129 ) 

130 continue 

131 

132 if "Supplementary File" == tag_text: 

133 xarticle.supplementary_materials.append( 

134 { 

135 "base": "", 

136 "caption": "Supplementary File", 

137 "location": self.get_str_attr(tag, "href"), 

138 "metadata": "", 

139 "mimetype": mimetype, 

140 "rel": "supplementary-material", 

141 } 

142 ) 

143 continue 

144 if "Supplementary material" == tag_text: 

145 xarticle.supplementary_materials.append( 

146 { 

147 "base": "", 

148 "caption": "Supplementary material", 

149 "location": self.get_str_attr(tag, "href"), 

150 "metadata": "", 

151 "mimetype": mimetype, 

152 "rel": "supplementary-material", 

153 } 

154 ) 

155 continue 

156 if tag_text.lower().startswith("appendix"): 

157 xarticle.supplementary_materials.append( 

158 { 

159 "base": "", 

160 "caption": "Appendix", 

161 "location": self.get_str_attr(tag, "href"), 

162 "metadata": "", 

163 "mimetype": mimetype, 

164 "rel": "appendix", 

165 } 

166 ) 

167 continue 

168 if tag_text.lower().startswith("addendum"): 

169 xarticle.supplementary_materials.append( 

170 { 

171 "base": "", 

172 "caption": "Addendum", 

173 "location": self.get_str_attr(tag, "href"), 

174 "metadata": "", 

175 "mimetype": mimetype, 

176 "rel": "addendum", 

177 } 

178 ) 

179 continue 

180 

181 raise ValueError(f"Unimplemented file {tag_text} for {xarticle.url}") 

182 

183 return xarticle