Coverage for src / crawler / by_source / ejc_crawler.py: 7%
113 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-02-02 15:55 +0000
1from bs4 import BeautifulSoup, Tag
2from ptf.model_data import create_abstract, create_articledata
4from crawler.matching_crawler import MatchingCrawler
5from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
8class EjcCrawler(MatchingCrawler):
9 source_name = "The Electronic Journal of Combinatorics website"
10 source_domain = "EJC"
11 source_website = "https://www.combinatorics.org"
13 # Parse one page (pagination)
14 def parse_ejc_collection_page(self, soup: "Tag"):
15 xissues = []
16 xissues_tags = soup.select(".obj_issue_summary a.title")
17 for tag in xissues_tags:
18 try:
19 volume_data = regex_to_dict(
20 pattern=r"Volume (?P<volume>\d+)(?:, Issue (?P<issue>\d+))? \( ?(?P<year>\d{4})(?:[-\d]+)? ?\)(?: \((?P<title>[\w ]+)\))?",
21 value=tag.text,
22 )
23 except ValueError:
24 self.logger.warning(f"Couldn't parse issue with name {cleanup_str(tag.text)}")
25 continue
26 xissue = self.create_xissue(
27 url=self.get_str_attr(tag, "href"),
28 year=volume_data["year"],
29 volume_number=volume_data["volume"],
30 issue_number=volume_data["issue"],
31 )
32 if volume_data.get("title", ""):
33 xissue.title_tex = volume_data["title"]
34 xissues.append(xissue)
35 return xissues
37 def parse_collection_content(self, content):
38 xissues = []
39 soup = BeautifulSoup(content, "html5lib")
40 next = soup.select_one("a.next")
41 xissues.extend(self.parse_ejc_collection_page(soup))
42 # Handle pagination
43 while next is not None:
44 content = self.download_file(self.get_str_attr(next, "href"))
45 soup = BeautifulSoup(content, "html5lib")
46 next = soup.select_one("a.next")
47 xissues.extend(self.parse_ejc_collection_page(soup))
48 return xissues
50 def parse_issue_content(self, content, xissue):
51 soup = BeautifulSoup(content, "html5lib")
52 sections = soup.select(".section")
53 article_number = 0
54 for section in sections:
55 articles = section.select("ul.articles li .obj_article_summary .title a")
56 atype = ""
57 atype_tag = section.select_one("h2")
58 if atype_tag:
59 atype = atype_tag.text
60 for article_tag in articles:
61 xarticle = create_articledata()
62 xarticle.url = self.get_str_attr(article_tag, "href")
63 xarticle.pid = f"a{article_number}"
64 xarticle.atype = atype
65 article_number += 1
66 xissue.articles.append(xarticle)
68 return super().parse_issue_content(content, xissue)
70 def parse_article_content(self, content, xissue, xarticle, url):
71 soup = BeautifulSoup(content, "html5lib")
72 self.get_metadata_using_citation_meta(xarticle, xissue, soup, ["author", "doi", "title"])
73 self.get_metadata_using_dcterms(xarticle, soup, ("date_published", "article_type"))
75 # abstract
76 abstract_tag = soup.select_one(".abstract")
77 if not abstract_tag:
78 raise ValueError("Cannot find abstract")
79 label = abstract_tag.select_one(".label")
80 if label:
81 label.decompose()
82 xarticle.abstracts.append(create_abstract(value_tex=cleanup_str(abstract_tag.text)))
84 # article number
85 article_number_label = soup.select_one(
86 ".sub_item label:-soup-contains-own('Article Number')"
87 )
88 if article_number_label:
89 article_number_container = soup.parent
90 if not article_number_container:
91 raise ValueError("Couldn't find article number container")
92 article_number_tag = article_number_container.select_one(".value .pages")
93 if not article_number_tag:
94 raise ValueError("Couldn't find article number")
95 xarticle.article_number = cleanup_str(article_number_tag.text)
97 # PDF
98 links_tags = soup.select(".galleys_links a")
99 for tag in links_tags:
100 tag_text = cleanup_str(tag.text)
101 if tag_text.lower().startswith("comment"):
102 self.logger.warning(f"COMMENT file ignored for article at {xarticle.url}")
103 continue
104 if "CODE" == tag_text or "Sage Package" == tag_text:
105 self.logger.warning(f"CODE file ignored for article at {xarticle.url}")
106 continue
107 if "Source Code" == tag_text:
108 self.logger.warning(f"Source Code file ignored for article at {xarticle.url}")
109 continue
110 if tag_text.lower().startswith("data"):
111 self.logger.warning(f"DATA file ignored for article at {xarticle.url}")
112 continue
114 if "pdf" in tag.get("class", []):
115 mimetype = "application/pdf"
116 elif "file" in tag.get("class", []):
117 mimetype = "text/html"
118 else:
119 raise ValueError(f"Couldn't get mimetype for class {tag.get('class', [])}")
121 if tag_text.startswith("PDF"):
122 pdf_url = self.get_str_attr(tag, "href")
123 pdf_url = pdf_url.replace("article/view/", "article/download/")
124 add_pdf_link_to_xarticle(xarticle, pdf_url)
125 continue
126 if "HTML" == tag_text:
127 add_pdf_link_to_xarticle(
128 xarticle, self.get_str_attr(tag, "href"), mimetype=mimetype
129 )
130 continue
132 if "Supplementary File" == tag_text:
133 xarticle.supplementary_materials.append(
134 {
135 "base": "",
136 "caption": "Supplementary File",
137 "location": self.get_str_attr(tag, "href"),
138 "metadata": "",
139 "mimetype": mimetype,
140 "rel": "supplementary-material",
141 }
142 )
143 continue
144 if "Supplementary material" == tag_text:
145 xarticle.supplementary_materials.append(
146 {
147 "base": "",
148 "caption": "Supplementary material",
149 "location": self.get_str_attr(tag, "href"),
150 "metadata": "",
151 "mimetype": mimetype,
152 "rel": "supplementary-material",
153 }
154 )
155 continue
156 if tag_text.lower().startswith("appendix"):
157 xarticle.supplementary_materials.append(
158 {
159 "base": "",
160 "caption": "Appendix",
161 "location": self.get_str_attr(tag, "href"),
162 "metadata": "",
163 "mimetype": mimetype,
164 "rel": "appendix",
165 }
166 )
167 continue
168 if tag_text.lower().startswith("addendum"):
169 xarticle.supplementary_materials.append(
170 {
171 "base": "",
172 "caption": "Addendum",
173 "location": self.get_str_attr(tag, "href"),
174 "metadata": "",
175 "mimetype": mimetype,
176 "rel": "addendum",
177 }
178 )
179 continue
181 raise ValueError(f"Unimplemented file {tag_text} for {xarticle.url}")
183 return xarticle