Coverage for src/crawler/by_source/jsig_crawler.py: 9%
98 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-07-30 09:47 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup, Tag
4from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
6from crawler.base_crawler import BaseCollectionCrawler
7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
10class JsigCrawler(BaseCollectionCrawler):
11 source_name = "Journal of Singularities website"
12 source_domain = "JSIG"
13 source_website = "https://journalofsing.org"
15 issue_re = r"volume (?P<volume>\d+), (?P<year>\d{4})"
17 def parse_collection_content(self, content):
18 xissues = []
19 soup = BeautifulSoup(content, "html.parser")
20 issues_tags = soup.select("#navcontainer a[href^='volume']")
22 # issues 1-10 are listed in a dedicated html page
23 nested = soup.select_one("#navcontainer a[href^='volume1-10']")
24 if isinstance(nested, Tag):
25 issues_tags.remove(nested)
26 nested_href = nested.get("href")
27 if isinstance(nested_href, str):
28 additional_content = self.download_file(urljoin(self.collection_url, nested_href))
29 more_soup = BeautifulSoup(additional_content, "html.parser")
30 more_issues = more_soup.select("#col-text-content2 a")
31 issues_tags.extend(more_issues)
33 for tag in issues_tags:
34 issue_dict = regex_to_dict(
35 self.issue_re, tag.text, error_msg="Couldn't parse issue data"
36 )
38 issue_href = tag.get("href")
39 if not isinstance(issue_href, str):
40 raise ValueError("Couldn't parse issue url")
42 xissues.append(
43 self.create_xissue(
44 urljoin(self.collection_url, issue_href),
45 issue_dict["year"],
46 issue_dict["volume"],
47 None,
48 )
49 )
50 return xissues
52 def parse_issue_content(self, content, xissue):
53 if not xissue.url:
54 raise ValueError("xissue must have an url")
56 soup = BeautifulSoup(content, "html.parser")
57 articles_tags = soup.select("#col-text-content2 a[href$='.html']")
58 for index, tag in enumerate(articles_tags):
59 article_url = tag.get("href")
60 if not isinstance(article_url, str):
61 raise ValueError("Couldn't parse article data")
62 xarticle = create_articledata()
63 xarticle.pid = "a" + str(index)
64 xarticle.url = urljoin(xissue.url, article_url)
65 xissue.articles.append(xarticle)
67 def parse_article_content(self, content, xissue, xarticle, url):
68 if not xarticle.url:
69 raise ValueError("xarticle must have an url")
71 soup = BeautifulSoup(content, "html5lib")
72 soup = soup.select_one("#col-text-content")
74 if not soup:
75 raise ValueError("Couldn't parse article page")
77 # Title and pdf
78 title_tag = soup.select_one("h2")
79 if not title_tag:
80 raise ValueError("Couldn't parse article title")
81 xarticle.title_tex = cleanup_str(title_tag.text)
82 pdf_tag = title_tag.select_one("a")
83 if not pdf_tag:
84 raise ValueError("Couldn't find article pdf")
85 pdf_href = pdf_tag.get("href")
86 if not isinstance(pdf_href, str):
87 raise ValueError("Couldn't parse article pdf")
88 pdf_href = urljoin(xarticle.url, pdf_href)
89 add_pdf_link_to_xarticle(xarticle, pdf_href)
90 title_tag.decompose()
92 # Authors
93 authors_tag = soup.select_one("div > span")
94 if authors_tag:
95 authors_list = authors_tag.text.replace(" and ", ", ").split(", ")
96 for author_str in authors_list:
97 xarticle.contributors.append(
98 create_contributor(string_name=cleanup_str(author_str), role="author")
99 )
100 authors_tag.decompose()
102 # MSC
103 msc_header = soup.select_one("p:-soup-contains('Mathematical Subject Classification')")
104 if msc_header:
105 msc_tag = msc_header.findNext("p")
106 if isinstance(msc_tag, Tag):
107 for msc_str in (
108 msc_tag.text.replace(";", ",")
109 .replace("Primary", "")
110 .replace("Secondary", "")
111 .replace(" ", "")
112 .split(",")
113 ):
114 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc_str)))
115 msc_tag.decompose()
116 msc_header.decompose()
118 # Abstract
119 abstract_header = soup.select_one("p:-soup-contains('Abstract')")
120 if abstract_header:
121 abstract_tag = abstract_header.findNext("p")
122 if isinstance(abstract_tag, Tag):
123 xarticle.abstracts.append(
124 create_abstract(
125 tag="abstract", value_tex=cleanup_str(abstract_tag.text), lang="en"
126 )
127 )
128 abstract_tag.decompose()
129 abstract_header.decompose()
131 # Pages
132 pages_tag = soup.select_one("p.style3")
133 if isinstance(pages_tag, Tag):
134 pages_text = cleanup_str(pages_tag.text)
135 pages_dict = regex_to_dict(
136 r"volume \d+ \(\d+\), (?P<fpage>\d+)\-(?P<lpage>\d+)", pages_text
137 )
138 xarticle.fpage = pages_dict["fpage"]
139 xarticle.lpage = pages_dict["lpage"]
140 pages_tag.decompose()
142 # DOI
143 doi_tag = soup.select_one("p:-soup-contains('DOI:')")
144 if isinstance(doi_tag, Tag):
145 xarticle.doi = cleanup_str(
146 doi_tag.select_one("a").get("href").removeprefix("http://dx.doi.org/")
147 )
148 doi_tag.decompose()
149 if xarticle.url == "https://journalofsing.org/volume22/article2.html":
150 xarticle.doi = "10.5427/jsing.2020.22b"
152 return xarticle