Coverage for src / crawler / by_source / jsig_crawler.py: 10%
100 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-11 14:57 +0000
1from urllib.parse import urljoin
3from bs4 import BeautifulSoup, Tag
4from django.conf import settings
5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj
7from crawler.base_crawler import BaseCollectionCrawler
8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict
11class JsigCrawler(BaseCollectionCrawler):
12 source_name = "Journal of Singularities website"
13 source_domain = "JSIG"
14 source_website = "https://journalofsing.org"
16 issue_re = r"volume (?P<volume>\d+), (?P<year>\d{4})"
17 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 35)
19 def parse_collection_content(self, content):
20 xissues = []
21 soup = BeautifulSoup(content, "html5lib")
22 issues_tags = soup.select("#navcontainer a[href^='volume']")
24 # issues 1-10 are listed in a dedicated html page
25 nested = soup.select_one("#navcontainer a[href^='volume1-10']")
26 if isinstance(nested, Tag):
27 issues_tags.remove(nested)
28 nested_href = nested.get("href")
29 if isinstance(nested_href, str):
30 additional_content = self.download_file(urljoin(self.collection_url, nested_href))
31 more_soup = BeautifulSoup(additional_content, "html5lib")
32 more_issues = more_soup.select("#col-text-content2 a")
33 issues_tags.extend(more_issues)
35 for tag in issues_tags:
36 issue_dict = regex_to_dict(
37 self.issue_re, tag.text, error_msg="Couldn't parse issue data"
38 )
40 issue_href = tag.get("href")
41 if not isinstance(issue_href, str):
42 raise ValueError("Couldn't parse issue url")
44 xissues.append(
45 self.create_xissue(
46 urljoin(self.collection_url, issue_href),
47 issue_dict["year"],
48 issue_dict["volume"],
49 None,
50 )
51 )
52 return xissues
54 def parse_issue_content(self, content, xissue):
55 if not xissue.url:
56 raise ValueError("xissue must have an url")
58 soup = BeautifulSoup(content, "html5lib")
59 articles_tags = soup.select("#col-text-content2 a[href$='.html']")
60 for index, tag in enumerate(articles_tags):
61 article_url = tag.get("href")
62 if not isinstance(article_url, str):
63 raise ValueError("Couldn't parse article data")
64 xarticle = create_articledata()
65 xarticle.pid = "a" + str(index)
66 xarticle.url = urljoin(xissue.url, article_url)
67 xissue.articles.append(xarticle)
69 def parse_article_content(self, content, xissue, xarticle, url):
70 if not xarticle.url:
71 raise ValueError("xarticle must have an url")
73 soup = BeautifulSoup(content, "html5lib")
74 soup = soup.select_one("#col-text-content")
76 if not soup:
77 raise ValueError("Couldn't parse article page")
79 # Title and pdf
80 title_tag = soup.select_one("h2")
81 if not title_tag:
82 raise ValueError("Couldn't parse article title")
83 xarticle.title_tex = cleanup_str(title_tag.text)
84 pdf_tag = title_tag.select_one("a")
85 if not pdf_tag:
86 raise ValueError("Couldn't find article pdf")
87 pdf_href = pdf_tag.get("href")
88 if not isinstance(pdf_href, str):
89 raise ValueError("Couldn't parse article pdf")
90 pdf_href = urljoin(xarticle.url, pdf_href)
91 add_pdf_link_to_xarticle(xarticle, pdf_href)
92 title_tag.decompose()
94 # Authors
95 authors_tag = soup.select_one("div > span")
96 if authors_tag:
97 authors_list = authors_tag.text.replace(" and ", ", ").split(", ")
98 for author_str in authors_list:
99 xarticle.contributors.append(
100 create_contributor(string_name=cleanup_str(author_str), role="author")
101 )
102 authors_tag.decompose()
104 # MSC
105 msc_header = soup.select_one("p:-soup-contains('Mathematical Subject Classification')")
106 if msc_header:
107 msc_tag = msc_header.findNext("p")
108 if isinstance(msc_tag, Tag):
109 for msc_str in (
110 msc_tag.text.replace(";", ",")
111 .replace("Primary", "")
112 .replace("Secondary", "")
113 .replace(" ", "")
114 .split(",")
115 ):
116 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc_str)))
117 msc_tag.decompose()
118 msc_header.decompose()
120 # Abstract
121 abstract_header = soup.select_one("p:-soup-contains('Abstract')")
122 if abstract_header:
123 abstract_tag = abstract_header.findNext("p")
124 if isinstance(abstract_tag, Tag):
125 xarticle.abstracts.append(
126 create_abstract(value_tex=cleanup_str(abstract_tag.text), lang="en")
127 )
128 abstract_tag.decompose()
129 abstract_header.decompose()
131 # Pages
132 pages_tag = soup.select_one("p.style3")
133 if isinstance(pages_tag, Tag):
134 pages_text = cleanup_str(pages_tag.text)
135 pages_dict = regex_to_dict(
136 r"volume \d+ \(\d+\), (?P<fpage>\d+)\-(?P<lpage>\d+)", pages_text
137 )
138 xarticle.fpage = pages_dict["fpage"]
139 xarticle.lpage = pages_dict["lpage"]
140 pages_tag.decompose()
142 # DOI
143 doi_tag = soup.select_one("p:-soup-contains('DOI:')")
144 if isinstance(doi_tag, Tag):
145 xarticle.doi = cleanup_str(
146 doi_tag.select_one("a").get("href").removeprefix("http://dx.doi.org/")
147 )
148 doi_tag.decompose()
149 if xarticle.url == "https://journalofsing.org/volume22/article2.html":
150 xarticle.doi = "10.5427/jsing.2020.22b"
152 return xarticle