Coverage for src/crawler/by_source/jsig_crawler.py: 9%

98 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-07-30 09:47 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

5 

6from crawler.base_crawler import BaseCollectionCrawler 

7from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

8 

9 

10class JsigCrawler(BaseCollectionCrawler): 

11 source_name = "Journal of Singularities website" 

12 source_domain = "JSIG" 

13 source_website = "https://journalofsing.org" 

14 

15 issue_re = r"volume (?P<volume>\d+), (?P<year>\d{4})" 

16 

17 def parse_collection_content(self, content): 

18 xissues = [] 

19 soup = BeautifulSoup(content, "html.parser") 

20 issues_tags = soup.select("#navcontainer a[href^='volume']") 

21 

22 # issues 1-10 are listed in a dedicated html page 

23 nested = soup.select_one("#navcontainer a[href^='volume1-10']") 

24 if isinstance(nested, Tag): 

25 issues_tags.remove(nested) 

26 nested_href = nested.get("href") 

27 if isinstance(nested_href, str): 

28 additional_content = self.download_file(urljoin(self.collection_url, nested_href)) 

29 more_soup = BeautifulSoup(additional_content, "html.parser") 

30 more_issues = more_soup.select("#col-text-content2 a") 

31 issues_tags.extend(more_issues) 

32 

33 for tag in issues_tags: 

34 issue_dict = regex_to_dict( 

35 self.issue_re, tag.text, error_msg="Couldn't parse issue data" 

36 ) 

37 

38 issue_href = tag.get("href") 

39 if not isinstance(issue_href, str): 

40 raise ValueError("Couldn't parse issue url") 

41 

42 xissues.append( 

43 self.create_xissue( 

44 urljoin(self.collection_url, issue_href), 

45 issue_dict["year"], 

46 issue_dict["volume"], 

47 None, 

48 ) 

49 ) 

50 return xissues 

51 

52 def parse_issue_content(self, content, xissue): 

53 if not xissue.url: 

54 raise ValueError("xissue must have an url") 

55 

56 soup = BeautifulSoup(content, "html.parser") 

57 articles_tags = soup.select("#col-text-content2 a[href$='.html']") 

58 for index, tag in enumerate(articles_tags): 

59 article_url = tag.get("href") 

60 if not isinstance(article_url, str): 

61 raise ValueError("Couldn't parse article data") 

62 xarticle = create_articledata() 

63 xarticle.pid = "a" + str(index) 

64 xarticle.url = urljoin(xissue.url, article_url) 

65 xissue.articles.append(xarticle) 

66 

67 def parse_article_content(self, content, xissue, xarticle, url): 

68 if not xarticle.url: 

69 raise ValueError("xarticle must have an url") 

70 

71 soup = BeautifulSoup(content, "html5lib") 

72 soup = soup.select_one("#col-text-content") 

73 

74 if not soup: 

75 raise ValueError("Couldn't parse article page") 

76 

77 # Title and pdf 

78 title_tag = soup.select_one("h2") 

79 if not title_tag: 

80 raise ValueError("Couldn't parse article title") 

81 xarticle.title_tex = cleanup_str(title_tag.text) 

82 pdf_tag = title_tag.select_one("a") 

83 if not pdf_tag: 

84 raise ValueError("Couldn't find article pdf") 

85 pdf_href = pdf_tag.get("href") 

86 if not isinstance(pdf_href, str): 

87 raise ValueError("Couldn't parse article pdf") 

88 pdf_href = urljoin(xarticle.url, pdf_href) 

89 add_pdf_link_to_xarticle(xarticle, pdf_href) 

90 title_tag.decompose() 

91 

92 # Authors 

93 authors_tag = soup.select_one("div > span") 

94 if authors_tag: 

95 authors_list = authors_tag.text.replace(" and ", ", ").split(", ") 

96 for author_str in authors_list: 

97 xarticle.contributors.append( 

98 create_contributor(string_name=cleanup_str(author_str), role="author") 

99 ) 

100 authors_tag.decompose() 

101 

102 # MSC 

103 msc_header = soup.select_one("p:-soup-contains('Mathematical Subject Classification')") 

104 if msc_header: 

105 msc_tag = msc_header.findNext("p") 

106 if isinstance(msc_tag, Tag): 

107 for msc_str in ( 

108 msc_tag.text.replace(";", ",") 

109 .replace("Primary", "") 

110 .replace("Secondary", "") 

111 .replace(" ", "") 

112 .split(",") 

113 ): 

114 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc_str))) 

115 msc_tag.decompose() 

116 msc_header.decompose() 

117 

118 # Abstract 

119 abstract_header = soup.select_one("p:-soup-contains('Abstract')") 

120 if abstract_header: 

121 abstract_tag = abstract_header.findNext("p") 

122 if isinstance(abstract_tag, Tag): 

123 xarticle.abstracts.append( 

124 create_abstract( 

125 tag="abstract", value_tex=cleanup_str(abstract_tag.text), lang="en" 

126 ) 

127 ) 

128 abstract_tag.decompose() 

129 abstract_header.decompose() 

130 

131 # Pages 

132 pages_tag = soup.select_one("p.style3") 

133 if isinstance(pages_tag, Tag): 

134 pages_text = cleanup_str(pages_tag.text) 

135 pages_dict = regex_to_dict( 

136 r"volume \d+ \(\d+\), (?P<fpage>\d+)\-(?P<lpage>\d+)", pages_text 

137 ) 

138 xarticle.fpage = pages_dict["fpage"] 

139 xarticle.lpage = pages_dict["lpage"] 

140 pages_tag.decompose() 

141 

142 # DOI 

143 doi_tag = soup.select_one("p:-soup-contains('DOI:')") 

144 if isinstance(doi_tag, Tag): 

145 xarticle.doi = cleanup_str( 

146 doi_tag.select_one("a").get("href").removeprefix("http://dx.doi.org/") 

147 ) 

148 doi_tag.decompose() 

149 if xarticle.url == "https://journalofsing.org/volume22/article2.html": 

150 xarticle.doi = "10.5427/jsing.2020.22b" 

151 

152 return xarticle