Coverage for src / crawler / by_source / jsig_crawler.py: 10%

100 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1from urllib.parse import urljoin 

2 

3from bs4 import BeautifulSoup, Tag 

4from django.conf import settings 

5from ptf.model_data import create_abstract, create_articledata, create_contributor, create_subj 

6 

7from crawler.base_crawler import BaseCollectionCrawler 

8from crawler.utils import add_pdf_link_to_xarticle, cleanup_str, regex_to_dict 

9 

10 

11class JsigCrawler(BaseCollectionCrawler): 

12 source_name = "Journal of Singularities website" 

13 source_domain = "JSIG" 

14 source_website = "https://journalofsing.org" 

15 

16 issue_re = r"volume (?P<volume>\d+), (?P<year>\d{4})" 

17 requests_interval = max(getattr(settings, "REQUESTS_INTERVAL", 90), 35) 

18 

19 def parse_collection_content(self, content): 

20 xissues = [] 

21 soup = BeautifulSoup(content, "html5lib") 

22 issues_tags = soup.select("#navcontainer a[href^='volume']") 

23 

24 # issues 1-10 are listed in a dedicated html page 

25 nested = soup.select_one("#navcontainer a[href^='volume1-10']") 

26 if isinstance(nested, Tag): 

27 issues_tags.remove(nested) 

28 nested_href = nested.get("href") 

29 if isinstance(nested_href, str): 

30 additional_content = self.download_file(urljoin(self.collection_url, nested_href)) 

31 more_soup = BeautifulSoup(additional_content, "html5lib") 

32 more_issues = more_soup.select("#col-text-content2 a") 

33 issues_tags.extend(more_issues) 

34 

35 for tag in issues_tags: 

36 issue_dict = regex_to_dict( 

37 self.issue_re, tag.text, error_msg="Couldn't parse issue data" 

38 ) 

39 

40 issue_href = tag.get("href") 

41 if not isinstance(issue_href, str): 

42 raise ValueError("Couldn't parse issue url") 

43 

44 xissues.append( 

45 self.create_xissue( 

46 urljoin(self.collection_url, issue_href), 

47 issue_dict["year"], 

48 issue_dict["volume"], 

49 None, 

50 ) 

51 ) 

52 return xissues 

53 

54 def parse_issue_content(self, content, xissue): 

55 if not xissue.url: 

56 raise ValueError("xissue must have an url") 

57 

58 soup = BeautifulSoup(content, "html5lib") 

59 articles_tags = soup.select("#col-text-content2 a[href$='.html']") 

60 for index, tag in enumerate(articles_tags): 

61 article_url = tag.get("href") 

62 if not isinstance(article_url, str): 

63 raise ValueError("Couldn't parse article data") 

64 xarticle = create_articledata() 

65 xarticle.pid = "a" + str(index) 

66 xarticle.url = urljoin(xissue.url, article_url) 

67 xissue.articles.append(xarticle) 

68 

69 def parse_article_content(self, content, xissue, xarticle, url): 

70 if not xarticle.url: 

71 raise ValueError("xarticle must have an url") 

72 

73 soup = BeautifulSoup(content, "html5lib") 

74 soup = soup.select_one("#col-text-content") 

75 

76 if not soup: 

77 raise ValueError("Couldn't parse article page") 

78 

79 # Title and pdf 

80 title_tag = soup.select_one("h2") 

81 if not title_tag: 

82 raise ValueError("Couldn't parse article title") 

83 xarticle.title_tex = cleanup_str(title_tag.text) 

84 pdf_tag = title_tag.select_one("a") 

85 if not pdf_tag: 

86 raise ValueError("Couldn't find article pdf") 

87 pdf_href = pdf_tag.get("href") 

88 if not isinstance(pdf_href, str): 

89 raise ValueError("Couldn't parse article pdf") 

90 pdf_href = urljoin(xarticle.url, pdf_href) 

91 add_pdf_link_to_xarticle(xarticle, pdf_href) 

92 title_tag.decompose() 

93 

94 # Authors 

95 authors_tag = soup.select_one("div > span") 

96 if authors_tag: 

97 authors_list = authors_tag.text.replace(" and ", ", ").split(", ") 

98 for author_str in authors_list: 

99 xarticle.contributors.append( 

100 create_contributor(string_name=cleanup_str(author_str), role="author") 

101 ) 

102 authors_tag.decompose() 

103 

104 # MSC 

105 msc_header = soup.select_one("p:-soup-contains('Mathematical Subject Classification')") 

106 if msc_header: 

107 msc_tag = msc_header.findNext("p") 

108 if isinstance(msc_tag, Tag): 

109 for msc_str in ( 

110 msc_tag.text.replace(";", ",") 

111 .replace("Primary", "") 

112 .replace("Secondary", "") 

113 .replace(" ", "") 

114 .split(",") 

115 ): 

116 xarticle.kwds.append(create_subj(type="msc", value=cleanup_str(msc_str))) 

117 msc_tag.decompose() 

118 msc_header.decompose() 

119 

120 # Abstract 

121 abstract_header = soup.select_one("p:-soup-contains('Abstract')") 

122 if abstract_header: 

123 abstract_tag = abstract_header.findNext("p") 

124 if isinstance(abstract_tag, Tag): 

125 xarticle.abstracts.append( 

126 create_abstract(value_tex=cleanup_str(abstract_tag.text), lang="en") 

127 ) 

128 abstract_tag.decompose() 

129 abstract_header.decompose() 

130 

131 # Pages 

132 pages_tag = soup.select_one("p.style3") 

133 if isinstance(pages_tag, Tag): 

134 pages_text = cleanup_str(pages_tag.text) 

135 pages_dict = regex_to_dict( 

136 r"volume \d+ \(\d+\), (?P<fpage>\d+)\-(?P<lpage>\d+)", pages_text 

137 ) 

138 xarticle.fpage = pages_dict["fpage"] 

139 xarticle.lpage = pages_dict["lpage"] 

140 pages_tag.decompose() 

141 

142 # DOI 

143 doi_tag = soup.select_one("p:-soup-contains('DOI:')") 

144 if isinstance(doi_tag, Tag): 

145 xarticle.doi = cleanup_str( 

146 doi_tag.select_one("a").get("href").removeprefix("http://dx.doi.org/") 

147 ) 

148 doi_tag.decompose() 

149 if xarticle.url == "https://journalofsing.org/volume22/article2.html": 

150 xarticle.doi = "10.5427/jsing.2020.22b" 

151 

152 return xarticle