Coverage for src / crawler / by_source / nsjom / nsjom_1971_crawler.py: 85%

95 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-02-02 15:55 +0000

1import re 

2import typing 

3from datetime import datetime 

4from urllib.parse import urljoin 

5 

6from bs4 import BeautifulSoup, Tag 

7from ptf.model_data import ( 

8 IssueData, 

9 create_articledata, 

10 create_contributor, 

11 create_extlink, 

12 create_issuedata, 

13) 

14 

15from crawler.crawler_utils import set_pages 

16from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

17 

18if typing.TYPE_CHECKING: 

19 from .nsjom_crawler import NsjomCrawler 

20 

21 

22source_domain = "NSJOM" 

23 

24 

25def parse_collection_content( 

26 self: "NsjomCrawler", 

27 content: str, 

28 source_domain: str = "NSJOM", 

29): 

30 """ 

31 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

32 From 1971 to 2009 (included) 

33 """ 

34 xissues: list[IssueData] = [] 

35 year_start = 1971 

36 year_end = min(2009, datetime.now().year) 

37 

38 for year in range(year_start, year_end + 1): 

39 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

40 year_content = self.download_file(url) 

41 try: 

42 xissues = xissues + parse_year(self, year_content, year, url) 

43 except ValueError as e: 

44 # Adds message to printed error 

45 e.add_note(f"[{source_domain}]: {year}") 

46 raise 

47 return xissues 

48 

49 

50def parse_issue_content(self, content: str, xissue: IssueData): 

51 if not xissue.year: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("Issue year is not set") 

53 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

54 

55 

56def parse_year( 

57 self: "NsjomCrawler", 

58 content: str, 

59 year: int, 

60 url: str | None = None, 

61 pid_to_parse: str | None = None, 

62): 

63 """Parses one page. 

64 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

65 """ 

66 

67 soup = BeautifulSoup(content, "html.parser") 

68 xissues: list[IssueData] = [] 

69 issues_tags = soup.select("body>table") 

70 for issue_tag in issues_tags: 

71 xissue = parse_issue_tag(self, issue_tag, year) 

72 if not xissue: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 continue 

74 xissue.url = url 

75 xissues.append(xissue) 

76 return xissues 

77 

78 

79def parse_issue_tag( 

80 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None 

81): 

82 """Parses one issue tag. 

83 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

84 """ 

85 xissue = create_issuedata() 

86 xissue.year = str(year) 

87 ext_link = create_extlink( 

88 rel="source", 

89 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

90 metadata=source_domain, 

91 ) 

92 xissue.ext_links.append(ext_link) 

93 

94 table_lines = issue_tag.select("tr") 

95 issue_title_tag = table_lines.pop(0) 

96 match = re.search( 

97 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?", 

98 issue_title_tag.text, 

99 ) 

100 # Issue Summary ? 

101 if match is None: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 raise ValueError("Cannot find volume number") 

103 volume_number = match.group("Volume") 

104 issue_number = match.group("Issue") 

105 if volume_number: 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true

106 xissue.volume = volume_number 

107 else: 

108 raise ValueError("Cannot read volume number") 

109 if issue_number: 

110 xissue.number = issue_number 

111 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

112 

113 if pid_to_parse and xissue.pid != pid_to_parse: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 return 

115 table_lines.pop(0) # table header 

116 for index, table_line_tag in enumerate(table_lines): 

117 try: 

118 xarticle = parse_article_tag( 

119 self, 

120 table_line_tag, 

121 xissue.pid, 

122 index, 

123 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

124 ) 

125 except ValueError as e: 

126 e.add_note(f"{volume_number}_{issue_number}") 

127 raise 

128 if xarticle: 

129 xissue.articles.append(xarticle) 

130 return xissue 

131 

132 

133def parse_article_tag( 

134 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None 

135): 

136 """Parses one article tag. 

137 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

138 """ 

139 xarticle = create_articledata() 

140 if source: 140 ↛ 144line 140 didn't jump to line 144 because the condition on line 140 was always true

141 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

142 xarticle.ext_links.append(ext_link) 

143 

144 article_data = article_tag.select("td") 

145 if len(article_data) != 3: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 raise ValueError("Issue table doesn't have three columns") 

147 author_tag, title_tag, pages_tag = article_data 

148 

149 authors = [ 

150 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>") 

151 ] 

152 for author_name in authors: 

153 if author_name == "": 

154 continue 

155 

156 author_name = cleanup_str(author_name) 

157 

158 author = create_contributor(role="author", string_name=author_name) 

159 xarticle.contributors.append(author) 

160 

161 title = cleanup_str(title_tag.text) 

162 xarticle.title_tex = title 

163 xarticle.pid = f"{issue_pid}a_{index}" 

164 

165 title_link_tag = title_tag.select_one("a") 

166 if title_link_tag is None: 

167 self.logger.warning( 

168 f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link", 

169 extra={"pid": xarticle.pid}, 

170 ) 

171 return None 

172 

173 href = self.get_str_attr(title_link_tag, "href") 

174 pdf_link = urljoin(self.source_website, href) 

175 add_pdf_link_to_xarticle(xarticle, pdf_link) 

176 

177 set_pages(xarticle, cleanup_str(pages_tag.text)) 

178 return xarticle