Coverage for src / crawler / by_source / nsjom / nsjom_1971_crawler.py: 83%

98 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-11 14:57 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.crawler_utils import set_pages 

15from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

16 

17if typing.TYPE_CHECKING: 

18 from .nsjom_crawler import NsjomCrawler 

19 

20 

21source_domain = "NSJOM" 

22 

23 

24def parse_collection_content( 

25 self: "NsjomCrawler", 

26 content: str, 

27 source_domain: str = "NSJOM", 

28): 

29 """ 

30 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

31 From 1971 to 2009 (included) 

32 """ 

33 xissues: list[IssueData] = [] 

34 year_start = 1971 

35 year_end = min(2009, datetime.now().year) 

36 

37 for year in range(year_start, year_end + 1): 

38 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

39 year_content = self.download_file(url) 

40 try: 

41 xissues = xissues + parse_year(self, year_content, year, url) 

42 except ValueError as e: 

43 # Adds message to printed error 

44 e.add_note(f"[{source_domain}]: {year}") 

45 raise 

46 return xissues 

47 

48 

49def parse_issue_content(self, content: str, xissue: IssueData): 

50 if not xissue.year: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 raise ValueError("Issue year is not set") 

52 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

53 

54 

55def parse_year( 

56 self: "NsjomCrawler", 

57 content: str, 

58 year: int, 

59 url: str | None = None, 

60 pid_to_parse: str | None = None, 

61): 

62 """Parses one page. 

63 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

64 """ 

65 

66 soup = BeautifulSoup(content, "html.parser") 

67 xissues: list[IssueData] = [] 

68 issues_tags = soup.select("body>table") 

69 for issue_tag in issues_tags: 

70 xissue = parse_issue_tag(self, issue_tag, year) 

71 if not xissue: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 continue 

73 xissue.url = url 

74 xissues.append(xissue) 

75 return xissues 

76 

77 

78def parse_issue_tag( 

79 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None 

80): 

81 """Parses one issue tag. 

82 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

83 """ 

84 xissue = create_issuedata() 

85 xissue.year = str(year) 

86 ext_link = create_extlink( 

87 rel="source", 

88 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

89 metadata=source_domain, 

90 ) 

91 xissue.ext_links.append(ext_link) 

92 

93 table_lines = issue_tag.select("tr") 

94 issue_title_tag = table_lines.pop(0) 

95 match = re.search( 

96 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?", 

97 issue_title_tag.text, 

98 ) 

99 # Issue Summary ? 

100 if match is None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 raise ValueError("Cannot find volume number") 

102 volume_number = match.group("Volume") 

103 issue_number = match.group("Issue") 

104 if volume_number: 104 ↛ 107line 104 didn't jump to line 107 because the condition on line 104 was always true

105 xissue.volume = volume_number 

106 else: 

107 raise ValueError("Cannot read volume number") 

108 if issue_number: 

109 xissue.number = issue_number 

110 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

111 

112 if pid_to_parse and xissue.pid != pid_to_parse: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 return 

114 table_lines.pop(0) # table header 

115 for index, table_line_tag in enumerate(table_lines): 

116 try: 

117 xarticle = parse_article_tag( 

118 self, 

119 table_line_tag, 

120 xissue.pid, 

121 index, 

122 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

123 ) 

124 except ValueError as e: 

125 e.add_note(f"{volume_number}_{issue_number}") 

126 raise 

127 if xarticle: 

128 xissue.articles.append(xarticle) 

129 return xissue 

130 

131 

132def parse_article_tag( 

133 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None 

134): 

135 """Parses one article tag. 

136 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

137 """ 

138 xarticle = create_articledata() 

139 if source: 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true

140 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

141 xarticle.ext_links.append(ext_link) 

142 

143 article_data = article_tag.select("td") 

144 if len(article_data) != 3: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 raise ValueError("Issue table doesn't have three columns") 

146 author_tag, title_tag, pages_tag = article_data 

147 

148 authors = [ 

149 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>") 

150 ] 

151 for author_name in authors: 

152 if author_name == "": 

153 continue 

154 

155 author_name = cleanup_str(author_name) 

156 

157 author = create_contributor(role="author", string_name=author_name) 

158 xarticle.contributors.append(author) 

159 

160 title = cleanup_str(title_tag.text) 

161 xarticle.title_tex = title 

162 xarticle.pid = f"{issue_pid}a_{index}" 

163 

164 title_link_tag = title_tag.select_one("a") 

165 if title_link_tag is None: 

166 self.logger.warning( 

167 f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link", 

168 extra={"pid": xarticle.pid}, 

169 ) 

170 return None 

171 pdf_link = title_link_tag.get("href") 

172 if pdf_link is None: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 raise ValueError("Article pdf link is None") 

174 if isinstance(pdf_link, list): 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 raise ValueError("Article has multiple pdf hrefs") 

176 pdf_link = self.source_website + pdf_link 

177 add_pdf_link_to_xarticle(xarticle, pdf_link) 

178 

179 set_pages(xarticle, cleanup_str(pages_tag.text)) 

180 return xarticle