Coverage for src/crawler/by_source/nsjom/nsjom_1971_crawler.py: 81%

99 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 from ..nsjom_crawler import NsjomCrawler 

18 

19 

20source_domain = "NSJOM" 

21 

22 

23def parse_collection_content( 

24 self: "NsjomCrawler", 

25 content: str, 

26 periode_start: int = 0, 

27 periode_end: int = datetime.now().year, 

28 source_domain: str = "NSJOM", 

29): 

30 """ 

31 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

32 From 1971 to 2009 (included) 

33 """ 

34 xissues: list[IssueData] = [] 

35 year_start = max(1971, periode_start) 

36 year_end = min(2009, periode_end) 

37 

38 for year in range(year_start, year_end + 1): 

39 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

40 year_content = self.download_file(url) 

41 try: 

42 xissues = xissues + parse_year(self, year_content, year, url) 

43 except ValueError as e: 

44 # Adds message to printed error 

45 e.add_note(f"[{source_domain}]: {year}") 

46 raise 

47 return xissues 

48 

49 

50def parse_issue_content(self, content: str, xissue: IssueData): 

51 if not xissue.year: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 raise ValueError("Issue year is not set") 

53 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

54 

55 

56def parse_year( 

57 self: "NsjomCrawler", 

58 content: str, 

59 year: int, 

60 url: str | None = None, 

61 pid_to_parse: str | None = None, 

62): 

63 """Parses one page. 

64 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html""" 

65 soup = BeautifulSoup(content, "html.parser") 

66 xissues: list[IssueData] = [] 

67 issues_tags = soup.select("body>table") 

68 for issue_tag in issues_tags: 

69 xissue = parse_issue_tag(self, issue_tag, year) 

70 if not xissue: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 continue 

72 xissue.url = url 

73 xissues.append(xissue) 

74 return xissues 

75 

76 

77def parse_issue_tag( 

78 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None 

79): 

80 """Parses one issue tag. 

81 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

82 """ 

83 xissue = create_issuedata() 

84 xissue.year = str(year) 

85 ext_link = create_extlink( 

86 rel="source", 

87 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

88 metadata=source_domain, 

89 ) 

90 xissue.ext_links.append(ext_link) 

91 

92 table_lines = issue_tag.select("tr") 

93 issue_title_tag = table_lines.pop(0) 

94 match = re.search( 

95 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?", 

96 issue_title_tag.text, 

97 ) 

98 # Issue Summary ? 

99 if match is None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 raise ValueError("Cannot find volume number") 

101 volume_number = match.group("Volume") 

102 issue_number = match.group("Issue") 

103 if volume_number: 103 ↛ 106line 103 didn't jump to line 106 because the condition on line 103 was always true

104 xissue.volume = volume_number 

105 else: 

106 raise ValueError("Cannot read volume number") 

107 if issue_number: 

108 xissue.number = issue_number 

109 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

110 

111 if pid_to_parse and xissue.pid != pid_to_parse: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 return 

113 table_lines.pop(0) # table header 

114 for index, table_line_tag in enumerate(table_lines): 

115 try: 

116 xarticle = parse_article_tag( 

117 self, 

118 table_line_tag, 

119 xissue.pid, 

120 index, 

121 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

122 ) 

123 except ValueError as e: 

124 e.add_note(f"{volume_number}_{issue_number}") 

125 raise 

126 if xarticle: 

127 xissue.articles.append(xarticle) 

128 return xissue 

129 

130 

131def parse_article_tag( 

132 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None 

133): 

134 """Parses one article tag. 

135 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

136 """ 

137 xarticle = create_articledata() 

138 if source: 138 ↛ 142line 138 didn't jump to line 142 because the condition on line 138 was always true

139 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

140 xarticle.ext_links.append(ext_link) 

141 

142 article_data = article_tag.select("td") 

143 if len(article_data) != 3: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 raise ValueError("Issue table doesn't have three columns") 

145 author_tag, title_tag, pages_tag = article_data 

146 

147 authors = [ 

148 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>") 

149 ] 

150 for author_name in authors: 

151 if author_name == "": 

152 continue 

153 

154 author_name = cleanup_str(author_name) 

155 

156 author = create_contributor(role="author", string_name=author_name) 

157 xarticle.contributors.append(author) 

158 

159 title = cleanup_str(title_tag.text) 

160 xarticle.title_tex = title 

161 xarticle.pid = f"{issue_pid}a_{index}" 

162 

163 title_link_tag = title_tag.select_one("a") 

164 if title_link_tag is None: 

165 print(f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link") 

166 return None 

167 pdf_link = title_link_tag.get("href") 

168 if pdf_link is None: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 raise ValueError("Article pdf link is None") 

170 if isinstance(pdf_link, list): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 raise ValueError("Article has multiple pdf hrefs") 

172 pdf_link = self.source_website + pdf_link 

173 add_pdf_link_to_xarticle(xarticle, pdf_link) 

174 

175 self.set_pages(xarticle, cleanup_str(pages_tag.text)) 

176 return xarticle