Coverage for src/crawler/by_source/nsjom/nsjom_1971_crawler.py: 83%

98 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.base_crawler import add_pdf_link_to_xarticle 

15from crawler.utils import cleanup_str 

16 

17if typing.TYPE_CHECKING: 17 ↛ 18line 17 didn't jump to line 18 because the condition on line 17 was never true

18 from ..nsjom_crawler import NsjomCrawler 

19 

20 

21source_domain = "NSJOM" 

22 

23 

24def parse_collection_content( 

25 self: "NsjomCrawler", 

26 content: str, 

27 periode_start: int = 0, 

28 periode_end: int = datetime.now().year, 

29 source_domain: str = "NSJOM", 

30): 

31 """ 

32 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

33 From 1971 to 2009 (included) 

34 """ 

35 xissues: list[IssueData] = [] 

36 year_start = max(1971, periode_start) 

37 year_end = min(2009, periode_end) 

38 

39 for year in range(year_start, year_end + 1): 

40 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

41 year_content = self.get_page_content(url) 

42 try: 

43 xissues = xissues + parse_year(self, year_content, year, url) 

44 except ValueError as e: 

45 # Adds message to printed error 

46 e.add_note(f"[{source_domain}]: {year}") 

47 raise 

48 return xissues 

49 

50 

51def parse_issue_content(self, content: str, xissue: IssueData): 

52 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

53 

54 

55def parse_year( 

56 self: "NsjomCrawler", 

57 content: str, 

58 year: int, 

59 url: str | None = None, 

60 pid_to_parse: str | None = None, 

61): 

62 """Parses one page. 

63 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html""" 

64 soup = BeautifulSoup(content, "html.parser") 

65 xissues: list[IssueData] = [] 

66 issues_tags = soup.select("body>table") 

67 for issue_tag in issues_tags: 

68 xissue = parse_issue_tag(self, issue_tag, year) 

69 if not xissue: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 continue 

71 xissue.url = url 

72 xissues.append(xissue) 

73 return xissues 

74 

75 

76def parse_issue_tag( 

77 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None 

78): 

79 """Parses one issue tag. 

80 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

81 """ 

82 xissue = create_issuedata() 

83 xissue.year = str(year) 

84 ext_link = create_extlink( 

85 rel="source", 

86 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

87 metadata=source_domain, 

88 ) 

89 xissue.ext_links.append(ext_link) 

90 

91 table_lines = issue_tag.select("tr") 

92 issue_title_tag = table_lines.pop(0) 

93 match = re.search( 

94 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?", 

95 issue_title_tag.text, 

96 ) 

97 # Issue Summary ? 

98 if match is None: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 raise ValueError("Cannot find volume number") 

100 volume_number = match.group("Volume") 

101 issue_number = match.group("Issue") 

102 if volume_number: 102 ↛ 105line 102 didn't jump to line 105 because the condition on line 102 was always true

103 xissue.volume = volume_number 

104 else: 

105 raise ValueError("Cannot read volume number") 

106 if issue_number: 

107 xissue.number = issue_number 

108 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

109 

110 if pid_to_parse and xissue.pid != pid_to_parse: 110 ↛ 111line 110 didn't jump to line 111 because the condition on line 110 was never true

111 return 

112 table_lines.pop(0) # table header 

113 for index, table_line_tag in enumerate(table_lines): 

114 try: 

115 xarticle = parse_article_tag( 

116 self, 

117 table_line_tag, 

118 xissue.pid, 

119 index, 

120 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

121 ) 

122 except ValueError as e: 

123 e.add_note(f"{volume_number}_{issue_number}") 

124 raise 

125 if xarticle: 

126 xissue.articles.append(xarticle) 

127 return xissue 

128 

129 

130def parse_article_tag( 

131 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None 

132): 

133 """Parses one article tag. 

134 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

135 """ 

136 xarticle = create_articledata() 

137 if source: 137 ↛ 141line 137 didn't jump to line 141 because the condition on line 137 was always true

138 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

139 xarticle.ext_links.append(ext_link) 

140 

141 article_data = article_tag.select("td") 

142 if len(article_data) != 3: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 raise ValueError("Issue table doesn't have three columns") 

144 author_tag, title_tag, pages_tag = article_data 

145 

146 authors = [ 

147 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>") 

148 ] 

149 for author_name in authors: 

150 if author_name == "": 

151 continue 

152 

153 author_name = cleanup_str(author_name) 

154 

155 author = create_contributor(role="author", string_name=author_name) 

156 xarticle.contributors.append(author) 

157 

158 title = cleanup_str(title_tag.text) 

159 xarticle.title_tex = title 

160 xarticle.pid = f"{issue_pid}a_{index}" 

161 

162 title_link_tag = title_tag.select_one("a") 

163 if title_link_tag is None: 

164 print(f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link") 

165 return None 

166 pdf_link = title_link_tag.get("href") 

167 if pdf_link is None: 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 raise ValueError("Article pdf link is None") 

169 if isinstance(pdf_link, list): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 raise ValueError("Article has multiple pdf hrefs") 

171 pdf_link = self.source_website + pdf_link 

172 add_pdf_link_to_xarticle(xarticle, pdf_link) 

173 

174 xarticle.page_range = cleanup_str(pages_tag.text) 

175 return xarticle