Coverage for src/crawler/by_source/nsjom/nsjom_1971_crawler.py: 81%

99 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 from ..nsjom_crawler import NsjomCrawler 

18 

19 

20source_domain = "NSJOM" 

21 

22 

23def parse_collection_content( 

24 self: "NsjomCrawler", 

25 content: str, 

26 source_domain: str = "NSJOM", 

27): 

28 """ 

29 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns1971.html 

30 From 1971 to 2009 (included) 

31 """ 

32 xissues: list[IssueData] = [] 

33 year_start = 1971 

34 year_end = min(2009, datetime.now().year) 

35 

36 for year in range(year_start, year_end + 1): 

37 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

38 year_content = self.download_file(url) 

39 try: 

40 xissues = xissues + parse_year(self, year_content, year, url) 

41 except ValueError as e: 

42 # Adds message to printed error 

43 e.add_note(f"[{source_domain}]: {year}") 

44 raise 

45 return xissues 

46 

47 

48def parse_issue_content(self, content: str, xissue: IssueData): 

49 if not xissue.year: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 raise ValueError("Issue year is not set") 

51 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

52 

53 

54def parse_year( 

55 self: "NsjomCrawler", 

56 content: str, 

57 year: int, 

58 url: str | None = None, 

59 pid_to_parse: str | None = None, 

60): 

61 """Parses one page. 

62 eg : https://sites.dmi.uns.ac.rs/nsjom/ns2009.html""" 

63 soup = BeautifulSoup(content, "html.parser") 

64 xissues: list[IssueData] = [] 

65 issues_tags = soup.select("body>table") 

66 for issue_tag in issues_tags: 

67 xissue = parse_issue_tag(self, issue_tag, year) 

68 if not xissue: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 continue 

70 xissue.url = url 

71 xissues.append(xissue) 

72 return xissues 

73 

74 

75def parse_issue_tag( 

76 self: "NsjomCrawler", issue_tag: Tag, year: int, pid_to_parse: str | None = None 

77): 

78 """Parses one issue tag. 

79 eg: `document.querySelector('body>table')` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

80 """ 

81 xissue = create_issuedata() 

82 xissue.year = str(year) 

83 ext_link = create_extlink( 

84 rel="source", 

85 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

86 metadata=source_domain, 

87 ) 

88 xissue.ext_links.append(ext_link) 

89 

90 table_lines = issue_tag.select("tr") 

91 issue_title_tag = table_lines.pop(0) 

92 match = re.search( 

93 r"[\n\r ]*NSJOM[\n\r ]*Vol (?P<Volume>\d+)\.(?: No\. (?P<Issue>\d+))?", 

94 issue_title_tag.text, 

95 ) 

96 # Issue Summary ? 

97 if match is None: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 raise ValueError("Cannot find volume number") 

99 volume_number = match.group("Volume") 

100 issue_number = match.group("Issue") 

101 if volume_number: 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true

102 xissue.volume = volume_number 

103 else: 

104 raise ValueError("Cannot read volume number") 

105 if issue_number: 

106 xissue.number = issue_number 

107 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

108 

109 if pid_to_parse and xissue.pid != pid_to_parse: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 return 

111 table_lines.pop(0) # table header 

112 for index, table_line_tag in enumerate(table_lines): 

113 try: 

114 xarticle = parse_article_tag( 

115 self, 

116 table_line_tag, 

117 xissue.pid, 

118 index, 

119 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

120 ) 

121 except ValueError as e: 

122 e.add_note(f"{volume_number}_{issue_number}") 

123 raise 

124 if xarticle: 

125 xissue.articles.append(xarticle) 

126 return xissue 

127 

128 

129def parse_article_tag( 

130 self: "NsjomCrawler", article_tag: Tag, issue_pid: str, index: int, source: str | None = None 

131): 

132 """Parses one article tag. 

133 eg: `document.querySelector("body>table tr:nth-child(3)")` in https://sites.dmi.uns.ac.rs/nsjom/ns2009.html 

134 """ 

135 xarticle = create_articledata() 

136 if source: 136 ↛ 140line 136 didn't jump to line 140 because the condition on line 136 was always true

137 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

138 xarticle.ext_links.append(ext_link) 

139 

140 article_data = article_tag.select("td") 

141 if len(article_data) != 3: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 raise ValueError("Issue table doesn't have three columns") 

143 author_tag, title_tag, pages_tag = article_data 

144 

145 authors = [ 

146 BeautifulSoup(_, "html.parser").text.strip() for _ in str(author_tag).split("<br/>") 

147 ] 

148 for author_name in authors: 

149 if author_name == "": 

150 continue 

151 

152 author_name = cleanup_str(author_name) 

153 

154 author = create_contributor(role="author", string_name=author_name) 

155 xarticle.contributors.append(author) 

156 

157 title = cleanup_str(title_tag.text) 

158 xarticle.title_tex = title 

159 xarticle.pid = f"{issue_pid}a_{index}" 

160 

161 title_link_tag = title_tag.select_one("a") 

162 if title_link_tag is None: 

163 print(f"[{source_domain}] {issue_pid}_{index} : Cannot find article pdf link") 

164 return None 

165 pdf_link = title_link_tag.get("href") 

166 if pdf_link is None: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 raise ValueError("Article pdf link is None") 

168 if isinstance(pdf_link, list): 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 raise ValueError("Article has multiple pdf hrefs") 

170 pdf_link = self.source_website + pdf_link 

171 add_pdf_link_to_xarticle(xarticle, pdf_link) 

172 

173 self.set_pages(xarticle, cleanup_str(pages_tag.text)) 

174 return xarticle