Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 79%

109 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2025-01-15 14:09 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 from ..nsjom_crawler import NsjomCrawler 

18 

19source_domain = "NSJOM" 

20 

21 

22def parse_collection_content( 

23 self: "NsjomCrawler", 

24 content: str, 

25 periode_start: int = 0, 

26 periode_end: int = datetime.now().year, 

27 source_domain: str = "NSJOM", 

28): 

29 """ 

30 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html 

31 From 2010 to 2014 (included) 

32 """ 

33 xissues: list[IssueData] = [] 

34 year_start = max(2010, periode_start) 

35 year_end = min(2014, periode_end) 

36 

37 for year in range(year_start, year_end + 1): 

38 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

39 year_content = self.download_file(url) 

40 xissues = xissues + parse_year(self, year_content, year, url) 

41 

42 return xissues 

43 

44 

45def is_heading(element: Tag): 

46 if element.select_one(".HeadingNSJOM"): 

47 return True 

48 classname = element.get("class") 

49 if not classname: 

50 return False 

51 

52 if isinstance(classname, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 if classname == "HeadingNSJOM": 

54 return True 

55 return False 

56 

57 if "HeadingNSJOM" in classname: 

58 return True 

59 return False 

60 

61 

62def parse_issue_content(self, content: str, xissue: IssueData): 

63 if not xissue.year: 

64 raise ValueError("Issue year is not set") 

65 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

66 

67 

68def parse_year( 

69 self: "NsjomCrawler", 

70 content: str, 

71 year: int, 

72 url: str | None = None, 

73 pid_to_crawl: str | None = None, 

74): 

75 soup = BeautifulSoup(content, "html.parser") 

76 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a") 

77 issues: list[list[Tag]] = [] 

78 # Sort tags into issues 

79 for current_element in page_elements: 

80 if is_heading(current_element): 

81 issues.append([current_element]) 

82 continue 

83 if current_element.text == "\xa0": 

84 continue 

85 issues[-1].append(current_element) 

86 

87 xissues: list[IssueData] = [] 

88 for issue_elements in issues: 

89 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl) 

90 if not xissue: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 continue 

92 xissue.url = url 

93 xissues.append(xissue) 

94 

95 return xissues 

96 

97 

98def parse_issue_tags( 

99 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None 

100): 

101 xissue = create_issuedata() 

102 xissue.year = str(year) 

103 ext_link = create_extlink( 

104 rel="source", 

105 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

106 metadata=source_domain, 

107 ) 

108 xissue.ext_links.append(ext_link) 

109 issue_title_tag = tags.pop(0) 

110 match = re.search( 

111 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text 

112 ) 

113 if match is None: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 raise ValueError("Cannot find volume number") 

115 volume_number = match.group("Volume") 

116 issue_number = match.group("Issue") 

117 if volume_number: 117 ↛ 120line 117 didn't jump to line 120 because the condition on line 117 was always true

118 xissue.volume = volume_number 

119 else: 

120 raise ValueError("Cannot read volume number") 

121 if issue_number: 121 ↛ 123line 121 didn't jump to line 123 because the condition on line 121 was always true

122 xissue.number = issue_number 

123 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

124 

125 if pid_to_crawl and xissue.pid != pid_to_crawl: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true

126 return 

127 

128 while len(tags) > 0: 

129 article_meta = tags.pop(0) 

130 article_meta_text = cleanup_str(article_meta.text) 

131 if article_meta_text == "": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 continue 

133 

134 article_title = tags.pop(0) 

135 article = parse_article( 

136 self, 

137 article_meta_text, 

138 article_title, 

139 xissue.pid, 

140 len(xissue.articles), 

141 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

142 ) 

143 if article is None: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 continue 

145 xissue.articles.append(article) 

146 return xissue 

147 

148 

149def parse_article( 

150 self: "NsjomCrawler", 

151 meta_text: str, 

152 title_tag: Tag, 

153 issue_pid: str, 

154 index: int, 

155 source: str | None = None, 

156): 

157 xarticle = create_articledata() 

158 if source: 158 ↛ 161line 158 didn't jump to line 161 because the condition on line 158 was always true

159 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

160 xarticle.ext_links.append(ext_link) 

161 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text) 

162 if match is None: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 raise ValueError("Cannot parse authors or page number") 

164 self.set_pages(xarticle, match.group("Pages")) 

165 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors")) 

166 for a in authors: 

167 author = create_contributor(role="author", string_name=a) 

168 xarticle.contributors.append(author) 

169 

170 article_pdf_link = title_tag.get("href") 

171 if article_pdf_link is None: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 print("[NSJOM] article does not have a pdf") 

173 return None 

174 if isinstance(article_pdf_link, list): 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 raise ValueError("Article link is a list") 

176 pdf_link = self.source_website + article_pdf_link 

177 add_pdf_link_to_xarticle(xarticle, pdf_link) 

178 

179 title = cleanup_str(title_tag.text) 

180 xarticle.title_tex = title 

181 xarticle.pid = f"{issue_pid}a_{index}" 

182 

183 return xarticle