Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 80%

110 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-11-20 09:03 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.base_crawler import add_pdf_link_to_xarticle 

15from crawler.utils import cleanup_str 

16 

17if typing.TYPE_CHECKING: 17 ↛ 18line 17 didn't jump to line 18 because the condition on line 17 was never true

18 from ..nsjom_crawler import NsjomCrawler 

19 

20source_domain = "NSJOM" 

21 

22 

23def parse_collection_content( 

24 self: "NsjomCrawler", 

25 content: str, 

26 periode_start: int = 0, 

27 periode_end: int = datetime.now().year, 

28 source_domain: str = "NSJOM", 

29): 

30 """ 

31 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html 

32 From 2010 to 2014 (included) 

33 """ 

34 xissues: list[IssueData] = [] 

35 year_start = max(2010, periode_start) 

36 year_end = min(2014, periode_end) 

37 

38 for year in range(year_start, year_end + 1): 

39 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

40 year_content = self.get_page_content(url) 

41 xissues = xissues + parse_year(self, year_content, year, url) 

42 

43 return xissues 

44 

45 

46def is_heading(element: Tag): 

47 if element.select_one(".HeadingNSJOM"): 

48 return True 

49 classname = element.get("class") 

50 if not classname: 

51 return False 

52 

53 if isinstance(classname, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 if classname == "HeadingNSJOM": 

55 return True 

56 return False 

57 

58 if "HeadingNSJOM" in classname: 

59 return True 

60 return False 

61 

62 

63def parse_issue_content(self, content: str, xissue: IssueData): 

64 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

65 

66 

67def parse_year( 

68 self: "NsjomCrawler", 

69 content: str, 

70 year: int, 

71 url: str | None = None, 

72 pid_to_crawl: str | None = None, 

73): 

74 soup = BeautifulSoup(content, "html.parser") 

75 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a") 

76 issues: list[list[Tag]] = [] 

77 # Sort tags into issues 

78 for current_element in page_elements: 

79 if is_heading(current_element): 

80 issues.append([current_element]) 

81 continue 

82 if current_element.text == "\xa0": 

83 continue 

84 issues[-1].append(current_element) 

85 

86 xissues: list[IssueData] = [] 

87 for issue_elements in issues: 

88 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl) 

89 if not xissue: 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 continue 

91 xissue.url = url 

92 xissues.append(xissue) 

93 

94 return xissues 

95 

96 

97def parse_issue_tags( 

98 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None 

99): 

100 xissue = create_issuedata() 

101 xissue.year = str(year) 

102 ext_link = create_extlink( 

103 rel="source", 

104 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

105 metadata=source_domain, 

106 ) 

107 xissue.ext_links.append(ext_link) 

108 issue_title_tag = tags.pop(0) 

109 match = re.search( 

110 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text 

111 ) 

112 if match is None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 raise ValueError("Cannot find volume number") 

114 volume_number = match.group("Volume") 

115 issue_number = match.group("Issue") 

116 if volume_number: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true

117 xissue.volume = volume_number 

118 else: 

119 raise ValueError("Cannot read volume number") 

120 if issue_number: 120 ↛ 122line 120 didn't jump to line 122 because the condition on line 120 was always true

121 xissue.number = issue_number 

122 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

123 

124 if pid_to_crawl and xissue.pid != pid_to_crawl: 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 return 

126 

127 while len(tags) > 0: 

128 article_meta = tags.pop(0) 

129 article_meta_text = cleanup_str(article_meta.text) 

130 if article_meta_text == "": 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 continue 

132 

133 if len(tags) == 0: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 continue 

135 article_title = tags.pop(0) 

136 article = parse_article( 

137 self, 

138 article_meta_text, 

139 article_title, 

140 xissue.pid, 

141 len(xissue.articles), 

142 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

143 ) 

144 if article is None: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 continue 

146 xissue.articles.append(article) 

147 return xissue 

148 

149 

150def parse_article( 

151 self: "NsjomCrawler", 

152 meta_text: str, 

153 title_tag: Tag, 

154 issue_pid: str, 

155 index: int, 

156 source: str | None = None, 

157): 

158 xarticle = create_articledata() 

159 if source: 159 ↛ 162line 159 didn't jump to line 162 because the condition on line 159 was always true

160 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

161 xarticle.ext_links.append(ext_link) 

162 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text) 

163 if match is None: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 raise ValueError("Cannot parse authors or page number") 

165 xarticle.page_range = match.group("Pages") 

166 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors")) 

167 for a in authors: 

168 author = create_contributor(role="author", string_name=a) 

169 xarticle.contributors.append(author) 

170 

171 article_pdf_link = title_tag.get("href") 

172 if article_pdf_link is None: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 print("[NSJOM] article does not have a pdf") 

174 return None 

175 if isinstance(article_pdf_link, list): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 raise ValueError("Article link is a list") 

177 pdf_link = self.source_website + article_pdf_link 

178 add_pdf_link_to_xarticle(xarticle, pdf_link) 

179 

180 title = cleanup_str(title_tag.text) 

181 xarticle.title_tex = title 

182 xarticle.pid = f"{issue_pid}a_{index}" 

183 

184 return xarticle