Coverage for src/crawler/by_source/nsjom/nsjom_2010_crawler.py: 79%

109 statements  

« prev     ^ index     » next       coverage.py v7.7.0, created at 2025-04-03 12:36 +0000

1import re 

2import typing 

3from datetime import datetime 

4 

5from bs4 import BeautifulSoup, Tag 

6from ptf.model_data import ( 

7 IssueData, 

8 create_articledata, 

9 create_contributor, 

10 create_extlink, 

11 create_issuedata, 

12) 

13 

14from crawler.utils import add_pdf_link_to_xarticle, cleanup_str 

15 

16if typing.TYPE_CHECKING: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 from ..nsjom_crawler import NsjomCrawler 

18 

19source_domain = "NSJOM" 

20 

21 

22def parse_collection_content( 

23 self: "NsjomCrawler", 

24 content: str, 

25 source_domain: str = "NSJOM", 

26): 

27 """ 

28 Parses all articles from year-specific webpages : https://sites.dmi.uns.ac.rs/nsjom/ns2010.html 

29 From 2010 to 2014 (included) 

30 """ 

31 xissues: list[IssueData] = [] 

32 year_start = 2010 

33 year_end = min(2014, datetime.now().year) 

34 

35 for year in range(year_start, year_end + 1): 

36 url = f"https://sites.dmi.uns.ac.rs/nsjom/ns{year}.html" 

37 year_content = self.download_file(url) 

38 xissues = xissues + parse_year(self, year_content, year, url) 

39 

40 return xissues 

41 

42 

43def is_heading(element: Tag): 

44 if element.select_one(".HeadingNSJOM"): 

45 return True 

46 classname = element.get("class") 

47 if not classname: 

48 return False 

49 

50 if isinstance(classname, str): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 if classname == "HeadingNSJOM": 

52 return True 

53 return False 

54 

55 if "HeadingNSJOM" in classname: 

56 return True 

57 return False 

58 

59 

60def parse_issue_content(self, content: str, xissue: IssueData): 

61 if not xissue.year: 

62 raise ValueError("Issue year is not set") 

63 parse_year(self, content, int(xissue.year), xissue.url, xissue.pid) 

64 

65 

66def parse_year( 

67 self: "NsjomCrawler", 

68 content: str, 

69 year: int, 

70 url: str | None = None, 

71 pid_to_crawl: str | None = None, 

72): 

73 soup = BeautifulSoup(content, "html.parser") 

74 page_elements = soup.select("p.HeadingNSJOM, p.style1, p.style1+blockquote a") 

75 issues: list[list[Tag]] = [] 

76 # Sort tags into issues 

77 for current_element in page_elements: 

78 if is_heading(current_element): 

79 issues.append([current_element]) 

80 continue 

81 if current_element.text == "\xa0": 

82 continue 

83 issues[-1].append(current_element) 

84 

85 xissues: list[IssueData] = [] 

86 for issue_elements in issues: 

87 xissue = parse_issue_tags(self, issue_elements, year, pid_to_crawl) 

88 if not xissue: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 continue 

90 xissue.url = url 

91 xissues.append(xissue) 

92 

93 return xissues 

94 

95 

96def parse_issue_tags( 

97 self: "NsjomCrawler", tags: list[Tag], year: int, pid_to_crawl: str | None = None 

98): 

99 xissue = create_issuedata() 

100 xissue.year = str(year) 

101 ext_link = create_extlink( 

102 rel="source", 

103 location=f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

104 metadata=source_domain, 

105 ) 

106 xissue.ext_links.append(ext_link) 

107 issue_title_tag = tags.pop(0) 

108 match = re.search( 

109 r"NSJOM Vol\. (?P<Volume>\d+)(?:, No. (?P<Issue>\d+))?", issue_title_tag.text 

110 ) 

111 if match is None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 raise ValueError("Cannot find volume number") 

113 volume_number = match.group("Volume") 

114 issue_number = match.group("Issue") 

115 if volume_number: 115 ↛ 118line 115 didn't jump to line 118 because the condition on line 115 was always true

116 xissue.volume = volume_number 

117 else: 

118 raise ValueError("Cannot read volume number") 

119 if issue_number: 119 ↛ 121line 119 didn't jump to line 121 because the condition on line 119 was always true

120 xissue.number = issue_number 

121 xissue.pid = f"{source_domain}_{year}_{volume_number}_{issue_number}" 

122 

123 if pid_to_crawl and xissue.pid != pid_to_crawl: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true

124 return 

125 

126 while len(tags) > 0: 

127 article_meta = tags.pop(0) 

128 article_meta_text = cleanup_str(article_meta.text) 

129 if article_meta_text == "": 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 continue 

131 

132 article_title = tags.pop(0) 

133 article = parse_article( 

134 self, 

135 article_meta_text, 

136 article_title, 

137 xissue.pid, 

138 len(xissue.articles), 

139 f"https://sites.dmi.uns.ac.rs/nsjom/issue.html?year={year}", 

140 ) 

141 if article is None: 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 continue 

143 xissue.articles.append(article) 

144 return xissue 

145 

146 

147def parse_article( 

148 self: "NsjomCrawler", 

149 meta_text: str, 

150 title_tag: Tag, 

151 issue_pid: str, 

152 index: int, 

153 source: str | None = None, 

154): 

155 xarticle = create_articledata() 

156 if source: 156 ↛ 159line 156 didn't jump to line 159 because the condition on line 156 was always true

157 ext_link = create_extlink(rel="source", location=source, metadata=self.source_domain) 

158 xarticle.ext_links.append(ext_link) 

159 match = re.search(r"\d+ \/ \d+ \/ \d+ \/ (?P<Pages>\d+(?:-\d+)?): (?P<Authors>.+)", meta_text) 

160 if match is None: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 raise ValueError("Cannot parse authors or page number") 

162 self.set_pages(xarticle, match.group("Pages")) 

163 authors = re.findall(r"(?: and )?((?:(?<!,)(?<! and).(?!and ))+)", match.group("Authors")) 

164 for a in authors: 

165 author = create_contributor(role="author", string_name=a) 

166 xarticle.contributors.append(author) 

167 

168 article_pdf_link = title_tag.get("href") 

169 if article_pdf_link is None: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 print("[NSJOM] article does not have a pdf") 

171 return None 

172 if isinstance(article_pdf_link, list): 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 raise ValueError("Article link is a list") 

174 pdf_link = self.source_website + article_pdf_link 

175 add_pdf_link_to_xarticle(xarticle, pdf_link) 

176 

177 title = cleanup_str(title_tag.text) 

178 xarticle.title_tex = title 

179 xarticle.pid = f"{issue_pid}a_{index}" 

180 

181 return xarticle